diff --git a/ckpts/qwen3-1.7b-whisper-260205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/global_step7000/mp_rank_00_model_states.pt b/ckpts/qwen3-1.7b-whisper-260205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/global_step7000/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ea80e4cd1b2e4fc6535f0c32ab3195b41079ba9f
--- /dev/null
+++ b/ckpts/qwen3-1.7b-whisper-260205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/global_step7000/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c42afe0d84d6312de0c4d2bb7302c5d9db244998ba1df02858f948affbce9249
+size 10261699974
diff --git a/ckpts/qwen3-1.7b-whisper-260205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/latest b/ckpts/qwen3-1.7b-whisper-260205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/latest
new file mode 100644
index 0000000000000000000000000000000000000000..02f8b21f7d75fb06ab0c850d040bab5a1b2af782
--- /dev/null
+++ b/ckpts/qwen3-1.7b-whisper-260205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/latest
@@ -0,0 +1 @@
+global_step7000
\ No newline at end of file
diff --git a/ckpts/qwen3-1.7b-whisper-260205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/zero_to_fp32.py b/ckpts/qwen3-1.7b-whisper-260205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/ckpts/qwen3-1.7b-whisper-260205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+
+    Returns:
+        - pytorch ``state_dict``
+
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_file,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/ckpts/qwen3-1.7b-whisper-260205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/log.txt b/ckpts/qwen3-1.7b-whisper-260205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/log.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d78e7fd79a6f475b2fa3aab2be22aaf7fdc26e15
--- /dev/null
+++ b/ckpts/qwen3-1.7b-whisper-260205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/log.txt
@@ -0,0 +1,32384 @@
+[2026-02-06 15:54:46,504] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2026-02-06 15:54:47,905] [WARNING] [runner.py:202:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
+[2026-02-06 15:54:47,906] [INFO] [runner.py:568:main] cmd = /usr/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgM119 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None pretrain_demo_qwenaudio.py --num_worker 16 --save_path /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked --save_steps 7000 --logging_steps 1 --eval_steps 4000001 --train_batch_size 80 --micro_train_batch_size 10 --micro_eval_batch_size 8 --model_path /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-1227_4x3000h_multistage/ckpts/global_step34000_hf_merged_lora --tokenizer_path /fs/nlp/common/plms/qwen-audio/qwen3-1.7b-chat-audio-whisper-v3-convproj --max_epochs 3 --max_len 1024 --zero_stage 2 --max_ckpt_num 5 --learning_rate 2e-5 --flash_attn --use_custom_qwen3 --dataset_config /fs/nlp/common_intern/meiyuxiang/uniscale_multimodal/scripts/myx/data_config/train_config_12x1000h_mls.yaml --tensorboard-dir /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/tb_logs --n_mels 128 --audio_subsampling_scale 4 --packing_samples --seed 421 --freeze_llm --freeze_audio --lora_rank 32 --lora_alpha 64 --specaug_policy SM --post_audio_ratio 1 --gradient_checkpointing --load_checkpoint --bf16 --use_zipper_lora --use_lid_router --zipper_lora_num_languages 12 --zipper_lora_r 32 --trainable_A --zipper_lora_scope audio --zipper_lora_use_independent
+[2026-02-06 15:54:50,123] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2026-02-06 15:54:51,405] [INFO] [launch.py:138:main] 0 NCCL_VERSION=2.19.3
+[2026-02-06 15:54:51,405] [INFO] [launch.py:138:main] 0 NCCL_SOCKET_IFNAME=eth1
+[2026-02-06 15:54:51,405] [INFO] [launch.py:138:main] 0 NCCL_CUMEM_HOST_ENABLE=0
+[2026-02-06 15:54:51,405] [INFO] [launch.py:138:main] 0 NCCL_IB_HCA=^=mlx5_4
+[2026-02-06 15:54:51,405] [INFO] [launch.py:145:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3]}
+[2026-02-06 15:54:51,405] [INFO] [launch.py:151:main] nnodes=1, num_local_procs=4, node_rank=0
+[2026-02-06 15:54:51,405] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1, 2, 3]})
+[2026-02-06 15:54:51,405] [INFO] [launch.py:163:main] dist_world_size=4
+[2026-02-06 15:54:51,405] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3
+[2026-02-06 15:54:51,406] [INFO] [launch.py:253:main] process 707055 spawned with command: ['/usr/bin/python', '-u', 'pretrain_demo_qwenaudio.py', '--local_rank=0', '--num_worker', '16', '--save_path', '/fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked', '--save_steps', '7000', '--logging_steps', '1', '--eval_steps', '4000001', '--train_batch_size', '80', '--micro_train_batch_size', '10', '--micro_eval_batch_size', '8', '--model_path', '/fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-1227_4x3000h_multistage/ckpts/global_step34000_hf_merged_lora', '--tokenizer_path', '/fs/nlp/common/plms/qwen-audio/qwen3-1.7b-chat-audio-whisper-v3-convproj', '--max_epochs', '3', '--max_len', '1024', '--zero_stage', '2', '--max_ckpt_num', '5', '--learning_rate', '2e-5', '--flash_attn', '--use_custom_qwen3', '--dataset_config', '/fs/nlp/common_intern/meiyuxiang/uniscale_multimodal/scripts/myx/data_config/train_config_12x1000h_mls.yaml', '--tensorboard-dir', '/fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/tb_logs', '--n_mels', '128', '--audio_subsampling_scale', '4', '--packing_samples', '--seed', '421', '--freeze_llm', '--freeze_audio', '--lora_rank', '32', '--lora_alpha', '64', '--specaug_policy', 'SM', '--post_audio_ratio', '1', '--gradient_checkpointing', '--load_checkpoint', '--bf16', '--use_zipper_lora', '--use_lid_router', '--zipper_lora_num_languages', '12', '--zipper_lora_r', '32', '--trainable_A', '--zipper_lora_scope', 'audio', '--zipper_lora_use_independent']
+[2026-02-06 15:54:51,407] [INFO] [launch.py:253:main] process 707056 spawned with command: ['/usr/bin/python', '-u', 'pretrain_demo_qwenaudio.py', '--local_rank=1', '--num_worker', '16', '--save_path', '/fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked', '--save_steps', '7000', '--logging_steps', '1', '--eval_steps', '4000001', '--train_batch_size', '80', '--micro_train_batch_size', '10', '--micro_eval_batch_size', '8', '--model_path', '/fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-1227_4x3000h_multistage/ckpts/global_step34000_hf_merged_lora', '--tokenizer_path', '/fs/nlp/common/plms/qwen-audio/qwen3-1.7b-chat-audio-whisper-v3-convproj', '--max_epochs', '3', '--max_len', '1024', '--zero_stage', '2', '--max_ckpt_num', '5', '--learning_rate', '2e-5', '--flash_attn', '--use_custom_qwen3', '--dataset_config', '/fs/nlp/common_intern/meiyuxiang/uniscale_multimodal/scripts/myx/data_config/train_config_12x1000h_mls.yaml', '--tensorboard-dir', '/fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/tb_logs', '--n_mels', '128', '--audio_subsampling_scale', '4', '--packing_samples', '--seed', '421', '--freeze_llm', '--freeze_audio', '--lora_rank', '32', '--lora_alpha', '64', '--specaug_policy', 'SM', '--post_audio_ratio', '1', '--gradient_checkpointing', '--load_checkpoint', '--bf16', '--use_zipper_lora', '--use_lid_router', '--zipper_lora_num_languages', '12', '--zipper_lora_r', '32', '--trainable_A', '--zipper_lora_scope', 'audio', '--zipper_lora_use_independent']
+[2026-02-06 15:54:51,407] [INFO] [launch.py:253:main] process 707057 spawned with command: ['/usr/bin/python', '-u', 'pretrain_demo_qwenaudio.py', '--local_rank=2', '--num_worker', '16', '--save_path', '/fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked', '--save_steps', '7000', '--logging_steps', '1', '--eval_steps', '4000001', '--train_batch_size', '80', '--micro_train_batch_size', '10', '--micro_eval_batch_size', '8', '--model_path', '/fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-1227_4x3000h_multistage/ckpts/global_step34000_hf_merged_lora', '--tokenizer_path', '/fs/nlp/common/plms/qwen-audio/qwen3-1.7b-chat-audio-whisper-v3-convproj', '--max_epochs', '3', '--max_len', '1024', '--zero_stage', '2', '--max_ckpt_num', '5', '--learning_rate', '2e-5', '--flash_attn', '--use_custom_qwen3', '--dataset_config', '/fs/nlp/common_intern/meiyuxiang/uniscale_multimodal/scripts/myx/data_config/train_config_12x1000h_mls.yaml', '--tensorboard-dir', '/fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/tb_logs', '--n_mels', '128', '--audio_subsampling_scale', '4', '--packing_samples', '--seed', '421', '--freeze_llm', '--freeze_audio', '--lora_rank', '32', '--lora_alpha', '64', '--specaug_policy', 'SM', '--post_audio_ratio', '1', '--gradient_checkpointing', '--load_checkpoint', '--bf16', '--use_zipper_lora', '--use_lid_router', '--zipper_lora_num_languages', '12', '--zipper_lora_r', '32', '--trainable_A', '--zipper_lora_scope', 'audio', '--zipper_lora_use_independent']
+[2026-02-06 15:54:51,408] [INFO] [launch.py:253:main] process 707058 spawned with command: ['/usr/bin/python', '-u', 'pretrain_demo_qwenaudio.py', '--local_rank=3', '--num_worker', '16', '--save_path', '/fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked', '--save_steps', '7000', '--logging_steps', '1', '--eval_steps', '4000001', '--train_batch_size', '80', '--micro_train_batch_size', '10', '--micro_eval_batch_size', '8', '--model_path', '/fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-1227_4x3000h_multistage/ckpts/global_step34000_hf_merged_lora', '--tokenizer_path', '/fs/nlp/common/plms/qwen-audio/qwen3-1.7b-chat-audio-whisper-v3-convproj', '--max_epochs', '3', '--max_len', '1024', '--zero_stage', '2', '--max_ckpt_num', '5', '--learning_rate', '2e-5', '--flash_attn', '--use_custom_qwen3', '--dataset_config', '/fs/nlp/common_intern/meiyuxiang/uniscale_multimodal/scripts/myx/data_config/train_config_12x1000h_mls.yaml', '--tensorboard-dir', '/fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/tb_logs', '--n_mels', '128', '--audio_subsampling_scale', '4', '--packing_samples', '--seed', '421', '--freeze_llm', '--freeze_audio', '--lora_rank', '32', '--lora_alpha', '64', '--specaug_policy', 'SM', '--post_audio_ratio', '1', '--gradient_checkpointing', '--load_checkpoint', '--bf16', '--use_zipper_lora', '--use_lid_router', '--zipper_lora_num_languages', '12', '--zipper_lora_r', '32', '--trainable_A', '--zipper_lora_scope', 'audio', '--zipper_lora_use_independent']
+[2026-02-06 15:54:55,747] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2026-02-06 15:54:55,749] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2026-02-06 15:54:55,769] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2026-02-06 15:54:55,807] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2026-02-06 15:54:56,142] [INFO] [comm.py:637:init_distributed] cdb=None
+[2026-02-06 15:54:56,142] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2026-02-06 15:54:56,294] [INFO] [comm.py:637:init_distributed] cdb=None
+[2026-02-06 15:54:56,299] [INFO] [comm.py:637:init_distributed] cdb=None
+[W CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[W CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[2026-02-06 15:54:56,313] [INFO] [comm.py:637:init_distributed] cdb=None
+[W CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[W CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+2026-02-06 15:54:56.826 | INFO     | model.unigpt_audio_models.tokenization_qwen_audio:__init__:147 - vocab_size: 155165, audio_start_id: 155163, audio_end_id: 155164, audio_pad_id: 151851.
+2026-02-06 15:54:56.837 | INFO     | model.unigpt_audio_models.tokenization_qwen_audio:__init__:147 - vocab_size: 155165, audio_start_id: 155163, audio_end_id: 155164, audio_pad_id: 151851.
+2026-02-06 15:54:56.852 | INFO     | model.unigpt_audio_models.tokenization_qwen_audio:__init__:147 - vocab_size: 155165, audio_start_id: 155163, audio_end_id: 155164, audio_pad_id: 151851.
+loading dataset com_voice_ar with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]loading dataset com_voice_ar with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]2026-02-06 15:54:56.882 | INFO     | model.unigpt_audio_models.tokenization_qwen_audio:__init__:147 - vocab_size: 155165, audio_start_id: 155163, audio_end_id: 155164, audio_pad_id: 151851.
+loading dataset com_voice_ar with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]loading dataset com_voice_ar with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]3842it [00:00, 38411.06it/s]4013it [00:00, 40125.73it/s]4113it [00:00, 41126.12it/s]4034it [00:00, 40331.96it/s]6363it [00:00, 39199.68it/s]
+2026-02-06 15:54:57.025 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 1272 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/com_voice_ar_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+0it [00:00, ?it/s]6363it [00:00, 40098.20it/s]
+2026-02-06 15:54:57.030 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 1272 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/com_voice_ar_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+0it [00:00, ?it/s]6363it [00:00, 41130.26it/s]
+2026-02-06 15:54:57.040 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 1272 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/com_voice_ar_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+0it [00:00, ?it/s]6363it [00:00, 40521.02it/s]
+2026-02-06 15:54:57.074 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 1272 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/com_voice_ar_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+0it [00:00, ?it/s]4790it [00:00, 47893.77it/s]4790it [00:00, 47894.34it/s]4872it [00:00, 48713.43it/s]6363it [00:00, 44400.52it/s]
+2026-02-06 15:54:57.170 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 6363 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/com_voice_ar_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset mls_de with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]6363it [00:00, 45730.01it/s]
+2026-02-06 15:54:57.171 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 6363 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/com_voice_ar_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset mls_de with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]4765it [00:00, 47643.34it/s]6363it [00:00, 44804.38it/s]
+2026-02-06 15:54:57.184 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 6363 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/com_voice_ar_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset mls_de with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]6363it [00:00, 43686.88it/s]
+2026-02-06 15:54:57.221 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 6363 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/com_voice_ar_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset mls_de with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]12054it [00:00, 120532.64it/s]11938it [00:00, 119369.87it/s]13264it [00:00, 132635.07it/s]12219it [00:00, 122181.96it/s]25080it [00:00, 126250.30it/s]24644it [00:00, 123886.44it/s]26528it [00:00, 132138.25it/s]25041it [00:00, 125726.67it/s]38286it [00:00, 128898.33it/s]37722it [00:00, 127026.06it/s]39875it [00:00, 132739.82it/s]38166it [00:00, 128245.61it/s]50425it [00:00, 126782.21it/s]51176it [00:00, 127893.43it/s]53150it [00:00, 130858.69it/s]50991it [00:00, 127251.50it/s]63529it [00:00, 131284.08it/s]63529it [00:00, 127568.24it/s]
+
+63318it [00:00, 127551.39it/s]63529it [00:00, 126492.59it/s]
+2026-02-06 15:54:57.690 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 32304 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_de_pack_new.jsonl
+2026-02-06 15:54:57.691 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 32304 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_de_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset mls_es with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset mls_es with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+2026-02-06 15:54:57.696 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 32304 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_de_pack_new.jsonl
+0it [00:00, ?it/s]0it [00:00, ?it/s]{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset mls_es with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]63529it [00:00, 127999.71it/s]
+2026-02-06 15:54:57.742 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 32304 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_de_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset mls_es with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]11831it [00:00, 118293.19it/s]11890it [00:00, 118892.18it/s]12224it [00:00, 122225.55it/s]11982it [00:00, 119810.69it/s]24781it [00:00, 124879.51it/s]24841it [00:00, 125126.10it/s]25096it [00:00, 126041.46it/s]30551it [00:00, 122394.96it/s]30551it [00:00, 122618.78it/s]
+
+24902it [00:00, 125326.32it/s]30551it [00:00, 123978.23it/s]
+2026-02-06 15:54:57.952 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 33300 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_es_pack_new.jsonl
+2026-02-06 15:54:57.952 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 33300 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_es_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+0it [00:00, ?it/s]0it [00:00, ?it/s]2026-02-06 15:54:57.955 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 33300 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_es_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+0it [00:00, ?it/s]30551it [00:00, 123154.99it/s]
+2026-02-06 15:54:58.004 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 33300 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_es_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+0it [00:00, ?it/s]11431it [00:00, 114301.66it/s]11436it [00:00, 114352.75it/s]11471it [00:00, 114698.90it/s]10879it [00:00, 108770.14it/s]22862it [00:00, 114247.67it/s]22872it [00:00, 114273.10it/s]23000it [00:00, 115041.01it/s]22331it [00:00, 112139.32it/s]30551it [00:00, 114470.87it/s]30551it [00:00, 114488.16it/s]
+
+30551it [00:00, 115347.77it/s]
+2026-02-06 15:54:58.221 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 30551 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_es_pack_new.jsonl
+2026-02-06 15:54:58.221 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 30551 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_es_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+
+loading dataset mls_fr with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>loading dataset mls_fr with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+
+0it [00:00, ?it/s]0it [00:00, ?it/s]2026-02-06 15:54:58.222 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 30551 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_es_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset mls_fr with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]30551it [00:00, 113299.43it/s]
+2026-02-06 15:54:58.276 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 30551 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_es_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset mls_fr with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]8579it [00:00, 85787.22it/s]8584it [00:00, 85832.51it/s]8723it [00:00, 87214.28it/s]9023it [00:00, 90224.49it/s]17205it [00:00, 86055.47it/s]17205it [00:00, 86048.09it/s]17445it [00:00, 86638.27it/s]18046it [00:00, 88740.46it/s]26001it [00:00, 86917.97it/s]26001it [00:00, 86919.99it/s]26282it [00:00, 87419.71it/s]27001it [00:00, 89103.44it/s]35066it [00:00, 88389.73it/s]35067it [00:00, 88388.91it/s]35213it [00:00, 88160.18it/s]36216it [00:00, 87592.66it/s]
+36216it [00:00, 87400.99it/s]36216it [00:00, 87402.15it/s]
+
+2026-02-06 15:54:58.658 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 33637 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_fr_pack_new.jsonl
+2026-02-06 15:54:58.658 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 33637 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_fr_pack_new.jsonl
+2026-02-06 15:54:58.658 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 33637 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_fr_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset mls_it with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset mls_it with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset mls_it with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]36216it [00:00, 90183.48it/s]
+2026-02-06 15:54:58.700 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 33637 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_fr_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset mls_it with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]8282it [00:00, 125481.25it/s]8282it [00:00, 125765.64it/s]
+
+8282it [00:00, 124997.21it/s]
+2026-02-06 15:54:58.726 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 16564 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_it_pack_new.jsonl
+2026-02-06 15:54:58.726 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 16564 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_it_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}loading dataset com_voice_ja with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+
+loading dataset com_voice_ja with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+2026-02-06 15:54:58.727 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 16564 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_it_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset com_voice_ja with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]8282it [00:00, 142709.59it/s]
+2026-02-06 15:54:58.761 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 16564 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_it_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset com_voice_ja with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]4155it [00:00, 41545.58it/s]4155it [00:00, 41542.61it/s]4166it [00:00, 41655.27it/s]4335it [00:00, 43346.42it/s]8604it [00:00, 43273.96it/s]8472it [00:00, 42497.95it/s]8544it [00:00, 42901.65it/s]8819it [00:00, 44219.97it/s]12993it [00:00, 43553.41it/s]12901it [00:00, 43312.81it/s]12868it [00:00, 43050.63it/s]13241it [00:00, 44092.33it/s]17319it [00:00, 43099.45it/s]
+17233it [00:00, 43009.32it/s]17174it [00:00, 42706.87it/s]2026-02-06 15:54:59.130 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 1095 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/com_voice_ja_pack_new.jsonl
+17319it [00:00, 42835.46it/s]
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset com_voice_ko with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+2026-02-06 15:54:59.132 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 1095 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/com_voice_ja_pack_new.jsonl
+0it [00:00, ?it/s]17319it [00:00, 42648.83it/s]
+2026-02-06 15:54:59.134 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 1095 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/com_voice_ja_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset com_voice_ko with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset com_voice_ko with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]235it [00:00, 41105.19it/s]
+2026-02-06 15:54:59.139 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 235 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/com_voice_ko_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset mls_pt with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]235it [00:00, 56971.36it/s]
+2026-02-06 15:54:59.140 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 235 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/com_voice_ko_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset mls_pt with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]235it [00:00, 56172.65it/s]
+2026-02-06 15:54:59.142 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 235 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/com_voice_ko_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset mls_pt with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]17319it [00:00, 43751.77it/s]
+2026-02-06 15:54:59.158 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 1095 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/com_voice_ja_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset com_voice_ko with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]235it [00:00, 55517.71it/s]
+2026-02-06 15:54:59.165 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 235 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/com_voice_ko_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset mls_pt with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]5309it [00:00, 124458.88it/s]5309it [00:00, 126229.76it/s]
+
+5309it [00:00, 132515.81it/s]
+2026-02-06 15:54:59.183 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 1061 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_pt_pack_new.jsonl
+2026-02-06 15:54:59.183 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 1061 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_pt_pack_new.jsonl
+2026-02-06 15:54:59.183 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 1061 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_pt_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}loading dataset msr86k_ru with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+
+loading dataset msr86k_ru with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset msr86k_ru with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]5309it [00:00, 150808.03it/s]
+2026-02-06 15:54:59.202 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 1061 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/mls_pt_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset msr86k_ru with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]4047it [00:00, 40458.37it/s]4043it [00:00, 40421.17it/s]4054it [00:00, 40538.40it/s]4255it [00:00, 42545.68it/s]8340it [00:00, 41909.77it/s]8343it [00:00, 41934.19it/s]8366it [00:00, 42054.21it/s]8611it [00:00, 43140.52it/s]12661it [00:00, 42501.36it/s]12664it [00:00, 42515.70it/s]12661it [00:00, 42457.66it/s]12978it [00:00, 43380.60it/s]16912it [00:00, 42200.94it/s]16916it [00:00, 42199.20it/s]16907it [00:00, 42068.69it/s]17317it [00:00, 42681.16it/s]21133it [00:00, 40392.34it/s]21137it [00:00, 40391.66it/s]21115it [00:00, 40131.87it/s]21587it [00:00, 40636.82it/s]25189it [00:00, 39379.52it/s]25185it [00:00, 39354.35it/s]25143it [00:00, 38990.23it/s]25667it [00:00, 39556.95it/s]29131it [00:00, 38713.27it/s]29138it [00:00, 38706.20it/s]29054it [00:00, 38311.96it/s]29635it [00:00, 38826.83it/s]33009it [00:00, 38237.78it/s]33016it [00:00, 38235.00it/s]32893it [00:00, 37829.53it/s]33526it [00:00, 38238.40it/s]36844it [00:00, 37945.21it/s]36837it [00:00, 37888.97it/s]36681it [00:00, 37571.01it/s]37355it [00:00, 37931.29it/s]40629it [00:01, 37466.94it/s]40641it [00:01, 37447.66it/s]40441it [00:01, 37197.19it/s]41151it [00:01, 37827.14it/s]44440it [00:01, 37655.19it/s]44451it [00:01, 37640.56it/s]44216it [00:01, 37360.65it/s]44936it [00:01, 37815.64it/s]48208it [00:01, 37633.57it/s]48217it [00:01, 37627.31it/s]47954it [00:01, 37314.05it/s]48719it [00:01, 37746.34it/s]51973it [00:01, 37543.72it/s]51981it [00:01, 37537.55it/s]51692it [00:01, 37332.86it/s]52495it [00:01, 37672.12it/s]55731it [00:01, 37552.76it/s]55738it [00:01, 37544.83it/s]55438it [00:01, 37367.23it/s]56263it [00:01, 37559.83it/s]59487it [00:01, 37411.37it/s]59493it [00:01, 37406.31it/s]59176it [00:01, 37258.77it/s]60020it [00:01, 37484.41it/s]63240it [00:01, 37445.17it/s]63255it [00:01, 37468.56it/s]62952it [00:01, 37406.02it/s]63773it [00:01, 37497.20it/s]67000it [00:01, 37488.67it/s]67007it [00:01, 37482.02it/s]66696it [00:01, 37415.40it/s]67523it [00:01, 37482.39it/s]70765it [00:01, 37534.14it/s]70776it [00:01, 37542.49it/s]70464it [00:01, 37492.61it/s]71287it [00:01, 37528.16it/s]74519it [00:01, 37518.68it/s]74531it [00:01, 37438.51it/s]74221it [00:01, 37513.37it/s]75056it [00:01, 37573.43it/s]78297it [00:02, 37594.30it/s]78284it [00:02, 37462.57it/s]77997it [00:02, 37586.45it/s]78814it [00:02, 37556.93it/s]82058it [00:02, 37542.90it/s]82057it [00:02, 37479.10it/s]81756it [00:02, 37462.41it/s]82570it [00:02, 37532.71it/s]85473it [00:02, 38265.16it/s]
+85473it [00:02, 38216.61it/s]
+85473it [00:02, 38032.60it/s]85473it [00:02, 38356.47it/s]
+
+2026-02-06 15:55:01.446 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 35898 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/msr86k_ru_pack_new.jsonl
+2026-02-06 15:55:01.450 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 35898 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/msr86k_ru_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset com_voice_th with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset com_voice_th with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+2026-02-06 15:55:01.461 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 35898 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/msr86k_ru_pack_new.jsonl
+2026-02-06 15:55:01.461 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 35898 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/msr86k_ru_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset com_voice_th with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset com_voice_th with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]2900it [00:00, 28989.04it/s]2990it [00:00, 29889.62it/s]3014it [00:00, 30133.35it/s]3028it [00:00, 30274.18it/s]6246it [00:00, 31613.97it/s]6328it [00:00, 31936.73it/s]6332it [00:00, 31923.29it/s]6338it [00:00, 31934.97it/s]9755it [00:00, 33199.36it/s]9882it [00:00, 33578.17it/s]9803it [00:00, 33193.54it/s]9793it [00:00, 33127.72it/s]13535it [00:00, 35012.33it/s]13651it [00:00, 35199.19it/s]13570it [00:00, 34956.47it/s]13589it [00:00, 35032.77it/s]17398it [00:00, 35180.00it/s]17398it [00:00, 34957.29it/s]
+
+2026-02-06 15:55:01.985 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 34796 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/com_voice_th_pack_new.jsonl
+2026-02-06 15:55:01.985 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 34796 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/com_voice_th_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset msr86k_vi with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset msr86k_vi with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]0it [00:00, ?it/s]17398it [00:00, 35064.47it/s]
+17398it [00:00, 35036.38it/s]
+2026-02-06 15:55:01.998 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 34796 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/com_voice_th_pack_new.jsonl
+2026-02-06 15:55:01.998 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 34796 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/com_voice_th_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset msr86k_vi with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset msr86k_vi with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]0it [00:00, ?it/s]3298it [00:00, 32972.01it/s]3312it [00:00, 33117.50it/s]3479it [00:00, 34781.16it/s]3461it [00:00, 34598.98it/s]6767it [00:00, 33980.18it/s]6768it [00:00, 33963.02it/s]6921it [00:00, 34256.85it/s]6958it [00:00, 34348.30it/s]10166it [00:00, 33895.13it/s]10165it [00:00, 33767.50it/s]10347it [00:00, 34098.25it/s]10394it [00:00, 34006.12it/s]13670it [00:00, 34344.15it/s]13684it [00:00, 34323.35it/s]13815it [00:00, 34322.25it/s]13881it [00:00, 34342.15it/s]17182it [00:00, 34623.01it/s]17184it [00:00, 34565.96it/s]17318it [00:00, 34574.27it/s]17378it [00:00, 34563.07it/s]20721it [00:00, 34881.74it/s]20751it [00:00, 34937.36it/s]20842it [00:00, 34799.03it/s]20909it [00:00, 34813.15it/s]24292it [00:00, 35150.93it/s]24325it [00:00, 35196.11it/s]24406it [00:00, 35071.42it/s]24485it [00:00, 35120.42it/s]27808it [00:00, 34414.78it/s]27845it [00:00, 34408.11it/s]27914it [00:00, 34232.72it/s]27998it [00:00, 34280.42it/s]31253it [00:00, 33472.72it/s]31290it [00:00, 33496.45it/s]31342it [00:00, 33374.58it/s]31431it [00:00, 33353.66it/s]34607it [00:01, 32959.69it/s]34646it [00:01, 32935.70it/s]34686it [00:01, 32777.06it/s]34774it [00:01, 32770.14it/s]37908it [00:01, 32538.04it/s]37945it [00:01, 32563.70it/s]37969it [00:01, 32462.11it/s]38057it [00:01, 32407.32it/s]41166it [00:01, 32256.01it/s]41205it [00:01, 32281.31it/s]41219it [00:01, 32173.51it/s]41302it [00:01, 32190.29it/s]44394it [00:01, 32137.09it/s]44436it [00:01, 32137.27it/s]44439it [00:01, 31707.97it/s]44523it [00:01, 31724.76it/s]47609it [00:01, 32009.43it/s]47651it [00:01, 32009.57it/s]47612it [00:01, 31706.95it/s]47698it [00:01, 31664.14it/s]50811it [00:01, 31932.71it/s]50853it [00:01, 31925.96it/s]50784it [00:01, 31698.05it/s]50866it [00:01, 31653.17it/s]54005it [00:01, 31824.26it/s]54046it [00:01, 31826.70it/s]53955it [00:01, 31620.46it/s]54032it [00:01, 31596.93it/s]57202it [00:01, 31862.50it/s]57236it [00:01, 31845.94it/s]57136it [00:01, 31674.87it/s]57207it [00:01, 31640.53it/s]60389it [00:01, 31645.54it/s]60421it [00:01, 31643.73it/s]60304it [00:01, 31551.87it/s]60372it [00:01, 31544.38it/s]63586it [00:01, 31622.87it/s]63554it [00:01, 31407.57it/s]63460it [00:01, 31502.79it/s]63527it [00:01, 31511.31it/s]66780it [00:02, 31714.10it/s]66744it [00:02, 31549.67it/s]66611it [00:02, 31448.19it/s]66701it [00:02, 31575.82it/s]69952it [00:02, 31622.87it/s]69900it [00:02, 31518.00it/s]69807it [00:02, 31599.36it/s]69862it [00:02, 31584.02it/s]73115it [00:02, 31594.97it/s]73081it [00:02, 31603.91it/s]72968it [00:02, 31546.12it/s]73021it [00:02, 31550.76it/s]76290it [00:02, 31637.85it/s]76301it [00:02, 31781.42it/s]76151it [00:02, 31627.90it/s]76211it [00:02, 31654.90it/s]79454it [00:02, 31625.15it/s]79480it [00:02, 31723.95it/s]79314it [00:02, 31626.35it/s]79377it [00:02, 31587.98it/s]82667it [00:02, 31773.45it/s]82692it [00:02, 31840.36it/s]82514it [00:02, 31735.75it/s]82573it [00:02, 31697.17it/s]85871it [00:02, 31851.34it/s]85898it [00:02, 31904.40it/s]85688it [00:02, 31687.85it/s]85746it [00:02, 31704.47it/s]89057it [00:02, 31780.49it/s]89089it [00:02, 31802.06it/s]88863it [00:02, 31703.00it/s]88917it [00:02, 31656.35it/s]92253it [00:02, 31833.55it/s]92277it [00:02, 31821.72it/s]92034it [00:02, 31635.75it/s]92089it [00:02, 31673.74it/s]95457it [00:02, 31893.77it/s]95496it [00:02, 31928.44it/s]95219it [00:02, 31697.42it/s]95270it [00:02, 31711.08it/s]98657it [00:03, 31924.62it/s]98694it [00:03, 31942.49it/s]98417it [00:03, 31779.75it/s]98456it [00:03, 31752.69it/s]101850it [00:03, 31849.43it/s]101889it [00:03, 31863.02it/s]101596it [00:03, 31691.11it/s]101632it [00:03, 31713.65it/s]105036it [00:03, 31838.45it/s]105076it [00:03, 31852.35it/s]104780it [00:03, 31732.35it/s]104805it [00:03, 31718.08it/s]108227it [00:03, 31858.88it/s]108272it [00:03, 31880.94it/s]107960it [00:03, 31749.26it/s]107980it [00:03, 31724.73it/s]111431it [00:03, 31911.74it/s]111472it [00:03, 31915.45it/s]111136it [00:03, 31750.78it/s]111173it [00:03, 31785.63it/s]114628it [00:03, 31927.63it/s]114671it [00:03, 31936.06it/s]114326it [00:03, 31794.53it/s]114352it [00:03, 31776.41it/s]117857it [00:03, 32033.39it/s]117897it [00:03, 32031.96it/s]117542it [00:03, 31902.36it/s]117565it [00:03, 31881.56it/s]121061it [00:03, 32027.69it/s]121101it [00:03, 32010.49it/s]120737it [00:03, 31914.64it/s]120754it [00:03, 31880.34it/s]124264it [00:03, 31998.65it/s]124303it [00:03, 31993.39it/s]123929it [00:03, 31907.59it/s]123943it [00:03, 31879.81it/s]127468it [00:03, 32008.45it/s]127510it [00:03, 32015.37it/s]127120it [00:03, 31871.11it/s]127131it [00:03, 31859.17it/s]130669it [00:04, 31919.20it/s]130712it [00:04, 31930.28it/s]130308it [00:04, 31763.59it/s]130317it [00:04, 31753.41it/s]133861it [00:04, 31648.72it/s]133906it [00:04, 31493.00it/s]133485it [00:04, 31604.98it/s]133493it [00:04, 31612.31it/s]137027it [00:04, 31563.31it/s]137061it [00:04, 31508.98it/s]136646it [00:04, 31564.94it/s]136655it [00:04, 31552.02it/s]140239it [00:04, 31728.46it/s]140295it [00:04, 31754.36it/s]139832it [00:04, 31649.85it/s]139841it [00:04, 31642.00it/s]141611it [00:04, 32257.06it/s]
+141611it [00:04, 32257.66it/s]
+141611it [00:04, 32155.62it/s]141611it [00:04, 32157.10it/s]
+
+2026-02-06 15:55:06.409 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 36252 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/msr86k_vi_pack_new.jsonl
+2026-02-06 15:55:06.409 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 36252 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/msr86k_vi_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset librispeech_en with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset librispeech_en with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+2026-02-06 15:55:06.436 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 36252 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/msr86k_vi_pack_new.jsonl
+2026-02-06 15:55:06.436 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 36252 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/msr86k_vi_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset librispeech_en with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+loading dataset librispeech_en with <class 'data.qwen_audio_pretrain_dataset_pack.UniAudioPretrainPackedDataset'>
+0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]7360it [00:00, 73592.70it/s]7610it [00:00, 76096.26it/s]7162it [00:00, 71613.41it/s]7309it [00:00, 73086.76it/s]21963it [00:00, 116196.97it/s]22258it [00:00, 117490.11it/s]21171it [00:00, 111885.08it/s]21283it [00:00, 112291.38it/s]30983it [00:00, 121644.30it/s]30983it [00:00, 120784.35it/s]
+
+2026-02-06 15:55:06.749 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 30983 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/librispeech_en_pack_new.jsonl
+2026-02-06 15:55:06.749 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 30983 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/librispeech_en_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+
+30983it [00:00, 119658.72it/s]30983it [00:00, 119319.45it/s]
+
+2026-02-06 15:55:06.780 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 30983 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/librispeech_en_pack_new.jsonl
+2026-02-06 15:55:06.780 | INFO     | data.qwen_audio_pretrain_dataset_pack:load_meta_dicts:84 - loaded dataset of 30983 samples from /fs/nlp/common_intern/meiyuxiang/data_prepare/step3_pack_2048_scale_4_fixed_1212/librispeech_en_pack_new.jsonl
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+{'en': 0, 'fr': 1, 'ru': 2, 'th': 3, 'de': 4, 'es': 5, 'it': 6, 'pt': 7, 'ar': 8, 'vi': 9, 'ja': 10, 'ko': 11}
+AudioEncoderConfig(add_audio_bos_eos_token=True, audio_attn_implementation='flash_attention_2', audio_start_id=155163, avg_pool=True, is_causal=False, n_ctx=1500, n_head=20, n_layer=32, n_mels=128, n_state=1280, output_dim=2048, post_subsampling_scale=4, use_conformer_encoder=False, use_wav2vec2_encoder=False, wav2vec2_model_dim=None, wav2vec2_num_heads=None, wav2vec2_num_layers=None, wav2vec2_ffn_dim=None, use_conv_subsampling=True, use_chunk_encoder=False, chunk_n_window=100, conv_chunksize=200, chunk_n_window_candidates=[50, 100, 200, 400], randomize_chunk_window=False, chunk_level_router=False, num_moe_experts=8)
+AudioEncoderConfig(add_audio_bos_eos_token=True, audio_attn_implementation='flash_attention_2', audio_start_id=155163, avg_pool=True, is_causal=False, n_ctx=1500, n_head=20, n_layer=32, n_mels=128, n_state=1280, output_dim=2048, post_subsampling_scale=4, use_conformer_encoder=False, use_wav2vec2_encoder=False, wav2vec2_model_dim=None, wav2vec2_num_heads=None, wav2vec2_num_layers=None, wav2vec2_ffn_dim=None, use_conv_subsampling=True, use_chunk_encoder=False, chunk_n_window=100, conv_chunksize=200, chunk_n_window_candidates=[50, 100, 200, 400], randomize_chunk_window=False, chunk_level_router=False, num_moe_experts=8)
+2026-02-06 15:55:07.043 | INFO     | model.unigpt_audio_models.audio:__init__:639 - initializing AudioEncoder with audio_attn_implementation=flash_attention_2, is_causal=False
+AudioEncoderConfig(add_audio_bos_eos_token=True, audio_attn_implementation='flash_attention_2', audio_start_id=155163, avg_pool=True, is_causal=False, n_ctx=1500, n_head=20, n_layer=32, n_mels=128, n_state=1280, output_dim=2048, post_subsampling_scale=4, use_conformer_encoder=False, use_wav2vec2_encoder=False, wav2vec2_model_dim=None, wav2vec2_num_heads=None, wav2vec2_num_layers=None, wav2vec2_ffn_dim=None, use_conv_subsampling=True, use_chunk_encoder=False, chunk_n_window=100, conv_chunksize=200, chunk_n_window_candidates=[50, 100, 200, 400], randomize_chunk_window=False, chunk_level_router=False, num_moe_experts=8)
+2026-02-06 15:55:07.051 | INFO     | model.unigpt_audio_models.audio:__init__:639 - initializing AudioEncoder with audio_attn_implementation=flash_attention_2, is_causal=False
+2026-02-06 15:55:07.059 | INFO     | model.unigpt_audio_models.audio:__init__:639 - initializing AudioEncoder with audio_attn_implementation=flash_attention_2, is_causal=False
+AudioEncoderConfig(add_audio_bos_eos_token=True, audio_attn_implementation='flash_attention_2', audio_start_id=155163, avg_pool=True, is_causal=False, n_ctx=1500, n_head=20, n_layer=32, n_mels=128, n_state=1280, output_dim=2048, post_subsampling_scale=4, use_conformer_encoder=False, use_wav2vec2_encoder=False, wav2vec2_model_dim=None, wav2vec2_num_heads=None, wav2vec2_num_layers=None, wav2vec2_ffn_dim=None, use_conv_subsampling=True, use_chunk_encoder=False, chunk_n_window=100, conv_chunksize=200, chunk_n_window_candidates=[50, 100, 200, 400], randomize_chunk_window=False, chunk_level_router=False, num_moe_experts=8)
+2026-02-06 15:55:07.100 | INFO     | model.unigpt_audio_models.audio:__init__:639 - initializing AudioEncoder with audio_attn_implementation=flash_attention_2, is_causal=False
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  3.75it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  3.67it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  3.54it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  3.16it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  4.43it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  4.31it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  4.31it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  4.20it/s]
+2026-02-06 15:55:07.747 | INFO     | model.actor:__init__:287 - *******************************************************
+2026-02-06 15:55:07.747 | INFO     | model.actor:__init__:288 - Zipper LoRA target modules: 
+['key', 'mlp.0', 'mlp.2', 'out', 'query', 'value']
+2026-02-06 15:55:07.749 | INFO     | model.actor:__init__:304 - Successfully loaded 12 language embeddings from file.
+2026-02-06 15:55:07.755 | INFO     | model.actor:__init__:287 - *******************************************************
+2026-02-06 15:55:07.755 | INFO     | model.actor:__init__:288 - Zipper LoRA target modules: 
+['key', 'mlp.0', 'mlp.2', 'out', 'query', 'value']
+2026-02-06 15:55:07.757 | INFO     | model.actor:__init__:304 - Successfully loaded 12 language embeddings from file.
+Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  4.11it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  4.01it/s]
+2026-02-06 15:55:07.787 | INFO     | model.actor:__init__:287 - *******************************************************
+2026-02-06 15:55:07.788 | INFO     | model.actor:__init__:288 - Zipper LoRA target modules: 
+['key', 'mlp.0', 'mlp.2', 'out', 'query', 'value']
+2026-02-06 15:55:07.790 | INFO     | model.actor:__init__:304 - Successfully loaded 12 language embeddings from file.
+Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.83it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.88it/s]
+2026-02-06 15:55:08.088 | INFO     | model.actor:__init__:287 - *******************************************************
+2026-02-06 15:55:08.089 | INFO     | model.actor:__init__:288 - Zipper LoRA target modules: 
+['key', 'mlp.0', 'mlp.2', 'out', 'query', 'value']
+2026-02-06 15:55:08.092 | INFO     | model.actor:__init__:304 - Successfully loaded 12 language embeddings from file.
+2026-02-06 15:55:16.097 | INFO     | model.actor:__init__:381 - LoRA target modules: 
+['model.layers.0.mlp.down_proj', 'model.layers.0.mlp.gate_proj', 'model.layers.0.mlp.up_proj', 'model.layers.0.self_attn.k_proj', 'model.layers.0.self_attn.o_proj', 'model.layers.0.self_attn.q_proj', 'model.layers.0.self_attn.v_proj', 'model.layers.1.mlp.down_proj', 'model.layers.1.mlp.gate_proj', 'model.layers.1.mlp.up_proj', 'model.layers.1.self_attn.k_proj', 'model.layers.1.self_attn.o_proj', 'model.layers.1.self_attn.q_proj', 'model.layers.1.self_attn.v_proj', 'model.layers.10.mlp.down_proj', 'model.layers.10.mlp.gate_proj', 'model.layers.10.mlp.up_proj', 'model.layers.10.self_attn.k_proj', 'model.layers.10.self_attn.o_proj', 'model.layers.10.self_attn.q_proj', 'model.layers.10.self_attn.v_proj', 'model.layers.11.mlp.down_proj', 'model.layers.11.mlp.gate_proj', 'model.layers.11.mlp.up_proj', 'model.layers.11.self_attn.k_proj', 'model.layers.11.self_attn.o_proj', 'model.layers.11.self_attn.q_proj', 'model.layers.11.self_attn.v_proj', 'model.layers.12.mlp.down_proj', 'model.layers.12.mlp.gate_proj', 'model.layers.12.mlp.up_proj', 'model.layers.12.self_attn.k_proj', 'model.layers.12.self_attn.o_proj', 'model.layers.12.self_attn.q_proj', 'model.layers.12.self_attn.v_proj', 'model.layers.13.mlp.down_proj', 'model.layers.13.mlp.gate_proj', 'model.layers.13.mlp.up_proj', 'model.layers.13.self_attn.k_proj', 'model.layers.13.self_attn.o_proj', 'model.layers.13.self_attn.q_proj', 'model.layers.13.self_attn.v_proj', 'model.layers.14.mlp.down_proj', 'model.layers.14.mlp.gate_proj', 'model.layers.14.mlp.up_proj', 'model.layers.14.self_attn.k_proj', 'model.layers.14.self_attn.o_proj', 'model.layers.14.self_attn.q_proj', 'model.layers.14.self_attn.v_proj', 'model.layers.15.mlp.down_proj', 'model.layers.15.mlp.gate_proj', 'model.layers.15.mlp.up_proj', 'model.layers.15.self_attn.k_proj', 'model.layers.15.self_attn.o_proj', 'model.layers.15.self_attn.q_proj', 'model.layers.15.self_attn.v_proj', 'model.layers.16.mlp.down_proj', 'model.layers.16.mlp.gate_proj', 'model.layers.16.mlp.up_proj', 'model.layers.16.self_attn.k_proj', 'model.layers.16.self_attn.o_proj', 'model.layers.16.self_attn.q_proj', 'model.layers.16.self_attn.v_proj', 'model.layers.17.mlp.down_proj', 'model.layers.17.mlp.gate_proj', 'model.layers.17.mlp.up_proj', 'model.layers.17.self_attn.k_proj', 'model.layers.17.self_attn.o_proj', 'model.layers.17.self_attn.q_proj', 'model.layers.17.self_attn.v_proj', 'model.layers.18.mlp.down_proj', 'model.layers.18.mlp.gate_proj', 'model.layers.18.mlp.up_proj', 'model.layers.18.self_attn.k_proj', 'model.layers.18.self_attn.o_proj', 'model.layers.18.self_attn.q_proj', 'model.layers.18.self_attn.v_proj', 'model.layers.19.mlp.down_proj', 'model.layers.19.mlp.gate_proj', 'model.layers.19.mlp.up_proj', 'model.layers.19.self_attn.k_proj', 'model.layers.19.self_attn.o_proj', 'model.layers.19.self_attn.q_proj', 'model.layers.19.self_attn.v_proj', 'model.layers.2.mlp.down_proj', 'model.layers.2.mlp.gate_proj', 'model.layers.2.mlp.up_proj', 'model.layers.2.self_attn.k_proj', 'model.layers.2.self_attn.o_proj', 'model.layers.2.self_attn.q_proj', 'model.layers.2.self_attn.v_proj', 'model.layers.20.mlp.down_proj', 'model.layers.20.mlp.gate_proj', 'model.layers.20.mlp.up_proj', 'model.layers.20.self_attn.k_proj', 'model.layers.20.self_attn.o_proj', 'model.layers.20.self_attn.q_proj', 'model.layers.20.self_attn.v_proj', 'model.layers.21.mlp.down_proj', 'model.layers.21.mlp.gate_proj', 'model.layers.21.mlp.up_proj', 'model.layers.21.self_attn.k_proj', 'model.layers.21.self_attn.o_proj', 'model.layers.21.self_attn.q_proj', 'model.layers.21.self_attn.v_proj', 'model.layers.22.mlp.down_proj', 'model.layers.22.mlp.gate_proj', 'model.layers.22.mlp.up_proj', 'model.layers.22.self_attn.k_proj', 'model.layers.22.self_attn.o_proj', 'model.layers.22.self_attn.q_proj', 'model.layers.22.self_attn.v_proj', 'model.layers.23.mlp.down_proj', 'model.layers.23.mlp.gate_proj', 'model.layers.23.mlp.up_proj', 'model.layers.23.self_attn.k_proj', 'model.layers.23.self_attn.o_proj', 'model.layers.23.self_attn.q_proj', 'model.layers.23.self_attn.v_proj', 'model.layers.24.mlp.down_proj', 'model.layers.24.mlp.gate_proj', 'model.layers.24.mlp.up_proj', 'model.layers.24.self_attn.k_proj', 'model.layers.24.self_attn.o_proj', 'model.layers.24.self_attn.q_proj', 'model.layers.24.self_attn.v_proj', 'model.layers.25.mlp.down_proj', 'model.layers.25.mlp.gate_proj', 'model.layers.25.mlp.up_proj', 'model.layers.25.self_attn.k_proj', 'model.layers.25.self_attn.o_proj', 'model.layers.25.self_attn.q_proj', 'model.layers.25.self_attn.v_proj', 'model.layers.26.mlp.down_proj', 'model.layers.26.mlp.gate_proj', 'model.layers.26.mlp.up_proj', 'model.layers.26.self_attn.k_proj', 'model.layers.26.self_attn.o_proj', 'model.layers.26.self_attn.q_proj', 'model.layers.26.self_attn.v_proj', 'model.layers.27.mlp.down_proj', 'model.layers.27.mlp.gate_proj', 'model.layers.27.mlp.up_proj', 'model.layers.27.self_attn.k_proj', 'model.layers.27.self_attn.o_proj', 'model.layers.27.self_attn.q_proj', 'model.layers.27.self_attn.v_proj', 'model.layers.3.mlp.down_proj', 'model.layers.3.mlp.gate_proj', 'model.layers.3.mlp.up_proj', 'model.layers.3.self_attn.k_proj', 'model.layers.3.self_attn.o_proj', 'model.layers.3.self_attn.q_proj', 'model.layers.3.self_attn.v_proj', 'model.layers.4.mlp.down_proj', 'model.layers.4.mlp.gate_proj', 'model.layers.4.mlp.up_proj', 'model.layers.4.self_attn.k_proj', 'model.layers.4.self_attn.o_proj', 'model.layers.4.self_attn.q_proj', 'model.layers.4.self_attn.v_proj', 'model.layers.5.mlp.down_proj', 'model.layers.5.mlp.gate_proj', 'model.layers.5.mlp.up_proj', 'model.layers.5.self_attn.k_proj', 'model.layers.5.self_attn.o_proj', 'model.layers.5.self_attn.q_proj', 'model.layers.5.self_attn.v_proj', 'model.layers.6.mlp.down_proj', 'model.layers.6.mlp.gate_proj', 'model.layers.6.mlp.up_proj', 'model.layers.6.self_attn.k_proj', 'model.layers.6.self_attn.o_proj', 'model.layers.6.self_attn.q_proj', 'model.layers.6.self_attn.v_proj', 'model.layers.7.mlp.down_proj', 'model.layers.7.mlp.gate_proj', 'model.layers.7.mlp.up_proj', 'model.layers.7.self_attn.k_proj', 'model.layers.7.self_attn.o_proj', 'model.layers.7.self_attn.q_proj', 'model.layers.7.self_attn.v_proj', 'model.layers.8.mlp.down_proj', 'model.layers.8.mlp.gate_proj', 'model.layers.8.mlp.up_proj', 'model.layers.8.self_attn.k_proj', 'model.layers.8.self_attn.o_proj', 'model.layers.8.self_attn.q_proj', 'model.layers.8.self_attn.v_proj', 'model.layers.9.mlp.down_proj', 'model.layers.9.mlp.gate_proj', 'model.layers.9.mlp.up_proj', 'model.layers.9.self_attn.k_proj', 'model.layers.9.self_attn.o_proj', 'model.layers.9.self_attn.q_proj', 'model.layers.9.self_attn.v_proj']
+2026-02-06 15:55:16.121 | INFO     | model.actor:__init__:381 - LoRA target modules: 
+['model.layers.0.mlp.down_proj', 'model.layers.0.mlp.gate_proj', 'model.layers.0.mlp.up_proj', 'model.layers.0.self_attn.k_proj', 'model.layers.0.self_attn.o_proj', 'model.layers.0.self_attn.q_proj', 'model.layers.0.self_attn.v_proj', 'model.layers.1.mlp.down_proj', 'model.layers.1.mlp.gate_proj', 'model.layers.1.mlp.up_proj', 'model.layers.1.self_attn.k_proj', 'model.layers.1.self_attn.o_proj', 'model.layers.1.self_attn.q_proj', 'model.layers.1.self_attn.v_proj', 'model.layers.10.mlp.down_proj', 'model.layers.10.mlp.gate_proj', 'model.layers.10.mlp.up_proj', 'model.layers.10.self_attn.k_proj', 'model.layers.10.self_attn.o_proj', 'model.layers.10.self_attn.q_proj', 'model.layers.10.self_attn.v_proj', 'model.layers.11.mlp.down_proj', 'model.layers.11.mlp.gate_proj', 'model.layers.11.mlp.up_proj', 'model.layers.11.self_attn.k_proj', 'model.layers.11.self_attn.o_proj', 'model.layers.11.self_attn.q_proj', 'model.layers.11.self_attn.v_proj', 'model.layers.12.mlp.down_proj', 'model.layers.12.mlp.gate_proj', 'model.layers.12.mlp.up_proj', 'model.layers.12.self_attn.k_proj', 'model.layers.12.self_attn.o_proj', 'model.layers.12.self_attn.q_proj', 'model.layers.12.self_attn.v_proj', 'model.layers.13.mlp.down_proj', 'model.layers.13.mlp.gate_proj', 'model.layers.13.mlp.up_proj', 'model.layers.13.self_attn.k_proj', 'model.layers.13.self_attn.o_proj', 'model.layers.13.self_attn.q_proj', 'model.layers.13.self_attn.v_proj', 'model.layers.14.mlp.down_proj', 'model.layers.14.mlp.gate_proj', 'model.layers.14.mlp.up_proj', 'model.layers.14.self_attn.k_proj', 'model.layers.14.self_attn.o_proj', 'model.layers.14.self_attn.q_proj', 'model.layers.14.self_attn.v_proj', 'model.layers.15.mlp.down_proj', 'model.layers.15.mlp.gate_proj', 'model.layers.15.mlp.up_proj', 'model.layers.15.self_attn.k_proj', 'model.layers.15.self_attn.o_proj', 'model.layers.15.self_attn.q_proj', 'model.layers.15.self_attn.v_proj', 'model.layers.16.mlp.down_proj', 'model.layers.16.mlp.gate_proj', 'model.layers.16.mlp.up_proj', 'model.layers.16.self_attn.k_proj', 'model.layers.16.self_attn.o_proj', 'model.layers.16.self_attn.q_proj', 'model.layers.16.self_attn.v_proj', 'model.layers.17.mlp.down_proj', 'model.layers.17.mlp.gate_proj', 'model.layers.17.mlp.up_proj', 'model.layers.17.self_attn.k_proj', 'model.layers.17.self_attn.o_proj', 'model.layers.17.self_attn.q_proj', 'model.layers.17.self_attn.v_proj', 'model.layers.18.mlp.down_proj', 'model.layers.18.mlp.gate_proj', 'model.layers.18.mlp.up_proj', 'model.layers.18.self_attn.k_proj', 'model.layers.18.self_attn.o_proj', 'model.layers.18.self_attn.q_proj', 'model.layers.18.self_attn.v_proj', 'model.layers.19.mlp.down_proj', 'model.layers.19.mlp.gate_proj', 'model.layers.19.mlp.up_proj', 'model.layers.19.self_attn.k_proj', 'model.layers.19.self_attn.o_proj', 'model.layers.19.self_attn.q_proj', 'model.layers.19.self_attn.v_proj', 'model.layers.2.mlp.down_proj', 'model.layers.2.mlp.gate_proj', 'model.layers.2.mlp.up_proj', 'model.layers.2.self_attn.k_proj', 'model.layers.2.self_attn.o_proj', 'model.layers.2.self_attn.q_proj', 'model.layers.2.self_attn.v_proj', 'model.layers.20.mlp.down_proj', 'model.layers.20.mlp.gate_proj', 'model.layers.20.mlp.up_proj', 'model.layers.20.self_attn.k_proj', 'model.layers.20.self_attn.o_proj', 'model.layers.20.self_attn.q_proj', 'model.layers.20.self_attn.v_proj', 'model.layers.21.mlp.down_proj', 'model.layers.21.mlp.gate_proj', 'model.layers.21.mlp.up_proj', 'model.layers.21.self_attn.k_proj', 'model.layers.21.self_attn.o_proj', 'model.layers.21.self_attn.q_proj', 'model.layers.21.self_attn.v_proj', 'model.layers.22.mlp.down_proj', 'model.layers.22.mlp.gate_proj', 'model.layers.22.mlp.up_proj', 'model.layers.22.self_attn.k_proj', 'model.layers.22.self_attn.o_proj', 'model.layers.22.self_attn.q_proj', 'model.layers.22.self_attn.v_proj', 'model.layers.23.mlp.down_proj', 'model.layers.23.mlp.gate_proj', 'model.layers.23.mlp.up_proj', 'model.layers.23.self_attn.k_proj', 'model.layers.23.self_attn.o_proj', 'model.layers.23.self_attn.q_proj', 'model.layers.23.self_attn.v_proj', 'model.layers.24.mlp.down_proj', 'model.layers.24.mlp.gate_proj', 'model.layers.24.mlp.up_proj', 'model.layers.24.self_attn.k_proj', 'model.layers.24.self_attn.o_proj', 'model.layers.24.self_attn.q_proj', 'model.layers.24.self_attn.v_proj', 'model.layers.25.mlp.down_proj', 'model.layers.25.mlp.gate_proj', 'model.layers.25.mlp.up_proj', 'model.layers.25.self_attn.k_proj', 'model.layers.25.self_attn.o_proj', 'model.layers.25.self_attn.q_proj', 'model.layers.25.self_attn.v_proj', 'model.layers.26.mlp.down_proj', 'model.layers.26.mlp.gate_proj', 'model.layers.26.mlp.up_proj', 'model.layers.26.self_attn.k_proj', 'model.layers.26.self_attn.o_proj', 'model.layers.26.self_attn.q_proj', 'model.layers.26.self_attn.v_proj', 'model.layers.27.mlp.down_proj', 'model.layers.27.mlp.gate_proj', 'model.layers.27.mlp.up_proj', 'model.layers.27.self_attn.k_proj', 'model.layers.27.self_attn.o_proj', 'model.layers.27.self_attn.q_proj', 'model.layers.27.self_attn.v_proj', 'model.layers.3.mlp.down_proj', 'model.layers.3.mlp.gate_proj', 'model.layers.3.mlp.up_proj', 'model.layers.3.self_attn.k_proj', 'model.layers.3.self_attn.o_proj', 'model.layers.3.self_attn.q_proj', 'model.layers.3.self_attn.v_proj', 'model.layers.4.mlp.down_proj', 'model.layers.4.mlp.gate_proj', 'model.layers.4.mlp.up_proj', 'model.layers.4.self_attn.k_proj', 'model.layers.4.self_attn.o_proj', 'model.layers.4.self_attn.q_proj', 'model.layers.4.self_attn.v_proj', 'model.layers.5.mlp.down_proj', 'model.layers.5.mlp.gate_proj', 'model.layers.5.mlp.up_proj', 'model.layers.5.self_attn.k_proj', 'model.layers.5.self_attn.o_proj', 'model.layers.5.self_attn.q_proj', 'model.layers.5.self_attn.v_proj', 'model.layers.6.mlp.down_proj', 'model.layers.6.mlp.gate_proj', 'model.layers.6.mlp.up_proj', 'model.layers.6.self_attn.k_proj', 'model.layers.6.self_attn.o_proj', 'model.layers.6.self_attn.q_proj', 'model.layers.6.self_attn.v_proj', 'model.layers.7.mlp.down_proj', 'model.layers.7.mlp.gate_proj', 'model.layers.7.mlp.up_proj', 'model.layers.7.self_attn.k_proj', 'model.layers.7.self_attn.o_proj', 'model.layers.7.self_attn.q_proj', 'model.layers.7.self_attn.v_proj', 'model.layers.8.mlp.down_proj', 'model.layers.8.mlp.gate_proj', 'model.layers.8.mlp.up_proj', 'model.layers.8.self_attn.k_proj', 'model.layers.8.self_attn.o_proj', 'model.layers.8.self_attn.q_proj', 'model.layers.8.self_attn.v_proj', 'model.layers.9.mlp.down_proj', 'model.layers.9.mlp.gate_proj', 'model.layers.9.mlp.up_proj', 'model.layers.9.self_attn.k_proj', 'model.layers.9.self_attn.o_proj', 'model.layers.9.self_attn.q_proj', 'model.layers.9.self_attn.v_proj']
+2026-02-06 15:55:16.521 | INFO     | model.actor:__init__:381 - LoRA target modules: 
+['model.layers.0.mlp.down_proj', 'model.layers.0.mlp.gate_proj', 'model.layers.0.mlp.up_proj', 'model.layers.0.self_attn.k_proj', 'model.layers.0.self_attn.o_proj', 'model.layers.0.self_attn.q_proj', 'model.layers.0.self_attn.v_proj', 'model.layers.1.mlp.down_proj', 'model.layers.1.mlp.gate_proj', 'model.layers.1.mlp.up_proj', 'model.layers.1.self_attn.k_proj', 'model.layers.1.self_attn.o_proj', 'model.layers.1.self_attn.q_proj', 'model.layers.1.self_attn.v_proj', 'model.layers.10.mlp.down_proj', 'model.layers.10.mlp.gate_proj', 'model.layers.10.mlp.up_proj', 'model.layers.10.self_attn.k_proj', 'model.layers.10.self_attn.o_proj', 'model.layers.10.self_attn.q_proj', 'model.layers.10.self_attn.v_proj', 'model.layers.11.mlp.down_proj', 'model.layers.11.mlp.gate_proj', 'model.layers.11.mlp.up_proj', 'model.layers.11.self_attn.k_proj', 'model.layers.11.self_attn.o_proj', 'model.layers.11.self_attn.q_proj', 'model.layers.11.self_attn.v_proj', 'model.layers.12.mlp.down_proj', 'model.layers.12.mlp.gate_proj', 'model.layers.12.mlp.up_proj', 'model.layers.12.self_attn.k_proj', 'model.layers.12.self_attn.o_proj', 'model.layers.12.self_attn.q_proj', 'model.layers.12.self_attn.v_proj', 'model.layers.13.mlp.down_proj', 'model.layers.13.mlp.gate_proj', 'model.layers.13.mlp.up_proj', 'model.layers.13.self_attn.k_proj', 'model.layers.13.self_attn.o_proj', 'model.layers.13.self_attn.q_proj', 'model.layers.13.self_attn.v_proj', 'model.layers.14.mlp.down_proj', 'model.layers.14.mlp.gate_proj', 'model.layers.14.mlp.up_proj', 'model.layers.14.self_attn.k_proj', 'model.layers.14.self_attn.o_proj', 'model.layers.14.self_attn.q_proj', 'model.layers.14.self_attn.v_proj', 'model.layers.15.mlp.down_proj', 'model.layers.15.mlp.gate_proj', 'model.layers.15.mlp.up_proj', 'model.layers.15.self_attn.k_proj', 'model.layers.15.self_attn.o_proj', 'model.layers.15.self_attn.q_proj', 'model.layers.15.self_attn.v_proj', 'model.layers.16.mlp.down_proj', 'model.layers.16.mlp.gate_proj', 'model.layers.16.mlp.up_proj', 'model.layers.16.self_attn.k_proj', 'model.layers.16.self_attn.o_proj', 'model.layers.16.self_attn.q_proj', 'model.layers.16.self_attn.v_proj', 'model.layers.17.mlp.down_proj', 'model.layers.17.mlp.gate_proj', 'model.layers.17.mlp.up_proj', 'model.layers.17.self_attn.k_proj', 'model.layers.17.self_attn.o_proj', 'model.layers.17.self_attn.q_proj', 'model.layers.17.self_attn.v_proj', 'model.layers.18.mlp.down_proj', 'model.layers.18.mlp.gate_proj', 'model.layers.18.mlp.up_proj', 'model.layers.18.self_attn.k_proj', 'model.layers.18.self_attn.o_proj', 'model.layers.18.self_attn.q_proj', 'model.layers.18.self_attn.v_proj', 'model.layers.19.mlp.down_proj', 'model.layers.19.mlp.gate_proj', 'model.layers.19.mlp.up_proj', 'model.layers.19.self_attn.k_proj', 'model.layers.19.self_attn.o_proj', 'model.layers.19.self_attn.q_proj', 'model.layers.19.self_attn.v_proj', 'model.layers.2.mlp.down_proj', 'model.layers.2.mlp.gate_proj', 'model.layers.2.mlp.up_proj', 'model.layers.2.self_attn.k_proj', 'model.layers.2.self_attn.o_proj', 'model.layers.2.self_attn.q_proj', 'model.layers.2.self_attn.v_proj', 'model.layers.20.mlp.down_proj', 'model.layers.20.mlp.gate_proj', 'model.layers.20.mlp.up_proj', 'model.layers.20.self_attn.k_proj', 'model.layers.20.self_attn.o_proj', 'model.layers.20.self_attn.q_proj', 'model.layers.20.self_attn.v_proj', 'model.layers.21.mlp.down_proj', 'model.layers.21.mlp.gate_proj', 'model.layers.21.mlp.up_proj', 'model.layers.21.self_attn.k_proj', 'model.layers.21.self_attn.o_proj', 'model.layers.21.self_attn.q_proj', 'model.layers.21.self_attn.v_proj', 'model.layers.22.mlp.down_proj', 'model.layers.22.mlp.gate_proj', 'model.layers.22.mlp.up_proj', 'model.layers.22.self_attn.k_proj', 'model.layers.22.self_attn.o_proj', 'model.layers.22.self_attn.q_proj', 'model.layers.22.self_attn.v_proj', 'model.layers.23.mlp.down_proj', 'model.layers.23.mlp.gate_proj', 'model.layers.23.mlp.up_proj', 'model.layers.23.self_attn.k_proj', 'model.layers.23.self_attn.o_proj', 'model.layers.23.self_attn.q_proj', 'model.layers.23.self_attn.v_proj', 'model.layers.24.mlp.down_proj', 'model.layers.24.mlp.gate_proj', 'model.layers.24.mlp.up_proj', 'model.layers.24.self_attn.k_proj', 'model.layers.24.self_attn.o_proj', 'model.layers.24.self_attn.q_proj', 'model.layers.24.self_attn.v_proj', 'model.layers.25.mlp.down_proj', 'model.layers.25.mlp.gate_proj', 'model.layers.25.mlp.up_proj', 'model.layers.25.self_attn.k_proj', 'model.layers.25.self_attn.o_proj', 'model.layers.25.self_attn.q_proj', 'model.layers.25.self_attn.v_proj', 'model.layers.26.mlp.down_proj', 'model.layers.26.mlp.gate_proj', 'model.layers.26.mlp.up_proj', 'model.layers.26.self_attn.k_proj', 'model.layers.26.self_attn.o_proj', 'model.layers.26.self_attn.q_proj', 'model.layers.26.self_attn.v_proj', 'model.layers.27.mlp.down_proj', 'model.layers.27.mlp.gate_proj', 'model.layers.27.mlp.up_proj', 'model.layers.27.self_attn.k_proj', 'model.layers.27.self_attn.o_proj', 'model.layers.27.self_attn.q_proj', 'model.layers.27.self_attn.v_proj', 'model.layers.3.mlp.down_proj', 'model.layers.3.mlp.gate_proj', 'model.layers.3.mlp.up_proj', 'model.layers.3.self_attn.k_proj', 'model.layers.3.self_attn.o_proj', 'model.layers.3.self_attn.q_proj', 'model.layers.3.self_attn.v_proj', 'model.layers.4.mlp.down_proj', 'model.layers.4.mlp.gate_proj', 'model.layers.4.mlp.up_proj', 'model.layers.4.self_attn.k_proj', 'model.layers.4.self_attn.o_proj', 'model.layers.4.self_attn.q_proj', 'model.layers.4.self_attn.v_proj', 'model.layers.5.mlp.down_proj', 'model.layers.5.mlp.gate_proj', 'model.layers.5.mlp.up_proj', 'model.layers.5.self_attn.k_proj', 'model.layers.5.self_attn.o_proj', 'model.layers.5.self_attn.q_proj', 'model.layers.5.self_attn.v_proj', 'model.layers.6.mlp.down_proj', 'model.layers.6.mlp.gate_proj', 'model.layers.6.mlp.up_proj', 'model.layers.6.self_attn.k_proj', 'model.layers.6.self_attn.o_proj', 'model.layers.6.self_attn.q_proj', 'model.layers.6.self_attn.v_proj', 'model.layers.7.mlp.down_proj', 'model.layers.7.mlp.gate_proj', 'model.layers.7.mlp.up_proj', 'model.layers.7.self_attn.k_proj', 'model.layers.7.self_attn.o_proj', 'model.layers.7.self_attn.q_proj', 'model.layers.7.self_attn.v_proj', 'model.layers.8.mlp.down_proj', 'model.layers.8.mlp.gate_proj', 'model.layers.8.mlp.up_proj', 'model.layers.8.self_attn.k_proj', 'model.layers.8.self_attn.o_proj', 'model.layers.8.self_attn.q_proj', 'model.layers.8.self_attn.v_proj', 'model.layers.9.mlp.down_proj', 'model.layers.9.mlp.gate_proj', 'model.layers.9.mlp.up_proj', 'model.layers.9.self_attn.k_proj', 'model.layers.9.self_attn.o_proj', 'model.layers.9.self_attn.q_proj', 'model.layers.9.self_attn.v_proj']
+2026-02-06 15:55:16.808 | INFO     | model.actor:__init__:381 - LoRA target modules: 
+['model.layers.0.mlp.down_proj', 'model.layers.0.mlp.gate_proj', 'model.layers.0.mlp.up_proj', 'model.layers.0.self_attn.k_proj', 'model.layers.0.self_attn.o_proj', 'model.layers.0.self_attn.q_proj', 'model.layers.0.self_attn.v_proj', 'model.layers.1.mlp.down_proj', 'model.layers.1.mlp.gate_proj', 'model.layers.1.mlp.up_proj', 'model.layers.1.self_attn.k_proj', 'model.layers.1.self_attn.o_proj', 'model.layers.1.self_attn.q_proj', 'model.layers.1.self_attn.v_proj', 'model.layers.10.mlp.down_proj', 'model.layers.10.mlp.gate_proj', 'model.layers.10.mlp.up_proj', 'model.layers.10.self_attn.k_proj', 'model.layers.10.self_attn.o_proj', 'model.layers.10.self_attn.q_proj', 'model.layers.10.self_attn.v_proj', 'model.layers.11.mlp.down_proj', 'model.layers.11.mlp.gate_proj', 'model.layers.11.mlp.up_proj', 'model.layers.11.self_attn.k_proj', 'model.layers.11.self_attn.o_proj', 'model.layers.11.self_attn.q_proj', 'model.layers.11.self_attn.v_proj', 'model.layers.12.mlp.down_proj', 'model.layers.12.mlp.gate_proj', 'model.layers.12.mlp.up_proj', 'model.layers.12.self_attn.k_proj', 'model.layers.12.self_attn.o_proj', 'model.layers.12.self_attn.q_proj', 'model.layers.12.self_attn.v_proj', 'model.layers.13.mlp.down_proj', 'model.layers.13.mlp.gate_proj', 'model.layers.13.mlp.up_proj', 'model.layers.13.self_attn.k_proj', 'model.layers.13.self_attn.o_proj', 'model.layers.13.self_attn.q_proj', 'model.layers.13.self_attn.v_proj', 'model.layers.14.mlp.down_proj', 'model.layers.14.mlp.gate_proj', 'model.layers.14.mlp.up_proj', 'model.layers.14.self_attn.k_proj', 'model.layers.14.self_attn.o_proj', 'model.layers.14.self_attn.q_proj', 'model.layers.14.self_attn.v_proj', 'model.layers.15.mlp.down_proj', 'model.layers.15.mlp.gate_proj', 'model.layers.15.mlp.up_proj', 'model.layers.15.self_attn.k_proj', 'model.layers.15.self_attn.o_proj', 'model.layers.15.self_attn.q_proj', 'model.layers.15.self_attn.v_proj', 'model.layers.16.mlp.down_proj', 'model.layers.16.mlp.gate_proj', 'model.layers.16.mlp.up_proj', 'model.layers.16.self_attn.k_proj', 'model.layers.16.self_attn.o_proj', 'model.layers.16.self_attn.q_proj', 'model.layers.16.self_attn.v_proj', 'model.layers.17.mlp.down_proj', 'model.layers.17.mlp.gate_proj', 'model.layers.17.mlp.up_proj', 'model.layers.17.self_attn.k_proj', 'model.layers.17.self_attn.o_proj', 'model.layers.17.self_attn.q_proj', 'model.layers.17.self_attn.v_proj', 'model.layers.18.mlp.down_proj', 'model.layers.18.mlp.gate_proj', 'model.layers.18.mlp.up_proj', 'model.layers.18.self_attn.k_proj', 'model.layers.18.self_attn.o_proj', 'model.layers.18.self_attn.q_proj', 'model.layers.18.self_attn.v_proj', 'model.layers.19.mlp.down_proj', 'model.layers.19.mlp.gate_proj', 'model.layers.19.mlp.up_proj', 'model.layers.19.self_attn.k_proj', 'model.layers.19.self_attn.o_proj', 'model.layers.19.self_attn.q_proj', 'model.layers.19.self_attn.v_proj', 'model.layers.2.mlp.down_proj', 'model.layers.2.mlp.gate_proj', 'model.layers.2.mlp.up_proj', 'model.layers.2.self_attn.k_proj', 'model.layers.2.self_attn.o_proj', 'model.layers.2.self_attn.q_proj', 'model.layers.2.self_attn.v_proj', 'model.layers.20.mlp.down_proj', 'model.layers.20.mlp.gate_proj', 'model.layers.20.mlp.up_proj', 'model.layers.20.self_attn.k_proj', 'model.layers.20.self_attn.o_proj', 'model.layers.20.self_attn.q_proj', 'model.layers.20.self_attn.v_proj', 'model.layers.21.mlp.down_proj', 'model.layers.21.mlp.gate_proj', 'model.layers.21.mlp.up_proj', 'model.layers.21.self_attn.k_proj', 'model.layers.21.self_attn.o_proj', 'model.layers.21.self_attn.q_proj', 'model.layers.21.self_attn.v_proj', 'model.layers.22.mlp.down_proj', 'model.layers.22.mlp.gate_proj', 'model.layers.22.mlp.up_proj', 'model.layers.22.self_attn.k_proj', 'model.layers.22.self_attn.o_proj', 'model.layers.22.self_attn.q_proj', 'model.layers.22.self_attn.v_proj', 'model.layers.23.mlp.down_proj', 'model.layers.23.mlp.gate_proj', 'model.layers.23.mlp.up_proj', 'model.layers.23.self_attn.k_proj', 'model.layers.23.self_attn.o_proj', 'model.layers.23.self_attn.q_proj', 'model.layers.23.self_attn.v_proj', 'model.layers.24.mlp.down_proj', 'model.layers.24.mlp.gate_proj', 'model.layers.24.mlp.up_proj', 'model.layers.24.self_attn.k_proj', 'model.layers.24.self_attn.o_proj', 'model.layers.24.self_attn.q_proj', 'model.layers.24.self_attn.v_proj', 'model.layers.25.mlp.down_proj', 'model.layers.25.mlp.gate_proj', 'model.layers.25.mlp.up_proj', 'model.layers.25.self_attn.k_proj', 'model.layers.25.self_attn.o_proj', 'model.layers.25.self_attn.q_proj', 'model.layers.25.self_attn.v_proj', 'model.layers.26.mlp.down_proj', 'model.layers.26.mlp.gate_proj', 'model.layers.26.mlp.up_proj', 'model.layers.26.self_attn.k_proj', 'model.layers.26.self_attn.o_proj', 'model.layers.26.self_attn.q_proj', 'model.layers.26.self_attn.v_proj', 'model.layers.27.mlp.down_proj', 'model.layers.27.mlp.gate_proj', 'model.layers.27.mlp.up_proj', 'model.layers.27.self_attn.k_proj', 'model.layers.27.self_attn.o_proj', 'model.layers.27.self_attn.q_proj', 'model.layers.27.self_attn.v_proj', 'model.layers.3.mlp.down_proj', 'model.layers.3.mlp.gate_proj', 'model.layers.3.mlp.up_proj', 'model.layers.3.self_attn.k_proj', 'model.layers.3.self_attn.o_proj', 'model.layers.3.self_attn.q_proj', 'model.layers.3.self_attn.v_proj', 'model.layers.4.mlp.down_proj', 'model.layers.4.mlp.gate_proj', 'model.layers.4.mlp.up_proj', 'model.layers.4.self_attn.k_proj', 'model.layers.4.self_attn.o_proj', 'model.layers.4.self_attn.q_proj', 'model.layers.4.self_attn.v_proj', 'model.layers.5.mlp.down_proj', 'model.layers.5.mlp.gate_proj', 'model.layers.5.mlp.up_proj', 'model.layers.5.self_attn.k_proj', 'model.layers.5.self_attn.o_proj', 'model.layers.5.self_attn.q_proj', 'model.layers.5.self_attn.v_proj', 'model.layers.6.mlp.down_proj', 'model.layers.6.mlp.gate_proj', 'model.layers.6.mlp.up_proj', 'model.layers.6.self_attn.k_proj', 'model.layers.6.self_attn.o_proj', 'model.layers.6.self_attn.q_proj', 'model.layers.6.self_attn.v_proj', 'model.layers.7.mlp.down_proj', 'model.layers.7.mlp.gate_proj', 'model.layers.7.mlp.up_proj', 'model.layers.7.self_attn.k_proj', 'model.layers.7.self_attn.o_proj', 'model.layers.7.self_attn.q_proj', 'model.layers.7.self_attn.v_proj', 'model.layers.8.mlp.down_proj', 'model.layers.8.mlp.gate_proj', 'model.layers.8.mlp.up_proj', 'model.layers.8.self_attn.k_proj', 'model.layers.8.self_attn.o_proj', 'model.layers.8.self_attn.q_proj', 'model.layers.8.self_attn.v_proj', 'model.layers.9.mlp.down_proj', 'model.layers.9.mlp.gate_proj', 'model.layers.9.mlp.up_proj', 'model.layers.9.self_attn.k_proj', 'model.layers.9.self_attn.o_proj', 'model.layers.9.self_attn.q_proj', 'model.layers.9.self_attn.v_proj']
+Actor(
+  (model): PeftModelForCausalLM(
+    (base_model): LoraModel(
+      (model): Qwen3AudioForCausalLM(
+        (model): Qwen3AudioModel(
+          (embed_tokens): Embedding(155165, 2048)
+          (layers): ModuleList(
+            (0-27): 28 x Qwen3DecoderLayer(
+              (self_attn): Qwen3FlashAttention2(
+                (q_proj): lora.Linear(
+                  (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
+                  (lora_dropout): ModuleDict(
+                    (default): Identity()
+                  )
+                  (lora_A): ModuleDict(
+                    (default): Linear(in_features=2048, out_features=32, bias=False)
+                  )
+                  (lora_B): ModuleDict(
+                    (default): Linear(in_features=32, out_features=2048, bias=False)
+                  )
+                  (lora_embedding_A): ParameterDict()
+                  (lora_embedding_B): ParameterDict()
+                )
+                (k_proj): lora.Linear(
+                  (base_layer): Linear(in_features=2048, out_features=1024, bias=False)
+                  (lora_dropout): ModuleDict(
+                    (default): Identity()
+                  )
+                  (lora_A): ModuleDict(
+                    (default): Linear(in_features=2048, out_features=32, bias=False)
+                  )
+                  (lora_B): ModuleDict(
+                    (default): Linear(in_features=32, out_features=1024, bias=False)
+                  )
+                  (lora_embedding_A): ParameterDict()
+                  (lora_embedding_B): ParameterDict()
+                )
+                (v_proj): lora.Linear(
+                  (base_layer): Linear(in_features=2048, out_features=1024, bias=False)
+                  (lora_dropout): ModuleDict(
+                    (default): Identity()
+                  )
+                  (lora_A): ModuleDict(
+                    (default): Linear(in_features=2048, out_features=32, bias=False)
+                  )
+                  (lora_B): ModuleDict(
+                    (default): Linear(in_features=32, out_features=1024, bias=False)
+                  )
+                  (lora_embedding_A): ParameterDict()
+                  (lora_embedding_B): ParameterDict()
+                )
+                (o_proj): lora.Linear(
+                  (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
+                  (lora_dropout): ModuleDict(
+                    (default): Identity()
+                  )
+                  (lora_A): ModuleDict(
+                    (default): Linear(in_features=2048, out_features=32, bias=False)
+                  )
+                  (lora_B): ModuleDict(
+                    (default): Linear(in_features=32, out_features=2048, bias=False)
+                  )
+                  (lora_embedding_A): ParameterDict()
+                  (lora_embedding_B): ParameterDict()
+                )
+                (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+                (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+                (rotary_emb): Qwen3RotaryEmbedding()
+              )
+              (mlp): Qwen3MLP(
+                (gate_proj): lora.Linear(
+                  (base_layer): Linear(in_features=2048, out_features=6144, bias=False)
+                  (lora_dropout): ModuleDict(
+                    (default): Identity()
+                  )
+                  (lora_A): ModuleDict(
+                    (default): Linear(in_features=2048, out_features=32, bias=False)
+                  )
+                  (lora_B): ModuleDict(
+                    (default): Linear(in_features=32, out_features=6144, bias=False)
+                  )
+                  (lora_embedding_A): ParameterDict()
+                  (lora_embedding_B): ParameterDict()
+                )
+                (up_proj): lora.Linear(
+                  (base_layer): Linear(in_features=2048, out_features=6144, bias=False)
+                  (lora_dropout): ModuleDict(
+                    (default): Identity()
+                  )
+                  (lora_A): ModuleDict(
+                    (default): Linear(in_features=2048, out_features=32, bias=False)
+                  )
+                  (lora_B): ModuleDict(
+                    (default): Linear(in_features=32, out_features=6144, bias=False)
+                  )
+                  (lora_embedding_A): ParameterDict()
+                  (lora_embedding_B): ParameterDict()
+                )
+                (down_proj): lora.Linear(
+                  (base_layer): Linear(in_features=6144, out_features=2048, bias=False)
+                  (lora_dropout): ModuleDict(
+                    (default): Identity()
+                  )
+                  (lora_A): ModuleDict(
+                    (default): Linear(in_features=6144, out_features=32, bias=False)
+                  )
+                  (lora_B): ModuleDict(
+                    (default): Linear(in_features=32, out_features=2048, bias=False)
+                  )
+                  (lora_embedding_A): ParameterDict()
+                  (lora_embedding_B): ParameterDict()
+                )
+                (act_fn): SiLU()
+              )
+              (input_layernorm): Qwen3RMSNorm((2048,), eps=1e-06)
+              (post_attention_layernorm): Qwen3RMSNorm((2048,), eps=1e-06)
+            )
+          )
+          (norm): Qwen3RMSNorm((2048,), eps=1e-06)
+          (rotary_emb): Qwen3RotaryEmbedding()
+          (audio): AudioEncoder(
+            (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
+            (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
+            (blocks): ModuleList(
+              (0-31): 32 x AudioEncoderLayer(
+                (attn): Qwen2AudioFlashAttention2(
+                  (key): DynamicRankZipperLoRAAdapter(
+                    (base): Linear(in_features=1280, out_features=1280, bias=False)
+                    (lora): DynamicRankZipperLoRALinear()
+                  )
+                  (value): DynamicRankZipperLoRAAdapter(
+                    (base): Linear(in_features=1280, out_features=1280, bias=True)
+                    (lora): DynamicRankZipperLoRALinear()
+                  )
+                  (query): DynamicRankZipperLoRAAdapter(
+                    (base): Linear(in_features=1280, out_features=1280, bias=True)
+                    (lora): DynamicRankZipperLoRALinear()
+                  )
+                  (out): DynamicRankZipperLoRAAdapter(
+                    (base): Linear(in_features=1280, out_features=1280, bias=True)
+                    (lora): DynamicRankZipperLoRALinear()
+                  )
+                )
+                (attn_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+                (mlp): Sequential(
+                  (0): DynamicRankZipperLoRAAdapter(
+                    (base): Linear(in_features=1280, out_features=5120, bias=True)
+                    (lora): DynamicRankZipperLoRALinear()
+                  )
+                  (1): GELU(approximate='none')
+                  (2): DynamicRankZipperLoRAAdapter(
+                    (base): Linear(in_features=5120, out_features=1280, bias=True)
+                    (lora): DynamicRankZipperLoRALinear()
+                  )
+                )
+                (mlp_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+              )
+            )
+            (ln_post): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+            (conv_proj): AudioEncoderProjector(
+              (gate_proj): Conv1d(1280, 5120, kernel_size=(4,), stride=(4,), bias=False)
+              (up_proj): Conv1d(1280, 5120, kernel_size=(4,), stride=(4,), bias=False)
+              (down_proj): Linear(in_features=5120, out_features=5120, bias=False)
+              (layer_norm): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)
+              (proj_decoder): Linear(in_features=5120, out_features=1280, bias=True)
+            )
+            (avg_pooler): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
+            (proj): Linear(in_features=1280, out_features=2048, bias=True)
+            (audio_bos_eos_token): Embedding(2, 2048)
+            (zipper_shared_embedding): Embedding(12, 1280)
+          )
+        )
+        (lm_head): Linear(in_features=2048, out_features=155165, bias=False)
+      )
+    )
+  )
+)
+freezing param: model.embed_tokens.weight
+freezing param: model.layers.0.self_attn.q_proj.base_layer.weight
+training param: model.layers.0.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.0.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.0.self_attn.k_proj.base_layer.weight
+training param: model.layers.0.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.0.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.0.self_attn.v_proj.base_layer.weight
+training param: model.layers.0.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.0.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.0.self_attn.o_proj.base_layer.weight
+training param: model.layers.0.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.0.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.0.self_attn.q_norm.weight
+freezing param: model.layers.0.self_attn.k_norm.weight
+freezing param: model.layers.0.mlp.gate_proj.base_layer.weight
+training param: model.layers.0.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.0.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.0.mlp.up_proj.base_layer.weight
+training param: model.layers.0.mlp.up_proj.lora_A.default.weight
+training param: model.layers.0.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.0.mlp.down_proj.base_layer.weight
+training param: model.layers.0.mlp.down_proj.lora_A.default.weight
+training param: model.layers.0.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.0.input_layernorm.weight
+freezing param: model.layers.0.post_attention_layernorm.weight
+freezing param: model.layers.1.self_attn.q_proj.base_layer.weight
+training param: model.layers.1.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.1.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.1.self_attn.k_proj.base_layer.weight
+training param: model.layers.1.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.1.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.1.self_attn.v_proj.base_layer.weight
+training param: model.layers.1.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.1.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.1.self_attn.o_proj.base_layer.weight
+training param: model.layers.1.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.1.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.1.self_attn.q_norm.weight
+freezing param: model.layers.1.self_attn.k_norm.weight
+freezing param: model.layers.1.mlp.gate_proj.base_layer.weight
+training param: model.layers.1.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.1.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.1.mlp.up_proj.base_layer.weight
+training param: model.layers.1.mlp.up_proj.lora_A.default.weight
+training param: model.layers.1.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.1.mlp.down_proj.base_layer.weight
+training param: model.layers.1.mlp.down_proj.lora_A.default.weight
+training param: model.layers.1.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.1.input_layernorm.weight
+freezing param: model.layers.1.post_attention_layernorm.weight
+freezing param: model.layers.2.self_attn.q_proj.base_layer.weight
+training param: model.layers.2.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.2.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.2.self_attn.k_proj.base_layer.weight
+training param: model.layers.2.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.2.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.2.self_attn.v_proj.base_layer.weight
+training param: model.layers.2.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.2.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.2.self_attn.o_proj.base_layer.weight
+training param: model.layers.2.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.2.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.2.self_attn.q_norm.weight
+freezing param: model.layers.2.self_attn.k_norm.weight
+freezing param: model.layers.2.mlp.gate_proj.base_layer.weight
+training param: model.layers.2.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.2.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.2.mlp.up_proj.base_layer.weight
+training param: model.layers.2.mlp.up_proj.lora_A.default.weight
+training param: model.layers.2.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.2.mlp.down_proj.base_layer.weight
+training param: model.layers.2.mlp.down_proj.lora_A.default.weight
+training param: model.layers.2.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.2.input_layernorm.weight
+freezing param: model.layers.2.post_attention_layernorm.weight
+freezing param: model.layers.3.self_attn.q_proj.base_layer.weight
+training param: model.layers.3.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.3.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.3.self_attn.k_proj.base_layer.weight
+training param: model.layers.3.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.3.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.3.self_attn.v_proj.base_layer.weight
+training param: model.layers.3.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.3.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.3.self_attn.o_proj.base_layer.weight
+training param: model.layers.3.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.3.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.3.self_attn.q_norm.weight
+freezing param: model.layers.3.self_attn.k_norm.weight
+freezing param: model.layers.3.mlp.gate_proj.base_layer.weight
+training param: model.layers.3.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.3.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.3.mlp.up_proj.base_layer.weight
+training param: model.layers.3.mlp.up_proj.lora_A.default.weight
+training param: model.layers.3.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.3.mlp.down_proj.base_layer.weight
+training param: model.layers.3.mlp.down_proj.lora_A.default.weight
+training param: model.layers.3.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.3.input_layernorm.weight
+freezing param: model.layers.3.post_attention_layernorm.weight
+freezing param: model.layers.4.self_attn.q_proj.base_layer.weight
+training param: model.layers.4.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.4.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.4.self_attn.k_proj.base_layer.weight
+training param: model.layers.4.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.4.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.4.self_attn.v_proj.base_layer.weight
+training param: model.layers.4.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.4.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.4.self_attn.o_proj.base_layer.weight
+training param: model.layers.4.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.4.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.4.self_attn.q_norm.weight
+freezing param: model.layers.4.self_attn.k_norm.weight
+freezing param: model.layers.4.mlp.gate_proj.base_layer.weight
+training param: model.layers.4.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.4.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.4.mlp.up_proj.base_layer.weight
+training param: model.layers.4.mlp.up_proj.lora_A.default.weight
+training param: model.layers.4.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.4.mlp.down_proj.base_layer.weight
+training param: model.layers.4.mlp.down_proj.lora_A.default.weight
+training param: model.layers.4.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.4.input_layernorm.weight
+freezing param: model.layers.4.post_attention_layernorm.weight
+freezing param: model.layers.5.self_attn.q_proj.base_layer.weight
+training param: model.layers.5.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.5.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.5.self_attn.k_proj.base_layer.weight
+training param: model.layers.5.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.5.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.5.self_attn.v_proj.base_layer.weight
+training param: model.layers.5.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.5.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.5.self_attn.o_proj.base_layer.weight
+training param: model.layers.5.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.5.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.5.self_attn.q_norm.weight
+freezing param: model.layers.5.self_attn.k_norm.weight
+freezing param: model.layers.5.mlp.gate_proj.base_layer.weight
+training param: model.layers.5.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.5.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.5.mlp.up_proj.base_layer.weight
+training param: model.layers.5.mlp.up_proj.lora_A.default.weight
+training param: model.layers.5.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.5.mlp.down_proj.base_layer.weight
+training param: model.layers.5.mlp.down_proj.lora_A.default.weight
+training param: model.layers.5.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.5.input_layernorm.weight
+freezing param: model.layers.5.post_attention_layernorm.weight
+freezing param: model.layers.6.self_attn.q_proj.base_layer.weight
+training param: model.layers.6.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.6.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.6.self_attn.k_proj.base_layer.weight
+training param: model.layers.6.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.6.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.6.self_attn.v_proj.base_layer.weight
+training param: model.layers.6.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.6.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.6.self_attn.o_proj.base_layer.weight
+training param: model.layers.6.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.6.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.6.self_attn.q_norm.weight
+freezing param: model.layers.6.self_attn.k_norm.weight
+freezing param: model.layers.6.mlp.gate_proj.base_layer.weight
+training param: model.layers.6.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.6.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.6.mlp.up_proj.base_layer.weight
+training param: model.layers.6.mlp.up_proj.lora_A.default.weight
+training param: model.layers.6.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.6.mlp.down_proj.base_layer.weight
+training param: model.layers.6.mlp.down_proj.lora_A.default.weight
+training param: model.layers.6.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.6.input_layernorm.weight
+freezing param: model.layers.6.post_attention_layernorm.weight
+freezing param: model.layers.7.self_attn.q_proj.base_layer.weight
+training param: model.layers.7.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.7.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.7.self_attn.k_proj.base_layer.weight
+training param: model.layers.7.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.7.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.7.self_attn.v_proj.base_layer.weight
+training param: model.layers.7.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.7.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.7.self_attn.o_proj.base_layer.weight
+training param: model.layers.7.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.7.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.7.self_attn.q_norm.weight
+freezing param: model.layers.7.self_attn.k_norm.weight
+freezing param: model.layers.7.mlp.gate_proj.base_layer.weight
+training param: model.layers.7.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.7.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.7.mlp.up_proj.base_layer.weight
+training param: model.layers.7.mlp.up_proj.lora_A.default.weight
+training param: model.layers.7.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.7.mlp.down_proj.base_layer.weight
+training param: model.layers.7.mlp.down_proj.lora_A.default.weight
+training param: model.layers.7.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.7.input_layernorm.weight
+freezing param: model.layers.7.post_attention_layernorm.weight
+freezing param: model.layers.8.self_attn.q_proj.base_layer.weight
+training param: model.layers.8.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.8.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.8.self_attn.k_proj.base_layer.weight
+training param: model.layers.8.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.8.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.8.self_attn.v_proj.base_layer.weight
+training param: model.layers.8.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.8.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.8.self_attn.o_proj.base_layer.weight
+training param: model.layers.8.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.8.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.8.self_attn.q_norm.weight
+freezing param: model.layers.8.self_attn.k_norm.weight
+freezing param: model.layers.8.mlp.gate_proj.base_layer.weight
+training param: model.layers.8.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.8.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.8.mlp.up_proj.base_layer.weight
+training param: model.layers.8.mlp.up_proj.lora_A.default.weight
+training param: model.layers.8.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.8.mlp.down_proj.base_layer.weight
+training param: model.layers.8.mlp.down_proj.lora_A.default.weight
+training param: model.layers.8.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.8.input_layernorm.weight
+freezing param: model.layers.8.post_attention_layernorm.weight
+freezing param: model.layers.9.self_attn.q_proj.base_layer.weight
+training param: model.layers.9.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.9.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.9.self_attn.k_proj.base_layer.weight
+training param: model.layers.9.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.9.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.9.self_attn.v_proj.base_layer.weight
+training param: model.layers.9.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.9.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.9.self_attn.o_proj.base_layer.weight
+training param: model.layers.9.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.9.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.9.self_attn.q_norm.weight
+freezing param: model.layers.9.self_attn.k_norm.weight
+freezing param: model.layers.9.mlp.gate_proj.base_layer.weight
+training param: model.layers.9.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.9.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.9.mlp.up_proj.base_layer.weight
+training param: model.layers.9.mlp.up_proj.lora_A.default.weight
+training param: model.layers.9.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.9.mlp.down_proj.base_layer.weight
+training param: model.layers.9.mlp.down_proj.lora_A.default.weight
+training param: model.layers.9.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.9.input_layernorm.weight
+freezing param: model.layers.9.post_attention_layernorm.weight
+freezing param: model.layers.10.self_attn.q_proj.base_layer.weight
+training param: model.layers.10.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.10.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.10.self_attn.k_proj.base_layer.weight
+training param: model.layers.10.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.10.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.10.self_attn.v_proj.base_layer.weight
+training param: model.layers.10.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.10.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.10.self_attn.o_proj.base_layer.weight
+training param: model.layers.10.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.10.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.10.self_attn.q_norm.weight
+freezing param: model.layers.10.self_attn.k_norm.weight
+freezing param: model.layers.10.mlp.gate_proj.base_layer.weight
+training param: model.layers.10.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.10.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.10.mlp.up_proj.base_layer.weight
+training param: model.layers.10.mlp.up_proj.lora_A.default.weight
+training param: model.layers.10.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.10.mlp.down_proj.base_layer.weight
+training param: model.layers.10.mlp.down_proj.lora_A.default.weight
+training param: model.layers.10.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.10.input_layernorm.weight
+freezing param: model.layers.10.post_attention_layernorm.weight
+freezing param: model.layers.11.self_attn.q_proj.base_layer.weight
+training param: model.layers.11.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.11.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.11.self_attn.k_proj.base_layer.weight
+training param: model.layers.11.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.11.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.11.self_attn.v_proj.base_layer.weight
+training param: model.layers.11.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.11.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.11.self_attn.o_proj.base_layer.weight
+training param: model.layers.11.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.11.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.11.self_attn.q_norm.weight
+freezing param: model.layers.11.self_attn.k_norm.weight
+freezing param: model.layers.11.mlp.gate_proj.base_layer.weight
+training param: model.layers.11.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.11.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.11.mlp.up_proj.base_layer.weight
+training param: model.layers.11.mlp.up_proj.lora_A.default.weight
+training param: model.layers.11.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.11.mlp.down_proj.base_layer.weight
+training param: model.layers.11.mlp.down_proj.lora_A.default.weight
+training param: model.layers.11.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.11.input_layernorm.weight
+freezing param: model.layers.11.post_attention_layernorm.weight
+freezing param: model.layers.12.self_attn.q_proj.base_layer.weight
+training param: model.layers.12.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.12.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.12.self_attn.k_proj.base_layer.weight
+training param: model.layers.12.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.12.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.12.self_attn.v_proj.base_layer.weight
+training param: model.layers.12.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.12.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.12.self_attn.o_proj.base_layer.weight
+training param: model.layers.12.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.12.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.12.self_attn.q_norm.weight
+freezing param: model.layers.12.self_attn.k_norm.weight
+freezing param: model.layers.12.mlp.gate_proj.base_layer.weight
+training param: model.layers.12.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.12.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.12.mlp.up_proj.base_layer.weight
+training param: model.layers.12.mlp.up_proj.lora_A.default.weight
+training param: model.layers.12.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.12.mlp.down_proj.base_layer.weight
+training param: model.layers.12.mlp.down_proj.lora_A.default.weight
+training param: model.layers.12.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.12.input_layernorm.weight
+freezing param: model.layers.12.post_attention_layernorm.weight
+freezing param: model.layers.13.self_attn.q_proj.base_layer.weight
+training param: model.layers.13.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.13.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.13.self_attn.k_proj.base_layer.weight
+training param: model.layers.13.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.13.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.13.self_attn.v_proj.base_layer.weight
+training param: model.layers.13.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.13.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.13.self_attn.o_proj.base_layer.weight
+training param: model.layers.13.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.13.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.13.self_attn.q_norm.weight
+freezing param: model.layers.13.self_attn.k_norm.weight
+freezing param: model.layers.13.mlp.gate_proj.base_layer.weight
+training param: model.layers.13.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.13.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.13.mlp.up_proj.base_layer.weight
+training param: model.layers.13.mlp.up_proj.lora_A.default.weight
+training param: model.layers.13.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.13.mlp.down_proj.base_layer.weight
+training param: model.layers.13.mlp.down_proj.lora_A.default.weight
+training param: model.layers.13.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.13.input_layernorm.weight
+freezing param: model.layers.13.post_attention_layernorm.weight
+freezing param: model.layers.14.self_attn.q_proj.base_layer.weight
+training param: model.layers.14.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.14.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.14.self_attn.k_proj.base_layer.weight
+training param: model.layers.14.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.14.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.14.self_attn.v_proj.base_layer.weight
+training param: model.layers.14.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.14.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.14.self_attn.o_proj.base_layer.weight
+training param: model.layers.14.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.14.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.14.self_attn.q_norm.weight
+freezing param: model.layers.14.self_attn.k_norm.weight
+freezing param: model.layers.14.mlp.gate_proj.base_layer.weight
+training param: model.layers.14.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.14.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.14.mlp.up_proj.base_layer.weight
+training param: model.layers.14.mlp.up_proj.lora_A.default.weight
+training param: model.layers.14.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.14.mlp.down_proj.base_layer.weight
+training param: model.layers.14.mlp.down_proj.lora_A.default.weight
+training param: model.layers.14.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.14.input_layernorm.weight
+freezing param: model.layers.14.post_attention_layernorm.weight
+freezing param: model.layers.15.self_attn.q_proj.base_layer.weight
+training param: model.layers.15.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.15.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.15.self_attn.k_proj.base_layer.weight
+training param: model.layers.15.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.15.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.15.self_attn.v_proj.base_layer.weight
+training param: model.layers.15.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.15.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.15.self_attn.o_proj.base_layer.weight
+training param: model.layers.15.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.15.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.15.self_attn.q_norm.weight
+freezing param: model.layers.15.self_attn.k_norm.weight
+freezing param: model.layers.15.mlp.gate_proj.base_layer.weight
+training param: model.layers.15.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.15.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.15.mlp.up_proj.base_layer.weight
+training param: model.layers.15.mlp.up_proj.lora_A.default.weight
+training param: model.layers.15.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.15.mlp.down_proj.base_layer.weight
+training param: model.layers.15.mlp.down_proj.lora_A.default.weight
+training param: model.layers.15.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.15.input_layernorm.weight
+freezing param: model.layers.15.post_attention_layernorm.weight
+freezing param: model.layers.16.self_attn.q_proj.base_layer.weight
+training param: model.layers.16.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.16.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.16.self_attn.k_proj.base_layer.weight
+training param: model.layers.16.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.16.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.16.self_attn.v_proj.base_layer.weight
+training param: model.layers.16.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.16.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.16.self_attn.o_proj.base_layer.weight
+training param: model.layers.16.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.16.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.16.self_attn.q_norm.weight
+freezing param: model.layers.16.self_attn.k_norm.weight
+freezing param: model.layers.16.mlp.gate_proj.base_layer.weight
+training param: model.layers.16.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.16.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.16.mlp.up_proj.base_layer.weight
+training param: model.layers.16.mlp.up_proj.lora_A.default.weight
+training param: model.layers.16.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.16.mlp.down_proj.base_layer.weight
+training param: model.layers.16.mlp.down_proj.lora_A.default.weight
+training param: model.layers.16.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.16.input_layernorm.weight
+freezing param: model.layers.16.post_attention_layernorm.weight
+freezing param: model.layers.17.self_attn.q_proj.base_layer.weight
+training param: model.layers.17.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.17.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.17.self_attn.k_proj.base_layer.weight
+training param: model.layers.17.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.17.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.17.self_attn.v_proj.base_layer.weight
+training param: model.layers.17.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.17.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.17.self_attn.o_proj.base_layer.weight
+training param: model.layers.17.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.17.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.17.self_attn.q_norm.weight
+freezing param: model.layers.17.self_attn.k_norm.weight
+freezing param: model.layers.17.mlp.gate_proj.base_layer.weight
+training param: model.layers.17.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.17.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.17.mlp.up_proj.base_layer.weight
+training param: model.layers.17.mlp.up_proj.lora_A.default.weight
+training param: model.layers.17.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.17.mlp.down_proj.base_layer.weight
+training param: model.layers.17.mlp.down_proj.lora_A.default.weight
+training param: model.layers.17.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.17.input_layernorm.weight
+freezing param: model.layers.17.post_attention_layernorm.weight
+freezing param: model.layers.18.self_attn.q_proj.base_layer.weight
+training param: model.layers.18.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.18.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.18.self_attn.k_proj.base_layer.weight
+training param: model.layers.18.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.18.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.18.self_attn.v_proj.base_layer.weight
+training param: model.layers.18.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.18.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.18.self_attn.o_proj.base_layer.weight
+training param: model.layers.18.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.18.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.18.self_attn.q_norm.weight
+freezing param: model.layers.18.self_attn.k_norm.weight
+freezing param: model.layers.18.mlp.gate_proj.base_layer.weight
+training param: model.layers.18.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.18.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.18.mlp.up_proj.base_layer.weight
+training param: model.layers.18.mlp.up_proj.lora_A.default.weight
+training param: model.layers.18.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.18.mlp.down_proj.base_layer.weight
+training param: model.layers.18.mlp.down_proj.lora_A.default.weight
+training param: model.layers.18.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.18.input_layernorm.weight
+freezing param: model.layers.18.post_attention_layernorm.weight
+freezing param: model.layers.19.self_attn.q_proj.base_layer.weight
+training param: model.layers.19.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.19.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.19.self_attn.k_proj.base_layer.weight
+training param: model.layers.19.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.19.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.19.self_attn.v_proj.base_layer.weight
+training param: model.layers.19.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.19.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.19.self_attn.o_proj.base_layer.weight
+training param: model.layers.19.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.19.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.19.self_attn.q_norm.weight
+freezing param: model.layers.19.self_attn.k_norm.weight
+freezing param: model.layers.19.mlp.gate_proj.base_layer.weight
+training param: model.layers.19.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.19.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.19.mlp.up_proj.base_layer.weight
+training param: model.layers.19.mlp.up_proj.lora_A.default.weight
+training param: model.layers.19.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.19.mlp.down_proj.base_layer.weight
+training param: model.layers.19.mlp.down_proj.lora_A.default.weight
+training param: model.layers.19.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.19.input_layernorm.weight
+freezing param: model.layers.19.post_attention_layernorm.weight
+freezing param: model.layers.20.self_attn.q_proj.base_layer.weight
+training param: model.layers.20.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.20.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.20.self_attn.k_proj.base_layer.weight
+training param: model.layers.20.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.20.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.20.self_attn.v_proj.base_layer.weight
+training param: model.layers.20.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.20.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.20.self_attn.o_proj.base_layer.weight
+training param: model.layers.20.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.20.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.20.self_attn.q_norm.weight
+freezing param: model.layers.20.self_attn.k_norm.weight
+freezing param: model.layers.20.mlp.gate_proj.base_layer.weight
+training param: model.layers.20.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.20.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.20.mlp.up_proj.base_layer.weight
+training param: model.layers.20.mlp.up_proj.lora_A.default.weight
+training param: model.layers.20.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.20.mlp.down_proj.base_layer.weight
+training param: model.layers.20.mlp.down_proj.lora_A.default.weight
+training param: model.layers.20.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.20.input_layernorm.weight
+freezing param: model.layers.20.post_attention_layernorm.weight
+freezing param: model.layers.21.self_attn.q_proj.base_layer.weight
+training param: model.layers.21.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.21.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.21.self_attn.k_proj.base_layer.weight
+training param: model.layers.21.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.21.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.21.self_attn.v_proj.base_layer.weight
+training param: model.layers.21.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.21.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.21.self_attn.o_proj.base_layer.weight
+training param: model.layers.21.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.21.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.21.self_attn.q_norm.weight
+freezing param: model.layers.21.self_attn.k_norm.weight
+freezing param: model.layers.21.mlp.gate_proj.base_layer.weight
+training param: model.layers.21.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.21.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.21.mlp.up_proj.base_layer.weight
+training param: model.layers.21.mlp.up_proj.lora_A.default.weight
+training param: model.layers.21.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.21.mlp.down_proj.base_layer.weight
+training param: model.layers.21.mlp.down_proj.lora_A.default.weight
+training param: model.layers.21.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.21.input_layernorm.weight
+freezing param: model.layers.21.post_attention_layernorm.weight
+freezing param: model.layers.22.self_attn.q_proj.base_layer.weight
+training param: model.layers.22.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.22.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.22.self_attn.k_proj.base_layer.weight
+training param: model.layers.22.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.22.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.22.self_attn.v_proj.base_layer.weight
+training param: model.layers.22.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.22.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.22.self_attn.o_proj.base_layer.weight
+training param: model.layers.22.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.22.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.22.self_attn.q_norm.weight
+freezing param: model.layers.22.self_attn.k_norm.weight
+freezing param: model.layers.22.mlp.gate_proj.base_layer.weight
+training param: model.layers.22.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.22.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.22.mlp.up_proj.base_layer.weight
+training param: model.layers.22.mlp.up_proj.lora_A.default.weight
+training param: model.layers.22.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.22.mlp.down_proj.base_layer.weight
+training param: model.layers.22.mlp.down_proj.lora_A.default.weight
+training param: model.layers.22.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.22.input_layernorm.weight
+freezing param: model.layers.22.post_attention_layernorm.weight
+freezing param: model.layers.23.self_attn.q_proj.base_layer.weight
+training param: model.layers.23.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.23.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.23.self_attn.k_proj.base_layer.weight
+training param: model.layers.23.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.23.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.23.self_attn.v_proj.base_layer.weight
+training param: model.layers.23.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.23.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.23.self_attn.o_proj.base_layer.weight
+training param: model.layers.23.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.23.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.23.self_attn.q_norm.weight
+freezing param: model.layers.23.self_attn.k_norm.weight
+freezing param: model.layers.23.mlp.gate_proj.base_layer.weight
+training param: model.layers.23.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.23.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.23.mlp.up_proj.base_layer.weight
+training param: model.layers.23.mlp.up_proj.lora_A.default.weight
+training param: model.layers.23.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.23.mlp.down_proj.base_layer.weight
+training param: model.layers.23.mlp.down_proj.lora_A.default.weight
+training param: model.layers.23.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.23.input_layernorm.weight
+freezing param: model.layers.23.post_attention_layernorm.weight
+freezing param: model.layers.24.self_attn.q_proj.base_layer.weight
+training param: model.layers.24.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.24.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.24.self_attn.k_proj.base_layer.weight
+training param: model.layers.24.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.24.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.24.self_attn.v_proj.base_layer.weight
+training param: model.layers.24.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.24.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.24.self_attn.o_proj.base_layer.weight
+training param: model.layers.24.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.24.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.24.self_attn.q_norm.weight
+freezing param: model.layers.24.self_attn.k_norm.weight
+freezing param: model.layers.24.mlp.gate_proj.base_layer.weight
+training param: model.layers.24.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.24.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.24.mlp.up_proj.base_layer.weight
+training param: model.layers.24.mlp.up_proj.lora_A.default.weight
+training param: model.layers.24.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.24.mlp.down_proj.base_layer.weight
+training param: model.layers.24.mlp.down_proj.lora_A.default.weight
+training param: model.layers.24.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.24.input_layernorm.weight
+freezing param: model.layers.24.post_attention_layernorm.weight
+freezing param: model.layers.25.self_attn.q_proj.base_layer.weight
+training param: model.layers.25.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.25.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.25.self_attn.k_proj.base_layer.weight
+training param: model.layers.25.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.25.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.25.self_attn.v_proj.base_layer.weight
+training param: model.layers.25.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.25.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.25.self_attn.o_proj.base_layer.weight
+training param: model.layers.25.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.25.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.25.self_attn.q_norm.weight
+freezing param: model.layers.25.self_attn.k_norm.weight
+freezing param: model.layers.25.mlp.gate_proj.base_layer.weight
+training param: model.layers.25.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.25.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.25.mlp.up_proj.base_layer.weight
+training param: model.layers.25.mlp.up_proj.lora_A.default.weight
+training param: model.layers.25.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.25.mlp.down_proj.base_layer.weight
+training param: model.layers.25.mlp.down_proj.lora_A.default.weight
+training param: model.layers.25.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.25.input_layernorm.weight
+freezing param: model.layers.25.post_attention_layernorm.weight
+freezing param: model.layers.26.self_attn.q_proj.base_layer.weight
+training param: model.layers.26.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.26.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.26.self_attn.k_proj.base_layer.weight
+training param: model.layers.26.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.26.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.26.self_attn.v_proj.base_layer.weight
+training param: model.layers.26.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.26.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.26.self_attn.o_proj.base_layer.weight
+training param: model.layers.26.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.26.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.26.self_attn.q_norm.weight
+freezing param: model.layers.26.self_attn.k_norm.weight
+freezing param: model.layers.26.mlp.gate_proj.base_layer.weight
+training param: model.layers.26.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.26.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.26.mlp.up_proj.base_layer.weight
+training param: model.layers.26.mlp.up_proj.lora_A.default.weight
+training param: model.layers.26.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.26.mlp.down_proj.base_layer.weight
+training param: model.layers.26.mlp.down_proj.lora_A.default.weight
+training param: model.layers.26.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.26.input_layernorm.weight
+freezing param: model.layers.26.post_attention_layernorm.weight
+freezing param: model.layers.27.self_attn.q_proj.base_layer.weight
+training param: model.layers.27.self_attn.q_proj.lora_A.default.weight
+training param: model.layers.27.self_attn.q_proj.lora_B.default.weight
+freezing param: model.layers.27.self_attn.k_proj.base_layer.weight
+training param: model.layers.27.self_attn.k_proj.lora_A.default.weight
+training param: model.layers.27.self_attn.k_proj.lora_B.default.weight
+freezing param: model.layers.27.self_attn.v_proj.base_layer.weight
+training param: model.layers.27.self_attn.v_proj.lora_A.default.weight
+training param: model.layers.27.self_attn.v_proj.lora_B.default.weight
+freezing param: model.layers.27.self_attn.o_proj.base_layer.weight
+training param: model.layers.27.self_attn.o_proj.lora_A.default.weight
+training param: model.layers.27.self_attn.o_proj.lora_B.default.weight
+freezing param: model.layers.27.self_attn.q_norm.weight
+freezing param: model.layers.27.self_attn.k_norm.weight
+freezing param: model.layers.27.mlp.gate_proj.base_layer.weight
+training param: model.layers.27.mlp.gate_proj.lora_A.default.weight
+training param: model.layers.27.mlp.gate_proj.lora_B.default.weight
+freezing param: model.layers.27.mlp.up_proj.base_layer.weight
+training param: model.layers.27.mlp.up_proj.lora_A.default.weight
+training param: model.layers.27.mlp.up_proj.lora_B.default.weight
+freezing param: model.layers.27.mlp.down_proj.base_layer.weight
+training param: model.layers.27.mlp.down_proj.lora_A.default.weight
+training param: model.layers.27.mlp.down_proj.lora_B.default.weight
+freezing param: model.layers.27.input_layernorm.weight
+freezing param: model.layers.27.post_attention_layernorm.weight
+freezing param: model.norm.weight
+freezing param: model.audio.conv1.weight
+freezing param: model.audio.conv1.bias
+freezing param: model.audio.conv2.weight
+freezing param: model.audio.conv2.bias
+freezing param: model.audio.blocks.0.attn.key.base.weight
+training param: model.audio.blocks.0.attn.key.lora.A_specific
+training param: model.audio.blocks.0.attn.key.lora.B_specific
+freezing param: model.audio.blocks.0.attn.value.base.weight
+freezing param: model.audio.blocks.0.attn.value.base.bias
+training param: model.audio.blocks.0.attn.value.lora.A_specific
+training param: model.audio.blocks.0.attn.value.lora.B_specific
+freezing param: model.audio.blocks.0.attn.query.base.weight
+freezing param: model.audio.blocks.0.attn.query.base.bias
+training param: model.audio.blocks.0.attn.query.lora.A_specific
+training param: model.audio.blocks.0.attn.query.lora.B_specific
+freezing param: model.audio.blocks.0.attn.out.base.weight
+freezing param: model.audio.blocks.0.attn.out.base.bias
+training param: model.audio.blocks.0.attn.out.lora.A_specific
+training param: model.audio.blocks.0.attn.out.lora.B_specific
+freezing param: model.audio.blocks.0.attn_ln.weight
+freezing param: model.audio.blocks.0.attn_ln.bias
+freezing param: model.audio.blocks.0.mlp.0.base.weight
+freezing param: model.audio.blocks.0.mlp.0.base.bias
+training param: model.audio.blocks.0.mlp.0.lora.A_specific
+training param: model.audio.blocks.0.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.0.mlp.2.base.weight
+freezing param: model.audio.blocks.0.mlp.2.base.bias
+training param: model.audio.blocks.0.mlp.2.lora.A_specific
+training param: model.audio.blocks.0.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.0.mlp_ln.weight
+freezing param: model.audio.blocks.0.mlp_ln.bias
+freezing param: model.audio.blocks.1.attn.key.base.weight
+training param: model.audio.blocks.1.attn.key.lora.A_specific
+training param: model.audio.blocks.1.attn.key.lora.B_specific
+freezing param: model.audio.blocks.1.attn.value.base.weight
+freezing param: model.audio.blocks.1.attn.value.base.bias
+training param: model.audio.blocks.1.attn.value.lora.A_specific
+training param: model.audio.blocks.1.attn.value.lora.B_specific
+freezing param: model.audio.blocks.1.attn.query.base.weight
+freezing param: model.audio.blocks.1.attn.query.base.bias
+training param: model.audio.blocks.1.attn.query.lora.A_specific
+training param: model.audio.blocks.1.attn.query.lora.B_specific
+freezing param: model.audio.blocks.1.attn.out.base.weight
+freezing param: model.audio.blocks.1.attn.out.base.bias
+training param: model.audio.blocks.1.attn.out.lora.A_specific
+training param: model.audio.blocks.1.attn.out.lora.B_specific
+freezing param: model.audio.blocks.1.attn_ln.weight
+freezing param: model.audio.blocks.1.attn_ln.bias
+freezing param: model.audio.blocks.1.mlp.0.base.weight
+freezing param: model.audio.blocks.1.mlp.0.base.bias
+training param: model.audio.blocks.1.mlp.0.lora.A_specific
+training param: model.audio.blocks.1.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.1.mlp.2.base.weight
+freezing param: model.audio.blocks.1.mlp.2.base.bias
+training param: model.audio.blocks.1.mlp.2.lora.A_specific
+training param: model.audio.blocks.1.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.1.mlp_ln.weight
+freezing param: model.audio.blocks.1.mlp_ln.bias
+freezing param: model.audio.blocks.2.attn.key.base.weight
+training param: model.audio.blocks.2.attn.key.lora.A_specific
+training param: model.audio.blocks.2.attn.key.lora.B_specific
+freezing param: model.audio.blocks.2.attn.value.base.weight
+freezing param: model.audio.blocks.2.attn.value.base.bias
+training param: model.audio.blocks.2.attn.value.lora.A_specific
+training param: model.audio.blocks.2.attn.value.lora.B_specific
+freezing param: model.audio.blocks.2.attn.query.base.weight
+freezing param: model.audio.blocks.2.attn.query.base.bias
+training param: model.audio.blocks.2.attn.query.lora.A_specific
+training param: model.audio.blocks.2.attn.query.lora.B_specific
+freezing param: model.audio.blocks.2.attn.out.base.weight
+freezing param: model.audio.blocks.2.attn.out.base.bias
+training param: model.audio.blocks.2.attn.out.lora.A_specific
+training param: model.audio.blocks.2.attn.out.lora.B_specific
+freezing param: model.audio.blocks.2.attn_ln.weight
+freezing param: model.audio.blocks.2.attn_ln.bias
+freezing param: model.audio.blocks.2.mlp.0.base.weight
+freezing param: model.audio.blocks.2.mlp.0.base.bias
+training param: model.audio.blocks.2.mlp.0.lora.A_specific
+training param: model.audio.blocks.2.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.2.mlp.2.base.weight
+freezing param: model.audio.blocks.2.mlp.2.base.bias
+training param: model.audio.blocks.2.mlp.2.lora.A_specific
+training param: model.audio.blocks.2.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.2.mlp_ln.weight
+freezing param: model.audio.blocks.2.mlp_ln.bias
+freezing param: model.audio.blocks.3.attn.key.base.weight
+training param: model.audio.blocks.3.attn.key.lora.A_specific
+training param: model.audio.blocks.3.attn.key.lora.B_specific
+freezing param: model.audio.blocks.3.attn.value.base.weight
+freezing param: model.audio.blocks.3.attn.value.base.bias
+training param: model.audio.blocks.3.attn.value.lora.A_specific
+training param: model.audio.blocks.3.attn.value.lora.B_specific
+freezing param: model.audio.blocks.3.attn.query.base.weight
+freezing param: model.audio.blocks.3.attn.query.base.bias
+training param: model.audio.blocks.3.attn.query.lora.A_specific
+training param: model.audio.blocks.3.attn.query.lora.B_specific
+freezing param: model.audio.blocks.3.attn.out.base.weight
+freezing param: model.audio.blocks.3.attn.out.base.bias
+training param: model.audio.blocks.3.attn.out.lora.A_specific
+training param: model.audio.blocks.3.attn.out.lora.B_specific
+freezing param: model.audio.blocks.3.attn_ln.weight
+freezing param: model.audio.blocks.3.attn_ln.bias
+freezing param: model.audio.blocks.3.mlp.0.base.weight
+freezing param: model.audio.blocks.3.mlp.0.base.bias
+training param: model.audio.blocks.3.mlp.0.lora.A_specific
+training param: model.audio.blocks.3.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.3.mlp.2.base.weight
+freezing param: model.audio.blocks.3.mlp.2.base.bias
+training param: model.audio.blocks.3.mlp.2.lora.A_specific
+training param: model.audio.blocks.3.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.3.mlp_ln.weight
+freezing param: model.audio.blocks.3.mlp_ln.bias
+freezing param: model.audio.blocks.4.attn.key.base.weight
+training param: model.audio.blocks.4.attn.key.lora.A_specific
+training param: model.audio.blocks.4.attn.key.lora.B_specific
+freezing param: model.audio.blocks.4.attn.value.base.weight
+freezing param: model.audio.blocks.4.attn.value.base.bias
+training param: model.audio.blocks.4.attn.value.lora.A_specific
+training param: model.audio.blocks.4.attn.value.lora.B_specific
+freezing param: model.audio.blocks.4.attn.query.base.weight
+freezing param: model.audio.blocks.4.attn.query.base.bias
+training param: model.audio.blocks.4.attn.query.lora.A_specific
+training param: model.audio.blocks.4.attn.query.lora.B_specific
+freezing param: model.audio.blocks.4.attn.out.base.weight
+freezing param: model.audio.blocks.4.attn.out.base.bias
+training param: model.audio.blocks.4.attn.out.lora.A_specific
+training param: model.audio.blocks.4.attn.out.lora.B_specific
+freezing param: model.audio.blocks.4.attn_ln.weight
+freezing param: model.audio.blocks.4.attn_ln.bias
+freezing param: model.audio.blocks.4.mlp.0.base.weight
+freezing param: model.audio.blocks.4.mlp.0.base.bias
+training param: model.audio.blocks.4.mlp.0.lora.A_specific
+training param: model.audio.blocks.4.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.4.mlp.2.base.weight
+freezing param: model.audio.blocks.4.mlp.2.base.bias
+training param: model.audio.blocks.4.mlp.2.lora.A_specific
+training param: model.audio.blocks.4.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.4.mlp_ln.weight
+freezing param: model.audio.blocks.4.mlp_ln.bias
+freezing param: model.audio.blocks.5.attn.key.base.weight
+training param: model.audio.blocks.5.attn.key.lora.A_specific
+training param: model.audio.blocks.5.attn.key.lora.B_specific
+freezing param: model.audio.blocks.5.attn.value.base.weight
+freezing param: model.audio.blocks.5.attn.value.base.bias
+training param: model.audio.blocks.5.attn.value.lora.A_specific
+training param: model.audio.blocks.5.attn.value.lora.B_specific
+freezing param: model.audio.blocks.5.attn.query.base.weight
+freezing param: model.audio.blocks.5.attn.query.base.bias
+training param: model.audio.blocks.5.attn.query.lora.A_specific
+training param: model.audio.blocks.5.attn.query.lora.B_specific
+freezing param: model.audio.blocks.5.attn.out.base.weight
+freezing param: model.audio.blocks.5.attn.out.base.bias
+training param: model.audio.blocks.5.attn.out.lora.A_specific
+training param: model.audio.blocks.5.attn.out.lora.B_specific
+freezing param: model.audio.blocks.5.attn_ln.weight
+freezing param: model.audio.blocks.5.attn_ln.bias
+freezing param: model.audio.blocks.5.mlp.0.base.weight
+freezing param: model.audio.blocks.5.mlp.0.base.bias
+training param: model.audio.blocks.5.mlp.0.lora.A_specific
+training param: model.audio.blocks.5.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.5.mlp.2.base.weight
+freezing param: model.audio.blocks.5.mlp.2.base.bias
+training param: model.audio.blocks.5.mlp.2.lora.A_specific
+training param: model.audio.blocks.5.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.5.mlp_ln.weight
+freezing param: model.audio.blocks.5.mlp_ln.bias
+freezing param: model.audio.blocks.6.attn.key.base.weight
+training param: model.audio.blocks.6.attn.key.lora.A_specific
+training param: model.audio.blocks.6.attn.key.lora.B_specific
+freezing param: model.audio.blocks.6.attn.value.base.weight
+freezing param: model.audio.blocks.6.attn.value.base.bias
+training param: model.audio.blocks.6.attn.value.lora.A_specific
+training param: model.audio.blocks.6.attn.value.lora.B_specific
+freezing param: model.audio.blocks.6.attn.query.base.weight
+freezing param: model.audio.blocks.6.attn.query.base.bias
+training param: model.audio.blocks.6.attn.query.lora.A_specific
+training param: model.audio.blocks.6.attn.query.lora.B_specific
+freezing param: model.audio.blocks.6.attn.out.base.weight
+freezing param: model.audio.blocks.6.attn.out.base.bias
+training param: model.audio.blocks.6.attn.out.lora.A_specific
+training param: model.audio.blocks.6.attn.out.lora.B_specific
+freezing param: model.audio.blocks.6.attn_ln.weight
+freezing param: model.audio.blocks.6.attn_ln.bias
+freezing param: model.audio.blocks.6.mlp.0.base.weight
+freezing param: model.audio.blocks.6.mlp.0.base.bias
+training param: model.audio.blocks.6.mlp.0.lora.A_specific
+training param: model.audio.blocks.6.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.6.mlp.2.base.weight
+freezing param: model.audio.blocks.6.mlp.2.base.bias
+training param: model.audio.blocks.6.mlp.2.lora.A_specific
+training param: model.audio.blocks.6.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.6.mlp_ln.weight
+freezing param: model.audio.blocks.6.mlp_ln.bias
+freezing param: model.audio.blocks.7.attn.key.base.weight
+training param: model.audio.blocks.7.attn.key.lora.A_specific
+training param: model.audio.blocks.7.attn.key.lora.B_specific
+freezing param: model.audio.blocks.7.attn.value.base.weight
+freezing param: model.audio.blocks.7.attn.value.base.bias
+training param: model.audio.blocks.7.attn.value.lora.A_specific
+training param: model.audio.blocks.7.attn.value.lora.B_specific
+freezing param: model.audio.blocks.7.attn.query.base.weight
+freezing param: model.audio.blocks.7.attn.query.base.bias
+training param: model.audio.blocks.7.attn.query.lora.A_specific
+training param: model.audio.blocks.7.attn.query.lora.B_specific
+freezing param: model.audio.blocks.7.attn.out.base.weight
+freezing param: model.audio.blocks.7.attn.out.base.bias
+training param: model.audio.blocks.7.attn.out.lora.A_specific
+training param: model.audio.blocks.7.attn.out.lora.B_specific
+freezing param: model.audio.blocks.7.attn_ln.weight
+freezing param: model.audio.blocks.7.attn_ln.bias
+freezing param: model.audio.blocks.7.mlp.0.base.weight
+freezing param: model.audio.blocks.7.mlp.0.base.bias
+training param: model.audio.blocks.7.mlp.0.lora.A_specific
+training param: model.audio.blocks.7.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.7.mlp.2.base.weight
+freezing param: model.audio.blocks.7.mlp.2.base.bias
+training param: model.audio.blocks.7.mlp.2.lora.A_specific
+training param: model.audio.blocks.7.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.7.mlp_ln.weight
+freezing param: model.audio.blocks.7.mlp_ln.bias
+freezing param: model.audio.blocks.8.attn.key.base.weight
+training param: model.audio.blocks.8.attn.key.lora.A_specific
+training param: model.audio.blocks.8.attn.key.lora.B_specific
+freezing param: model.audio.blocks.8.attn.value.base.weight
+freezing param: model.audio.blocks.8.attn.value.base.bias
+training param: model.audio.blocks.8.attn.value.lora.A_specific
+training param: model.audio.blocks.8.attn.value.lora.B_specific
+freezing param: model.audio.blocks.8.attn.query.base.weight
+freezing param: model.audio.blocks.8.attn.query.base.bias
+training param: model.audio.blocks.8.attn.query.lora.A_specific
+training param: model.audio.blocks.8.attn.query.lora.B_specific
+freezing param: model.audio.blocks.8.attn.out.base.weight
+freezing param: model.audio.blocks.8.attn.out.base.bias
+training param: model.audio.blocks.8.attn.out.lora.A_specific
+training param: model.audio.blocks.8.attn.out.lora.B_specific
+freezing param: model.audio.blocks.8.attn_ln.weight
+freezing param: model.audio.blocks.8.attn_ln.bias
+freezing param: model.audio.blocks.8.mlp.0.base.weight
+freezing param: model.audio.blocks.8.mlp.0.base.bias
+training param: model.audio.blocks.8.mlp.0.lora.A_specific
+training param: model.audio.blocks.8.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.8.mlp.2.base.weight
+freezing param: model.audio.blocks.8.mlp.2.base.bias
+training param: model.audio.blocks.8.mlp.2.lora.A_specific
+training param: model.audio.blocks.8.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.8.mlp_ln.weight
+freezing param: model.audio.blocks.8.mlp_ln.bias
+freezing param: model.audio.blocks.9.attn.key.base.weight
+training param: model.audio.blocks.9.attn.key.lora.A_specific
+training param: model.audio.blocks.9.attn.key.lora.B_specific
+freezing param: model.audio.blocks.9.attn.value.base.weight
+freezing param: model.audio.blocks.9.attn.value.base.bias
+training param: model.audio.blocks.9.attn.value.lora.A_specific
+training param: model.audio.blocks.9.attn.value.lora.B_specific
+freezing param: model.audio.blocks.9.attn.query.base.weight
+freezing param: model.audio.blocks.9.attn.query.base.bias
+training param: model.audio.blocks.9.attn.query.lora.A_specific
+training param: model.audio.blocks.9.attn.query.lora.B_specific
+freezing param: model.audio.blocks.9.attn.out.base.weight
+freezing param: model.audio.blocks.9.attn.out.base.bias
+training param: model.audio.blocks.9.attn.out.lora.A_specific
+training param: model.audio.blocks.9.attn.out.lora.B_specific
+freezing param: model.audio.blocks.9.attn_ln.weight
+freezing param: model.audio.blocks.9.attn_ln.bias
+freezing param: model.audio.blocks.9.mlp.0.base.weight
+freezing param: model.audio.blocks.9.mlp.0.base.bias
+training param: model.audio.blocks.9.mlp.0.lora.A_specific
+training param: model.audio.blocks.9.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.9.mlp.2.base.weight
+freezing param: model.audio.blocks.9.mlp.2.base.bias
+training param: model.audio.blocks.9.mlp.2.lora.A_specific
+training param: model.audio.blocks.9.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.9.mlp_ln.weight
+freezing param: model.audio.blocks.9.mlp_ln.bias
+freezing param: model.audio.blocks.10.attn.key.base.weight
+training param: model.audio.blocks.10.attn.key.lora.A_specific
+training param: model.audio.blocks.10.attn.key.lora.B_specific
+freezing param: model.audio.blocks.10.attn.value.base.weight
+freezing param: model.audio.blocks.10.attn.value.base.bias
+training param: model.audio.blocks.10.attn.value.lora.A_specific
+training param: model.audio.blocks.10.attn.value.lora.B_specific
+freezing param: model.audio.blocks.10.attn.query.base.weight
+freezing param: model.audio.blocks.10.attn.query.base.bias
+training param: model.audio.blocks.10.attn.query.lora.A_specific
+training param: model.audio.blocks.10.attn.query.lora.B_specific
+freezing param: model.audio.blocks.10.attn.out.base.weight
+freezing param: model.audio.blocks.10.attn.out.base.bias
+training param: model.audio.blocks.10.attn.out.lora.A_specific
+training param: model.audio.blocks.10.attn.out.lora.B_specific
+freezing param: model.audio.blocks.10.attn_ln.weight
+freezing param: model.audio.blocks.10.attn_ln.bias
+freezing param: model.audio.blocks.10.mlp.0.base.weight
+freezing param: model.audio.blocks.10.mlp.0.base.bias
+training param: model.audio.blocks.10.mlp.0.lora.A_specific
+training param: model.audio.blocks.10.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.10.mlp.2.base.weight
+freezing param: model.audio.blocks.10.mlp.2.base.bias
+training param: model.audio.blocks.10.mlp.2.lora.A_specific
+training param: model.audio.blocks.10.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.10.mlp_ln.weight
+freezing param: model.audio.blocks.10.mlp_ln.bias
+freezing param: model.audio.blocks.11.attn.key.base.weight
+training param: model.audio.blocks.11.attn.key.lora.A_specific
+training param: model.audio.blocks.11.attn.key.lora.B_specific
+freezing param: model.audio.blocks.11.attn.value.base.weight
+freezing param: model.audio.blocks.11.attn.value.base.bias
+training param: model.audio.blocks.11.attn.value.lora.A_specific
+training param: model.audio.blocks.11.attn.value.lora.B_specific
+freezing param: model.audio.blocks.11.attn.query.base.weight
+freezing param: model.audio.blocks.11.attn.query.base.bias
+training param: model.audio.blocks.11.attn.query.lora.A_specific
+training param: model.audio.blocks.11.attn.query.lora.B_specific
+freezing param: model.audio.blocks.11.attn.out.base.weight
+freezing param: model.audio.blocks.11.attn.out.base.bias
+training param: model.audio.blocks.11.attn.out.lora.A_specific
+training param: model.audio.blocks.11.attn.out.lora.B_specific
+freezing param: model.audio.blocks.11.attn_ln.weight
+freezing param: model.audio.blocks.11.attn_ln.bias
+freezing param: model.audio.blocks.11.mlp.0.base.weight
+freezing param: model.audio.blocks.11.mlp.0.base.bias
+training param: model.audio.blocks.11.mlp.0.lora.A_specific
+training param: model.audio.blocks.11.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.11.mlp.2.base.weight
+freezing param: model.audio.blocks.11.mlp.2.base.bias
+training param: model.audio.blocks.11.mlp.2.lora.A_specific
+training param: model.audio.blocks.11.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.11.mlp_ln.weight
+freezing param: model.audio.blocks.11.mlp_ln.bias
+freezing param: model.audio.blocks.12.attn.key.base.weight
+training param: model.audio.blocks.12.attn.key.lora.A_specific
+training param: model.audio.blocks.12.attn.key.lora.B_specific
+freezing param: model.audio.blocks.12.attn.value.base.weight
+freezing param: model.audio.blocks.12.attn.value.base.bias
+training param: model.audio.blocks.12.attn.value.lora.A_specific
+training param: model.audio.blocks.12.attn.value.lora.B_specific
+freezing param: model.audio.blocks.12.attn.query.base.weight
+freezing param: model.audio.blocks.12.attn.query.base.bias
+training param: model.audio.blocks.12.attn.query.lora.A_specific
+training param: model.audio.blocks.12.attn.query.lora.B_specific
+freezing param: model.audio.blocks.12.attn.out.base.weight
+freezing param: model.audio.blocks.12.attn.out.base.bias
+training param: model.audio.blocks.12.attn.out.lora.A_specific
+training param: model.audio.blocks.12.attn.out.lora.B_specific
+freezing param: model.audio.blocks.12.attn_ln.weight
+freezing param: model.audio.blocks.12.attn_ln.bias
+freezing param: model.audio.blocks.12.mlp.0.base.weight
+freezing param: model.audio.blocks.12.mlp.0.base.bias
+training param: model.audio.blocks.12.mlp.0.lora.A_specific
+training param: model.audio.blocks.12.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.12.mlp.2.base.weight
+freezing param: model.audio.blocks.12.mlp.2.base.bias
+training param: model.audio.blocks.12.mlp.2.lora.A_specific
+training param: model.audio.blocks.12.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.12.mlp_ln.weight
+freezing param: model.audio.blocks.12.mlp_ln.bias
+freezing param: model.audio.blocks.13.attn.key.base.weight
+training param: model.audio.blocks.13.attn.key.lora.A_specific
+training param: model.audio.blocks.13.attn.key.lora.B_specific
+freezing param: model.audio.blocks.13.attn.value.base.weight
+freezing param: model.audio.blocks.13.attn.value.base.bias
+training param: model.audio.blocks.13.attn.value.lora.A_specific
+training param: model.audio.blocks.13.attn.value.lora.B_specific
+freezing param: model.audio.blocks.13.attn.query.base.weight
+freezing param: model.audio.blocks.13.attn.query.base.bias
+training param: model.audio.blocks.13.attn.query.lora.A_specific
+training param: model.audio.blocks.13.attn.query.lora.B_specific
+freezing param: model.audio.blocks.13.attn.out.base.weight
+freezing param: model.audio.blocks.13.attn.out.base.bias
+training param: model.audio.blocks.13.attn.out.lora.A_specific
+training param: model.audio.blocks.13.attn.out.lora.B_specific
+freezing param: model.audio.blocks.13.attn_ln.weight
+freezing param: model.audio.blocks.13.attn_ln.bias
+freezing param: model.audio.blocks.13.mlp.0.base.weight
+freezing param: model.audio.blocks.13.mlp.0.base.bias
+training param: model.audio.blocks.13.mlp.0.lora.A_specific
+training param: model.audio.blocks.13.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.13.mlp.2.base.weight
+freezing param: model.audio.blocks.13.mlp.2.base.bias
+training param: model.audio.blocks.13.mlp.2.lora.A_specific
+training param: model.audio.blocks.13.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.13.mlp_ln.weight
+freezing param: model.audio.blocks.13.mlp_ln.bias
+freezing param: model.audio.blocks.14.attn.key.base.weight
+training param: model.audio.blocks.14.attn.key.lora.A_specific
+training param: model.audio.blocks.14.attn.key.lora.B_specific
+freezing param: model.audio.blocks.14.attn.value.base.weight
+freezing param: model.audio.blocks.14.attn.value.base.bias
+training param: model.audio.blocks.14.attn.value.lora.A_specific
+training param: model.audio.blocks.14.attn.value.lora.B_specific
+freezing param: model.audio.blocks.14.attn.query.base.weight
+freezing param: model.audio.blocks.14.attn.query.base.bias
+training param: model.audio.blocks.14.attn.query.lora.A_specific
+training param: model.audio.blocks.14.attn.query.lora.B_specific
+freezing param: model.audio.blocks.14.attn.out.base.weight
+freezing param: model.audio.blocks.14.attn.out.base.bias
+training param: model.audio.blocks.14.attn.out.lora.A_specific
+training param: model.audio.blocks.14.attn.out.lora.B_specific
+freezing param: model.audio.blocks.14.attn_ln.weight
+freezing param: model.audio.blocks.14.attn_ln.bias
+freezing param: model.audio.blocks.14.mlp.0.base.weight
+freezing param: model.audio.blocks.14.mlp.0.base.bias
+training param: model.audio.blocks.14.mlp.0.lora.A_specific
+training param: model.audio.blocks.14.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.14.mlp.2.base.weight
+freezing param: model.audio.blocks.14.mlp.2.base.bias
+training param: model.audio.blocks.14.mlp.2.lora.A_specific
+training param: model.audio.blocks.14.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.14.mlp_ln.weight
+freezing param: model.audio.blocks.14.mlp_ln.bias
+freezing param: model.audio.blocks.15.attn.key.base.weight
+training param: model.audio.blocks.15.attn.key.lora.A_specific
+training param: model.audio.blocks.15.attn.key.lora.B_specific
+freezing param: model.audio.blocks.15.attn.value.base.weight
+freezing param: model.audio.blocks.15.attn.value.base.bias
+training param: model.audio.blocks.15.attn.value.lora.A_specific
+training param: model.audio.blocks.15.attn.value.lora.B_specific
+freezing param: model.audio.blocks.15.attn.query.base.weight
+freezing param: model.audio.blocks.15.attn.query.base.bias
+training param: model.audio.blocks.15.attn.query.lora.A_specific
+training param: model.audio.blocks.15.attn.query.lora.B_specific
+freezing param: model.audio.blocks.15.attn.out.base.weight
+freezing param: model.audio.blocks.15.attn.out.base.bias
+training param: model.audio.blocks.15.attn.out.lora.A_specific
+training param: model.audio.blocks.15.attn.out.lora.B_specific
+freezing param: model.audio.blocks.15.attn_ln.weight
+freezing param: model.audio.blocks.15.attn_ln.bias
+freezing param: model.audio.blocks.15.mlp.0.base.weight
+freezing param: model.audio.blocks.15.mlp.0.base.bias
+training param: model.audio.blocks.15.mlp.0.lora.A_specific
+training param: model.audio.blocks.15.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.15.mlp.2.base.weight
+freezing param: model.audio.blocks.15.mlp.2.base.bias
+training param: model.audio.blocks.15.mlp.2.lora.A_specific
+training param: model.audio.blocks.15.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.15.mlp_ln.weight
+freezing param: model.audio.blocks.15.mlp_ln.bias
+freezing param: model.audio.blocks.16.attn.key.base.weight
+training param: model.audio.blocks.16.attn.key.lora.A_specific
+training param: model.audio.blocks.16.attn.key.lora.B_specific
+freezing param: model.audio.blocks.16.attn.value.base.weight
+freezing param: model.audio.blocks.16.attn.value.base.bias
+training param: model.audio.blocks.16.attn.value.lora.A_specific
+training param: model.audio.blocks.16.attn.value.lora.B_specific
+freezing param: model.audio.blocks.16.attn.query.base.weight
+freezing param: model.audio.blocks.16.attn.query.base.bias
+training param: model.audio.blocks.16.attn.query.lora.A_specific
+training param: model.audio.blocks.16.attn.query.lora.B_specific
+freezing param: model.audio.blocks.16.attn.out.base.weight
+freezing param: model.audio.blocks.16.attn.out.base.bias
+training param: model.audio.blocks.16.attn.out.lora.A_specific
+training param: model.audio.blocks.16.attn.out.lora.B_specific
+freezing param: model.audio.blocks.16.attn_ln.weight
+freezing param: model.audio.blocks.16.attn_ln.bias
+freezing param: model.audio.blocks.16.mlp.0.base.weight
+freezing param: model.audio.blocks.16.mlp.0.base.bias
+training param: model.audio.blocks.16.mlp.0.lora.A_specific
+training param: model.audio.blocks.16.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.16.mlp.2.base.weight
+freezing param: model.audio.blocks.16.mlp.2.base.bias
+training param: model.audio.blocks.16.mlp.2.lora.A_specific
+training param: model.audio.blocks.16.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.16.mlp_ln.weight
+freezing param: model.audio.blocks.16.mlp_ln.bias
+freezing param: model.audio.blocks.17.attn.key.base.weight
+training param: model.audio.blocks.17.attn.key.lora.A_specific
+training param: model.audio.blocks.17.attn.key.lora.B_specific
+freezing param: model.audio.blocks.17.attn.value.base.weight
+freezing param: model.audio.blocks.17.attn.value.base.bias
+training param: model.audio.blocks.17.attn.value.lora.A_specific
+training param: model.audio.blocks.17.attn.value.lora.B_specific
+freezing param: model.audio.blocks.17.attn.query.base.weight
+freezing param: model.audio.blocks.17.attn.query.base.bias
+training param: model.audio.blocks.17.attn.query.lora.A_specific
+training param: model.audio.blocks.17.attn.query.lora.B_specific
+freezing param: model.audio.blocks.17.attn.out.base.weight
+freezing param: model.audio.blocks.17.attn.out.base.bias
+training param: model.audio.blocks.17.attn.out.lora.A_specific
+training param: model.audio.blocks.17.attn.out.lora.B_specific
+freezing param: model.audio.blocks.17.attn_ln.weight
+freezing param: model.audio.blocks.17.attn_ln.bias
+freezing param: model.audio.blocks.17.mlp.0.base.weight
+freezing param: model.audio.blocks.17.mlp.0.base.bias
+training param: model.audio.blocks.17.mlp.0.lora.A_specific
+training param: model.audio.blocks.17.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.17.mlp.2.base.weight
+freezing param: model.audio.blocks.17.mlp.2.base.bias
+training param: model.audio.blocks.17.mlp.2.lora.A_specific
+training param: model.audio.blocks.17.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.17.mlp_ln.weight
+freezing param: model.audio.blocks.17.mlp_ln.bias
+freezing param: model.audio.blocks.18.attn.key.base.weight
+training param: model.audio.blocks.18.attn.key.lora.A_specific
+training param: model.audio.blocks.18.attn.key.lora.B_specific
+freezing param: model.audio.blocks.18.attn.value.base.weight
+freezing param: model.audio.blocks.18.attn.value.base.bias
+training param: model.audio.blocks.18.attn.value.lora.A_specific
+training param: model.audio.blocks.18.attn.value.lora.B_specific
+freezing param: model.audio.blocks.18.attn.query.base.weight
+freezing param: model.audio.blocks.18.attn.query.base.bias
+training param: model.audio.blocks.18.attn.query.lora.A_specific
+training param: model.audio.blocks.18.attn.query.lora.B_specific
+freezing param: model.audio.blocks.18.attn.out.base.weight
+freezing param: model.audio.blocks.18.attn.out.base.bias
+training param: model.audio.blocks.18.attn.out.lora.A_specific
+training param: model.audio.blocks.18.attn.out.lora.B_specific
+freezing param: model.audio.blocks.18.attn_ln.weight
+freezing param: model.audio.blocks.18.attn_ln.bias
+freezing param: model.audio.blocks.18.mlp.0.base.weight
+freezing param: model.audio.blocks.18.mlp.0.base.bias
+training param: model.audio.blocks.18.mlp.0.lora.A_specific
+training param: model.audio.blocks.18.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.18.mlp.2.base.weight
+freezing param: model.audio.blocks.18.mlp.2.base.bias
+training param: model.audio.blocks.18.mlp.2.lora.A_specific
+training param: model.audio.blocks.18.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.18.mlp_ln.weight
+freezing param: model.audio.blocks.18.mlp_ln.bias
+freezing param: model.audio.blocks.19.attn.key.base.weight
+training param: model.audio.blocks.19.attn.key.lora.A_specific
+training param: model.audio.blocks.19.attn.key.lora.B_specific
+freezing param: model.audio.blocks.19.attn.value.base.weight
+freezing param: model.audio.blocks.19.attn.value.base.bias
+training param: model.audio.blocks.19.attn.value.lora.A_specific
+training param: model.audio.blocks.19.attn.value.lora.B_specific
+freezing param: model.audio.blocks.19.attn.query.base.weight
+freezing param: model.audio.blocks.19.attn.query.base.bias
+training param: model.audio.blocks.19.attn.query.lora.A_specific
+training param: model.audio.blocks.19.attn.query.lora.B_specific
+freezing param: model.audio.blocks.19.attn.out.base.weight
+freezing param: model.audio.blocks.19.attn.out.base.bias
+training param: model.audio.blocks.19.attn.out.lora.A_specific
+training param: model.audio.blocks.19.attn.out.lora.B_specific
+freezing param: model.audio.blocks.19.attn_ln.weight
+freezing param: model.audio.blocks.19.attn_ln.bias
+freezing param: model.audio.blocks.19.mlp.0.base.weight
+freezing param: model.audio.blocks.19.mlp.0.base.bias
+training param: model.audio.blocks.19.mlp.0.lora.A_specific
+training param: model.audio.blocks.19.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.19.mlp.2.base.weight
+freezing param: model.audio.blocks.19.mlp.2.base.bias
+training param: model.audio.blocks.19.mlp.2.lora.A_specific
+training param: model.audio.blocks.19.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.19.mlp_ln.weight
+freezing param: model.audio.blocks.19.mlp_ln.bias
+freezing param: model.audio.blocks.20.attn.key.base.weight
+training param: model.audio.blocks.20.attn.key.lora.A_specific
+training param: model.audio.blocks.20.attn.key.lora.B_specific
+freezing param: model.audio.blocks.20.attn.value.base.weight
+freezing param: model.audio.blocks.20.attn.value.base.bias
+training param: model.audio.blocks.20.attn.value.lora.A_specific
+training param: model.audio.blocks.20.attn.value.lora.B_specific
+freezing param: model.audio.blocks.20.attn.query.base.weight
+freezing param: model.audio.blocks.20.attn.query.base.bias
+training param: model.audio.blocks.20.attn.query.lora.A_specific
+training param: model.audio.blocks.20.attn.query.lora.B_specific
+freezing param: model.audio.blocks.20.attn.out.base.weight
+freezing param: model.audio.blocks.20.attn.out.base.bias
+training param: model.audio.blocks.20.attn.out.lora.A_specific
+training param: model.audio.blocks.20.attn.out.lora.B_specific
+freezing param: model.audio.blocks.20.attn_ln.weight
+freezing param: model.audio.blocks.20.attn_ln.bias
+freezing param: model.audio.blocks.20.mlp.0.base.weight
+freezing param: model.audio.blocks.20.mlp.0.base.bias
+training param: model.audio.blocks.20.mlp.0.lora.A_specific
+training param: model.audio.blocks.20.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.20.mlp.2.base.weight
+freezing param: model.audio.blocks.20.mlp.2.base.bias
+training param: model.audio.blocks.20.mlp.2.lora.A_specific
+training param: model.audio.blocks.20.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.20.mlp_ln.weight
+freezing param: model.audio.blocks.20.mlp_ln.bias
+freezing param: model.audio.blocks.21.attn.key.base.weight
+training param: model.audio.blocks.21.attn.key.lora.A_specific
+training param: model.audio.blocks.21.attn.key.lora.B_specific
+freezing param: model.audio.blocks.21.attn.value.base.weight
+freezing param: model.audio.blocks.21.attn.value.base.bias
+training param: model.audio.blocks.21.attn.value.lora.A_specific
+training param: model.audio.blocks.21.attn.value.lora.B_specific
+freezing param: model.audio.blocks.21.attn.query.base.weight
+freezing param: model.audio.blocks.21.attn.query.base.bias
+training param: model.audio.blocks.21.attn.query.lora.A_specific
+training param: model.audio.blocks.21.attn.query.lora.B_specific
+freezing param: model.audio.blocks.21.attn.out.base.weight
+freezing param: model.audio.blocks.21.attn.out.base.bias
+training param: model.audio.blocks.21.attn.out.lora.A_specific
+training param: model.audio.blocks.21.attn.out.lora.B_specific
+freezing param: model.audio.blocks.21.attn_ln.weight
+freezing param: model.audio.blocks.21.attn_ln.bias
+freezing param: model.audio.blocks.21.mlp.0.base.weight
+freezing param: model.audio.blocks.21.mlp.0.base.bias
+training param: model.audio.blocks.21.mlp.0.lora.A_specific
+training param: model.audio.blocks.21.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.21.mlp.2.base.weight
+freezing param: model.audio.blocks.21.mlp.2.base.bias
+training param: model.audio.blocks.21.mlp.2.lora.A_specific
+training param: model.audio.blocks.21.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.21.mlp_ln.weight
+freezing param: model.audio.blocks.21.mlp_ln.bias
+freezing param: model.audio.blocks.22.attn.key.base.weight
+training param: model.audio.blocks.22.attn.key.lora.A_specific
+training param: model.audio.blocks.22.attn.key.lora.B_specific
+freezing param: model.audio.blocks.22.attn.value.base.weight
+freezing param: model.audio.blocks.22.attn.value.base.bias
+training param: model.audio.blocks.22.attn.value.lora.A_specific
+training param: model.audio.blocks.22.attn.value.lora.B_specific
+freezing param: model.audio.blocks.22.attn.query.base.weight
+freezing param: model.audio.blocks.22.attn.query.base.bias
+training param: model.audio.blocks.22.attn.query.lora.A_specific
+training param: model.audio.blocks.22.attn.query.lora.B_specific
+freezing param: model.audio.blocks.22.attn.out.base.weight
+freezing param: model.audio.blocks.22.attn.out.base.bias
+training param: model.audio.blocks.22.attn.out.lora.A_specific
+training param: model.audio.blocks.22.attn.out.lora.B_specific
+freezing param: model.audio.blocks.22.attn_ln.weight
+freezing param: model.audio.blocks.22.attn_ln.bias
+freezing param: model.audio.blocks.22.mlp.0.base.weight
+freezing param: model.audio.blocks.22.mlp.0.base.bias
+training param: model.audio.blocks.22.mlp.0.lora.A_specific
+training param: model.audio.blocks.22.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.22.mlp.2.base.weight
+freezing param: model.audio.blocks.22.mlp.2.base.bias
+training param: model.audio.blocks.22.mlp.2.lora.A_specific
+training param: model.audio.blocks.22.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.22.mlp_ln.weight
+freezing param: model.audio.blocks.22.mlp_ln.bias
+freezing param: model.audio.blocks.23.attn.key.base.weight
+training param: model.audio.blocks.23.attn.key.lora.A_specific
+training param: model.audio.blocks.23.attn.key.lora.B_specific
+freezing param: model.audio.blocks.23.attn.value.base.weight
+freezing param: model.audio.blocks.23.attn.value.base.bias
+training param: model.audio.blocks.23.attn.value.lora.A_specific
+training param: model.audio.blocks.23.attn.value.lora.B_specific
+freezing param: model.audio.blocks.23.attn.query.base.weight
+freezing param: model.audio.blocks.23.attn.query.base.bias
+training param: model.audio.blocks.23.attn.query.lora.A_specific
+training param: model.audio.blocks.23.attn.query.lora.B_specific
+freezing param: model.audio.blocks.23.attn.out.base.weight
+freezing param: model.audio.blocks.23.attn.out.base.bias
+training param: model.audio.blocks.23.attn.out.lora.A_specific
+training param: model.audio.blocks.23.attn.out.lora.B_specific
+freezing param: model.audio.blocks.23.attn_ln.weight
+freezing param: model.audio.blocks.23.attn_ln.bias
+freezing param: model.audio.blocks.23.mlp.0.base.weight
+freezing param: model.audio.blocks.23.mlp.0.base.bias
+training param: model.audio.blocks.23.mlp.0.lora.A_specific
+training param: model.audio.blocks.23.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.23.mlp.2.base.weight
+freezing param: model.audio.blocks.23.mlp.2.base.bias
+training param: model.audio.blocks.23.mlp.2.lora.A_specific
+training param: model.audio.blocks.23.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.23.mlp_ln.weight
+freezing param: model.audio.blocks.23.mlp_ln.bias
+freezing param: model.audio.blocks.24.attn.key.base.weight
+training param: model.audio.blocks.24.attn.key.lora.A_specific
+training param: model.audio.blocks.24.attn.key.lora.B_specific
+freezing param: model.audio.blocks.24.attn.value.base.weight
+freezing param: model.audio.blocks.24.attn.value.base.bias
+training param: model.audio.blocks.24.attn.value.lora.A_specific
+training param: model.audio.blocks.24.attn.value.lora.B_specific
+freezing param: model.audio.blocks.24.attn.query.base.weight
+freezing param: model.audio.blocks.24.attn.query.base.bias
+training param: model.audio.blocks.24.attn.query.lora.A_specific
+training param: model.audio.blocks.24.attn.query.lora.B_specific
+freezing param: model.audio.blocks.24.attn.out.base.weight
+freezing param: model.audio.blocks.24.attn.out.base.bias
+training param: model.audio.blocks.24.attn.out.lora.A_specific
+training param: model.audio.blocks.24.attn.out.lora.B_specific
+freezing param: model.audio.blocks.24.attn_ln.weight
+freezing param: model.audio.blocks.24.attn_ln.bias
+freezing param: model.audio.blocks.24.mlp.0.base.weight
+freezing param: model.audio.blocks.24.mlp.0.base.bias
+training param: model.audio.blocks.24.mlp.0.lora.A_specific
+training param: model.audio.blocks.24.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.24.mlp.2.base.weight
+freezing param: model.audio.blocks.24.mlp.2.base.bias
+training param: model.audio.blocks.24.mlp.2.lora.A_specific
+training param: model.audio.blocks.24.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.24.mlp_ln.weight
+freezing param: model.audio.blocks.24.mlp_ln.bias
+freezing param: model.audio.blocks.25.attn.key.base.weight
+training param: model.audio.blocks.25.attn.key.lora.A_specific
+training param: model.audio.blocks.25.attn.key.lora.B_specific
+freezing param: model.audio.blocks.25.attn.value.base.weight
+freezing param: model.audio.blocks.25.attn.value.base.bias
+training param: model.audio.blocks.25.attn.value.lora.A_specific
+training param: model.audio.blocks.25.attn.value.lora.B_specific
+freezing param: model.audio.blocks.25.attn.query.base.weight
+freezing param: model.audio.blocks.25.attn.query.base.bias
+training param: model.audio.blocks.25.attn.query.lora.A_specific
+training param: model.audio.blocks.25.attn.query.lora.B_specific
+freezing param: model.audio.blocks.25.attn.out.base.weight
+freezing param: model.audio.blocks.25.attn.out.base.bias
+training param: model.audio.blocks.25.attn.out.lora.A_specific
+training param: model.audio.blocks.25.attn.out.lora.B_specific
+freezing param: model.audio.blocks.25.attn_ln.weight
+freezing param: model.audio.blocks.25.attn_ln.bias
+freezing param: model.audio.blocks.25.mlp.0.base.weight
+freezing param: model.audio.blocks.25.mlp.0.base.bias
+training param: model.audio.blocks.25.mlp.0.lora.A_specific
+training param: model.audio.blocks.25.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.25.mlp.2.base.weight
+freezing param: model.audio.blocks.25.mlp.2.base.bias
+training param: model.audio.blocks.25.mlp.2.lora.A_specific
+training param: model.audio.blocks.25.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.25.mlp_ln.weight
+freezing param: model.audio.blocks.25.mlp_ln.bias
+freezing param: model.audio.blocks.26.attn.key.base.weight
+training param: model.audio.blocks.26.attn.key.lora.A_specific
+training param: model.audio.blocks.26.attn.key.lora.B_specific
+freezing param: model.audio.blocks.26.attn.value.base.weight
+freezing param: model.audio.blocks.26.attn.value.base.bias
+training param: model.audio.blocks.26.attn.value.lora.A_specific
+training param: model.audio.blocks.26.attn.value.lora.B_specific
+freezing param: model.audio.blocks.26.attn.query.base.weight
+freezing param: model.audio.blocks.26.attn.query.base.bias
+training param: model.audio.blocks.26.attn.query.lora.A_specific
+training param: model.audio.blocks.26.attn.query.lora.B_specific
+freezing param: model.audio.blocks.26.attn.out.base.weight
+freezing param: model.audio.blocks.26.attn.out.base.bias
+training param: model.audio.blocks.26.attn.out.lora.A_specific
+training param: model.audio.blocks.26.attn.out.lora.B_specific
+freezing param: model.audio.blocks.26.attn_ln.weight
+freezing param: model.audio.blocks.26.attn_ln.bias
+freezing param: model.audio.blocks.26.mlp.0.base.weight
+freezing param: model.audio.blocks.26.mlp.0.base.bias
+training param: model.audio.blocks.26.mlp.0.lora.A_specific
+training param: model.audio.blocks.26.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.26.mlp.2.base.weight
+freezing param: model.audio.blocks.26.mlp.2.base.bias
+training param: model.audio.blocks.26.mlp.2.lora.A_specific
+training param: model.audio.blocks.26.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.26.mlp_ln.weight
+freezing param: model.audio.blocks.26.mlp_ln.bias
+freezing param: model.audio.blocks.27.attn.key.base.weight
+training param: model.audio.blocks.27.attn.key.lora.A_specific
+training param: model.audio.blocks.27.attn.key.lora.B_specific
+freezing param: model.audio.blocks.27.attn.value.base.weight
+freezing param: model.audio.blocks.27.attn.value.base.bias
+training param: model.audio.blocks.27.attn.value.lora.A_specific
+training param: model.audio.blocks.27.attn.value.lora.B_specific
+freezing param: model.audio.blocks.27.attn.query.base.weight
+freezing param: model.audio.blocks.27.attn.query.base.bias
+training param: model.audio.blocks.27.attn.query.lora.A_specific
+training param: model.audio.blocks.27.attn.query.lora.B_specific
+freezing param: model.audio.blocks.27.attn.out.base.weight
+freezing param: model.audio.blocks.27.attn.out.base.bias
+training param: model.audio.blocks.27.attn.out.lora.A_specific
+training param: model.audio.blocks.27.attn.out.lora.B_specific
+freezing param: model.audio.blocks.27.attn_ln.weight
+freezing param: model.audio.blocks.27.attn_ln.bias
+freezing param: model.audio.blocks.27.mlp.0.base.weight
+freezing param: model.audio.blocks.27.mlp.0.base.bias
+training param: model.audio.blocks.27.mlp.0.lora.A_specific
+training param: model.audio.blocks.27.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.27.mlp.2.base.weight
+freezing param: model.audio.blocks.27.mlp.2.base.bias
+training param: model.audio.blocks.27.mlp.2.lora.A_specific
+training param: model.audio.blocks.27.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.27.mlp_ln.weight
+freezing param: model.audio.blocks.27.mlp_ln.bias
+freezing param: model.audio.blocks.28.attn.key.base.weight
+training param: model.audio.blocks.28.attn.key.lora.A_specific
+training param: model.audio.blocks.28.attn.key.lora.B_specific
+freezing param: model.audio.blocks.28.attn.value.base.weight
+freezing param: model.audio.blocks.28.attn.value.base.bias
+training param: model.audio.blocks.28.attn.value.lora.A_specific
+training param: model.audio.blocks.28.attn.value.lora.B_specific
+freezing param: model.audio.blocks.28.attn.query.base.weight
+freezing param: model.audio.blocks.28.attn.query.base.bias
+training param: model.audio.blocks.28.attn.query.lora.A_specific
+training param: model.audio.blocks.28.attn.query.lora.B_specific
+freezing param: model.audio.blocks.28.attn.out.base.weight
+freezing param: model.audio.blocks.28.attn.out.base.bias
+training param: model.audio.blocks.28.attn.out.lora.A_specific
+training param: model.audio.blocks.28.attn.out.lora.B_specific
+freezing param: model.audio.blocks.28.attn_ln.weight
+freezing param: model.audio.blocks.28.attn_ln.bias
+freezing param: model.audio.blocks.28.mlp.0.base.weight
+freezing param: model.audio.blocks.28.mlp.0.base.bias
+training param: model.audio.blocks.28.mlp.0.lora.A_specific
+training param: model.audio.blocks.28.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.28.mlp.2.base.weight
+freezing param: model.audio.blocks.28.mlp.2.base.bias
+training param: model.audio.blocks.28.mlp.2.lora.A_specific
+training param: model.audio.blocks.28.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.28.mlp_ln.weight
+freezing param: model.audio.blocks.28.mlp_ln.bias
+freezing param: model.audio.blocks.29.attn.key.base.weight
+training param: model.audio.blocks.29.attn.key.lora.A_specific
+training param: model.audio.blocks.29.attn.key.lora.B_specific
+freezing param: model.audio.blocks.29.attn.value.base.weight
+freezing param: model.audio.blocks.29.attn.value.base.bias
+training param: model.audio.blocks.29.attn.value.lora.A_specific
+training param: model.audio.blocks.29.attn.value.lora.B_specific
+freezing param: model.audio.blocks.29.attn.query.base.weight
+freezing param: model.audio.blocks.29.attn.query.base.bias
+training param: model.audio.blocks.29.attn.query.lora.A_specific
+training param: model.audio.blocks.29.attn.query.lora.B_specific
+freezing param: model.audio.blocks.29.attn.out.base.weight
+freezing param: model.audio.blocks.29.attn.out.base.bias
+training param: model.audio.blocks.29.attn.out.lora.A_specific
+training param: model.audio.blocks.29.attn.out.lora.B_specific
+freezing param: model.audio.blocks.29.attn_ln.weight
+freezing param: model.audio.blocks.29.attn_ln.bias
+freezing param: model.audio.blocks.29.mlp.0.base.weight
+freezing param: model.audio.blocks.29.mlp.0.base.bias
+training param: model.audio.blocks.29.mlp.0.lora.A_specific
+training param: model.audio.blocks.29.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.29.mlp.2.base.weight
+freezing param: model.audio.blocks.29.mlp.2.base.bias
+training param: model.audio.blocks.29.mlp.2.lora.A_specific
+training param: model.audio.blocks.29.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.29.mlp_ln.weight
+freezing param: model.audio.blocks.29.mlp_ln.bias
+freezing param: model.audio.blocks.30.attn.key.base.weight
+training param: model.audio.blocks.30.attn.key.lora.A_specific
+training param: model.audio.blocks.30.attn.key.lora.B_specific
+freezing param: model.audio.blocks.30.attn.value.base.weight
+freezing param: model.audio.blocks.30.attn.value.base.bias
+training param: model.audio.blocks.30.attn.value.lora.A_specific
+training param: model.audio.blocks.30.attn.value.lora.B_specific
+freezing param: model.audio.blocks.30.attn.query.base.weight
+freezing param: model.audio.blocks.30.attn.query.base.bias
+training param: model.audio.blocks.30.attn.query.lora.A_specific
+training param: model.audio.blocks.30.attn.query.lora.B_specific
+freezing param: model.audio.blocks.30.attn.out.base.weight
+freezing param: model.audio.blocks.30.attn.out.base.bias
+training param: model.audio.blocks.30.attn.out.lora.A_specific
+training param: model.audio.blocks.30.attn.out.lora.B_specific
+freezing param: model.audio.blocks.30.attn_ln.weight
+freezing param: model.audio.blocks.30.attn_ln.bias
+freezing param: model.audio.blocks.30.mlp.0.base.weight
+freezing param: model.audio.blocks.30.mlp.0.base.bias
+training param: model.audio.blocks.30.mlp.0.lora.A_specific
+training param: model.audio.blocks.30.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.30.mlp.2.base.weight
+freezing param: model.audio.blocks.30.mlp.2.base.bias
+training param: model.audio.blocks.30.mlp.2.lora.A_specific
+training param: model.audio.blocks.30.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.30.mlp_ln.weight
+freezing param: model.audio.blocks.30.mlp_ln.bias
+freezing param: model.audio.blocks.31.attn.key.base.weight
+training param: model.audio.blocks.31.attn.key.lora.A_specific
+training param: model.audio.blocks.31.attn.key.lora.B_specific
+freezing param: model.audio.blocks.31.attn.value.base.weight
+freezing param: model.audio.blocks.31.attn.value.base.bias
+training param: model.audio.blocks.31.attn.value.lora.A_specific
+training param: model.audio.blocks.31.attn.value.lora.B_specific
+freezing param: model.audio.blocks.31.attn.query.base.weight
+freezing param: model.audio.blocks.31.attn.query.base.bias
+training param: model.audio.blocks.31.attn.query.lora.A_specific
+training param: model.audio.blocks.31.attn.query.lora.B_specific
+freezing param: model.audio.blocks.31.attn.out.base.weight
+freezing param: model.audio.blocks.31.attn.out.base.bias
+training param: model.audio.blocks.31.attn.out.lora.A_specific
+training param: model.audio.blocks.31.attn.out.lora.B_specific
+freezing param: model.audio.blocks.31.attn_ln.weight
+freezing param: model.audio.blocks.31.attn_ln.bias
+freezing param: model.audio.blocks.31.mlp.0.base.weight
+freezing param: model.audio.blocks.31.mlp.0.base.bias
+training param: model.audio.blocks.31.mlp.0.lora.A_specific
+training param: model.audio.blocks.31.mlp.0.lora.B_specific
+freezing param: model.audio.blocks.31.mlp.2.base.weight
+freezing param: model.audio.blocks.31.mlp.2.base.bias
+training param: model.audio.blocks.31.mlp.2.lora.A_specific
+training param: model.audio.blocks.31.mlp.2.lora.B_specific
+freezing param: model.audio.blocks.31.mlp_ln.weight
+freezing param: model.audio.blocks.31.mlp_ln.bias
+training param: model.audio.ln_post.weight
+training param: model.audio.ln_post.bias
+training param: model.audio.conv_proj.gate_proj.weight
+training param: model.audio.conv_proj.up_proj.weight
+training param: model.audio.conv_proj.down_proj.weight
+training param: model.audio.conv_proj.layer_norm.weight
+training param: model.audio.conv_proj.layer_norm.bias
+training param: model.audio.conv_proj.proj_decoder.weight
+training param: model.audio.conv_proj.proj_decoder.bias
+training param: model.audio.proj.weight
+training param: model.audio.proj.bias
+training param: model.audio.audio_bos_eos_token.weight
+freezing param: model.audio.zipper_shared_embedding.weight
+param trainable: 405819136, frozen: 2362249728
+Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
+Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
+Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
+Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
+Detected CUDA files, patching ldflags
+Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/fused_adam/build.ninja...
+/usr/local/lib/python3.10/dist-packages/torch/utils/cpp_extension.py:1967: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. 
+If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
+  warnings.warn(
+Building extension module fused_adam...
+Using envvar MAX_JOBS (16) as the number of workers...
+ninja: no work to do.
+Loading extension module fused_adam...
+Time to load fused_adam op: 0.16039204597473145 seconds
+/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:78.)
+  self._dummy_overflow_buf = get_accelerator().IntTensor([0])
+2026-02-06 15:55:38.135 | INFO     | __main__:train:264 - sample size: 3217
+Loading extension module fused_adam...
+Time to load fused_adam op: 0.20172905921936035 seconds
+/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:78.)
+  self._dummy_overflow_buf = get_accelerator().IntTensor([0])
+2026-02-06 15:55:38.199 | INFO     | __main__:train:264 - sample size: 3217
+Loading extension module fused_adam...
+Time to load fused_adam op: 0.20147418975830078 seconds
+Loading extension module fused_adam...
+Time to load fused_adam op: 0.20134687423706055 seconds
+/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:78.)
+  self._dummy_overflow_buf = get_accelerator().IntTensor([0])
+2026-02-06 15:55:38.236 | INFO     | __main__:train:264 - sample size: 3217
+/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:78.)
+  self._dummy_overflow_buf = get_accelerator().IntTensor([0])
+2026-02-06 15:55:38.319 | INFO     | __main__:train:264 - sample size: 3217
+[2026-02-06 15:55:38,330] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.13.2, git-hash=unknown, git-branch=unknown
+[2026-02-06 15:55:38,330] [INFO] [comm.py:662:init_distributed] Distributed backend already initialized
+[2026-02-06 15:55:41,839] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
+[2026-02-06 15:55:41,847] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer
+[2026-02-06 15:55:41,847] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
+[2026-02-06 15:55:42,030] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam
+[2026-02-06 15:55:42,030] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'>
+[2026-02-06 15:55:42,030] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 2 optimizer
+[2026-02-06 15:55:42,030] [INFO] [stage_1_and_2.py:149:__init__] Reduce bucket size 500,000,000
+[2026-02-06 15:55:42,030] [INFO] [stage_1_and_2.py:150:__init__] Allgather bucket size 500,000,000
+[2026-02-06 15:55:42,030] [INFO] [stage_1_and_2.py:151:__init__] CPU Offload: False
+[2026-02-06 15:55:42,030] [INFO] [stage_1_and_2.py:152:__init__] Round robin gradient partitioning: False
+[2026-02-06 15:55:59,616] [WARNING] [engine.py:2740:load_checkpoint] Unable to find latest file at /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.
+2026-02-06 15:56:00.218 | INFO     | trainer.unigpt_pretrain_trainer:fit:130 - [Zipper LoRA] Freeze B weights for 0 steps.
+[2026-02-06 15:56:03,224] [WARNING] [engine.py:2740:load_checkpoint] Unable to find latest file at /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.
+[2026-02-06 15:56:03,278] [WARNING] [engine.py:2740:load_checkpoint] Unable to find latest file at /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.
+[2026-02-06 15:56:03,417] [INFO] [utils.py:800:see_memory_usage] Before initializing optimizer states
+[2026-02-06 15:56:03,418] [INFO] [utils.py:801:see_memory_usage] MA 5.54 GB         Max_MA 5.54 GB         CA 6.16 GB         Max_CA 6 GB 
+[2026-02-06 15:56:03,419] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 0.0 GB, percent = 0.0%
+2026-02-06 15:56:03.423 | INFO     | trainer.unigpt_pretrain_trainer:fit:130 - [Zipper LoRA] Freeze B weights for 0 steps.
+2026-02-06 15:56:03.477 | INFO     | trainer.unigpt_pretrain_trainer:fit:130 - [Zipper LoRA] Freeze B weights for 0 steps.
+[2026-02-06 15:56:03,595] [INFO] [utils.py:800:see_memory_usage] After initializing optimizer states
+[2026-02-06 15:56:03,596] [INFO] [utils.py:801:see_memory_usage] MA 5.54 GB         Max_MA 5.92 GB         CA 6.53 GB         Max_CA 7 GB 
+[2026-02-06 15:56:03,597] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 0.0 GB, percent = 0.0%
+[2026-02-06 15:56:03,597] [INFO] [stage_1_and_2.py:539:__init__] optimizer state initialized
+[2026-02-06 15:56:03,763] [INFO] [utils.py:800:see_memory_usage] After initializing ZeRO optimizer
+[2026-02-06 15:56:03,764] [INFO] [utils.py:801:see_memory_usage] MA 5.54 GB         Max_MA 5.54 GB         CA 6.53 GB         Max_CA 7 GB 
+[2026-02-06 15:56:03,764] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 0.0 GB, percent = 0.0%
+[2026-02-06 15:56:03,777] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam
+[2026-02-06 15:56:03,777] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler
+[2026-02-06 15:56:03,777] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = <torch.optim.lr_scheduler.LambdaLR object at 0x7f3e9cabd5d0>
+[2026-02-06 15:56:03,777] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-06 15:56:03,783] [INFO] [config.py:987:print] DeepSpeedEngine configuration:
+[2026-02-06 15:56:03,783] [INFO] [config.py:991:print]   activation_checkpointing_config  {
+    "partition_activations": false, 
+    "contiguous_memory_optimization": false, 
+    "cpu_checkpointing": false, 
+    "number_checkpoints": null, 
+    "synchronize_checkpoint_boundary": false, 
+    "profile": false
+}
+[2026-02-06 15:56:03,783] [INFO] [config.py:991:print]   aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
+[2026-02-06 15:56:03,783] [INFO] [config.py:991:print]   amp_enabled .................. False
+[2026-02-06 15:56:03,783] [INFO] [config.py:991:print]   amp_params ................... False
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   autotuning_config ............ {
+    "enabled": false, 
+    "start_step": null, 
+    "end_step": null, 
+    "metric_path": null, 
+    "arg_mappings": null, 
+    "metric": "throughput", 
+    "model_info": null, 
+    "results_dir": "autotuning_results", 
+    "exps_dir": "autotuning_exps", 
+    "overwrite": true, 
+    "fast": true, 
+    "start_profile_step": 3, 
+    "end_profile_step": 5, 
+    "tuner_type": "gridsearch", 
+    "tuner_early_stopping": 5, 
+    "tuner_num_trials": 50, 
+    "model_info_path": null, 
+    "mp_size": 1, 
+    "max_train_batch_size": null, 
+    "min_train_batch_size": 1, 
+    "max_train_micro_batch_size_per_gpu": 1.024000e+03, 
+    "min_train_micro_batch_size_per_gpu": 1, 
+    "num_tuning_micro_batch_sizes": 3
+}
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   bfloat16_enabled ............. True
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   checkpoint_parallel_write_pipeline  False
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   checkpoint_tag_validation_enabled  True
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   checkpoint_tag_validation_fail  False
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7f3e83acae60>
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   communication_data_type ...... None
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   compile_config ............... enabled=False backend='inductor' kwargs={}
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   curriculum_enabled_legacy .... False
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   curriculum_params_legacy ..... False
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   data_efficiency_enabled ...... False
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   dataloader_drop_last ......... False
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   disable_allgather ............ False
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   dump_state ................... False
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   dynamic_loss_scale_args ...... None
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   eigenvalue_enabled ........... False
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   eigenvalue_gas_boundary_resolution  1
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   eigenvalue_layer_num ......... 0
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   eigenvalue_max_iter .......... 100
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   eigenvalue_stability ......... 1e-06
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   eigenvalue_tol ............... 0.01
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   eigenvalue_verbose ........... False
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   elasticity_enabled ........... False
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   flops_profiler_config ........ {
+    "enabled": false, 
+    "recompute_fwd_factor": 0.0, 
+    "profile_step": 1, 
+    "module_depth": -1, 
+    "top_modules": 1, 
+    "detailed": true, 
+    "output_file": null
+}
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   fp16_auto_cast ............... None
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   fp16_enabled ................. False
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   fp16_master_weights_and_gradients  False
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   global_rank .................. 0
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   grad_accum_dtype ............. fp32
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   gradient_accumulation_steps .. 2
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   gradient_clipping ............ 1.0
+[2026-02-06 15:56:03,784] [INFO] [config.py:991:print]   gradient_predivide_factor .... 1.0
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   graph_harvesting ............. False
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   initial_dynamic_scale ........ 1
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   load_universal_checkpoint .... False
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   loss_scale ................... 1.0
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   memory_breakdown ............. False
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   mics_hierarchial_params_gather  False
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   mics_shard_size .............. -1
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   nebula_config ................ {
+    "enabled": false, 
+    "persistent_storage_path": null, 
+    "persistent_time_interval": 100, 
+    "num_of_version_in_retention": 2, 
+    "enable_nebula_load": true, 
+    "load_path": null
+}
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   optimizer_legacy_fusion ...... False
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   optimizer_name ............... None
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   optimizer_params ............. None
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True}
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   pld_enabled .................. False
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   pld_params ................... False
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   prescale_gradients ........... False
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   scheduler_name ............... None
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   scheduler_params ............. None
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   seq_parallel_communication_data_type  torch.float32
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   sparse_attention ............. None
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   sparse_gradients_enabled ..... False
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   steps_per_print .............. 100
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   train_batch_size ............. 80
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   train_micro_batch_size_per_gpu  10
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   use_data_before_expert_parallel_  False
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   use_node_local_storage ....... False
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   wall_clock_breakdown ......... False
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   weight_quantization_config ... None
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   world_size ................... 4
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   zero_allow_untested_optimizer  False
+[2026-02-06 15:56:03,785] [INFO] [config.py:991:print]   zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500,000,000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='none', nvme_path=None, buffer_count=4, pin_memory=True, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False, ratio=1.0) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True
+[2026-02-06 15:56:03,786] [INFO] [config.py:991:print]   zero_enabled ................. True
+[2026-02-06 15:56:03,786] [INFO] [config.py:991:print]   zero_force_ds_cpu_optimizer .. True
+[2026-02-06 15:56:03,786] [INFO] [config.py:991:print]   zero_optimization_stage ...... 2
+[2026-02-06 15:56:03,787] [INFO] [config.py:977:print_user_config]   json = {
+    "steps_per_print": 100, 
+    "zero_optimization": {
+        "stage": 2, 
+        "offload_param": {
+            "device": "none"
+        }, 
+        "offload_optimizer": {
+            "device": "none", 
+            "pin_memory": true
+        }, 
+        "sub_group_size": "auto", 
+        "stage3_max_live_parameters": "auto", 
+        "stage3_max_reuse_distance": "auto", 
+        "stage3_param_persistence_threshold": "auto", 
+        "stage3_prefetch_bucket_size": "auto", 
+        "reduce_bucket_size": "auto", 
+        "zero_hpz_partition_size": 1, 
+        "zero_quantized_weights": false, 
+        "zero_quantized_gradients": false
+    }, 
+    "bf16": {
+        "enabled": true
+    }, 
+    "gradient_clipping": 1.0, 
+    "prescale_gradients": false, 
+    "wall_clock_breakdown": false, 
+    "data_types": {
+        "grad_accum_dtype": "fp32"
+    }, 
+    "train_micro_batch_size_per_gpu": 10, 
+    "train_batch_size": 80
+}
+Load checkpoint:  /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked
+[2026-02-06 15:56:03,787] [WARNING] [engine.py:2740:load_checkpoint] Unable to find latest file at /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.
+> setting tensorboard ...
+2026-02-06 15:56:03.969 | INFO     | trainer.unigpt_pretrain_trainer:fit:130 - [Zipper LoRA] Freeze B weights for 0 steps.
+Train epoch:   0%|          | 0/3 [00:00<?, ?it/s]
+Train step of epoch 0:   0%|          | 0/6434 [00:00<?, ?it/s][A
+Train step of epoch 0:   0%|          | 0/6434 [00:20<?, ?it/s, gpt_loss=0.67, loss_mean=0.067][A
+Train step of epoch 0:   0%|          | 1/6434 [00:20<36:18:47, 20.32s/it, gpt_loss=0.67, loss_mean=0.067][A
+Train step of epoch 0:   0%|          | 1/6434 [00:28<36:18:47, 20.32s/it, gpt_loss=1.08, loss_mean=0.168][A
+Train step of epoch 0:   0%|          | 2/6434 [00:28<23:29:02, 13.14s/it, gpt_loss=1.08, loss_mean=0.168][A
+Train step of epoch 0:   0%|          | 2/6434 [00:36<23:29:02, 13.14s/it, gpt_loss=0.802, loss_mean=0.232][A
+Train step of epoch 0:   0%|          | 3/6434 [00:36<19:34:21, 10.96s/it, gpt_loss=0.802, loss_mean=0.232][A
+Train step of epoch 0:   0%|          | 3/6434 [00:45<19:34:21, 10.96s/it, gpt_loss=0.738, loss_mean=0.282][A
+Train step of epoch 0:   0%|          | 4/6434 [00:45<18:00:25, 10.08s/it, gpt_loss=0.738, loss_mean=0.282][A
+Train step of epoch 0:   0%|          | 4/6434 [00:54<18:00:25, 10.08s/it, gpt_loss=0.7, loss_mean=0.324]  [A
+Train step of epoch 0:   0%|          | 5/6434 [00:54<17:22:51,  9.73s/it, gpt_loss=0.7, loss_mean=0.324][A
+Train step of epoch 0:   0%|          | 5/6434 [01:02<17:22:51,  9.73s/it, gpt_loss=0.629, loss_mean=0.355][A
+Train step of epoch 0:   0%|          | 6/6434 [01:02<16:02:13,  8.98s/it, gpt_loss=0.629, loss_mean=0.355][A
+Train step of epoch 0:   0%|          | 6/6434 [01:09<16:02:13,  8.98s/it, gpt_loss=0.684, loss_mean=0.387][A
+Train step of epoch 0:   0%|          | 7/6434 [01:09<15:19:58,  8.59s/it, gpt_loss=0.684, loss_mean=0.387][A
+Train step of epoch 0:   0%|          | 7/6434 [01:17<15:19:58,  8.59s/it, gpt_loss=0.749, loss_mean=0.424][A
+Train step of epoch 0:   0%|          | 8/6434 [01:17<15:00:56,  8.41s/it, gpt_loss=0.749, loss_mean=0.424][A
+Train step of epoch 0:   0%|          | 8/6434 [01:26<15:00:56,  8.41s/it, gpt_loss=0.482, loss_mean=0.429][A
+Train step of epoch 0:   0%|          | 9/6434 [01:26<14:55:46,  8.37s/it, gpt_loss=0.482, loss_mean=0.429][A
+[LID Router Debug] Step: 10
+Batch Size: 10
+Audio Batch Size: 92
+LID Assignments: [9, 2, 4, 0, 9, 3, 1, 7, 1, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 7, 9}
+
+Train step of epoch 0:   0%|          | 9/6434 [01:33<14:55:46,  8.37s/it, gpt_loss=0.601, loss_mean=0.447][A
+Train step of epoch 0:   0%|          | 10/6434 [01:33<14:20:31,  8.04s/it, gpt_loss=0.601, loss_mean=0.447][A
+Train step of epoch 0:   0%|          | 10/6434 [01:41<14:20:31,  8.04s/it, gpt_loss=0.751, loss_mean=0.477][A
+Train step of epoch 0:   0%|          | 11/6434 [01:41<14:32:25,  8.15s/it, gpt_loss=0.751, loss_mean=0.477][A
+Train step of epoch 0:   0%|          | 11/6434 [01:50<14:32:25,  8.15s/it, gpt_loss=0.728, loss_mean=0.502][A
+Train step of epoch 0:   0%|          | 12/6434 [01:50<14:46:39,  8.28s/it, gpt_loss=0.728, loss_mean=0.502][A
+Train step of epoch 0:   0%|          | 12/6434 [01:57<14:46:39,  8.28s/it, gpt_loss=0.68, loss_mean=0.52]  [A
+Train step of epoch 0:   0%|          | 13/6434 [01:57<14:04:52,  7.89s/it, gpt_loss=0.68, loss_mean=0.52][A
+Train step of epoch 0:   0%|          | 13/6434 [02:06<14:04:52,  7.89s/it, gpt_loss=0.838, loss_mean=0.552][A
+Train step of epoch 0:   0%|          | 14/6434 [02:06<14:46:01,  8.28s/it, gpt_loss=0.838, loss_mean=0.552][A
+Train step of epoch 0:   0%|          | 14/6434 [02:15<14:46:01,  8.28s/it, gpt_loss=0.623, loss_mean=0.559][A
+Train step of epoch 0:   0%|          | 15/6434 [02:15<15:12:50,  8.53s/it, gpt_loss=0.623, loss_mean=0.559][A
+Train step of epoch 0:   0%|          | 15/6434 [02:25<15:12:50,  8.53s/it, gpt_loss=0.662, loss_mean=0.569][A
+Train step of epoch 0:   0%|          | 16/6434 [02:25<15:50:31,  8.89s/it, gpt_loss=0.662, loss_mean=0.569][A
+Train step of epoch 0:   0%|          | 16/6434 [02:34<15:50:31,  8.89s/it, gpt_loss=0.616, loss_mean=0.574][A
+Train step of epoch 0:   0%|          | 17/6434 [02:34<15:55:35,  8.93s/it, gpt_loss=0.616, loss_mean=0.574][A
+Train step of epoch 0:   0%|          | 17/6434 [02:43<15:55:35,  8.93s/it, gpt_loss=0.816, loss_mean=0.598][A
+Train step of epoch 0:   0%|          | 18/6434 [02:43<15:49:49,  8.88s/it, gpt_loss=0.816, loss_mean=0.598][A
+Train step of epoch 0:   0%|          | 18/6434 [02:51<15:49:49,  8.88s/it, gpt_loss=0.62, loss_mean=0.6]   [A
+Train step of epoch 0:   0%|          | 19/6434 [02:51<15:27:47,  8.68s/it, gpt_loss=0.62, loss_mean=0.6][A
+[LID Router Debug] Step: 20
+Batch Size: 10
+Audio Batch Size: 123
+LID Assignments: [0, 4, 3, 1, 3, 9, 1, 5, 3, 4]
+Active Experts in Batch: {0, 1, 3, 4, 5, 9}
+
+Train step of epoch 0:   0%|          | 19/6434 [02:59<15:27:47,  8.68s/it, gpt_loss=0.831, loss_mean=0.623][A
+Train step of epoch 0:   0%|          | 20/6434 [02:59<15:11:36,  8.53s/it, gpt_loss=0.831, loss_mean=0.623][A
+Train step of epoch 0:   0%|          | 20/6434 [03:08<15:11:36,  8.53s/it, gpt_loss=0.756, loss_mean=0.637][A
+Train step of epoch 0:   0%|          | 21/6434 [03:08<15:22:12,  8.63s/it, gpt_loss=0.756, loss_mean=0.637][A
+Train step of epoch 0:   0%|          | 21/6434 [03:16<15:22:12,  8.63s/it, gpt_loss=0.75, loss_mean=0.648] [A
+Train step of epoch 0:   0%|          | 22/6434 [03:16<15:02:28,  8.44s/it, gpt_loss=0.75, loss_mean=0.648][A
+Train step of epoch 0:   0%|          | 22/6434 [03:25<15:02:28,  8.44s/it, gpt_loss=0.631, loss_mean=0.646][A
+Train step of epoch 0:   0%|          | 23/6434 [03:25<15:12:41,  8.54s/it, gpt_loss=0.631, loss_mean=0.646][A
+Train step of epoch 0:   0%|          | 23/6434 [03:34<15:12:41,  8.54s/it, gpt_loss=0.721, loss_mean=0.654][A
+Train step of epoch 0:   0%|          | 24/6434 [03:34<15:17:01,  8.58s/it, gpt_loss=0.721, loss_mean=0.654][A
+Train step of epoch 0:   0%|          | 24/6434 [03:41<15:17:01,  8.58s/it, gpt_loss=0.53, loss_mean=0.641] [A
+Train step of epoch 0:   0%|          | 25/6434 [03:41<14:52:02,  8.35s/it, gpt_loss=0.53, loss_mean=0.641][A
+Train step of epoch 0:   0%|          | 25/6434 [03:51<14:52:02,  8.35s/it, gpt_loss=0.645, loss_mean=0.642][A
+Train step of epoch 0:   0%|          | 26/6434 [03:51<15:18:06,  8.60s/it, gpt_loss=0.645, loss_mean=0.642][A
+Train step of epoch 0:   0%|          | 26/6434 [03:59<15:18:06,  8.60s/it, gpt_loss=0.992, loss_mean=0.677][A
+Train step of epoch 0:   0%|          | 27/6434 [03:59<15:19:29,  8.61s/it, gpt_loss=0.992, loss_mean=0.677][A
+Train step of epoch 0:   0%|          | 27/6434 [04:08<15:19:29,  8.61s/it, gpt_loss=0.824, loss_mean=0.691][A
+Train step of epoch 0:   0%|          | 28/6434 [04:08<15:16:30,  8.58s/it, gpt_loss=0.824, loss_mean=0.691][A
+Train step of epoch 0:   0%|          | 28/6434 [04:15<15:16:30,  8.58s/it, gpt_loss=0.713, loss_mean=0.694][A
+Train step of epoch 0:   0%|          | 29/6434 [04:15<14:42:17,  8.27s/it, gpt_loss=0.713, loss_mean=0.694][A
+[LID Router Debug] Step: 30
+Batch Size: 10
+Audio Batch Size: 110
+LID Assignments: [3, 4, 3, 5, 9, 5, 5, 1, 1, 1]
+Active Experts in Batch: {1, 3, 4, 5, 9}
+
+Train step of epoch 0:   0%|          | 29/6434 [04:25<14:42:17,  8.27s/it, gpt_loss=0.662, loss_mean=0.69] [A
+Train step of epoch 0:   0%|          | 30/6434 [04:25<15:15:24,  8.58s/it, gpt_loss=0.662, loss_mean=0.69][A
+Train step of epoch 0:   0%|          | 30/6434 [04:33<15:15:24,  8.58s/it, gpt_loss=0.806, loss_mean=0.702][A
+Train step of epoch 0:   0%|          | 31/6434 [04:33<14:59:25,  8.43s/it, gpt_loss=0.806, loss_mean=0.702][A
+Train step of epoch 0:   0%|          | 31/6434 [04:41<14:59:25,  8.43s/it, gpt_loss=0.834, loss_mean=0.715][A
+Train step of epoch 0:   0%|          | 32/6434 [04:41<14:48:28,  8.33s/it, gpt_loss=0.834, loss_mean=0.715][A
+Train step of epoch 0:   0%|          | 32/6434 [04:50<14:48:28,  8.33s/it, gpt_loss=0.649, loss_mean=0.709][A
+Train step of epoch 0:   1%|          | 33/6434 [04:50<15:23:05,  8.65s/it, gpt_loss=0.649, loss_mean=0.709][A
+Train step of epoch 0:   1%|          | 33/6434 [04:58<15:23:05,  8.65s/it, gpt_loss=0.673, loss_mean=0.705][A
+Train step of epoch 0:   1%|          | 34/6434 [04:58<14:45:36,  8.30s/it, gpt_loss=0.673, loss_mean=0.705][A
+Train step of epoch 0:   1%|          | 34/6434 [05:07<14:45:36,  8.30s/it, gpt_loss=0.79, loss_mean=0.713] [A
+Train step of epoch 0:   1%|          | 35/6434 [05:07<15:18:21,  8.61s/it, gpt_loss=0.79, loss_mean=0.713][A
+Train step of epoch 0:   1%|          | 35/6434 [05:15<15:18:21,  8.61s/it, gpt_loss=0.745, loss_mean=0.717][A
+Train step of epoch 0:   1%|          | 36/6434 [05:15<15:10:07,  8.54s/it, gpt_loss=0.745, loss_mean=0.717][A
+Train step of epoch 0:   1%|          | 36/6434 [05:24<15:10:07,  8.54s/it, gpt_loss=0.78, loss_mean=0.723] [A
+Train step of epoch 0:   1%|          | 37/6434 [05:24<15:23:44,  8.66s/it, gpt_loss=0.78, loss_mean=0.723][A
+Train step of epoch 0:   1%|          | 37/6434 [05:32<15:23:44,  8.66s/it, gpt_loss=0.607, loss_mean=0.711][A
+Train step of epoch 0:   1%|          | 38/6434 [05:32<14:56:02,  8.41s/it, gpt_loss=0.607, loss_mean=0.711][A
+Train step of epoch 0:   1%|          | 38/6434 [05:40<14:56:02,  8.41s/it, gpt_loss=0.604, loss_mean=0.701][A
+Train step of epoch 0:   1%|          | 39/6434 [05:40<14:44:07,  8.30s/it, gpt_loss=0.604, loss_mean=0.701][A
+[LID Router Debug] Step: 40
+Batch Size: 10
+Audio Batch Size: 98
+LID Assignments: [3, 1, 3, 5, 0, 2, 4, 1, 4, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5}
+
+Train step of epoch 0:   1%|          | 39/6434 [05:49<14:44:07,  8.30s/it, gpt_loss=0.548, loss_mean=0.685][A
+Train step of epoch 0:   1%|          | 40/6434 [05:49<15:03:00,  8.47s/it, gpt_loss=0.548, loss_mean=0.685][A
+Train step of epoch 0:   1%|          | 40/6434 [05:57<15:03:00,  8.47s/it, gpt_loss=0.683, loss_mean=0.685][A
+Train step of epoch 0:   1%|          | 41/6434 [05:57<15:02:20,  8.47s/it, gpt_loss=0.683, loss_mean=0.685][A
+Train step of epoch 0:   1%|          | 41/6434 [06:06<15:02:20,  8.47s/it, gpt_loss=0.671, loss_mean=0.684][A
+Train step of epoch 0:   1%|          | 42/6434 [06:06<15:09:38,  8.54s/it, gpt_loss=0.671, loss_mean=0.684][A
+Train step of epoch 0:   1%|          | 42/6434 [06:15<15:09:38,  8.54s/it, gpt_loss=0.711, loss_mean=0.686][A
+Train step of epoch 0:   1%|          | 43/6434 [06:15<15:31:23,  8.74s/it, gpt_loss=0.711, loss_mean=0.686][A
+Train step of epoch 0:   1%|          | 43/6434 [06:24<15:31:23,  8.74s/it, gpt_loss=0.49, loss_mean=0.667] [A
+Train step of epoch 0:   1%|          | 44/6434 [06:24<15:23:34,  8.67s/it, gpt_loss=0.49, loss_mean=0.667][A
+Train step of epoch 0:   1%|          | 44/6434 [06:32<15:23:34,  8.67s/it, gpt_loss=0.472, loss_mean=0.647][A
+Train step of epoch 0:   1%|          | 45/6434 [06:32<15:13:21,  8.58s/it, gpt_loss=0.472, loss_mean=0.647][A
+Train step of epoch 0:   1%|          | 45/6434 [06:41<15:13:21,  8.58s/it, gpt_loss=0.664, loss_mean=0.649][A
+Train step of epoch 0:   1%|          | 46/6434 [06:41<15:24:15,  8.68s/it, gpt_loss=0.664, loss_mean=0.649][A
+Train step of epoch 0:   1%|          | 46/6434 [06:49<15:24:15,  8.68s/it, gpt_loss=0.509, loss_mean=0.635][A
+Train step of epoch 0:   1%|          | 47/6434 [06:49<15:00:51,  8.46s/it, gpt_loss=0.509, loss_mean=0.635][A
+Train step of epoch 0:   1%|          | 47/6434 [06:57<15:00:51,  8.46s/it, gpt_loss=0.804, loss_mean=0.652][A
+Train step of epoch 0:   1%|          | 48/6434 [06:57<14:45:41,  8.32s/it, gpt_loss=0.804, loss_mean=0.652][A
+Train step of epoch 0:   1%|          | 48/6434 [07:06<14:45:41,  8.32s/it, gpt_loss=0.705, loss_mean=0.657][A
+Train step of epoch 0:   1%|          | 49/6434 [07:06<14:58:49,  8.45s/it, gpt_loss=0.705, loss_mean=0.657][A
+[LID Router Debug] Step: 50
+Batch Size: 10
+Audio Batch Size: 92
+LID Assignments: [9, 1, 2, 9, 4, 0, 5, 2, 0, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:   1%|          | 49/6434 [07:13<14:58:49,  8.45s/it, gpt_loss=0.683, loss_mean=0.66] [A
+Train step of epoch 0:   1%|          | 50/6434 [07:13<14:23:50,  8.12s/it, gpt_loss=0.683, loss_mean=0.66][A
+Train step of epoch 0:   1%|          | 50/6434 [07:22<14:23:50,  8.12s/it, gpt_loss=0.827, loss_mean=0.676][A
+Train step of epoch 0:   1%|          | 51/6434 [07:22<14:56:16,  8.42s/it, gpt_loss=0.827, loss_mean=0.676][A
+Train step of epoch 0:   1%|          | 51/6434 [07:29<14:56:16,  8.42s/it, gpt_loss=0.559, loss_mean=0.665][A
+Train step of epoch 0:   1%|          | 52/6434 [07:29<14:16:08,  8.05s/it, gpt_loss=0.559, loss_mean=0.665][A
+Train step of epoch 0:   1%|          | 52/6434 [07:38<14:16:08,  8.05s/it, gpt_loss=0.613, loss_mean=0.66] [A
+Train step of epoch 0:   1%|          | 53/6434 [07:38<14:29:14,  8.17s/it, gpt_loss=0.613, loss_mean=0.66][A
+Train step of epoch 0:   1%|          | 53/6434 [07:46<14:29:14,  8.17s/it, gpt_loss=0.709, loss_mean=0.665][A
+Train step of epoch 0:   1%|          | 54/6434 [07:46<14:10:37,  8.00s/it, gpt_loss=0.709, loss_mean=0.665][A
+Train step of epoch 0:   1%|          | 54/6434 [07:53<14:10:37,  8.00s/it, gpt_loss=0.701, loss_mean=0.668][A
+Train step of epoch 0:   1%|          | 55/6434 [07:53<14:00:15,  7.90s/it, gpt_loss=0.701, loss_mean=0.668][A
+Train step of epoch 0:   1%|          | 55/6434 [08:01<14:00:15,  7.90s/it, gpt_loss=0.786, loss_mean=0.68] [A
+Train step of epoch 0:   1%|          | 56/6434 [08:01<14:06:38,  7.96s/it, gpt_loss=0.786, loss_mean=0.68][A
+Train step of epoch 0:   1%|          | 56/6434 [08:10<14:06:38,  7.96s/it, gpt_loss=0.531, loss_mean=0.665][A
+Train step of epoch 0:   1%|          | 57/6434 [08:10<14:36:45,  8.25s/it, gpt_loss=0.531, loss_mean=0.665][A
+Train step of epoch 0:   1%|          | 57/6434 [08:18<14:36:45,  8.25s/it, gpt_loss=0.723, loss_mean=0.671][A
+Train step of epoch 0:   1%|          | 58/6434 [08:18<14:32:32,  8.21s/it, gpt_loss=0.723, loss_mean=0.671][A
+Train step of epoch 0:   1%|          | 58/6434 [08:27<14:32:32,  8.21s/it, gpt_loss=0.656, loss_mean=0.669][A
+Train step of epoch 0:   1%|          | 59/6434 [08:27<14:50:14,  8.38s/it, gpt_loss=0.656, loss_mean=0.669][A
+[LID Router Debug] Step: 60
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [6, 2, 1, 5, 4, 4, 5, 3, 5, 1]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6}
+
+Train step of epoch 0:   1%|          | 59/6434 [08:36<14:50:14,  8.38s/it, gpt_loss=0.506, loss_mean=0.653][A
+Train step of epoch 0:   1%|          | 60/6434 [08:36<15:10:54,  8.57s/it, gpt_loss=0.506, loss_mean=0.653][A
+Train step of epoch 0:   1%|          | 60/6434 [08:44<15:10:54,  8.57s/it, gpt_loss=0.629, loss_mean=0.651][A
+Train step of epoch 0:   1%|          | 61/6434 [08:44<14:57:31,  8.45s/it, gpt_loss=0.629, loss_mean=0.651][A
+Train step of epoch 0:   1%|          | 61/6434 [08:52<14:57:31,  8.45s/it, gpt_loss=0.704, loss_mean=0.656][A
+Train step of epoch 0:   1%|          | 62/6434 [08:52<14:44:24,  8.33s/it, gpt_loss=0.704, loss_mean=0.656][A
+Train step of epoch 0:   1%|          | 62/6434 [09:00<14:44:24,  8.33s/it, gpt_loss=0.49, loss_mean=0.639] [A
+Train step of epoch 0:   1%|          | 63/6434 [09:00<14:14:28,  8.05s/it, gpt_loss=0.49, loss_mean=0.639][A
+Train step of epoch 0:   1%|          | 63/6434 [09:09<14:14:28,  8.05s/it, gpt_loss=0.569, loss_mean=0.632][A
+Train step of epoch 0:   1%|          | 64/6434 [09:09<14:59:01,  8.47s/it, gpt_loss=0.569, loss_mean=0.632][A
+Train step of epoch 0:   1%|          | 64/6434 [09:17<14:59:01,  8.47s/it, gpt_loss=0.544, loss_mean=0.624][A
+Train step of epoch 0:   1%|          | 65/6434 [09:17<14:42:31,  8.31s/it, gpt_loss=0.544, loss_mean=0.624][A
+Train step of epoch 0:   1%|          | 65/6434 [09:26<14:42:31,  8.31s/it, gpt_loss=0.61, loss_mean=0.622] [A
+Train step of epoch 0:   1%|          | 66/6434 [09:26<14:53:25,  8.42s/it, gpt_loss=0.61, loss_mean=0.622][A
+Train step of epoch 0:   1%|          | 66/6434 [09:34<14:53:25,  8.42s/it, gpt_loss=0.917, loss_mean=0.652][A
+Train step of epoch 0:   1%|          | 67/6434 [09:34<14:38:44,  8.28s/it, gpt_loss=0.917, loss_mean=0.652][A
+Train step of epoch 0:   1%|          | 67/6434 [09:41<14:38:44,  8.28s/it, gpt_loss=0.726, loss_mean=0.659][A
+Train step of epoch 0:   1%|          | 68/6434 [09:41<14:18:02,  8.09s/it, gpt_loss=0.726, loss_mean=0.659][A
+Train step of epoch 0:   1%|          | 68/6434 [09:50<14:18:02,  8.09s/it, gpt_loss=0.583, loss_mean=0.651][A
+Train step of epoch 0:   1%|          | 69/6434 [09:50<14:48:55,  8.38s/it, gpt_loss=0.583, loss_mean=0.651][A
+[LID Router Debug] Step: 70
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [1, 4, 5, 3, 3, 9, 0, 2, 4, 5]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:   1%|          | 69/6434 [10:00<14:48:55,  8.38s/it, gpt_loss=0.587, loss_mean=0.645][A
+Train step of epoch 0:   1%|          | 70/6434 [10:00<15:14:59,  8.63s/it, gpt_loss=0.587, loss_mean=0.645][A
+Train step of epoch 0:   1%|          | 70/6434 [10:07<15:14:59,  8.63s/it, gpt_loss=0.55, loss_mean=0.636] [A
+Train step of epoch 0:   1%|          | 71/6434 [10:07<14:33:49,  8.24s/it, gpt_loss=0.55, loss_mean=0.636][A
+Train step of epoch 0:   1%|          | 71/6434 [10:15<14:33:49,  8.24s/it, gpt_loss=0.558, loss_mean=0.628][A
+Train step of epoch 0:   1%|          | 72/6434 [10:15<14:38:19,  8.28s/it, gpt_loss=0.558, loss_mean=0.628][A
+Train step of epoch 0:   1%|          | 72/6434 [10:27<14:38:19,  8.28s/it, gpt_loss=0.383, loss_mean=0.603][A
+Train step of epoch 0:   1%|          | 73/6434 [10:27<16:26:49,  9.31s/it, gpt_loss=0.383, loss_mean=0.603][A
+Train step of epoch 0:   1%|          | 73/6434 [10:35<16:26:49,  9.31s/it, gpt_loss=0.541, loss_mean=0.597][A
+Train step of epoch 0:   1%|          | 74/6434 [10:35<15:52:37,  8.99s/it, gpt_loss=0.541, loss_mean=0.597][A
+Train step of epoch 0:   1%|          | 74/6434 [10:44<15:52:37,  8.99s/it, gpt_loss=0.509, loss_mean=0.588][A
+Train step of epoch 0:   1%|          | 75/6434 [10:44<15:51:21,  8.98s/it, gpt_loss=0.509, loss_mean=0.588][A
+Train step of epoch 0:   1%|          | 75/6434 [10:54<15:51:21,  8.98s/it, gpt_loss=0.554, loss_mean=0.585][A
+Train step of epoch 0:   1%|          | 76/6434 [10:54<16:01:13,  9.07s/it, gpt_loss=0.554, loss_mean=0.585][A
+Train step of epoch 0:   1%|          | 76/6434 [11:02<16:01:13,  9.07s/it, gpt_loss=0.519, loss_mean=0.578][A
+Train step of epoch 0:   1%|          | 77/6434 [11:02<15:53:30,  9.00s/it, gpt_loss=0.519, loss_mean=0.578][A
+Train step of epoch 0:   1%|          | 77/6434 [11:11<15:53:30,  9.00s/it, gpt_loss=0.589, loss_mean=0.579][A
+Train step of epoch 0:   1%|          | 78/6434 [11:11<15:28:45,  8.77s/it, gpt_loss=0.589, loss_mean=0.579][A
+Train step of epoch 0:   1%|          | 78/6434 [11:19<15:28:45,  8.77s/it, gpt_loss=0.43, loss_mean=0.564] [A
+Train step of epoch 0:   1%|          | 79/6434 [11:19<15:17:01,  8.66s/it, gpt_loss=0.43, loss_mean=0.564][A
+[LID Router Debug] Step: 80
+Batch Size: 10
+Audio Batch Size: 95
+LID Assignments: [1, 0, 6, 5, 4, 10, 5, 2, 0, 5]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6, 10}
+
+Train step of epoch 0:   1%|          | 79/6434 [11:28<15:17:01,  8.66s/it, gpt_loss=0.452, loss_mean=0.553][A
+Train step of epoch 0:   1%|          | 80/6434 [11:28<15:35:04,  8.83s/it, gpt_loss=0.452, loss_mean=0.553][A
+Train step of epoch 0:   1%|          | 80/6434 [11:36<15:35:04,  8.83s/it, gpt_loss=0.595, loss_mean=0.557][A
+Train step of epoch 0:   1%|▏         | 81/6434 [11:36<15:06:40,  8.56s/it, gpt_loss=0.595, loss_mean=0.557][A
+Train step of epoch 0:   1%|▏         | 81/6434 [11:45<15:06:40,  8.56s/it, gpt_loss=0.639, loss_mean=0.565][A
+Train step of epoch 0:   1%|▏         | 82/6434 [11:45<15:10:24,  8.60s/it, gpt_loss=0.639, loss_mean=0.565][A
+Train step of epoch 0:   1%|▏         | 82/6434 [11:53<15:10:24,  8.60s/it, gpt_loss=0.503, loss_mean=0.559][A
+Train step of epoch 0:   1%|▏         | 83/6434 [11:53<14:57:49,  8.48s/it, gpt_loss=0.503, loss_mean=0.559][A
+Train step of epoch 0:   1%|▏         | 83/6434 [12:02<14:57:49,  8.48s/it, gpt_loss=0.548, loss_mean=0.558][A
+Train step of epoch 0:   1%|▏         | 84/6434 [12:02<15:19:22,  8.69s/it, gpt_loss=0.548, loss_mean=0.558][A
+Train step of epoch 0:   1%|▏         | 84/6434 [12:12<15:19:22,  8.69s/it, gpt_loss=0.576, loss_mean=0.56] [A
+Train step of epoch 0:   1%|▏         | 85/6434 [12:12<15:42:18,  8.91s/it, gpt_loss=0.576, loss_mean=0.56][A
+Train step of epoch 0:   1%|▏         | 85/6434 [12:21<15:42:18,  8.91s/it, gpt_loss=0.495, loss_mean=0.553][A
+Train step of epoch 0:   1%|▏         | 86/6434 [12:21<15:39:14,  8.88s/it, gpt_loss=0.495, loss_mean=0.553][A
+Train step of epoch 0:   1%|▏         | 86/6434 [12:29<15:39:14,  8.88s/it, gpt_loss=0.614, loss_mean=0.559][A
+Train step of epoch 0:   1%|▏         | 87/6434 [12:29<15:24:35,  8.74s/it, gpt_loss=0.614, loss_mean=0.559][A
+Train step of epoch 0:   1%|▏         | 87/6434 [12:37<15:24:35,  8.74s/it, gpt_loss=0.599, loss_mean=0.563][A
+Train step of epoch 0:   1%|▏         | 88/6434 [12:37<15:14:42,  8.65s/it, gpt_loss=0.599, loss_mean=0.563][A
+Train step of epoch 0:   1%|▏         | 88/6434 [12:45<15:14:42,  8.65s/it, gpt_loss=0.492, loss_mean=0.556][A
+Train step of epoch 0:   1%|▏         | 89/6434 [12:45<14:34:53,  8.27s/it, gpt_loss=0.492, loss_mean=0.556][A
+[LID Router Debug] Step: 90
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [5, 9, 3, 4, 2, 5, 1, 6, 9, 1]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:   1%|▏         | 89/6434 [12:54<14:34:53,  8.27s/it, gpt_loss=0.51, loss_mean=0.552] [A
+Train step of epoch 0:   1%|▏         | 90/6434 [12:54<14:58:04,  8.49s/it, gpt_loss=0.51, loss_mean=0.552][A
+Train step of epoch 0:   1%|▏         | 90/6434 [13:02<14:58:04,  8.49s/it, gpt_loss=0.642, loss_mean=0.561][A
+Train step of epoch 0:   1%|▏         | 91/6434 [13:02<14:58:19,  8.50s/it, gpt_loss=0.642, loss_mean=0.561][A
+Train step of epoch 0:   1%|▏         | 91/6434 [13:12<14:58:19,  8.50s/it, gpt_loss=0.509, loss_mean=0.556][A
+Train step of epoch 0:   1%|▏         | 92/6434 [13:12<15:23:47,  8.74s/it, gpt_loss=0.509, loss_mean=0.556][A
+Train step of epoch 0:   1%|▏         | 92/6434 [13:19<15:23:47,  8.74s/it, gpt_loss=0.576, loss_mean=0.558][A
+Train step of epoch 0:   1%|▏         | 93/6434 [13:19<14:55:20,  8.47s/it, gpt_loss=0.576, loss_mean=0.558][A
+Train step of epoch 0:   1%|▏         | 93/6434 [13:28<14:55:20,  8.47s/it, gpt_loss=0.483, loss_mean=0.55] [A
+Train step of epoch 0:   1%|▏         | 94/6434 [13:28<14:59:52,  8.52s/it, gpt_loss=0.483, loss_mean=0.55][A
+Train step of epoch 0:   1%|▏         | 94/6434 [13:36<14:59:52,  8.52s/it, gpt_loss=0.514, loss_mean=0.547][A
+Train step of epoch 0:   1%|▏         | 95/6434 [13:36<14:30:09,  8.24s/it, gpt_loss=0.514, loss_mean=0.547][A
+Train step of epoch 0:   1%|▏         | 95/6434 [13:44<14:30:09,  8.24s/it, gpt_loss=0.559, loss_mean=0.548][A
+Train step of epoch 0:   1%|▏         | 96/6434 [13:44<14:19:19,  8.13s/it, gpt_loss=0.559, loss_mean=0.548][A
+Train step of epoch 0:   1%|▏         | 96/6434 [13:53<14:19:19,  8.13s/it, gpt_loss=0.595, loss_mean=0.552][A
+Train step of epoch 0:   2%|▏         | 97/6434 [13:53<15:01:07,  8.53s/it, gpt_loss=0.595, loss_mean=0.552][A
+Train step of epoch 0:   2%|▏         | 97/6434 [14:01<15:01:07,  8.53s/it, gpt_loss=0.587, loss_mean=0.556][A
+Train step of epoch 0:   2%|▏         | 98/6434 [14:01<14:46:18,  8.39s/it, gpt_loss=0.587, loss_mean=0.556][A
+Train step of epoch 0:   2%|▏         | 98/6434 [14:10<14:46:18,  8.39s/it, gpt_loss=0.574, loss_mean=0.558][A
+Train step of epoch 0:   2%|▏         | 99/6434 [14:10<14:58:56,  8.51s/it, gpt_loss=0.574, loss_mean=0.558][A
+[LID Router Debug] Step: 100
+Batch Size: 10
+Audio Batch Size: 111
+LID Assignments: [9, 6, 6, 5, 5, 3, 4, 3, 0, 4]
+Active Experts in Batch: {0, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:   2%|▏         | 99/6434 [14:19<14:58:56,  8.51s/it, gpt_loss=0.529, loss_mean=0.555][A
+Train step of epoch 0:   2%|▏         | 100/6434 [14:19<15:12:15,  8.64s/it, gpt_loss=0.529, loss_mean=0.555][A
+Train step of epoch 0:   2%|▏         | 100/6434 [14:27<15:12:15,  8.64s/it, gpt_loss=0.548, loss_mean=0.554][A
+Train step of epoch 0:   2%|▏         | 101/6434 [14:27<14:46:50,  8.40s/it, gpt_loss=0.548, loss_mean=0.554][A
+Train step of epoch 0:   2%|▏         | 101/6434 [14:34<14:46:50,  8.40s/it, gpt_loss=0.651, loss_mean=0.564][A
+Train step of epoch 0:   2%|▏         | 102/6434 [14:34<14:20:03,  8.15s/it, gpt_loss=0.651, loss_mean=0.564][A
+Train step of epoch 0:   2%|▏         | 102/6434 [14:43<14:20:03,  8.15s/it, gpt_loss=0.5, loss_mean=0.557]  [A
+Train step of epoch 0:   2%|▏         | 103/6434 [14:43<14:37:19,  8.31s/it, gpt_loss=0.5, loss_mean=0.557][A
+Train step of epoch 0:   2%|▏         | 103/6434 [14:52<14:37:19,  8.31s/it, gpt_loss=0.441, loss_mean=0.546][A
+Train step of epoch 0:   2%|▏         | 104/6434 [14:52<14:47:32,  8.41s/it, gpt_loss=0.441, loss_mean=0.546][A
+Train step of epoch 0:   2%|▏         | 104/6434 [15:00<14:47:32,  8.41s/it, gpt_loss=0.559, loss_mean=0.547][A
+Train step of epoch 0:   2%|▏         | 105/6434 [15:00<14:54:29,  8.48s/it, gpt_loss=0.559, loss_mean=0.547][A
+Train step of epoch 0:   2%|▏         | 105/6434 [15:09<14:54:29,  8.48s/it, gpt_loss=0.612, loss_mean=0.554][A
+Train step of epoch 0:   2%|▏         | 106/6434 [15:09<14:53:35,  8.47s/it, gpt_loss=0.612, loss_mean=0.554][A
+Train step of epoch 0:   2%|▏         | 106/6434 [15:17<14:53:35,  8.47s/it, gpt_loss=0.427, loss_mean=0.541][A
+Train step of epoch 0:   2%|▏         | 107/6434 [15:17<14:55:26,  8.49s/it, gpt_loss=0.427, loss_mean=0.541][A
+Train step of epoch 0:   2%|▏         | 107/6434 [15:25<14:55:26,  8.49s/it, gpt_loss=0.492, loss_mean=0.536][A
+Train step of epoch 0:   2%|▏         | 108/6434 [15:25<14:41:20,  8.36s/it, gpt_loss=0.492, loss_mean=0.536][A
+Train step of epoch 0:   2%|▏         | 108/6434 [15:34<14:41:20,  8.36s/it, gpt_loss=0.514, loss_mean=0.534][A
+Train step of epoch 0:   2%|▏         | 109/6434 [15:34<14:41:00,  8.36s/it, gpt_loss=0.514, loss_mean=0.534][A
+[LID Router Debug] Step: 110
+Batch Size: 10
+Audio Batch Size: 121
+LID Assignments: [3, 3, 4, 1, 0, 5, 9, 1, 10, 3]
+Active Experts in Batch: {0, 1, 3, 4, 5, 9, 10}
+
+Train step of epoch 0:   2%|▏         | 109/6434 [15:44<14:41:00,  8.36s/it, gpt_loss=0.457, loss_mean=0.526][A
+Train step of epoch 0:   2%|▏         | 110/6434 [15:44<15:46:34,  8.98s/it, gpt_loss=0.457, loss_mean=0.526][A
+Train step of epoch 0:   2%|▏         | 110/6434 [15:51<15:46:34,  8.98s/it, gpt_loss=0.48, loss_mean=0.522] [A
+Train step of epoch 0:   2%|▏         | 111/6434 [15:51<14:53:24,  8.48s/it, gpt_loss=0.48, loss_mean=0.522][A
+Train step of epoch 0:   2%|▏         | 111/6434 [16:00<14:53:24,  8.48s/it, gpt_loss=0.423, loss_mean=0.512][A
+Train step of epoch 0:   2%|▏         | 112/6434 [16:00<14:59:25,  8.54s/it, gpt_loss=0.423, loss_mean=0.512][A
+Train step of epoch 0:   2%|▏         | 112/6434 [16:08<14:59:25,  8.54s/it, gpt_loss=0.57, loss_mean=0.518] [A
+Train step of epoch 0:   2%|▏         | 113/6434 [16:08<14:53:06,  8.48s/it, gpt_loss=0.57, loss_mean=0.518][A
+Train step of epoch 0:   2%|▏         | 113/6434 [16:16<14:53:06,  8.48s/it, gpt_loss=0.455, loss_mean=0.511][A
+Train step of epoch 0:   2%|▏         | 114/6434 [16:16<14:36:44,  8.32s/it, gpt_loss=0.455, loss_mean=0.511][A
+Train step of epoch 0:   2%|▏         | 114/6434 [16:25<14:36:44,  8.32s/it, gpt_loss=0.416, loss_mean=0.502][A
+Train step of epoch 0:   2%|▏         | 115/6434 [16:25<14:55:53,  8.51s/it, gpt_loss=0.416, loss_mean=0.502][A
+Train step of epoch 0:   2%|▏         | 115/6434 [16:33<14:55:53,  8.51s/it, gpt_loss=0.545, loss_mean=0.506][A
+Train step of epoch 0:   2%|▏         | 116/6434 [16:33<14:40:22,  8.36s/it, gpt_loss=0.545, loss_mean=0.506][A
+Train step of epoch 0:   2%|▏         | 116/6434 [16:42<14:40:22,  8.36s/it, gpt_loss=0.585, loss_mean=0.514][A
+Train step of epoch 0:   2%|▏         | 117/6434 [16:42<14:52:39,  8.48s/it, gpt_loss=0.585, loss_mean=0.514][A
+Train step of epoch 0:   2%|▏         | 117/6434 [16:50<14:52:39,  8.48s/it, gpt_loss=0.553, loss_mean=0.518][A
+Train step of epoch 0:   2%|▏         | 118/6434 [16:50<14:34:03,  8.30s/it, gpt_loss=0.553, loss_mean=0.518][A
+Train step of epoch 0:   2%|▏         | 118/6434 [16:58<14:34:03,  8.30s/it, gpt_loss=0.464, loss_mean=0.513][A
+Train step of epoch 0:   2%|▏         | 119/6434 [16:58<14:23:11,  8.20s/it, gpt_loss=0.464, loss_mean=0.513][A
+[LID Router Debug] Step: 120
+Batch Size: 10
+Audio Batch Size: 110
+LID Assignments: [4, 2, 4, 6, 5, 5, 3, 9, 4, 2]
+Active Experts in Batch: {2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:   2%|▏         | 119/6434 [17:06<14:23:11,  8.20s/it, gpt_loss=0.485, loss_mean=0.51] [A
+Train step of epoch 0:   2%|▏         | 120/6434 [17:06<14:11:14,  8.09s/it, gpt_loss=0.485, loss_mean=0.51][A
+Train step of epoch 0:   2%|▏         | 120/6434 [17:15<14:11:14,  8.09s/it, gpt_loss=0.389, loss_mean=0.498][A
+Train step of epoch 0:   2%|▏         | 121/6434 [17:15<14:36:57,  8.33s/it, gpt_loss=0.389, loss_mean=0.498][A
+Train step of epoch 0:   2%|▏         | 121/6434 [17:23<14:36:57,  8.33s/it, gpt_loss=0.44, loss_mean=0.492] [A
+Train step of epoch 0:   2%|▏         | 122/6434 [17:23<14:50:06,  8.46s/it, gpt_loss=0.44, loss_mean=0.492][A
+Train step of epoch 0:   2%|▏         | 122/6434 [17:31<14:50:06,  8.46s/it, gpt_loss=0.448, loss_mean=0.487][A
+Train step of epoch 0:   2%|▏         | 123/6434 [17:31<14:26:29,  8.24s/it, gpt_loss=0.448, loss_mean=0.487][A
+Train step of epoch 0:   2%|▏         | 123/6434 [17:40<14:26:29,  8.24s/it, gpt_loss=0.467, loss_mean=0.485][A
+Train step of epoch 0:   2%|▏         | 124/6434 [17:40<14:55:44,  8.52s/it, gpt_loss=0.467, loss_mean=0.485][A
+Train step of epoch 0:   2%|▏         | 124/6434 [17:49<14:55:44,  8.52s/it, gpt_loss=0.583, loss_mean=0.495][A
+Train step of epoch 0:   2%|▏         | 125/6434 [17:49<14:58:59,  8.55s/it, gpt_loss=0.583, loss_mean=0.495][A
+Train step of epoch 0:   2%|▏         | 125/6434 [17:57<14:58:59,  8.55s/it, gpt_loss=0.522, loss_mean=0.498][A
+Train step of epoch 0:   2%|▏         | 126/6434 [17:57<14:53:37,  8.50s/it, gpt_loss=0.522, loss_mean=0.498][A
+Train step of epoch 0:   2%|▏         | 126/6434 [18:05<14:53:37,  8.50s/it, gpt_loss=0.371, loss_mean=0.485][A
+Train step of epoch 0:   2%|▏         | 127/6434 [18:05<14:23:46,  8.22s/it, gpt_loss=0.371, loss_mean=0.485][A
+Train step of epoch 0:   2%|▏         | 127/6434 [18:13<14:23:46,  8.22s/it, gpt_loss=0.417, loss_mean=0.478][A
+Train step of epoch 0:   2%|▏         | 128/6434 [18:13<14:18:38,  8.17s/it, gpt_loss=0.417, loss_mean=0.478][A
+Train step of epoch 0:   2%|▏         | 128/6434 [18:20<14:18:38,  8.17s/it, gpt_loss=0.543, loss_mean=0.485][A
+Train step of epoch 0:   2%|▏         | 129/6434 [18:20<13:41:52,  7.82s/it, gpt_loss=0.543, loss_mean=0.485][A
+[LID Router Debug] Step: 130
+Batch Size: 10
+Audio Batch Size: 91
+LID Assignments: [6, 4, 2, 1, 0, 3, 1, 2, 0, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6}
+
+Train step of epoch 0:   2%|▏         | 129/6434 [18:27<13:41:52,  7.82s/it, gpt_loss=0.404, loss_mean=0.477][A
+Train step of epoch 0:   2%|▏         | 130/6434 [18:27<13:24:45,  7.66s/it, gpt_loss=0.404, loss_mean=0.477][A
+Train step of epoch 0:   2%|▏         | 130/6434 [18:35<13:24:45,  7.66s/it, gpt_loss=0.563, loss_mean=0.485][A
+Train step of epoch 0:   2%|▏         | 131/6434 [18:35<13:18:50,  7.60s/it, gpt_loss=0.563, loss_mean=0.485][A
+Train step of epoch 0:   2%|▏         | 131/6434 [18:45<13:18:50,  7.60s/it, gpt_loss=0.372, loss_mean=0.474][A
+Train step of epoch 0:   2%|▏         | 132/6434 [18:45<14:43:25,  8.41s/it, gpt_loss=0.372, loss_mean=0.474][A
+Train step of epoch 0:   2%|▏         | 132/6434 [18:54<14:43:25,  8.41s/it, gpt_loss=0.351, loss_mean=0.462][A
+Train step of epoch 0:   2%|▏         | 133/6434 [18:54<15:03:13,  8.60s/it, gpt_loss=0.351, loss_mean=0.462][A
+Train step of epoch 0:   2%|▏         | 133/6434 [19:02<15:03:13,  8.60s/it, gpt_loss=0.556, loss_mean=0.471][A
+Train step of epoch 0:   2%|▏         | 134/6434 [19:02<14:32:28,  8.31s/it, gpt_loss=0.556, loss_mean=0.471][A
+Train step of epoch 0:   2%|▏         | 134/6434 [19:11<14:32:28,  8.31s/it, gpt_loss=0.418, loss_mean=0.466][A
+Train step of epoch 0:   2%|▏         | 135/6434 [19:11<15:17:26,  8.74s/it, gpt_loss=0.418, loss_mean=0.466][A
+Train step of epoch 0:   2%|▏         | 135/6434 [19:19<15:17:26,  8.74s/it, gpt_loss=0.505, loss_mean=0.47] [A
+Train step of epoch 0:   2%|▏         | 136/6434 [19:19<14:43:13,  8.41s/it, gpt_loss=0.505, loss_mean=0.47][A
+Train step of epoch 0:   2%|▏         | 136/6434 [19:27<14:43:13,  8.41s/it, gpt_loss=0.44, loss_mean=0.467][A
+Train step of epoch 0:   2%|▏         | 137/6434 [19:27<14:38:50,  8.37s/it, gpt_loss=0.44, loss_mean=0.467][A
+Train step of epoch 0:   2%|▏         | 137/6434 [19:36<14:38:50,  8.37s/it, gpt_loss=0.456, loss_mean=0.466][A
+Train step of epoch 0:   2%|▏         | 138/6434 [19:36<14:58:32,  8.56s/it, gpt_loss=0.456, loss_mean=0.466][A
+Train step of epoch 0:   2%|▏         | 138/6434 [19:45<14:58:32,  8.56s/it, gpt_loss=0.47, loss_mean=0.466] [A
+Train step of epoch 0:   2%|▏         | 139/6434 [19:45<14:55:42,  8.54s/it, gpt_loss=0.47, loss_mean=0.466][A
+[LID Router Debug] Step: 140
+Batch Size: 10
+Audio Batch Size: 92
+LID Assignments: [0, 9, 1, 1, 2, 2, 2, 5, 6, 3]
+Active Experts in Batch: {0, 1, 2, 3, 5, 6, 9}
+
+Train step of epoch 0:   2%|▏         | 139/6434 [19:52<14:55:42,  8.54s/it, gpt_loss=0.48, loss_mean=0.467][A
+Train step of epoch 0:   2%|▏         | 140/6434 [19:52<14:21:48,  8.22s/it, gpt_loss=0.48, loss_mean=0.467][A
+Train step of epoch 0:   2%|▏         | 140/6434 [20:01<14:21:48,  8.22s/it, gpt_loss=0.553, loss_mean=0.476][A
+Train step of epoch 0:   2%|▏         | 141/6434 [20:01<14:24:03,  8.24s/it, gpt_loss=0.553, loss_mean=0.476][A
+Train step of epoch 0:   2%|▏         | 141/6434 [20:08<14:24:03,  8.24s/it, gpt_loss=0.473, loss_mean=0.476][A
+Train step of epoch 0:   2%|▏         | 142/6434 [20:08<14:05:25,  8.06s/it, gpt_loss=0.473, loss_mean=0.476][A
+Train step of epoch 0:   2%|▏         | 142/6434 [20:18<14:05:25,  8.06s/it, gpt_loss=0.402, loss_mean=0.468][A
+Train step of epoch 0:   2%|▏         | 143/6434 [20:18<15:09:22,  8.67s/it, gpt_loss=0.402, loss_mean=0.468][A
+Train step of epoch 0:   2%|▏         | 143/6434 [20:26<15:09:22,  8.67s/it, gpt_loss=0.476, loss_mean=0.469][A
+Train step of epoch 0:   2%|▏         | 144/6434 [20:26<14:48:45,  8.48s/it, gpt_loss=0.476, loss_mean=0.469][A
+Train step of epoch 0:   2%|▏         | 144/6434 [20:35<14:48:45,  8.48s/it, gpt_loss=0.397, loss_mean=0.462][A
+Train step of epoch 0:   2%|▏         | 145/6434 [20:35<14:55:34,  8.54s/it, gpt_loss=0.397, loss_mean=0.462][A
+Train step of epoch 0:   2%|▏         | 145/6434 [20:42<14:55:34,  8.54s/it, gpt_loss=0.509, loss_mean=0.467][A
+Train step of epoch 0:   2%|▏         | 146/6434 [20:42<14:19:08,  8.20s/it, gpt_loss=0.509, loss_mean=0.467][A
+Train step of epoch 0:   2%|▏         | 146/6434 [20:50<14:19:08,  8.20s/it, gpt_loss=0.495, loss_mean=0.469][A
+Train step of epoch 0:   2%|▏         | 147/6434 [20:50<14:07:08,  8.08s/it, gpt_loss=0.495, loss_mean=0.469][A
+Train step of epoch 0:   2%|▏         | 147/6434 [20:59<14:07:08,  8.08s/it, gpt_loss=0.42, loss_mean=0.464] [A
+Train step of epoch 0:   2%|▏         | 148/6434 [20:59<14:28:03,  8.29s/it, gpt_loss=0.42, loss_mean=0.464][A
+Train step of epoch 0:   2%|▏         | 148/6434 [21:08<14:28:03,  8.29s/it, gpt_loss=0.464, loss_mean=0.464][A
+Train step of epoch 0:   2%|▏         | 149/6434 [21:08<14:49:43,  8.49s/it, gpt_loss=0.464, loss_mean=0.464][A
+[LID Router Debug] Step: 150
+Batch Size: 10
+Audio Batch Size: 109
+LID Assignments: [3, 1, 0, 0, 4, 1, 0, 6, 9, 4]
+Active Experts in Batch: {0, 1, 3, 4, 6, 9}
+
+Train step of epoch 0:   2%|▏         | 149/6434 [21:16<14:49:43,  8.49s/it, gpt_loss=0.35, loss_mean=0.453] [A
+Train step of epoch 0:   2%|▏         | 150/6434 [21:16<14:23:57,  8.25s/it, gpt_loss=0.35, loss_mean=0.453][A
+Train step of epoch 0:   2%|▏         | 150/6434 [21:24<14:23:57,  8.25s/it, gpt_loss=0.365, loss_mean=0.444][A
+Train step of epoch 0:   2%|▏         | 151/6434 [21:24<14:33:46,  8.34s/it, gpt_loss=0.365, loss_mean=0.444][A
+Train step of epoch 0:   2%|▏         | 151/6434 [21:33<14:33:46,  8.34s/it, gpt_loss=0.534, loss_mean=0.453][A
+Train step of epoch 0:   2%|▏         | 152/6434 [21:33<14:38:18,  8.39s/it, gpt_loss=0.534, loss_mean=0.453][A
+Train step of epoch 0:   2%|▏         | 152/6434 [21:41<14:38:18,  8.39s/it, gpt_loss=0.456, loss_mean=0.454][A
+Train step of epoch 0:   2%|▏         | 153/6434 [21:41<14:40:40,  8.41s/it, gpt_loss=0.456, loss_mean=0.454][A
+Train step of epoch 0:   2%|▏         | 153/6434 [21:50<14:40:40,  8.41s/it, gpt_loss=0.376, loss_mean=0.446][A
+Train step of epoch 0:   2%|▏         | 154/6434 [21:50<14:42:53,  8.44s/it, gpt_loss=0.376, loss_mean=0.446][A
+Train step of epoch 0:   2%|▏         | 154/6434 [21:59<14:42:53,  8.44s/it, gpt_loss=0.505, loss_mean=0.452][A
+Train step of epoch 0:   2%|▏         | 155/6434 [21:59<15:08:31,  8.68s/it, gpt_loss=0.505, loss_mean=0.452][A
+Train step of epoch 0:   2%|▏         | 155/6434 [22:06<15:08:31,  8.68s/it, gpt_loss=0.438, loss_mean=0.45] [A
+Train step of epoch 0:   2%|▏         | 156/6434 [22:06<14:25:03,  8.27s/it, gpt_loss=0.438, loss_mean=0.45][A
+Train step of epoch 0:   2%|▏         | 156/6434 [22:16<14:25:03,  8.27s/it, gpt_loss=0.43, loss_mean=0.448][A
+Train step of epoch 0:   2%|▏         | 157/6434 [22:16<15:00:23,  8.61s/it, gpt_loss=0.43, loss_mean=0.448][A
+Train step of epoch 0:   2%|▏         | 157/6434 [22:23<15:00:23,  8.61s/it, gpt_loss=0.422, loss_mean=0.446][A
+Train step of epoch 0:   2%|▏         | 158/6434 [22:23<14:33:04,  8.35s/it, gpt_loss=0.422, loss_mean=0.446][A
+Train step of epoch 0:   2%|▏         | 158/6434 [22:31<14:33:04,  8.35s/it, gpt_loss=0.606, loss_mean=0.462][A
+Train step of epoch 0:   2%|▏         | 159/6434 [22:31<14:26:57,  8.29s/it, gpt_loss=0.606, loss_mean=0.462][A
+[LID Router Debug] Step: 160
+Batch Size: 10
+Audio Batch Size: 111
+LID Assignments: [1, 9, 0, 1, 9, 1, 9, 0, 4, 3]
+Active Experts in Batch: {0, 1, 3, 4, 9}
+
+Train step of epoch 0:   2%|▏         | 159/6434 [22:39<14:26:57,  8.29s/it, gpt_loss=0.45, loss_mean=0.461] [A
+Train step of epoch 0:   2%|▏         | 160/6434 [22:39<14:06:21,  8.09s/it, gpt_loss=0.45, loss_mean=0.461][A
+Train step of epoch 0:   2%|▏         | 160/6434 [22:48<14:06:21,  8.09s/it, gpt_loss=0.473, loss_mean=0.462][A
+Train step of epoch 0:   3%|▎         | 161/6434 [22:48<14:42:45,  8.44s/it, gpt_loss=0.473, loss_mean=0.462][A
+Train step of epoch 0:   3%|▎         | 161/6434 [22:58<14:42:45,  8.44s/it, gpt_loss=0.456, loss_mean=0.461][A
+Train step of epoch 0:   3%|▎         | 162/6434 [22:58<15:10:40,  8.71s/it, gpt_loss=0.456, loss_mean=0.461][A
+Train step of epoch 0:   3%|▎         | 162/6434 [23:06<15:10:40,  8.71s/it, gpt_loss=0.508, loss_mean=0.466][A
+Train step of epoch 0:   3%|▎         | 163/6434 [23:06<15:07:02,  8.68s/it, gpt_loss=0.508, loss_mean=0.466][A
+Train step of epoch 0:   3%|▎         | 163/6434 [23:15<15:07:02,  8.68s/it, gpt_loss=0.465, loss_mean=0.466][A
+Train step of epoch 0:   3%|▎         | 164/6434 [23:15<15:01:26,  8.63s/it, gpt_loss=0.465, loss_mean=0.466][A
+Train step of epoch 0:   3%|▎         | 164/6434 [23:23<15:01:26,  8.63s/it, gpt_loss=0.445, loss_mean=0.464][A
+Train step of epoch 0:   3%|▎         | 165/6434 [23:23<14:47:53,  8.50s/it, gpt_loss=0.445, loss_mean=0.464][A
+Train step of epoch 0:   3%|▎         | 165/6434 [23:32<14:47:53,  8.50s/it, gpt_loss=0.418, loss_mean=0.459][A
+Train step of epoch 0:   3%|▎         | 166/6434 [23:32<15:08:41,  8.70s/it, gpt_loss=0.418, loss_mean=0.459][A
+Train step of epoch 0:   3%|▎         | 166/6434 [23:41<15:08:41,  8.70s/it, gpt_loss=0.462, loss_mean=0.46] [A
+Train step of epoch 0:   3%|▎         | 167/6434 [23:41<15:09:03,  8.70s/it, gpt_loss=0.462, loss_mean=0.46][A
+Train step of epoch 0:   3%|▎         | 167/6434 [23:51<15:09:03,  8.70s/it, gpt_loss=0.363, loss_mean=0.45][A
+Train step of epoch 0:   3%|▎         | 168/6434 [23:51<15:52:21,  9.12s/it, gpt_loss=0.363, loss_mean=0.45][A
+Train step of epoch 0:   3%|▎         | 168/6434 [23:59<15:52:21,  9.12s/it, gpt_loss=0.421, loss_mean=0.447][A
+Train step of epoch 0:   3%|▎         | 169/6434 [23:59<15:15:50,  8.77s/it, gpt_loss=0.421, loss_mean=0.447][A
+[LID Router Debug] Step: 170
+Batch Size: 10
+Audio Batch Size: 117
+LID Assignments: [4, 0, 2, 10, 4, 6, 6, 3, 4, 9]
+Active Experts in Batch: {0, 2, 3, 4, 6, 9, 10}
+
+Train step of epoch 0:   3%|▎         | 169/6434 [24:07<15:15:50,  8.77s/it, gpt_loss=0.572, loss_mean=0.46] [A
+Train step of epoch 0:   3%|▎         | 170/6434 [24:07<14:56:59,  8.59s/it, gpt_loss=0.572, loss_mean=0.46][A
+Train step of epoch 0:   3%|▎         | 170/6434 [24:15<14:56:59,  8.59s/it, gpt_loss=0.477, loss_mean=0.461][A
+Train step of epoch 0:   3%|▎         | 171/6434 [24:15<14:42:48,  8.46s/it, gpt_loss=0.477, loss_mean=0.461][A
+Train step of epoch 0:   3%|▎         | 171/6434 [24:24<14:42:48,  8.46s/it, gpt_loss=0.45, loss_mean=0.46]  [A
+Train step of epoch 0:   3%|▎         | 172/6434 [24:24<15:00:05,  8.62s/it, gpt_loss=0.45, loss_mean=0.46][A
+Train step of epoch 0:   3%|▎         | 172/6434 [24:33<15:00:05,  8.62s/it, gpt_loss=0.418, loss_mean=0.456][A
+Train step of epoch 0:   3%|▎         | 173/6434 [24:33<14:55:02,  8.58s/it, gpt_loss=0.418, loss_mean=0.456][A
+Train step of epoch 0:   3%|▎         | 173/6434 [24:40<14:55:02,  8.58s/it, gpt_loss=0.43, loss_mean=0.453] [A
+Train step of epoch 0:   3%|▎         | 174/6434 [24:40<14:22:18,  8.26s/it, gpt_loss=0.43, loss_mean=0.453][A
+Train step of epoch 0:   3%|▎         | 174/6434 [24:48<14:22:18,  8.26s/it, gpt_loss=0.462, loss_mean=0.454][A
+Train step of epoch 0:   3%|▎         | 175/6434 [24:48<14:20:03,  8.24s/it, gpt_loss=0.462, loss_mean=0.454][A
+Train step of epoch 0:   3%|▎         | 175/6434 [24:57<14:20:03,  8.24s/it, gpt_loss=0.455, loss_mean=0.454][A
+Train step of epoch 0:   3%|▎         | 176/6434 [24:57<14:22:30,  8.27s/it, gpt_loss=0.455, loss_mean=0.454][A
+Train step of epoch 0:   3%|▎         | 176/6434 [25:05<14:22:30,  8.27s/it, gpt_loss=0.513, loss_mean=0.46] [A
+Train step of epoch 0:   3%|▎         | 177/6434 [25:05<14:23:04,  8.28s/it, gpt_loss=0.513, loss_mean=0.46][A
+Train step of epoch 0:   3%|▎         | 177/6434 [25:14<14:23:04,  8.28s/it, gpt_loss=0.525, loss_mean=0.467][A
+Train step of epoch 0:   3%|▎         | 178/6434 [25:14<14:43:32,  8.47s/it, gpt_loss=0.525, loss_mean=0.467][A
+Train step of epoch 0:   3%|▎         | 178/6434 [25:23<14:43:32,  8.47s/it, gpt_loss=0.39, loss_mean=0.459] [A
+Train step of epoch 0:   3%|▎         | 179/6434 [25:23<14:50:17,  8.54s/it, gpt_loss=0.39, loss_mean=0.459][A
+[LID Router Debug] Step: 180
+Batch Size: 10
+Audio Batch Size: 138
+LID Assignments: [2, 9, 9, 3, 3, 3, 1, 0, 9, 3]
+Active Experts in Batch: {0, 1, 2, 3, 9}
+
+Train step of epoch 0:   3%|▎         | 179/6434 [25:32<14:50:17,  8.54s/it, gpt_loss=0.519, loss_mean=0.465][A
+Train step of epoch 0:   3%|▎         | 180/6434 [25:32<15:09:10,  8.72s/it, gpt_loss=0.519, loss_mean=0.465][A
+Train step of epoch 0:   3%|▎         | 180/6434 [25:40<15:09:10,  8.72s/it, gpt_loss=0.464, loss_mean=0.465][A
+Train step of epoch 0:   3%|▎         | 181/6434 [25:40<14:45:21,  8.50s/it, gpt_loss=0.464, loss_mean=0.465][A
+Train step of epoch 0:   3%|▎         | 181/6434 [25:48<14:45:21,  8.50s/it, gpt_loss=0.348, loss_mean=0.453][A
+Train step of epoch 0:   3%|▎         | 182/6434 [25:48<14:46:09,  8.50s/it, gpt_loss=0.348, loss_mean=0.453][A
+Train step of epoch 0:   3%|▎         | 182/6434 [25:56<14:46:09,  8.50s/it, gpt_loss=0.409, loss_mean=0.449][A
+Train step of epoch 0:   3%|▎         | 183/6434 [25:56<14:31:41,  8.37s/it, gpt_loss=0.409, loss_mean=0.449][A
+Train step of epoch 0:   3%|▎         | 183/6434 [26:06<14:31:41,  8.37s/it, gpt_loss=0.38, loss_mean=0.442] [A
+Train step of epoch 0:   3%|▎         | 184/6434 [26:06<15:23:13,  8.86s/it, gpt_loss=0.38, loss_mean=0.442][A
+Train step of epoch 0:   3%|▎         | 184/6434 [26:14<15:23:13,  8.86s/it, gpt_loss=0.304, loss_mean=0.428][A
+Train step of epoch 0:   3%|▎         | 185/6434 [26:14<14:39:48,  8.45s/it, gpt_loss=0.304, loss_mean=0.428][A
+Train step of epoch 0:   3%|▎         | 185/6434 [26:23<14:39:48,  8.45s/it, gpt_loss=0.386, loss_mean=0.424][A
+Train step of epoch 0:   3%|▎         | 186/6434 [26:23<14:55:43,  8.60s/it, gpt_loss=0.386, loss_mean=0.424][A
+Train step of epoch 0:   3%|▎         | 186/6434 [26:32<14:55:43,  8.60s/it, gpt_loss=0.396, loss_mean=0.421][A
+Train step of epoch 0:   3%|▎         | 187/6434 [26:32<15:23:21,  8.87s/it, gpt_loss=0.396, loss_mean=0.421][A
+Train step of epoch 0:   3%|▎         | 187/6434 [26:42<15:23:21,  8.87s/it, gpt_loss=0.393, loss_mean=0.418][A
+Train step of epoch 0:   3%|▎         | 188/6434 [26:42<16:01:50,  9.24s/it, gpt_loss=0.393, loss_mean=0.418][A
+Train step of epoch 0:   3%|▎         | 188/6434 [26:51<16:01:50,  9.24s/it, gpt_loss=0.471, loss_mean=0.424][A
+Train step of epoch 0:   3%|▎         | 189/6434 [26:51<15:49:53,  9.13s/it, gpt_loss=0.471, loss_mean=0.424][A
+[LID Router Debug] Step: 190
+Batch Size: 10
+Audio Batch Size: 78
+LID Assignments: [2, 9, 1, 4, 0, 5, 2, 1, 5, 2]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+Train step of epoch 0:   3%|▎         | 189/6434 [26:59<15:49:53,  9.13s/it, gpt_loss=0.385, loss_mean=0.42] [A
+Train step of epoch 0:   3%|▎         | 190/6434 [26:59<15:02:32,  8.67s/it, gpt_loss=0.385, loss_mean=0.42][A
+Train step of epoch 0:   3%|▎         | 190/6434 [27:08<15:02:32,  8.67s/it, gpt_loss=0.345, loss_mean=0.412][A
+Train step of epoch 0:   3%|▎         | 191/6434 [27:08<15:20:08,  8.84s/it, gpt_loss=0.345, loss_mean=0.412][A
+Train step of epoch 0:   3%|▎         | 191/6434 [27:18<15:20:08,  8.84s/it, gpt_loss=0.392, loss_mean=0.41] [A
+Train step of epoch 0:   3%|▎         | 192/6434 [27:18<15:42:00,  9.05s/it, gpt_loss=0.392, loss_mean=0.41][A
+Train step of epoch 0:   3%|▎         | 192/6434 [27:28<15:42:00,  9.05s/it, gpt_loss=0.437, loss_mean=0.413][A
+Train step of epoch 0:   3%|▎         | 193/6434 [27:28<16:12:53,  9.35s/it, gpt_loss=0.437, loss_mean=0.413][A
+Train step of epoch 0:   3%|▎         | 193/6434 [27:36<16:12:53,  9.35s/it, gpt_loss=0.531, loss_mean=0.425][A
+Train step of epoch 0:   3%|▎         | 194/6434 [27:36<15:46:22,  9.10s/it, gpt_loss=0.531, loss_mean=0.425][A
+Train step of epoch 0:   3%|▎         | 194/6434 [27:44<15:46:22,  9.10s/it, gpt_loss=0.444, loss_mean=0.427][A
+Train step of epoch 0:   3%|▎         | 195/6434 [27:44<15:10:45,  8.76s/it, gpt_loss=0.444, loss_mean=0.427][A
+Train step of epoch 0:   3%|▎         | 195/6434 [27:52<15:10:45,  8.76s/it, gpt_loss=0.462, loss_mean=0.43] [A
+Train step of epoch 0:   3%|▎         | 196/6434 [27:52<14:32:11,  8.39s/it, gpt_loss=0.462, loss_mean=0.43][A
+Train step of epoch 0:   3%|▎         | 196/6434 [28:00<14:32:11,  8.39s/it, gpt_loss=0.463, loss_mean=0.433][A
+Train step of epoch 0:   3%|▎         | 197/6434 [28:00<14:35:19,  8.42s/it, gpt_loss=0.463, loss_mean=0.433][A
+Train step of epoch 0:   3%|▎         | 197/6434 [28:08<14:35:19,  8.42s/it, gpt_loss=0.396, loss_mean=0.43] [A
+Train step of epoch 0:   3%|▎         | 198/6434 [28:08<14:20:38,  8.28s/it, gpt_loss=0.396, loss_mean=0.43][A
+Train step of epoch 0:   3%|▎         | 198/6434 [28:17<14:20:38,  8.28s/it, gpt_loss=0.298, loss_mean=0.417][A
+Train step of epoch 0:   3%|▎         | 199/6434 [28:17<14:24:04,  8.32s/it, gpt_loss=0.298, loss_mean=0.417][A
+[LID Router Debug] Step: 200
+Batch Size: 10
+Audio Batch Size: 78
+LID Assignments: [6, 6, 6, 0, 1, 4, 9, 0, 5, 2]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6, 9}
+[2026-02-06 16:24:30,012] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[1.9999995134340094e-05, 1.9999995134340094e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-06 16:24:30,013] [INFO] [timer.py:260:stop] epoch=0/micro_step=200/global_step=100, RunningAvgSamplesPerSec=4.73043607391196, CurrSamplesPerSec=4.626720285225346, MemAllocated=12.61GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:   3%|▎         | 199/6434 [28:26<14:24:04,  8.32s/it, gpt_loss=0.465, loss_mean=0.421][A
+Train step of epoch 0:   3%|▎         | 200/6434 [28:26<14:43:00,  8.50s/it, gpt_loss=0.465, loss_mean=0.421][A
+Train step of epoch 0:   3%|▎         | 200/6434 [28:34<14:43:00,  8.50s/it, gpt_loss=0.403, loss_mean=0.42] [A
+Train step of epoch 0:   3%|▎         | 201/6434 [28:34<14:38:15,  8.45s/it, gpt_loss=0.403, loss_mean=0.42][A
+Train step of epoch 0:   3%|▎         | 201/6434 [28:42<14:38:15,  8.45s/it, gpt_loss=0.481, loss_mean=0.426][A
+Train step of epoch 0:   3%|▎         | 202/6434 [28:42<14:13:26,  8.22s/it, gpt_loss=0.481, loss_mean=0.426][A
+Train step of epoch 0:   3%|▎         | 202/6434 [28:50<14:13:26,  8.22s/it, gpt_loss=0.439, loss_mean=0.427][A
+Train step of epoch 0:   3%|▎         | 203/6434 [28:50<14:20:33,  8.29s/it, gpt_loss=0.439, loss_mean=0.427][A
+Train step of epoch 0:   3%|▎         | 203/6434 [28:57<14:20:33,  8.29s/it, gpt_loss=0.375, loss_mean=0.422][A
+Train step of epoch 0:   3%|▎         | 204/6434 [28:57<13:44:30,  7.94s/it, gpt_loss=0.375, loss_mean=0.422][A
+Train step of epoch 0:   3%|▎         | 204/6434 [29:05<13:44:30,  7.94s/it, gpt_loss=0.378, loss_mean=0.418][A
+Train step of epoch 0:   3%|▎         | 205/6434 [29:05<13:42:59,  7.93s/it, gpt_loss=0.378, loss_mean=0.418][A
+Train step of epoch 0:   3%|▎         | 205/6434 [29:14<13:42:59,  7.93s/it, gpt_loss=0.366, loss_mean=0.412][A
+Train step of epoch 0:   3%|▎         | 206/6434 [29:14<14:02:22,  8.12s/it, gpt_loss=0.366, loss_mean=0.412][A
+Train step of epoch 0:   3%|▎         | 206/6434 [29:22<14:02:22,  8.12s/it, gpt_loss=0.436, loss_mean=0.415][A
+Train step of epoch 0:   3%|▎         | 207/6434 [29:22<13:58:25,  8.08s/it, gpt_loss=0.436, loss_mean=0.415][A
+Train step of epoch 0:   3%|▎         | 207/6434 [29:30<13:58:25,  8.08s/it, gpt_loss=0.351, loss_mean=0.408][A
+Train step of epoch 0:   3%|▎         | 208/6434 [29:30<14:08:59,  8.18s/it, gpt_loss=0.351, loss_mean=0.408][A
+Train step of epoch 0:   3%|▎         | 208/6434 [29:40<14:08:59,  8.18s/it, gpt_loss=0.33, loss_mean=0.401] [A
+Train step of epoch 0:   3%|▎         | 209/6434 [29:40<15:11:06,  8.78s/it, gpt_loss=0.33, loss_mean=0.401][A
+[LID Router Debug] Step: 210
+Batch Size: 10
+Audio Batch Size: 78
+LID Assignments: [0, 4, 1, 5, 2, 5, 2, 0, 0, 5]
+Active Experts in Batch: {0, 1, 2, 4, 5}
+
+Train step of epoch 0:   3%|▎         | 209/6434 [29:48<15:11:06,  8.78s/it, gpt_loss=0.387, loss_mean=0.399][A
+Train step of epoch 0:   3%|▎         | 210/6434 [29:48<14:30:42,  8.39s/it, gpt_loss=0.387, loss_mean=0.399][A
+Train step of epoch 0:   3%|▎         | 210/6434 [29:56<14:30:42,  8.39s/it, gpt_loss=0.384, loss_mean=0.398][A
+Train step of epoch 0:   3%|▎         | 211/6434 [29:56<14:31:34,  8.40s/it, gpt_loss=0.384, loss_mean=0.398][A
+Train step of epoch 0:   3%|▎         | 211/6434 [30:06<14:31:34,  8.40s/it, gpt_loss=0.377, loss_mean=0.396][A
+Train step of epoch 0:   3%|▎         | 212/6434 [30:06<15:05:22,  8.73s/it, gpt_loss=0.377, loss_mean=0.396][A
+Train step of epoch 0:   3%|▎         | 212/6434 [30:15<15:05:22,  8.73s/it, gpt_loss=0.469, loss_mean=0.403][A
+Train step of epoch 0:   3%|▎         | 213/6434 [30:15<15:11:11,  8.79s/it, gpt_loss=0.469, loss_mean=0.403][A
+Train step of epoch 0:   3%|▎         | 213/6434 [30:23<15:11:11,  8.79s/it, gpt_loss=0.418, loss_mean=0.404][A
+Train step of epoch 0:   3%|▎         | 214/6434 [30:23<15:15:45,  8.83s/it, gpt_loss=0.418, loss_mean=0.404][A
+Train step of epoch 0:   3%|▎         | 214/6434 [30:31<15:15:45,  8.83s/it, gpt_loss=0.499, loss_mean=0.414][A
+Train step of epoch 0:   3%|▎         | 215/6434 [30:31<14:41:23,  8.50s/it, gpt_loss=0.499, loss_mean=0.414][A
+Train step of epoch 0:   3%|▎         | 215/6434 [30:39<14:41:23,  8.50s/it, gpt_loss=0.405, loss_mean=0.413][A
+Train step of epoch 0:   3%|▎         | 216/6434 [30:39<14:25:07,  8.35s/it, gpt_loss=0.405, loss_mean=0.413][A
+Train step of epoch 0:   3%|▎         | 216/6434 [30:48<14:25:07,  8.35s/it, gpt_loss=0.421, loss_mean=0.414][A
+Train step of epoch 0:   3%|▎         | 217/6434 [30:48<14:26:42,  8.36s/it, gpt_loss=0.421, loss_mean=0.414][A
+Train step of epoch 0:   3%|▎         | 217/6434 [30:55<14:26:42,  8.36s/it, gpt_loss=0.395, loss_mean=0.412][A
+Train step of epoch 0:   3%|▎         | 218/6434 [30:55<14:11:00,  8.21s/it, gpt_loss=0.395, loss_mean=0.412][A
+Train step of epoch 0:   3%|▎         | 218/6434 [31:04<14:11:00,  8.21s/it, gpt_loss=0.458, loss_mean=0.417][A
+Train step of epoch 0:   3%|▎         | 219/6434 [31:04<14:12:09,  8.23s/it, gpt_loss=0.458, loss_mean=0.417][A
+[LID Router Debug] Step: 220
+Batch Size: 10
+Audio Batch Size: 130
+LID Assignments: [2, 3, 5, 4, 9, 3, 3, 1, 3, 5]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:   3%|▎         | 219/6434 [31:12<14:12:09,  8.23s/it, gpt_loss=0.419, loss_mean=0.417][A
+Train step of epoch 0:   3%|▎         | 220/6434 [31:12<14:27:33,  8.38s/it, gpt_loss=0.419, loss_mean=0.417][A
+Train step of epoch 0:   3%|▎         | 220/6434 [31:23<14:27:33,  8.38s/it, gpt_loss=0.47, loss_mean=0.422] [A
+Train step of epoch 0:   3%|▎         | 221/6434 [31:23<15:35:45,  9.04s/it, gpt_loss=0.47, loss_mean=0.422][A
+Train step of epoch 0:   3%|▎         | 221/6434 [31:31<15:35:45,  9.04s/it, gpt_loss=0.417, loss_mean=0.422][A
+Train step of epoch 0:   3%|▎         | 222/6434 [31:31<14:59:56,  8.69s/it, gpt_loss=0.417, loss_mean=0.422][A
+Train step of epoch 0:   3%|▎         | 222/6434 [31:39<14:59:56,  8.69s/it, gpt_loss=0.468, loss_mean=0.426][A
+Train step of epoch 0:   3%|▎         | 223/6434 [31:39<14:32:00,  8.42s/it, gpt_loss=0.468, loss_mean=0.426][A
+Train step of epoch 0:   3%|▎         | 223/6434 [31:47<14:32:00,  8.42s/it, gpt_loss=0.412, loss_mean=0.425][A
+Train step of epoch 0:   3%|▎         | 224/6434 [31:47<14:21:35,  8.32s/it, gpt_loss=0.412, loss_mean=0.425][A
+Train step of epoch 0:   3%|▎         | 224/6434 [31:55<14:21:35,  8.32s/it, gpt_loss=0.402, loss_mean=0.423][A
+Train step of epoch 0:   3%|▎         | 225/6434 [31:55<14:13:45,  8.25s/it, gpt_loss=0.402, loss_mean=0.423][A
+Train step of epoch 0:   3%|▎         | 225/6434 [32:04<14:13:45,  8.25s/it, gpt_loss=0.409, loss_mean=0.421][A
+Train step of epoch 0:   4%|▎         | 226/6434 [32:04<14:34:28,  8.45s/it, gpt_loss=0.409, loss_mean=0.421][A
+Train step of epoch 0:   4%|▎         | 226/6434 [32:13<14:34:28,  8.45s/it, gpt_loss=0.404, loss_mean=0.419][A
+Train step of epoch 0:   4%|▎         | 227/6434 [32:13<14:43:22,  8.54s/it, gpt_loss=0.404, loss_mean=0.419][A
+Train step of epoch 0:   4%|▎         | 227/6434 [32:21<14:43:22,  8.54s/it, gpt_loss=0.426, loss_mean=0.42] [A
+Train step of epoch 0:   4%|▎         | 228/6434 [32:21<14:38:47,  8.50s/it, gpt_loss=0.426, loss_mean=0.42][A
+Train step of epoch 0:   4%|▎         | 228/6434 [32:30<14:38:47,  8.50s/it, gpt_loss=0.36, loss_mean=0.414][A
+Train step of epoch 0:   4%|▎         | 229/6434 [32:30<15:03:24,  8.74s/it, gpt_loss=0.36, loss_mean=0.414][A
+[LID Router Debug] Step: 230
+Batch Size: 10
+Audio Batch Size: 120
+LID Assignments: [2, 0, 0, 3, 5, 2, 2, 5, 5, 9]
+Active Experts in Batch: {0, 2, 3, 5, 9}
+
+Train step of epoch 0:   4%|▎         | 229/6434 [32:38<15:03:24,  8.74s/it, gpt_loss=0.527, loss_mean=0.425][A
+Train step of epoch 0:   4%|▎         | 230/6434 [32:38<14:46:12,  8.57s/it, gpt_loss=0.527, loss_mean=0.425][A
+Train step of epoch 0:   4%|▎         | 230/6434 [32:47<14:46:12,  8.57s/it, gpt_loss=0.512, loss_mean=0.434][A
+Train step of epoch 0:   4%|▎         | 231/6434 [32:47<14:36:52,  8.48s/it, gpt_loss=0.512, loss_mean=0.434][A
+Train step of epoch 0:   4%|▎         | 231/6434 [32:55<14:36:52,  8.48s/it, gpt_loss=0.393, loss_mean=0.43] [A
+Train step of epoch 0:   4%|▎         | 232/6434 [32:55<14:28:22,  8.40s/it, gpt_loss=0.393, loss_mean=0.43][A
+Train step of epoch 0:   4%|▎         | 232/6434 [33:04<14:28:22,  8.40s/it, gpt_loss=0.391, loss_mean=0.426][A
+Train step of epoch 0:   4%|▎         | 233/6434 [33:04<14:36:53,  8.48s/it, gpt_loss=0.391, loss_mean=0.426][A
+Train step of epoch 0:   4%|▎         | 233/6434 [33:12<14:36:53,  8.48s/it, gpt_loss=0.399, loss_mean=0.423][A
+Train step of epoch 0:   4%|▎         | 234/6434 [33:12<14:44:33,  8.56s/it, gpt_loss=0.399, loss_mean=0.423][A
+Train step of epoch 0:   4%|▎         | 234/6434 [33:20<14:44:33,  8.56s/it, gpt_loss=0.417, loss_mean=0.423][A
+Train step of epoch 0:   4%|▎         | 235/6434 [33:20<14:28:36,  8.41s/it, gpt_loss=0.417, loss_mean=0.423][A
+Train step of epoch 0:   4%|▎         | 235/6434 [33:30<14:28:36,  8.41s/it, gpt_loss=0.486, loss_mean=0.429][A
+Train step of epoch 0:   4%|▎         | 236/6434 [33:30<15:14:44,  8.86s/it, gpt_loss=0.486, loss_mean=0.429][A
+Train step of epoch 0:   4%|▎         | 236/6434 [33:39<15:14:44,  8.86s/it, gpt_loss=0.409, loss_mean=0.427][A
+Train step of epoch 0:   4%|▎         | 237/6434 [33:39<15:12:40,  8.84s/it, gpt_loss=0.409, loss_mean=0.427][A
+Train step of epoch 0:   4%|▎         | 237/6434 [33:49<15:12:40,  8.84s/it, gpt_loss=0.388, loss_mean=0.423][A
+Train step of epoch 0:   4%|▎         | 238/6434 [33:49<15:37:16,  9.08s/it, gpt_loss=0.388, loss_mean=0.423][A
+Train step of epoch 0:   4%|▎         | 238/6434 [33:57<15:37:16,  9.08s/it, gpt_loss=0.32, loss_mean=0.413] [A
+Train step of epoch 0:   4%|▎         | 239/6434 [33:57<15:20:05,  8.91s/it, gpt_loss=0.32, loss_mean=0.413][A
+[LID Router Debug] Step: 240
+Batch Size: 10
+Audio Batch Size: 130
+LID Assignments: [5, 4, 3, 4, 1, 2, 9, 3, 0, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:   4%|▎         | 239/6434 [34:06<15:20:05,  8.91s/it, gpt_loss=0.453, loss_mean=0.417][A
+Train step of epoch 0:   4%|▎         | 240/6434 [34:06<15:14:18,  8.86s/it, gpt_loss=0.453, loss_mean=0.417][A
+Train step of epoch 0:   4%|▎         | 240/6434 [34:14<15:14:18,  8.86s/it, gpt_loss=0.4, loss_mean=0.415]  [A
+Train step of epoch 0:   4%|▎         | 241/6434 [34:14<14:52:44,  8.65s/it, gpt_loss=0.4, loss_mean=0.415][A
+Train step of epoch 0:   4%|▎         | 241/6434 [34:22<14:52:44,  8.65s/it, gpt_loss=0.459, loss_mean=0.42][A
+Train step of epoch 0:   4%|▍         | 242/6434 [34:22<14:33:14,  8.46s/it, gpt_loss=0.459, loss_mean=0.42][A
+Train step of epoch 0:   4%|▍         | 242/6434 [34:30<14:33:14,  8.46s/it, gpt_loss=0.43, loss_mean=0.421][A
+Train step of epoch 0:   4%|▍         | 243/6434 [34:30<14:22:40,  8.36s/it, gpt_loss=0.43, loss_mean=0.421][A
+Train step of epoch 0:   4%|▍         | 243/6434 [34:39<14:22:40,  8.36s/it, gpt_loss=0.359, loss_mean=0.414][A
+Train step of epoch 0:   4%|▍         | 244/6434 [34:39<14:27:50,  8.41s/it, gpt_loss=0.359, loss_mean=0.414][A
+Train step of epoch 0:   4%|▍         | 244/6434 [34:48<14:27:50,  8.41s/it, gpt_loss=0.438, loss_mean=0.417][A
+Train step of epoch 0:   4%|▍         | 245/6434 [34:48<14:41:08,  8.54s/it, gpt_loss=0.438, loss_mean=0.417][A
+Train step of epoch 0:   4%|▍         | 245/6434 [34:57<14:41:08,  8.54s/it, gpt_loss=0.417, loss_mean=0.417][A
+Train step of epoch 0:   4%|▍         | 246/6434 [34:57<15:04:45,  8.77s/it, gpt_loss=0.417, loss_mean=0.417][A
+Train step of epoch 0:   4%|▍         | 246/6434 [35:04<15:04:45,  8.77s/it, gpt_loss=0.549, loss_mean=0.43] [A
+Train step of epoch 0:   4%|▍         | 247/6434 [35:04<14:26:46,  8.41s/it, gpt_loss=0.549, loss_mean=0.43][A
+Train step of epoch 0:   4%|▍         | 247/6434 [35:12<14:26:46,  8.41s/it, gpt_loss=0.462, loss_mean=0.433][A
+Train step of epoch 0:   4%|▍         | 248/6434 [35:12<14:05:25,  8.20s/it, gpt_loss=0.462, loss_mean=0.433][A
+Train step of epoch 0:   4%|▍         | 248/6434 [35:21<14:05:25,  8.20s/it, gpt_loss=0.374, loss_mean=0.427][A
+Train step of epoch 0:   4%|▍         | 249/6434 [35:21<14:28:57,  8.43s/it, gpt_loss=0.374, loss_mean=0.427][A
+[LID Router Debug] Step: 250
+Batch Size: 10
+Audio Batch Size: 91
+LID Assignments: [6, 1, 0, 2, 2, 0, 7, 0, 9, 5]
+Active Experts in Batch: {0, 1, 2, 5, 6, 7, 9}
+
+Train step of epoch 0:   4%|▍         | 249/6434 [35:30<14:28:57,  8.43s/it, gpt_loss=0.41, loss_mean=0.426] [A
+Train step of epoch 0:   4%|▍         | 250/6434 [35:30<14:35:16,  8.49s/it, gpt_loss=0.41, loss_mean=0.426][A
+Train step of epoch 0:   4%|▍         | 250/6434 [35:39<14:35:16,  8.49s/it, gpt_loss=0.408, loss_mean=0.424][A
+Train step of epoch 0:   4%|▍         | 251/6434 [35:39<15:01:43,  8.75s/it, gpt_loss=0.408, loss_mean=0.424][A
+Train step of epoch 0:   4%|▍         | 251/6434 [35:49<15:01:43,  8.75s/it, gpt_loss=0.405, loss_mean=0.422][A
+Train step of epoch 0:   4%|▍         | 252/6434 [35:49<15:20:58,  8.94s/it, gpt_loss=0.405, loss_mean=0.422][A
+Train step of epoch 0:   4%|▍         | 252/6434 [35:57<15:20:58,  8.94s/it, gpt_loss=0.527, loss_mean=0.432][A
+Train step of epoch 0:   4%|▍         | 253/6434 [35:57<14:52:38,  8.67s/it, gpt_loss=0.527, loss_mean=0.432][A
+Train step of epoch 0:   4%|▍         | 253/6434 [36:05<14:52:38,  8.67s/it, gpt_loss=0.503, loss_mean=0.44] [A
+Train step of epoch 0:   4%|▍         | 254/6434 [36:05<14:54:38,  8.69s/it, gpt_loss=0.503, loss_mean=0.44][A
+Train step of epoch 0:   4%|▍         | 254/6434 [36:13<14:54:38,  8.69s/it, gpt_loss=0.366, loss_mean=0.432][A
+Train step of epoch 0:   4%|▍         | 255/6434 [36:13<14:21:07,  8.36s/it, gpt_loss=0.366, loss_mean=0.432][A
+Train step of epoch 0:   4%|▍         | 255/6434 [36:21<14:21:07,  8.36s/it, gpt_loss=0.33, loss_mean=0.422] [A
+Train step of epoch 0:   4%|▍         | 256/6434 [36:21<14:22:00,  8.37s/it, gpt_loss=0.33, loss_mean=0.422][A
+Train step of epoch 0:   4%|▍         | 256/6434 [36:32<14:22:00,  8.37s/it, gpt_loss=0.424, loss_mean=0.422][A
+Train step of epoch 0:   4%|▍         | 257/6434 [36:32<15:22:11,  8.96s/it, gpt_loss=0.424, loss_mean=0.422][A
+Train step of epoch 0:   4%|▍         | 257/6434 [36:41<15:22:11,  8.96s/it, gpt_loss=0.385, loss_mean=0.418][A
+Train step of epoch 0:   4%|▍         | 258/6434 [36:41<15:38:07,  9.11s/it, gpt_loss=0.385, loss_mean=0.418][A
+Train step of epoch 0:   4%|▍         | 258/6434 [36:51<15:38:07,  9.11s/it, gpt_loss=0.424, loss_mean=0.419][A
+Train step of epoch 0:   4%|▍         | 259/6434 [36:51<15:56:25,  9.29s/it, gpt_loss=0.424, loss_mean=0.419][A
+[LID Router Debug] Step: 260
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [4, 5, 5, 3, 5, 2, 2, 6, 5, 5]
+Active Experts in Batch: {2, 3, 4, 5, 6}
+
+Train step of epoch 0:   4%|▍         | 259/6434 [36:59<15:56:25,  9.29s/it, gpt_loss=0.361, loss_mean=0.413][A
+Train step of epoch 0:   4%|▍         | 260/6434 [36:59<15:07:33,  8.82s/it, gpt_loss=0.361, loss_mean=0.413][A
+Train step of epoch 0:   4%|▍         | 260/6434 [37:06<15:07:33,  8.82s/it, gpt_loss=0.43, loss_mean=0.415] [A
+Train step of epoch 0:   4%|▍         | 261/6434 [37:06<14:24:10,  8.40s/it, gpt_loss=0.43, loss_mean=0.415][A
+Train step of epoch 0:   4%|▍         | 261/6434 [37:15<14:24:10,  8.40s/it, gpt_loss=0.501, loss_mean=0.424][A
+Train step of epoch 0:   4%|▍         | 262/6434 [37:15<14:54:27,  8.70s/it, gpt_loss=0.501, loss_mean=0.424][A
+Train step of epoch 0:   4%|▍         | 262/6434 [37:23<14:54:27,  8.70s/it, gpt_loss=0.642, loss_mean=0.445][A
+Train step of epoch 0:   4%|▍         | 263/6434 [37:23<14:30:50,  8.47s/it, gpt_loss=0.642, loss_mean=0.445][A
+Train step of epoch 0:   4%|▍         | 263/6434 [37:31<14:30:50,  8.47s/it, gpt_loss=0.382, loss_mean=0.439][A
+Train step of epoch 0:   4%|▍         | 264/6434 [37:31<13:59:50,  8.17s/it, gpt_loss=0.382, loss_mean=0.439][A
+Train step of epoch 0:   4%|▍         | 264/6434 [37:39<13:59:50,  8.17s/it, gpt_loss=0.418, loss_mean=0.437][A
+Train step of epoch 0:   4%|▍         | 265/6434 [37:39<14:10:58,  8.28s/it, gpt_loss=0.418, loss_mean=0.437][A
+Train step of epoch 0:   4%|▍         | 265/6434 [37:47<14:10:58,  8.28s/it, gpt_loss=0.387, loss_mean=0.432][A
+Train step of epoch 0:   4%|▍         | 266/6434 [37:47<14:06:48,  8.24s/it, gpt_loss=0.387, loss_mean=0.432][A
+Train step of epoch 0:   4%|▍         | 266/6434 [37:55<14:06:48,  8.24s/it, gpt_loss=0.465, loss_mean=0.435][A
+Train step of epoch 0:   4%|▍         | 267/6434 [37:55<13:39:44,  7.98s/it, gpt_loss=0.465, loss_mean=0.435][A
+Train step of epoch 0:   4%|▍         | 267/6434 [38:04<13:39:44,  7.98s/it, gpt_loss=0.419, loss_mean=0.434][A
+Train step of epoch 0:   4%|▍         | 268/6434 [38:04<14:08:31,  8.26s/it, gpt_loss=0.419, loss_mean=0.434][A
+Train step of epoch 0:   4%|▍         | 268/6434 [38:12<14:08:31,  8.26s/it, gpt_loss=0.355, loss_mean=0.426][A
+Train step of epoch 0:   4%|▍         | 269/6434 [38:12<14:21:10,  8.38s/it, gpt_loss=0.355, loss_mean=0.426][A
+[LID Router Debug] Step: 270
+Batch Size: 10
+Audio Batch Size: 100
+LID Assignments: [1, 4, 0, 6, 9, 1, 5, 8, 1, 4]
+Active Experts in Batch: {0, 1, 4, 5, 6, 8, 9}
+
+Train step of epoch 0:   4%|▍         | 269/6434 [38:20<14:21:10,  8.38s/it, gpt_loss=0.41, loss_mean=0.424] [A
+Train step of epoch 0:   4%|▍         | 270/6434 [38:20<14:09:07,  8.27s/it, gpt_loss=0.41, loss_mean=0.424][A
+Train step of epoch 0:   4%|▍         | 270/6434 [38:28<14:09:07,  8.27s/it, gpt_loss=0.37, loss_mean=0.419][A
+Train step of epoch 0:   4%|▍         | 271/6434 [38:28<13:57:50,  8.16s/it, gpt_loss=0.37, loss_mean=0.419][A
+Train step of epoch 0:   4%|▍         | 271/6434 [38:36<13:57:50,  8.16s/it, gpt_loss=0.409, loss_mean=0.418][A
+Train step of epoch 0:   4%|▍         | 272/6434 [38:36<13:53:04,  8.11s/it, gpt_loss=0.409, loss_mean=0.418][A
+Train step of epoch 0:   4%|▍         | 272/6434 [38:44<13:53:04,  8.11s/it, gpt_loss=0.436, loss_mean=0.42] [A
+Train step of epoch 0:   4%|▍         | 273/6434 [38:44<13:49:15,  8.08s/it, gpt_loss=0.436, loss_mean=0.42][A
+Train step of epoch 0:   4%|▍         | 273/6434 [38:52<13:49:15,  8.08s/it, gpt_loss=0.303, loss_mean=0.408][A
+Train step of epoch 0:   4%|▍         | 274/6434 [38:52<13:43:55,  8.03s/it, gpt_loss=0.303, loss_mean=0.408][A
+Train step of epoch 0:   4%|▍         | 274/6434 [39:01<13:43:55,  8.03s/it, gpt_loss=0.333, loss_mean=0.4]  [A
+Train step of epoch 0:   4%|▍         | 275/6434 [39:01<14:09:44,  8.28s/it, gpt_loss=0.333, loss_mean=0.4][A
+Train step of epoch 0:   4%|▍         | 275/6434 [39:10<14:09:44,  8.28s/it, gpt_loss=0.401, loss_mean=0.4][A
+Train step of epoch 0:   4%|▍         | 276/6434 [39:10<14:27:49,  8.46s/it, gpt_loss=0.401, loss_mean=0.4][A
+Train step of epoch 0:   4%|▍         | 276/6434 [39:18<14:27:49,  8.46s/it, gpt_loss=0.375, loss_mean=0.398][A
+Train step of epoch 0:   4%|▍         | 277/6434 [39:18<14:18:51,  8.37s/it, gpt_loss=0.375, loss_mean=0.398][A
+Train step of epoch 0:   4%|▍         | 277/6434 [39:26<14:18:51,  8.37s/it, gpt_loss=0.437, loss_mean=0.402][A
+Train step of epoch 0:   4%|▍         | 278/6434 [39:26<14:10:49,  8.29s/it, gpt_loss=0.437, loss_mean=0.402][A
+Train step of epoch 0:   4%|▍         | 278/6434 [39:35<14:10:49,  8.29s/it, gpt_loss=0.465, loss_mean=0.408][A
+Train step of epoch 0:   4%|▍         | 279/6434 [39:35<14:34:01,  8.52s/it, gpt_loss=0.465, loss_mean=0.408][A
+[LID Router Debug] Step: 280
+Batch Size: 10
+Audio Batch Size: 132
+LID Assignments: [6, 4, 3, 9, 2, 2, 3, 5, 4, 5]
+Active Experts in Batch: {2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:   4%|▍         | 279/6434 [39:44<14:34:01,  8.52s/it, gpt_loss=0.475, loss_mean=0.415][A
+Train step of epoch 0:   4%|▍         | 280/6434 [39:44<14:36:43,  8.55s/it, gpt_loss=0.475, loss_mean=0.415][A
+Train step of epoch 0:   4%|▍         | 280/6434 [39:52<14:36:43,  8.55s/it, gpt_loss=0.406, loss_mean=0.414][A
+Train step of epoch 0:   4%|▍         | 281/6434 [39:52<14:16:54,  8.36s/it, gpt_loss=0.406, loss_mean=0.414][A
+Train step of epoch 0:   4%|▍         | 281/6434 [40:01<14:16:54,  8.36s/it, gpt_loss=0.386, loss_mean=0.411][A
+Train step of epoch 0:   4%|▍         | 282/6434 [40:01<14:29:13,  8.48s/it, gpt_loss=0.386, loss_mean=0.411][A
+Train step of epoch 0:   4%|▍         | 282/6434 [40:09<14:29:13,  8.48s/it, gpt_loss=0.369, loss_mean=0.407][A
+Train step of epoch 0:   4%|▍         | 283/6434 [40:09<14:39:25,  8.58s/it, gpt_loss=0.369, loss_mean=0.407][A
+Train step of epoch 0:   4%|▍         | 283/6434 [40:17<14:39:25,  8.58s/it, gpt_loss=0.367, loss_mean=0.403][A
+Train step of epoch 0:   4%|▍         | 284/6434 [40:17<14:10:12,  8.29s/it, gpt_loss=0.367, loss_mean=0.403][A
+Train step of epoch 0:   4%|▍         | 284/6434 [40:26<14:10:12,  8.29s/it, gpt_loss=0.402, loss_mean=0.403][A
+Train step of epoch 0:   4%|▍         | 285/6434 [40:26<14:33:51,  8.53s/it, gpt_loss=0.402, loss_mean=0.403][A
+Train step of epoch 0:   4%|▍         | 285/6434 [40:35<14:33:51,  8.53s/it, gpt_loss=0.416, loss_mean=0.404][A
+Train step of epoch 0:   4%|▍         | 286/6434 [40:35<14:39:51,  8.59s/it, gpt_loss=0.416, loss_mean=0.404][A
+Train step of epoch 0:   4%|▍         | 286/6434 [40:43<14:39:51,  8.59s/it, gpt_loss=0.533, loss_mean=0.417][A
+Train step of epoch 0:   4%|▍         | 287/6434 [40:43<14:28:47,  8.48s/it, gpt_loss=0.533, loss_mean=0.417][A
+Train step of epoch 0:   4%|▍         | 287/6434 [40:52<14:28:47,  8.48s/it, gpt_loss=0.498, loss_mean=0.425][A
+Train step of epoch 0:   4%|▍         | 288/6434 [40:52<14:29:52,  8.49s/it, gpt_loss=0.498, loss_mean=0.425][A
+Train step of epoch 0:   4%|▍         | 288/6434 [41:00<14:29:52,  8.49s/it, gpt_loss=0.428, loss_mean=0.425][A
+Train step of epoch 0:   4%|▍         | 289/6434 [41:00<14:23:09,  8.43s/it, gpt_loss=0.428, loss_mean=0.425][A
+[LID Router Debug] Step: 290
+Batch Size: 10
+Audio Batch Size: 106
+LID Assignments: [4, 4, 3, 5, 6, 9, 4, 9, 4, 4]
+Active Experts in Batch: {3, 4, 5, 6, 9}
+
+Train step of epoch 0:   4%|▍         | 289/6434 [41:08<14:23:09,  8.43s/it, gpt_loss=0.391, loss_mean=0.422][A
+Train step of epoch 0:   5%|▍         | 290/6434 [41:08<14:11:58,  8.32s/it, gpt_loss=0.391, loss_mean=0.422][A
+Train step of epoch 0:   5%|▍         | 290/6434 [41:17<14:11:58,  8.32s/it, gpt_loss=0.335, loss_mean=0.413][A
+Train step of epoch 0:   5%|▍         | 291/6434 [41:17<14:27:30,  8.47s/it, gpt_loss=0.335, loss_mean=0.413][A
+Train step of epoch 0:   5%|▍         | 291/6434 [41:25<14:27:30,  8.47s/it, gpt_loss=0.332, loss_mean=0.405][A
+Train step of epoch 0:   5%|▍         | 292/6434 [41:25<14:33:16,  8.53s/it, gpt_loss=0.332, loss_mean=0.405][A
+Train step of epoch 0:   5%|▍         | 292/6434 [41:34<14:33:16,  8.53s/it, gpt_loss=0.393, loss_mean=0.404][A
+Train step of epoch 0:   5%|▍         | 293/6434 [41:34<14:21:49,  8.42s/it, gpt_loss=0.393, loss_mean=0.404][A
+Train step of epoch 0:   5%|▍         | 293/6434 [41:43<14:21:49,  8.42s/it, gpt_loss=0.384, loss_mean=0.402][A
+Train step of epoch 0:   5%|▍         | 294/6434 [41:43<15:01:12,  8.81s/it, gpt_loss=0.384, loss_mean=0.402][A
+Train step of epoch 0:   5%|▍         | 294/6434 [41:52<15:01:12,  8.81s/it, gpt_loss=0.389, loss_mean=0.401][A
+Train step of epoch 0:   5%|▍         | 295/6434 [41:52<15:10:35,  8.90s/it, gpt_loss=0.389, loss_mean=0.401][A
+Train step of epoch 0:   5%|▍         | 295/6434 [42:01<15:10:35,  8.90s/it, gpt_loss=0.353, loss_mean=0.396][A
+Train step of epoch 0:   5%|▍         | 296/6434 [42:01<15:00:27,  8.80s/it, gpt_loss=0.353, loss_mean=0.396][A
+Train step of epoch 0:   5%|▍         | 296/6434 [42:09<15:00:27,  8.80s/it, gpt_loss=0.365, loss_mean=0.393][A
+Train step of epoch 0:   5%|▍         | 297/6434 [42:09<14:42:56,  8.63s/it, gpt_loss=0.365, loss_mean=0.393][A
+Train step of epoch 0:   5%|▍         | 297/6434 [42:18<14:42:56,  8.63s/it, gpt_loss=0.316, loss_mean=0.385][A
+Train step of epoch 0:   5%|▍         | 298/6434 [42:18<15:03:24,  8.83s/it, gpt_loss=0.316, loss_mean=0.385][A
+Train step of epoch 0:   5%|▍         | 298/6434 [42:28<15:03:24,  8.83s/it, gpt_loss=0.381, loss_mean=0.385][A
+Train step of epoch 0:   5%|▍         | 299/6434 [42:28<15:28:51,  9.08s/it, gpt_loss=0.381, loss_mean=0.385][A
+[LID Router Debug] Step: 300
+Batch Size: 10
+Audio Batch Size: 78
+LID Assignments: [5, 1, 6, 2, 0, 1, 2, 1, 4, 5]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6}
+
+Train step of epoch 0:   5%|▍         | 299/6434 [42:37<15:28:51,  9.08s/it, gpt_loss=0.382, loss_mean=0.384][A
+Train step of epoch 0:   5%|▍         | 300/6434 [42:37<15:14:43,  8.95s/it, gpt_loss=0.382, loss_mean=0.384][A
+Train step of epoch 0:   5%|▍         | 300/6434 [42:45<15:14:43,  8.95s/it, gpt_loss=0.394, loss_mean=0.385][A
+Train step of epoch 0:   5%|▍         | 301/6434 [42:45<14:44:55,  8.66s/it, gpt_loss=0.394, loss_mean=0.385][A
+Train step of epoch 0:   5%|▍         | 301/6434 [42:54<14:44:55,  8.66s/it, gpt_loss=0.356, loss_mean=0.382][A
+Train step of epoch 0:   5%|▍         | 302/6434 [42:54<15:00:43,  8.81s/it, gpt_loss=0.356, loss_mean=0.382][A
+Train step of epoch 0:   5%|▍         | 302/6434 [43:03<15:00:43,  8.81s/it, gpt_loss=0.417, loss_mean=0.386][A
+Train step of epoch 0:   5%|▍         | 303/6434 [43:03<15:03:35,  8.84s/it, gpt_loss=0.417, loss_mean=0.386][A
+Train step of epoch 0:   5%|▍         | 303/6434 [43:12<15:03:35,  8.84s/it, gpt_loss=0.349, loss_mean=0.382][A
+Train step of epoch 0:   5%|▍         | 304/6434 [43:12<15:14:54,  8.96s/it, gpt_loss=0.349, loss_mean=0.382][A
+Train step of epoch 0:   5%|▍         | 304/6434 [43:20<15:14:54,  8.96s/it, gpt_loss=0.352, loss_mean=0.379][A
+Train step of epoch 0:   5%|▍         | 305/6434 [43:20<14:42:59,  8.64s/it, gpt_loss=0.352, loss_mean=0.379][A
+Train step of epoch 0:   5%|▍         | 305/6434 [43:27<14:42:59,  8.64s/it, gpt_loss=0.4, loss_mean=0.381]  [A
+Train step of epoch 0:   5%|▍         | 306/6434 [43:27<13:58:46,  8.21s/it, gpt_loss=0.4, loss_mean=0.381][A
+Train step of epoch 0:   5%|▍         | 306/6434 [43:35<13:58:46,  8.21s/it, gpt_loss=0.404, loss_mean=0.383][A
+Train step of epoch 0:   5%|▍         | 307/6434 [43:35<13:51:31,  8.14s/it, gpt_loss=0.404, loss_mean=0.383][A
+Train step of epoch 0:   5%|▍         | 307/6434 [43:44<13:51:31,  8.14s/it, gpt_loss=0.356, loss_mean=0.381][A
+Train step of epoch 0:   5%|▍         | 308/6434 [43:44<14:00:57,  8.24s/it, gpt_loss=0.356, loss_mean=0.381][A
+Train step of epoch 0:   5%|▍         | 308/6434 [43:52<14:00:57,  8.24s/it, gpt_loss=0.35, loss_mean=0.378] [A
+Train step of epoch 0:   5%|▍         | 309/6434 [43:52<14:09:40,  8.32s/it, gpt_loss=0.35, loss_mean=0.378][A
+[LID Router Debug] Step: 310
+Batch Size: 10
+Audio Batch Size: 118
+LID Assignments: [0, 2, 3, 1, 1, 0, 2, 9, 2, 0]
+Active Experts in Batch: {0, 1, 2, 3, 9}
+
+Train step of epoch 0:   5%|▍         | 309/6434 [44:01<14:09:40,  8.32s/it, gpt_loss=0.352, loss_mean=0.375][A
+Train step of epoch 0:   5%|▍         | 310/6434 [44:01<14:21:58,  8.45s/it, gpt_loss=0.352, loss_mean=0.375][A
+Train step of epoch 0:   5%|▍         | 310/6434 [44:10<14:21:58,  8.45s/it, gpt_loss=0.399, loss_mean=0.377][A
+Train step of epoch 0:   5%|▍         | 311/6434 [44:10<14:37:23,  8.60s/it, gpt_loss=0.399, loss_mean=0.377][A
+Train step of epoch 0:   5%|▍         | 311/6434 [44:18<14:37:23,  8.60s/it, gpt_loss=0.422, loss_mean=0.382][A
+Train step of epoch 0:   5%|▍         | 312/6434 [44:18<14:13:03,  8.36s/it, gpt_loss=0.422, loss_mean=0.382][A
+Train step of epoch 0:   5%|▍         | 312/6434 [44:26<14:13:03,  8.36s/it, gpt_loss=0.329, loss_mean=0.377][A
+Train step of epoch 0:   5%|▍         | 313/6434 [44:26<14:18:13,  8.41s/it, gpt_loss=0.329, loss_mean=0.377][A
+Train step of epoch 0:   5%|▍         | 313/6434 [44:34<14:18:13,  8.41s/it, gpt_loss=0.406, loss_mean=0.38] [A
+Train step of epoch 0:   5%|▍         | 314/6434 [44:34<13:57:54,  8.21s/it, gpt_loss=0.406, loss_mean=0.38][A
+Train step of epoch 0:   5%|▍         | 314/6434 [44:43<13:57:54,  8.21s/it, gpt_loss=0.378, loss_mean=0.379][A
+Train step of epoch 0:   5%|▍         | 315/6434 [44:43<14:11:07,  8.35s/it, gpt_loss=0.378, loss_mean=0.379][A
+Train step of epoch 0:   5%|▍         | 315/6434 [44:51<14:11:07,  8.35s/it, gpt_loss=0.313, loss_mean=0.373][A
+Train step of epoch 0:   5%|▍         | 316/6434 [44:51<14:23:37,  8.47s/it, gpt_loss=0.313, loss_mean=0.373][A
+Train step of epoch 0:   5%|▍         | 316/6434 [45:00<14:23:37,  8.47s/it, gpt_loss=0.48, loss_mean=0.384] [A
+Train step of epoch 0:   5%|▍         | 317/6434 [45:00<14:25:45,  8.49s/it, gpt_loss=0.48, loss_mean=0.384][A
+Train step of epoch 0:   5%|▍         | 317/6434 [45:10<14:25:45,  8.49s/it, gpt_loss=0.398, loss_mean=0.385][A
+Train step of epoch 0:   5%|▍         | 318/6434 [45:10<15:04:55,  8.88s/it, gpt_loss=0.398, loss_mean=0.385][A
+Train step of epoch 0:   5%|▍         | 318/6434 [45:18<15:04:55,  8.88s/it, gpt_loss=0.454, loss_mean=0.392][A
+Train step of epoch 0:   5%|▍         | 319/6434 [45:18<14:49:07,  8.72s/it, gpt_loss=0.454, loss_mean=0.392][A
+[LID Router Debug] Step: 320
+Batch Size: 10
+Audio Batch Size: 104
+LID Assignments: [0, 2, 3, 5, 9, 6, 4, 1, 9, 9]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:   5%|▍         | 319/6434 [45:25<14:49:07,  8.72s/it, gpt_loss=0.414, loss_mean=0.394][A
+Train step of epoch 0:   5%|▍         | 320/6434 [45:25<14:01:55,  8.26s/it, gpt_loss=0.414, loss_mean=0.394][A
+Train step of epoch 0:   5%|▍         | 320/6434 [45:34<14:01:55,  8.26s/it, gpt_loss=0.334, loss_mean=0.388][A
+Train step of epoch 0:   5%|▍         | 321/6434 [45:34<14:19:43,  8.44s/it, gpt_loss=0.334, loss_mean=0.388][A
+Train step of epoch 0:   5%|▍         | 321/6434 [45:43<14:19:43,  8.44s/it, gpt_loss=0.405, loss_mean=0.39] [A
+Train step of epoch 0:   5%|▌         | 322/6434 [45:43<14:33:13,  8.57s/it, gpt_loss=0.405, loss_mean=0.39][A
+Train step of epoch 0:   5%|▌         | 322/6434 [45:51<14:33:13,  8.57s/it, gpt_loss=0.356, loss_mean=0.386][A
+Train step of epoch 0:   5%|▌         | 323/6434 [45:51<14:14:32,  8.39s/it, gpt_loss=0.356, loss_mean=0.386][A
+Train step of epoch 0:   5%|▌         | 323/6434 [46:00<14:14:32,  8.39s/it, gpt_loss=0.389, loss_mean=0.387][A
+Train step of epoch 0:   5%|▌         | 324/6434 [46:00<14:31:54,  8.56s/it, gpt_loss=0.389, loss_mean=0.387][A
+Train step of epoch 0:   5%|▌         | 324/6434 [46:08<14:31:54,  8.56s/it, gpt_loss=0.414, loss_mean=0.389][A
+Train step of epoch 0:   5%|▌         | 325/6434 [46:08<14:11:01,  8.36s/it, gpt_loss=0.414, loss_mean=0.389][A
+Train step of epoch 0:   5%|▌         | 325/6434 [46:16<14:11:01,  8.36s/it, gpt_loss=0.454, loss_mean=0.396][A
+Train step of epoch 0:   5%|▌         | 326/6434 [46:16<14:18:14,  8.43s/it, gpt_loss=0.454, loss_mean=0.396][A
+Train step of epoch 0:   5%|▌         | 326/6434 [46:24<14:18:14,  8.43s/it, gpt_loss=0.33, loss_mean=0.389] [A
+Train step of epoch 0:   5%|▌         | 327/6434 [46:24<14:04:34,  8.30s/it, gpt_loss=0.33, loss_mean=0.389][A
+Train step of epoch 0:   5%|▌         | 327/6434 [46:32<14:04:34,  8.30s/it, gpt_loss=0.323, loss_mean=0.383][A
+Train step of epoch 0:   5%|▌         | 328/6434 [46:32<13:49:38,  8.15s/it, gpt_loss=0.323, loss_mean=0.383][A
+Train step of epoch 0:   5%|▌         | 328/6434 [46:40<13:49:38,  8.15s/it, gpt_loss=0.368, loss_mean=0.381][A
+Train step of epoch 0:   5%|▌         | 329/6434 [46:40<13:27:50,  7.94s/it, gpt_loss=0.368, loss_mean=0.381][A
+[LID Router Debug] Step: 330
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [2, 0, 2, 6, 1, 5, 5, 3, 5, 2]
+Active Experts in Batch: {0, 1, 2, 3, 5, 6}
+
+Train step of epoch 0:   5%|▌         | 329/6434 [46:48<13:27:50,  7.94s/it, gpt_loss=0.321, loss_mean=0.375][A
+Train step of epoch 0:   5%|▌         | 330/6434 [46:48<13:35:12,  8.01s/it, gpt_loss=0.321, loss_mean=0.375][A
+Train step of epoch 0:   5%|▌         | 330/6434 [46:56<13:35:12,  8.01s/it, gpt_loss=0.404, loss_mean=0.378][A
+Train step of epoch 0:   5%|▌         | 331/6434 [46:56<13:34:22,  8.01s/it, gpt_loss=0.404, loss_mean=0.378][A
+Train step of epoch 0:   5%|▌         | 331/6434 [47:04<13:34:22,  8.01s/it, gpt_loss=0.375, loss_mean=0.378][A
+Train step of epoch 0:   5%|▌         | 332/6434 [47:04<13:37:01,  8.03s/it, gpt_loss=0.375, loss_mean=0.378][A
+Train step of epoch 0:   5%|▌         | 332/6434 [47:12<13:37:01,  8.03s/it, gpt_loss=0.427, loss_mean=0.383][A
+Train step of epoch 0:   5%|▌         | 333/6434 [47:12<13:28:03,  7.95s/it, gpt_loss=0.427, loss_mean=0.383][A
+Train step of epoch 0:   5%|▌         | 333/6434 [47:19<13:28:03,  7.95s/it, gpt_loss=0.339, loss_mean=0.378][A
+Train step of epoch 0:   5%|▌         | 334/6434 [47:19<13:24:30,  7.91s/it, gpt_loss=0.339, loss_mean=0.378][A
+Train step of epoch 0:   5%|▌         | 334/6434 [47:29<13:24:30,  7.91s/it, gpt_loss=0.464, loss_mean=0.387][A
+Train step of epoch 0:   5%|▌         | 335/6434 [47:29<14:05:04,  8.31s/it, gpt_loss=0.464, loss_mean=0.387][A
+Train step of epoch 0:   5%|▌         | 335/6434 [47:38<14:05:04,  8.31s/it, gpt_loss=0.345, loss_mean=0.383][A
+Train step of epoch 0:   5%|▌         | 336/6434 [47:38<14:25:29,  8.52s/it, gpt_loss=0.345, loss_mean=0.383][A
+Train step of epoch 0:   5%|▌         | 336/6434 [47:46<14:25:29,  8.52s/it, gpt_loss=0.348, loss_mean=0.379][A
+Train step of epoch 0:   5%|▌         | 337/6434 [47:46<14:31:51,  8.58s/it, gpt_loss=0.348, loss_mean=0.379][A
+Train step of epoch 0:   5%|▌         | 337/6434 [47:54<14:31:51,  8.58s/it, gpt_loss=0.377, loss_mean=0.379][A
+Train step of epoch 0:   5%|▌         | 338/6434 [47:54<13:55:32,  8.22s/it, gpt_loss=0.377, loss_mean=0.379][A
+Train step of epoch 0:   5%|▌         | 338/6434 [48:02<13:55:32,  8.22s/it, gpt_loss=0.535, loss_mean=0.395][A
+Train step of epoch 0:   5%|▌         | 339/6434 [48:02<13:43:17,  8.10s/it, gpt_loss=0.535, loss_mean=0.395][A
+[LID Router Debug] Step: 340
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [0, 1, 3, 2, 0, 1, 6, 2, 5, 0]
+Active Experts in Batch: {0, 1, 2, 3, 5, 6}
+
+Train step of epoch 0:   5%|▌         | 339/6434 [48:11<13:43:17,  8.10s/it, gpt_loss=0.363, loss_mean=0.392][A
+Train step of epoch 0:   5%|▌         | 340/6434 [48:11<14:21:49,  8.49s/it, gpt_loss=0.363, loss_mean=0.392][A
+Train step of epoch 0:   5%|▌         | 340/6434 [48:18<14:21:49,  8.49s/it, gpt_loss=0.349, loss_mean=0.387][A
+Train step of epoch 0:   5%|▌         | 341/6434 [48:18<13:49:14,  8.17s/it, gpt_loss=0.349, loss_mean=0.387][A
+Train step of epoch 0:   5%|▌         | 341/6434 [48:27<13:49:14,  8.17s/it, gpt_loss=0.345, loss_mean=0.383][A
+Train step of epoch 0:   5%|▌         | 342/6434 [48:27<14:00:21,  8.28s/it, gpt_loss=0.345, loss_mean=0.383][A
+Train step of epoch 0:   5%|▌         | 342/6434 [48:35<14:00:21,  8.28s/it, gpt_loss=0.398, loss_mean=0.385][A
+Train step of epoch 0:   5%|▌         | 343/6434 [48:35<13:56:30,  8.24s/it, gpt_loss=0.398, loss_mean=0.385][A
+Train step of epoch 0:   5%|▌         | 343/6434 [48:43<13:56:30,  8.24s/it, gpt_loss=0.29, loss_mean=0.375] [A
+Train step of epoch 0:   5%|▌         | 344/6434 [48:43<13:59:23,  8.27s/it, gpt_loss=0.29, loss_mean=0.375][A
+Train step of epoch 0:   5%|▌         | 344/6434 [48:53<13:59:23,  8.27s/it, gpt_loss=0.337, loss_mean=0.371][A
+Train step of epoch 0:   5%|▌         | 345/6434 [48:53<14:23:57,  8.51s/it, gpt_loss=0.337, loss_mean=0.371][A
+Train step of epoch 0:   5%|▌         | 345/6434 [49:02<14:23:57,  8.51s/it, gpt_loss=0.443, loss_mean=0.378][A
+Train step of epoch 0:   5%|▌         | 346/6434 [49:02<14:54:27,  8.82s/it, gpt_loss=0.443, loss_mean=0.378][A
+Train step of epoch 0:   5%|▌         | 346/6434 [49:11<14:54:27,  8.82s/it, gpt_loss=0.37, loss_mean=0.378] [A
+Train step of epoch 0:   5%|▌         | 347/6434 [49:11<14:49:11,  8.76s/it, gpt_loss=0.37, loss_mean=0.378][A
+Train step of epoch 0:   5%|▌         | 347/6434 [49:18<14:49:11,  8.76s/it, gpt_loss=0.356, loss_mean=0.375][A
+Train step of epoch 0:   5%|▌         | 348/6434 [49:18<14:13:01,  8.41s/it, gpt_loss=0.356, loss_mean=0.375][A
+Train step of epoch 0:   5%|▌         | 348/6434 [49:26<14:13:01,  8.41s/it, gpt_loss=0.347, loss_mean=0.373][A
+Train step of epoch 0:   5%|▌         | 349/6434 [49:26<13:44:39,  8.13s/it, gpt_loss=0.347, loss_mean=0.373][A
+[LID Router Debug] Step: 350
+Batch Size: 10
+Audio Batch Size: 108
+LID Assignments: [1, 1, 4, 0, 0, 4, 3, 1, 3, 1]
+Active Experts in Batch: {0, 1, 3, 4}
+
+Train step of epoch 0:   5%|▌         | 349/6434 [49:34<13:44:39,  8.13s/it, gpt_loss=0.458, loss_mean=0.381][A
+Train step of epoch 0:   5%|▌         | 350/6434 [49:34<13:48:13,  8.17s/it, gpt_loss=0.458, loss_mean=0.381][A
+Train step of epoch 0:   5%|▌         | 350/6434 [49:42<13:48:13,  8.17s/it, gpt_loss=0.286, loss_mean=0.372][A
+Train step of epoch 0:   5%|▌         | 351/6434 [49:42<13:45:24,  8.14s/it, gpt_loss=0.286, loss_mean=0.372][A
+Train step of epoch 0:   5%|▌         | 351/6434 [49:50<13:45:24,  8.14s/it, gpt_loss=0.424, loss_mean=0.377][A
+Train step of epoch 0:   5%|▌         | 352/6434 [49:50<13:44:28,  8.13s/it, gpt_loss=0.424, loss_mean=0.377][A
+Train step of epoch 0:   5%|▌         | 352/6434 [49:58<13:44:28,  8.13s/it, gpt_loss=0.43, loss_mean=0.382] [A
+Train step of epoch 0:   5%|▌         | 353/6434 [49:58<13:46:44,  8.16s/it, gpt_loss=0.43, loss_mean=0.382][A
+Train step of epoch 0:   5%|▌         | 353/6434 [50:06<13:46:44,  8.16s/it, gpt_loss=0.367, loss_mean=0.381][A
+Train step of epoch 0:   6%|▌         | 354/6434 [50:06<13:30:07,  7.99s/it, gpt_loss=0.367, loss_mean=0.381][A
+Train step of epoch 0:   6%|▌         | 354/6434 [50:13<13:30:07,  7.99s/it, gpt_loss=0.411, loss_mean=0.384][A
+Train step of epoch 0:   6%|▌         | 355/6434 [50:13<13:13:29,  7.83s/it, gpt_loss=0.411, loss_mean=0.384][A
+Train step of epoch 0:   6%|▌         | 355/6434 [50:22<13:13:29,  7.83s/it, gpt_loss=0.351, loss_mean=0.38] [A
+Train step of epoch 0:   6%|▌         | 356/6434 [50:22<13:32:14,  8.02s/it, gpt_loss=0.351, loss_mean=0.38][A
+Train step of epoch 0:   6%|▌         | 356/6434 [50:30<13:32:14,  8.02s/it, gpt_loss=0.43, loss_mean=0.385][A
+Train step of epoch 0:   6%|▌         | 357/6434 [50:30<13:34:19,  8.04s/it, gpt_loss=0.43, loss_mean=0.385][A
+Train step of epoch 0:   6%|▌         | 357/6434 [50:38<13:34:19,  8.04s/it, gpt_loss=0.373, loss_mean=0.384][A
+Train step of epoch 0:   6%|▌         | 358/6434 [50:38<13:29:05,  7.99s/it, gpt_loss=0.373, loss_mean=0.384][A
+Train step of epoch 0:   6%|▌         | 358/6434 [50:46<13:29:05,  7.99s/it, gpt_loss=0.352, loss_mean=0.381][A
+Train step of epoch 0:   6%|▌         | 359/6434 [50:46<13:32:41,  8.03s/it, gpt_loss=0.352, loss_mean=0.381][A
+[LID Router Debug] Step: 360
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [9, 9, 4, 9, 4, 1, 4, 6, 1, 4]
+Active Experts in Batch: {9, 4, 1, 6}
+
+Train step of epoch 0:   6%|▌         | 359/6434 [50:54<13:32:41,  8.03s/it, gpt_loss=0.473, loss_mean=0.39] [A
+Train step of epoch 0:   6%|▌         | 360/6434 [50:54<13:35:22,  8.05s/it, gpt_loss=0.473, loss_mean=0.39][A
+Train step of epoch 0:   6%|▌         | 360/6434 [51:04<13:35:22,  8.05s/it, gpt_loss=0.388, loss_mean=0.39][A
+Train step of epoch 0:   6%|▌         | 361/6434 [51:04<14:35:44,  8.65s/it, gpt_loss=0.388, loss_mean=0.39][A
+Train step of epoch 0:   6%|▌         | 361/6434 [51:12<14:35:44,  8.65s/it, gpt_loss=0.37, loss_mean=0.388][A
+Train step of epoch 0:   6%|▌         | 362/6434 [51:12<14:25:19,  8.55s/it, gpt_loss=0.37, loss_mean=0.388][A
+Train step of epoch 0:   6%|▌         | 362/6434 [51:20<14:25:19,  8.55s/it, gpt_loss=0.344, loss_mean=0.383][A
+Train step of epoch 0:   6%|▌         | 363/6434 [51:20<13:56:44,  8.27s/it, gpt_loss=0.344, loss_mean=0.383][A
+Train step of epoch 0:   6%|▌         | 363/6434 [51:28<13:56:44,  8.27s/it, gpt_loss=0.359, loss_mean=0.381][A
+Train step of epoch 0:   6%|▌         | 364/6434 [51:28<13:30:44,  8.01s/it, gpt_loss=0.359, loss_mean=0.381][A
+Train step of epoch 0:   6%|▌         | 364/6434 [51:35<13:30:44,  8.01s/it, gpt_loss=0.431, loss_mean=0.386][A
+Train step of epoch 0:   6%|▌         | 365/6434 [51:35<13:06:57,  7.78s/it, gpt_loss=0.431, loss_mean=0.386][A
+Train step of epoch 0:   6%|▌         | 365/6434 [51:43<13:06:57,  7.78s/it, gpt_loss=0.348, loss_mean=0.382][A
+Train step of epoch 0:   6%|▌         | 366/6434 [51:43<13:21:47,  7.93s/it, gpt_loss=0.348, loss_mean=0.382][A
+Train step of epoch 0:   6%|▌         | 366/6434 [51:52<13:21:47,  7.93s/it, gpt_loss=0.34, loss_mean=0.378] [A
+Train step of epoch 0:   6%|▌         | 367/6434 [51:52<13:48:05,  8.19s/it, gpt_loss=0.34, loss_mean=0.378][A
+Train step of epoch 0:   6%|▌         | 367/6434 [52:00<13:48:05,  8.19s/it, gpt_loss=0.388, loss_mean=0.379][A
+Train step of epoch 0:   6%|▌         | 368/6434 [52:00<13:57:19,  8.28s/it, gpt_loss=0.388, loss_mean=0.379][A
+Train step of epoch 0:   6%|▌         | 368/6434 [52:08<13:57:19,  8.28s/it, gpt_loss=0.393, loss_mean=0.38] [A
+Train step of epoch 0:   6%|▌         | 369/6434 [52:08<13:47:58,  8.19s/it, gpt_loss=0.393, loss_mean=0.38][A
+[LID Router Debug] Step: 370
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [4, 3, 5, 1, 1, 3, 5, 2, 4, 6]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6}
+
+Train step of epoch 0:   6%|▌         | 369/6434 [52:17<13:47:58,  8.19s/it, gpt_loss=0.345, loss_mean=0.377][A
+Train step of epoch 0:   6%|▌         | 370/6434 [52:17<14:05:53,  8.37s/it, gpt_loss=0.345, loss_mean=0.377][A
+Train step of epoch 0:   6%|▌         | 370/6434 [52:25<14:05:53,  8.37s/it, gpt_loss=0.311, loss_mean=0.37] [A
+Train step of epoch 0:   6%|▌         | 371/6434 [52:25<14:05:09,  8.36s/it, gpt_loss=0.311, loss_mean=0.37][A
+Train step of epoch 0:   6%|▌         | 371/6434 [52:34<14:05:09,  8.36s/it, gpt_loss=0.345, loss_mean=0.368][A
+Train step of epoch 0:   6%|▌         | 372/6434 [52:34<14:11:06,  8.42s/it, gpt_loss=0.345, loss_mean=0.368][A
+Train step of epoch 0:   6%|▌         | 372/6434 [52:42<14:11:06,  8.42s/it, gpt_loss=0.362, loss_mean=0.367][A
+Train step of epoch 0:   6%|▌         | 373/6434 [52:42<14:09:33,  8.41s/it, gpt_loss=0.362, loss_mean=0.367][A
+Train step of epoch 0:   6%|▌         | 373/6434 [52:51<14:09:33,  8.41s/it, gpt_loss=0.476, loss_mean=0.378][A
+Train step of epoch 0:   6%|▌         | 374/6434 [52:51<14:04:25,  8.36s/it, gpt_loss=0.476, loss_mean=0.378][A
+Train step of epoch 0:   6%|▌         | 374/6434 [53:01<14:04:25,  8.36s/it, gpt_loss=0.395, loss_mean=0.38] [A
+Train step of epoch 0:   6%|▌         | 375/6434 [53:01<15:08:22,  9.00s/it, gpt_loss=0.395, loss_mean=0.38][A
+Train step of epoch 0:   6%|▌         | 375/6434 [53:09<15:08:22,  9.00s/it, gpt_loss=0.338, loss_mean=0.375][A
+Train step of epoch 0:   6%|▌         | 376/6434 [53:09<14:47:51,  8.79s/it, gpt_loss=0.338, loss_mean=0.375][A
+Train step of epoch 0:   6%|▌         | 376/6434 [53:18<14:47:51,  8.79s/it, gpt_loss=0.316, loss_mean=0.37] [A
+Train step of epoch 0:   6%|▌         | 377/6434 [53:18<14:27:40,  8.60s/it, gpt_loss=0.316, loss_mean=0.37][A
+Train step of epoch 0:   6%|▌         | 377/6434 [53:26<14:27:40,  8.60s/it, gpt_loss=0.444, loss_mean=0.377][A
+Train step of epoch 0:   6%|▌         | 378/6434 [53:26<14:19:19,  8.51s/it, gpt_loss=0.444, loss_mean=0.377][A
+Train step of epoch 0:   6%|▌         | 378/6434 [53:35<14:19:19,  8.51s/it, gpt_loss=0.36, loss_mean=0.375] [A
+Train step of epoch 0:   6%|▌         | 379/6434 [53:35<14:48:28,  8.80s/it, gpt_loss=0.36, loss_mean=0.375][A
+[LID Router Debug] Step: 380
+Batch Size: 10
+Audio Batch Size: 136
+LID Assignments: [3, 3, 4, 9, 2, 4, 9, 2, 7, 5]
+Active Experts in Batch: {2, 3, 4, 5, 7, 9}
+
+Train step of epoch 0:   6%|▌         | 379/6434 [53:44<14:48:28,  8.80s/it, gpt_loss=0.354, loss_mean=0.373][A
+Train step of epoch 0:   6%|▌         | 380/6434 [53:44<14:49:21,  8.81s/it, gpt_loss=0.354, loss_mean=0.373][A
+Train step of epoch 0:   6%|▌         | 380/6434 [53:53<14:49:21,  8.81s/it, gpt_loss=0.318, loss_mean=0.368][A
+Train step of epoch 0:   6%|▌         | 381/6434 [53:53<15:01:05,  8.93s/it, gpt_loss=0.318, loss_mean=0.368][A
+Train step of epoch 0:   6%|▌         | 381/6434 [54:01<15:01:05,  8.93s/it, gpt_loss=0.411, loss_mean=0.372][A
+Train step of epoch 0:   6%|▌         | 382/6434 [54:01<14:19:12,  8.52s/it, gpt_loss=0.411, loss_mean=0.372][A
+Train step of epoch 0:   6%|▌         | 382/6434 [54:08<14:19:12,  8.52s/it, gpt_loss=0.402, loss_mean=0.375][A
+Train step of epoch 0:   6%|▌         | 383/6434 [54:08<13:38:06,  8.11s/it, gpt_loss=0.402, loss_mean=0.375][A
+Train step of epoch 0:   6%|▌         | 383/6434 [54:18<13:38:06,  8.11s/it, gpt_loss=0.459, loss_mean=0.383][A
+Train step of epoch 0:   6%|▌         | 384/6434 [54:18<14:31:30,  8.64s/it, gpt_loss=0.459, loss_mean=0.383][A
+Train step of epoch 0:   6%|▌         | 384/6434 [54:27<14:31:30,  8.64s/it, gpt_loss=0.461, loss_mean=0.391][A
+Train step of epoch 0:   6%|▌         | 385/6434 [54:27<14:48:24,  8.81s/it, gpt_loss=0.461, loss_mean=0.391][A
+Train step of epoch 0:   6%|▌         | 385/6434 [54:36<14:48:24,  8.81s/it, gpt_loss=0.392, loss_mean=0.391][A
+Train step of epoch 0:   6%|▌         | 386/6434 [54:36<15:01:00,  8.94s/it, gpt_loss=0.392, loss_mean=0.391][A
+Train step of epoch 0:   6%|▌         | 386/6434 [54:45<15:01:00,  8.94s/it, gpt_loss=0.378, loss_mean=0.39] [A
+Train step of epoch 0:   6%|▌         | 387/6434 [54:45<14:42:44,  8.76s/it, gpt_loss=0.378, loss_mean=0.39][A
+Train step of epoch 0:   6%|▌         | 387/6434 [54:53<14:42:44,  8.76s/it, gpt_loss=0.47, loss_mean=0.398][A
+Train step of epoch 0:   6%|▌         | 388/6434 [54:53<14:20:07,  8.54s/it, gpt_loss=0.47, loss_mean=0.398][A
+Train step of epoch 0:   6%|▌         | 388/6434 [55:02<14:20:07,  8.54s/it, gpt_loss=0.321, loss_mean=0.39][A
+Train step of epoch 0:   6%|▌         | 389/6434 [55:02<14:26:22,  8.60s/it, gpt_loss=0.321, loss_mean=0.39][A
+[LID Router Debug] Step: 390
+Batch Size: 10
+Audio Batch Size: 112
+LID Assignments: [1, 1, 3, 5, 9, 1, 3, 0, 9, 4]
+Active Experts in Batch: {0, 1, 3, 4, 5, 9}
+
+Train step of epoch 0:   6%|▌         | 389/6434 [55:10<14:26:22,  8.60s/it, gpt_loss=0.479, loss_mean=0.399][A
+Train step of epoch 0:   6%|▌         | 390/6434 [55:10<14:12:45,  8.47s/it, gpt_loss=0.479, loss_mean=0.399][A
+Train step of epoch 0:   6%|▌         | 390/6434 [55:18<14:12:45,  8.47s/it, gpt_loss=0.352, loss_mean=0.394][A
+Train step of epoch 0:   6%|▌         | 391/6434 [55:18<13:55:09,  8.29s/it, gpt_loss=0.352, loss_mean=0.394][A
+Train step of epoch 0:   6%|▌         | 391/6434 [55:26<13:55:09,  8.29s/it, gpt_loss=0.446, loss_mean=0.4]  [A
+Train step of epoch 0:   6%|▌         | 392/6434 [55:26<13:53:32,  8.28s/it, gpt_loss=0.446, loss_mean=0.4][A
+Train step of epoch 0:   6%|▌         | 392/6434 [55:35<13:53:32,  8.28s/it, gpt_loss=0.44, loss_mean=0.404][A
+Train step of epoch 0:   6%|▌         | 393/6434 [55:35<14:24:48,  8.59s/it, gpt_loss=0.44, loss_mean=0.404][A
+Train step of epoch 0:   6%|▌         | 393/6434 [55:44<14:24:48,  8.59s/it, gpt_loss=0.344, loss_mean=0.398][A
+Train step of epoch 0:   6%|▌         | 394/6434 [55:44<14:33:57,  8.68s/it, gpt_loss=0.344, loss_mean=0.398][A
+Train step of epoch 0:   6%|▌         | 394/6434 [55:52<14:33:57,  8.68s/it, gpt_loss=0.429, loss_mean=0.401][A
+Train step of epoch 0:   6%|▌         | 395/6434 [55:52<14:03:03,  8.38s/it, gpt_loss=0.429, loss_mean=0.401][A
+Train step of epoch 0:   6%|▌         | 395/6434 [56:00<14:03:03,  8.38s/it, gpt_loss=0.392, loss_mean=0.4]  [A
+Train step of epoch 0:   6%|▌         | 396/6434 [56:00<14:07:50,  8.43s/it, gpt_loss=0.392, loss_mean=0.4][A
+Train step of epoch 0:   6%|▌         | 396/6434 [56:08<14:07:50,  8.43s/it, gpt_loss=0.44, loss_mean=0.404][A
+Train step of epoch 0:   6%|▌         | 397/6434 [56:08<13:46:52,  8.22s/it, gpt_loss=0.44, loss_mean=0.404][A
+Train step of epoch 0:   6%|▌         | 397/6434 [56:15<13:46:52,  8.22s/it, gpt_loss=0.426, loss_mean=0.406][A
+Train step of epoch 0:   6%|▌         | 398/6434 [56:15<13:24:25,  8.00s/it, gpt_loss=0.426, loss_mean=0.406][A
+Train step of epoch 0:   6%|▌         | 398/6434 [56:24<13:24:25,  8.00s/it, gpt_loss=0.415, loss_mean=0.407][A
+Train step of epoch 0:   6%|▌         | 399/6434 [56:24<13:36:11,  8.11s/it, gpt_loss=0.415, loss_mean=0.407][A
+[LID Router Debug] Step: 400
+Batch Size: 10
+Audio Batch Size: 115
+LID Assignments: [5, 1, 4, 4, 3, 0, 5, 2, 9, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+[2026-02-06 16:52:36,123] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=0, lr=[1.999426501601224e-05, 1.999426501601224e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-06 16:52:36,124] [INFO] [timer.py:260:stop] epoch=0/micro_step=400/global_step=200, RunningAvgSamplesPerSec=4.7420941127830565, CurrSamplesPerSec=4.950035512366853, MemAllocated=12.71GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:   6%|▌         | 399/6434 [56:32<13:36:11,  8.11s/it, gpt_loss=0.321, loss_mean=0.398][A
+Train step of epoch 0:   6%|▌         | 400/6434 [56:32<13:26:32,  8.02s/it, gpt_loss=0.321, loss_mean=0.398][A
+Train step of epoch 0:   6%|▌         | 400/6434 [56:41<13:26:32,  8.02s/it, gpt_loss=0.492, loss_mean=0.408][A
+Train step of epoch 0:   6%|▌         | 401/6434 [56:41<14:04:44,  8.40s/it, gpt_loss=0.492, loss_mean=0.408][A
+Train step of epoch 0:   6%|▌         | 401/6434 [56:51<14:04:44,  8.40s/it, gpt_loss=0.256, loss_mean=0.393][A
+Train step of epoch 0:   6%|▌         | 402/6434 [56:51<14:51:57,  8.87s/it, gpt_loss=0.256, loss_mean=0.393][A
+Train step of epoch 0:   6%|▌         | 402/6434 [56:59<14:51:57,  8.87s/it, gpt_loss=0.358, loss_mean=0.389][A
+Train step of epoch 0:   6%|▋         | 403/6434 [56:59<14:27:12,  8.63s/it, gpt_loss=0.358, loss_mean=0.389][A
+Train step of epoch 0:   6%|▋         | 403/6434 [57:07<14:27:12,  8.63s/it, gpt_loss=0.398, loss_mean=0.39] [A
+Train step of epoch 0:   6%|▋         | 404/6434 [57:07<13:57:09,  8.33s/it, gpt_loss=0.398, loss_mean=0.39][A
+Train step of epoch 0:   6%|▋         | 404/6434 [57:15<13:57:09,  8.33s/it, gpt_loss=0.363, loss_mean=0.387][A
+Train step of epoch 0:   6%|▋         | 405/6434 [57:15<14:08:36,  8.45s/it, gpt_loss=0.363, loss_mean=0.387][A
+Train step of epoch 0:   6%|▋         | 405/6434 [57:24<14:08:36,  8.45s/it, gpt_loss=0.418, loss_mean=0.39] [A
+Train step of epoch 0:   6%|▋         | 406/6434 [57:24<14:12:51,  8.49s/it, gpt_loss=0.418, loss_mean=0.39][A
+Train step of epoch 0:   6%|▋         | 406/6434 [57:33<14:12:51,  8.49s/it, gpt_loss=0.507, loss_mean=0.402][A
+Train step of epoch 0:   6%|▋         | 407/6434 [57:33<14:32:44,  8.69s/it, gpt_loss=0.507, loss_mean=0.402][A
+Train step of epoch 0:   6%|▋         | 407/6434 [57:42<14:32:44,  8.69s/it, gpt_loss=0.421, loss_mean=0.404][A
+Train step of epoch 0:   6%|▋         | 408/6434 [57:42<14:41:19,  8.78s/it, gpt_loss=0.421, loss_mean=0.404][A
+Train step of epoch 0:   6%|▋         | 408/6434 [57:50<14:41:19,  8.78s/it, gpt_loss=0.474, loss_mean=0.411][A
+Train step of epoch 0:   6%|▋         | 409/6434 [57:50<14:12:49,  8.49s/it, gpt_loss=0.474, loss_mean=0.411][A
+[LID Router Debug] Step: 410
+Batch Size: 10
+Audio Batch Size: 87
+LID Assignments: [3, 1, 0, 1, 5, 6, 0, 4, 5, 4]
+Active Experts in Batch: {0, 1, 3, 4, 5, 6}
+
+Train step of epoch 0:   6%|▋         | 409/6434 [57:58<14:12:49,  8.49s/it, gpt_loss=0.414, loss_mean=0.411][A
+Train step of epoch 0:   6%|▋         | 410/6434 [57:58<13:57:14,  8.34s/it, gpt_loss=0.414, loss_mean=0.411][A
+Train step of epoch 0:   6%|▋         | 410/6434 [58:05<13:57:14,  8.34s/it, gpt_loss=0.377, loss_mean=0.408][A
+Train step of epoch 0:   6%|▋         | 411/6434 [58:05<13:34:18,  8.11s/it, gpt_loss=0.377, loss_mean=0.408][A
+Train step of epoch 0:   6%|▋         | 411/6434 [58:15<13:34:18,  8.11s/it, gpt_loss=0.443, loss_mean=0.411][A
+Train step of epoch 0:   6%|▋         | 412/6434 [58:15<14:10:23,  8.47s/it, gpt_loss=0.443, loss_mean=0.411][A
+Train step of epoch 0:   6%|▋         | 412/6434 [58:23<14:10:23,  8.47s/it, gpt_loss=0.415, loss_mean=0.412][A
+Train step of epoch 0:   6%|▋         | 413/6434 [58:23<13:56:01,  8.33s/it, gpt_loss=0.415, loss_mean=0.412][A
+Train step of epoch 0:   6%|▋         | 413/6434 [58:32<13:56:01,  8.33s/it, gpt_loss=0.427, loss_mean=0.413][A
+Train step of epoch 0:   6%|▋         | 414/6434 [58:32<14:08:29,  8.46s/it, gpt_loss=0.427, loss_mean=0.413][A
+Train step of epoch 0:   6%|▋         | 414/6434 [58:40<14:08:29,  8.46s/it, gpt_loss=0.313, loss_mean=0.403][A
+Train step of epoch 0:   6%|▋         | 415/6434 [58:40<14:11:23,  8.49s/it, gpt_loss=0.313, loss_mean=0.403][A
+Train step of epoch 0:   6%|▋         | 415/6434 [58:48<14:11:23,  8.49s/it, gpt_loss=0.413, loss_mean=0.404][A
+Train step of epoch 0:   6%|▋         | 416/6434 [58:48<14:04:50,  8.42s/it, gpt_loss=0.413, loss_mean=0.404][A
+Train step of epoch 0:   6%|▋         | 416/6434 [58:58<14:04:50,  8.42s/it, gpt_loss=0.438, loss_mean=0.408][A
+Train step of epoch 0:   6%|▋         | 417/6434 [58:58<14:47:27,  8.85s/it, gpt_loss=0.438, loss_mean=0.408][A
+Train step of epoch 0:   6%|▋         | 417/6434 [59:06<14:47:27,  8.85s/it, gpt_loss=0.344, loss_mean=0.401][A
+Train step of epoch 0:   6%|▋         | 418/6434 [59:06<14:24:26,  8.62s/it, gpt_loss=0.344, loss_mean=0.401][A
+Train step of epoch 0:   6%|▋         | 418/6434 [59:14<14:24:26,  8.62s/it, gpt_loss=0.364, loss_mean=0.398][A
+Train step of epoch 0:   7%|▋         | 419/6434 [59:14<14:07:24,  8.45s/it, gpt_loss=0.364, loss_mean=0.398][A
+[LID Router Debug] Step: 420
+Batch Size: 10
+Audio Batch Size: 106
+LID Assignments: [2, 0, 3, 1, 2, 2, 4, 9, 1, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+Train step of epoch 0:   7%|▋         | 419/6434 [59:23<14:07:24,  8.45s/it, gpt_loss=0.431, loss_mean=0.401][A
+Train step of epoch 0:   7%|▋         | 420/6434 [59:23<14:09:53,  8.48s/it, gpt_loss=0.431, loss_mean=0.401][A
+Train step of epoch 0:   7%|▋         | 420/6434 [59:32<14:09:53,  8.48s/it, gpt_loss=0.407, loss_mean=0.401][A
+Train step of epoch 0:   7%|▋         | 421/6434 [59:32<14:42:31,  8.81s/it, gpt_loss=0.407, loss_mean=0.401][A
+Train step of epoch 0:   7%|▋         | 421/6434 [59:42<14:42:31,  8.81s/it, gpt_loss=0.431, loss_mean=0.404][A
+Train step of epoch 0:   7%|▋         | 422/6434 [59:42<14:57:53,  8.96s/it, gpt_loss=0.431, loss_mean=0.404][A
+Train step of epoch 0:   7%|▋         | 422/6434 [59:50<14:57:53,  8.96s/it, gpt_loss=0.354, loss_mean=0.399][A
+Train step of epoch 0:   7%|▋         | 423/6434 [59:50<14:22:25,  8.61s/it, gpt_loss=0.354, loss_mean=0.399][A
+Train step of epoch 0:   7%|▋         | 423/6434 [59:58<14:22:25,  8.61s/it, gpt_loss=0.503, loss_mean=0.41] [A
+Train step of epoch 0:   7%|▋         | 424/6434 [59:58<14:08:11,  8.47s/it, gpt_loss=0.503, loss_mean=0.41][A
+Train step of epoch 0:   7%|▋         | 424/6434 [1:00:05<14:08:11,  8.47s/it, gpt_loss=0.384, loss_mean=0.407][A
+Train step of epoch 0:   7%|▋         | 425/6434 [1:00:05<13:42:25,  8.21s/it, gpt_loss=0.384, loss_mean=0.407][A
+Train step of epoch 0:   7%|▋         | 425/6434 [1:00:14<13:42:25,  8.21s/it, gpt_loss=0.266, loss_mean=0.393][A
+Train step of epoch 0:   7%|▋         | 426/6434 [1:00:14<13:55:19,  8.34s/it, gpt_loss=0.266, loss_mean=0.393][A
+Train step of epoch 0:   7%|▋         | 426/6434 [1:00:22<13:55:19,  8.34s/it, gpt_loss=0.375, loss_mean=0.391][A
+Train step of epoch 0:   7%|▋         | 427/6434 [1:00:22<13:41:01,  8.20s/it, gpt_loss=0.375, loss_mean=0.391][A
+Train step of epoch 0:   7%|▋         | 427/6434 [1:00:30<13:41:01,  8.20s/it, gpt_loss=0.453, loss_mean=0.397][A
+Train step of epoch 0:   7%|▋         | 428/6434 [1:00:30<13:45:25,  8.25s/it, gpt_loss=0.453, loss_mean=0.397][A
+Train step of epoch 0:   7%|▋         | 428/6434 [1:00:39<13:45:25,  8.25s/it, gpt_loss=0.464, loss_mean=0.404][A
+Train step of epoch 0:   7%|▋         | 429/6434 [1:00:39<13:50:26,  8.30s/it, gpt_loss=0.464, loss_mean=0.404][A
+[LID Router Debug] Step: 430
+Batch Size: 10
+Audio Batch Size: 94
+LID Assignments: [5, 1, 2, 5, 0, 4, 9, 1, 9, 1]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+Train step of epoch 0:   7%|▋         | 429/6434 [1:00:46<13:50:26,  8.30s/it, gpt_loss=0.481, loss_mean=0.412][A
+Train step of epoch 0:   7%|▋         | 430/6434 [1:00:46<13:38:17,  8.18s/it, gpt_loss=0.481, loss_mean=0.412][A
+Train step of epoch 0:   7%|▋         | 430/6434 [1:00:56<13:38:17,  8.18s/it, gpt_loss=0.3, loss_mean=0.401]  [A
+Train step of epoch 0:   7%|▋         | 431/6434 [1:00:56<14:04:00,  8.44s/it, gpt_loss=0.3, loss_mean=0.401][A
+Train step of epoch 0:   7%|▋         | 431/6434 [1:01:04<14:04:00,  8.44s/it, gpt_loss=0.324, loss_mean=0.393][A
+Train step of epoch 0:   7%|▋         | 432/6434 [1:01:04<13:54:51,  8.35s/it, gpt_loss=0.324, loss_mean=0.393][A
+Train step of epoch 0:   7%|▋         | 432/6434 [1:01:12<13:54:51,  8.35s/it, gpt_loss=0.348, loss_mean=0.389][A
+Train step of epoch 0:   7%|▋         | 433/6434 [1:01:12<13:57:26,  8.37s/it, gpt_loss=0.348, loss_mean=0.389][A
+Train step of epoch 0:   7%|▋         | 433/6434 [1:01:20<13:57:26,  8.37s/it, gpt_loss=0.381, loss_mean=0.388][A
+Train step of epoch 0:   7%|▋         | 434/6434 [1:01:20<13:47:18,  8.27s/it, gpt_loss=0.381, loss_mean=0.388][A
+Train step of epoch 0:   7%|▋         | 434/6434 [1:01:29<13:47:18,  8.27s/it, gpt_loss=0.425, loss_mean=0.391][A
+Train step of epoch 0:   7%|▋         | 435/6434 [1:01:29<13:57:24,  8.38s/it, gpt_loss=0.425, loss_mean=0.391][A
+Train step of epoch 0:   7%|▋         | 435/6434 [1:01:36<13:57:24,  8.38s/it, gpt_loss=0.396, loss_mean=0.392][A
+Train step of epoch 0:   7%|▋         | 436/6434 [1:01:36<13:30:02,  8.10s/it, gpt_loss=0.396, loss_mean=0.392][A
+Train step of epoch 0:   7%|▋         | 436/6434 [1:01:44<13:30:02,  8.10s/it, gpt_loss=0.339, loss_mean=0.387][A
+Train step of epoch 0:   7%|▋         | 437/6434 [1:01:44<13:33:19,  8.14s/it, gpt_loss=0.339, loss_mean=0.387][A
+Train step of epoch 0:   7%|▋         | 437/6434 [1:01:52<13:33:19,  8.14s/it, gpt_loss=0.349, loss_mean=0.383][A
+Train step of epoch 0:   7%|▋         | 438/6434 [1:01:52<13:30:50,  8.11s/it, gpt_loss=0.349, loss_mean=0.383][A
+Train step of epoch 0:   7%|▋         | 438/6434 [1:02:01<13:30:50,  8.11s/it, gpt_loss=0.382, loss_mean=0.383][A
+Train step of epoch 0:   7%|▋         | 439/6434 [1:02:01<13:56:54,  8.38s/it, gpt_loss=0.382, loss_mean=0.383][A
+[LID Router Debug] Step: 440
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [0, 5, 4, 9, 2, 6, 5, 4, 0, 0]
+Active Experts in Batch: {0, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:   7%|▋         | 439/6434 [1:02:09<13:56:54,  8.38s/it, gpt_loss=0.442, loss_mean=0.389][A
+Train step of epoch 0:   7%|▋         | 440/6434 [1:02:09<13:44:52,  8.26s/it, gpt_loss=0.442, loss_mean=0.389][A
+Train step of epoch 0:   7%|▋         | 440/6434 [1:02:17<13:44:52,  8.26s/it, gpt_loss=0.438, loss_mean=0.394][A
+Train step of epoch 0:   7%|▋         | 441/6434 [1:02:17<13:33:00,  8.14s/it, gpt_loss=0.438, loss_mean=0.394][A
+Train step of epoch 0:   7%|▋         | 441/6434 [1:02:26<13:33:00,  8.14s/it, gpt_loss=0.283, loss_mean=0.382][A
+Train step of epoch 0:   7%|▋         | 442/6434 [1:02:26<13:43:22,  8.24s/it, gpt_loss=0.283, loss_mean=0.382][A
+Train step of epoch 0:   7%|▋         | 442/6434 [1:02:36<13:43:22,  8.24s/it, gpt_loss=0.392, loss_mean=0.383][A
+Train step of epoch 0:   7%|▋         | 443/6434 [1:02:36<14:50:22,  8.92s/it, gpt_loss=0.392, loss_mean=0.383][A
+Train step of epoch 0:   7%|▋         | 443/6434 [1:02:44<14:50:22,  8.92s/it, gpt_loss=0.499, loss_mean=0.395][A
+Train step of epoch 0:   7%|▋         | 444/6434 [1:02:44<14:27:31,  8.69s/it, gpt_loss=0.499, loss_mean=0.395][A
+Train step of epoch 0:   7%|▋         | 444/6434 [1:02:53<14:27:31,  8.69s/it, gpt_loss=0.417, loss_mean=0.397][A
+Train step of epoch 0:   7%|▋         | 445/6434 [1:02:53<14:14:11,  8.56s/it, gpt_loss=0.417, loss_mean=0.397][A
+Train step of epoch 0:   7%|▋         | 445/6434 [1:03:01<14:14:11,  8.56s/it, gpt_loss=0.346, loss_mean=0.392][A
+Train step of epoch 0:   7%|▋         | 446/6434 [1:03:01<14:04:24,  8.46s/it, gpt_loss=0.346, loss_mean=0.392][A
+Train step of epoch 0:   7%|▋         | 446/6434 [1:03:09<14:04:24,  8.46s/it, gpt_loss=0.427, loss_mean=0.396][A
+Train step of epoch 0:   7%|▋         | 447/6434 [1:03:09<13:51:30,  8.33s/it, gpt_loss=0.427, loss_mean=0.396][A
+Train step of epoch 0:   7%|▋         | 447/6434 [1:03:17<13:51:30,  8.33s/it, gpt_loss=0.455, loss_mean=0.402][A
+Train step of epoch 0:   7%|▋         | 448/6434 [1:03:17<13:51:49,  8.34s/it, gpt_loss=0.455, loss_mean=0.402][A
+Train step of epoch 0:   7%|▋         | 448/6434 [1:03:26<13:51:49,  8.34s/it, gpt_loss=0.324, loss_mean=0.394][A
+Train step of epoch 0:   7%|▋         | 449/6434 [1:03:26<14:11:34,  8.54s/it, gpt_loss=0.324, loss_mean=0.394][A
+[LID Router Debug] Step: 450
+Batch Size: 10
+Audio Batch Size: 121
+LID Assignments: [1, 0, 0, 5, 4, 3, 3, 3, 5, 3]
+Active Experts in Batch: {0, 1, 3, 4, 5}
+
+Train step of epoch 0:   7%|▋         | 449/6434 [1:03:35<14:11:34,  8.54s/it, gpt_loss=0.383, loss_mean=0.393][A
+Train step of epoch 0:   7%|▋         | 450/6434 [1:03:35<14:06:43,  8.49s/it, gpt_loss=0.383, loss_mean=0.393][A
+Train step of epoch 0:   7%|▋         | 450/6434 [1:03:42<14:06:43,  8.49s/it, gpt_loss=0.346, loss_mean=0.388][A
+Train step of epoch 0:   7%|▋         | 451/6434 [1:03:42<13:44:17,  8.27s/it, gpt_loss=0.346, loss_mean=0.388][A
+Train step of epoch 0:   7%|▋         | 451/6434 [1:03:51<13:44:17,  8.27s/it, gpt_loss=0.323, loss_mean=0.382][A
+Train step of epoch 0:   7%|▋         | 452/6434 [1:03:51<13:52:24,  8.35s/it, gpt_loss=0.323, loss_mean=0.382][A
+Train step of epoch 0:   7%|▋         | 452/6434 [1:04:00<13:52:24,  8.35s/it, gpt_loss=0.37, loss_mean=0.38]  [A
+Train step of epoch 0:   7%|▋         | 453/6434 [1:04:00<14:14:36,  8.57s/it, gpt_loss=0.37, loss_mean=0.38][A
+Train step of epoch 0:   7%|▋         | 453/6434 [1:04:08<14:14:36,  8.57s/it, gpt_loss=0.35, loss_mean=0.377][A
+Train step of epoch 0:   7%|▋         | 454/6434 [1:04:08<14:06:34,  8.49s/it, gpt_loss=0.35, loss_mean=0.377][A
+Train step of epoch 0:   7%|▋         | 454/6434 [1:04:16<14:06:34,  8.49s/it, gpt_loss=0.357, loss_mean=0.375][A
+Train step of epoch 0:   7%|▋         | 455/6434 [1:04:16<13:33:22,  8.16s/it, gpt_loss=0.357, loss_mean=0.375][A
+Train step of epoch 0:   7%|▋         | 455/6434 [1:04:24<13:33:22,  8.16s/it, gpt_loss=0.43, loss_mean=0.381] [A
+Train step of epoch 0:   7%|▋         | 456/6434 [1:04:24<13:48:40,  8.32s/it, gpt_loss=0.43, loss_mean=0.381][A
+Train step of epoch 0:   7%|▋         | 456/6434 [1:04:33<13:48:40,  8.32s/it, gpt_loss=0.376, loss_mean=0.38][A
+Train step of epoch 0:   7%|▋         | 457/6434 [1:04:33<13:42:59,  8.26s/it, gpt_loss=0.376, loss_mean=0.38][A
+Train step of epoch 0:   7%|▋         | 457/6434 [1:04:41<13:42:59,  8.26s/it, gpt_loss=0.375, loss_mean=0.38][A
+Train step of epoch 0:   7%|▋         | 458/6434 [1:04:41<13:55:45,  8.39s/it, gpt_loss=0.375, loss_mean=0.38][A
+Train step of epoch 0:   7%|▋         | 458/6434 [1:04:49<13:55:45,  8.39s/it, gpt_loss=0.39, loss_mean=0.381][A
+Train step of epoch 0:   7%|▋         | 459/6434 [1:04:49<13:38:11,  8.22s/it, gpt_loss=0.39, loss_mean=0.381][A
+[LID Router Debug] Step: 460
+Batch Size: 10
+Audio Batch Size: 127
+LID Assignments: [3, 6, 9, 3, 1, 6, 2, 1, 2, 3]
+Active Experts in Batch: {1, 2, 3, 6, 9}
+
+Train step of epoch 0:   7%|▋         | 459/6434 [1:04:58<13:38:11,  8.22s/it, gpt_loss=0.363, loss_mean=0.379][A
+Train step of epoch 0:   7%|▋         | 460/6434 [1:04:58<13:45:55,  8.30s/it, gpt_loss=0.363, loss_mean=0.379][A
+Train step of epoch 0:   7%|▋         | 460/6434 [1:05:07<13:45:55,  8.30s/it, gpt_loss=0.414, loss_mean=0.383][A
+Train step of epoch 0:   7%|▋         | 461/6434 [1:05:07<14:12:47,  8.57s/it, gpt_loss=0.414, loss_mean=0.383][A
+Train step of epoch 0:   7%|▋         | 461/6434 [1:05:15<14:12:47,  8.57s/it, gpt_loss=0.386, loss_mean=0.383][A
+Train step of epoch 0:   7%|▋         | 462/6434 [1:05:15<14:04:18,  8.48s/it, gpt_loss=0.386, loss_mean=0.383][A
+Train step of epoch 0:   7%|▋         | 462/6434 [1:05:23<14:04:18,  8.48s/it, gpt_loss=0.344, loss_mean=0.379][A
+Train step of epoch 0:   7%|▋         | 463/6434 [1:05:23<13:48:29,  8.33s/it, gpt_loss=0.344, loss_mean=0.379][A
+Train step of epoch 0:   7%|▋         | 463/6434 [1:05:30<13:48:29,  8.33s/it, gpt_loss=0.546, loss_mean=0.396][A
+Train step of epoch 0:   7%|▋         | 464/6434 [1:05:30<13:18:06,  8.02s/it, gpt_loss=0.546, loss_mean=0.396][A
+Train step of epoch 0:   7%|▋         | 464/6434 [1:05:39<13:18:06,  8.02s/it, gpt_loss=0.456, loss_mean=0.402][A
+Train step of epoch 0:   7%|▋         | 465/6434 [1:05:39<13:22:11,  8.06s/it, gpt_loss=0.456, loss_mean=0.402][A
+Train step of epoch 0:   7%|▋         | 465/6434 [1:05:48<13:22:11,  8.06s/it, gpt_loss=0.344, loss_mean=0.396][A
+Train step of epoch 0:   7%|▋         | 466/6434 [1:05:48<14:04:29,  8.49s/it, gpt_loss=0.344, loss_mean=0.396][A
+Train step of epoch 0:   7%|▋         | 466/6434 [1:05:56<14:04:29,  8.49s/it, gpt_loss=0.389, loss_mean=0.395][A
+Train step of epoch 0:   7%|▋         | 467/6434 [1:05:56<13:36:53,  8.21s/it, gpt_loss=0.389, loss_mean=0.395][A
+Train step of epoch 0:   7%|▋         | 467/6434 [1:06:04<13:36:53,  8.21s/it, gpt_loss=0.547, loss_mean=0.41] [A
+Train step of epoch 0:   7%|▋         | 468/6434 [1:06:04<13:34:54,  8.20s/it, gpt_loss=0.547, loss_mean=0.41][A
+Train step of epoch 0:   7%|▋         | 468/6434 [1:06:12<13:34:54,  8.20s/it, gpt_loss=0.314, loss_mean=0.401][A
+Train step of epoch 0:   7%|▋         | 469/6434 [1:06:12<13:46:57,  8.32s/it, gpt_loss=0.314, loss_mean=0.401][A
+[LID Router Debug] Step: 470
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [2, 0, 0, 6, 1, 0, 2, 1, 3, 0]
+Active Experts in Batch: {0, 1, 2, 3, 6}
+
+Train step of epoch 0:   7%|▋         | 469/6434 [1:06:21<13:46:57,  8.32s/it, gpt_loss=0.321, loss_mean=0.393][A
+Train step of epoch 0:   7%|▋         | 470/6434 [1:06:21<14:03:54,  8.49s/it, gpt_loss=0.321, loss_mean=0.393][A
+Train step of epoch 0:   7%|▋         | 470/6434 [1:06:29<14:03:54,  8.49s/it, gpt_loss=0.485, loss_mean=0.402][A
+Train step of epoch 0:   7%|▋         | 471/6434 [1:06:29<13:49:36,  8.35s/it, gpt_loss=0.485, loss_mean=0.402][A
+Train step of epoch 0:   7%|▋         | 471/6434 [1:06:39<13:49:36,  8.35s/it, gpt_loss=0.418, loss_mean=0.404][A
+Train step of epoch 0:   7%|▋         | 472/6434 [1:06:39<14:17:46,  8.63s/it, gpt_loss=0.418, loss_mean=0.404][A
+Train step of epoch 0:   7%|▋         | 472/6434 [1:06:47<14:17:46,  8.63s/it, gpt_loss=0.464, loss_mean=0.41] [A
+Train step of epoch 0:   7%|▋         | 473/6434 [1:06:47<14:17:51,  8.63s/it, gpt_loss=0.464, loss_mean=0.41][A
+Train step of epoch 0:   7%|▋         | 473/6434 [1:06:55<14:17:51,  8.63s/it, gpt_loss=0.358, loss_mean=0.405][A
+Train step of epoch 0:   7%|▋         | 474/6434 [1:06:55<14:05:00,  8.51s/it, gpt_loss=0.358, loss_mean=0.405][A
+Train step of epoch 0:   7%|▋         | 474/6434 [1:07:05<14:05:00,  8.51s/it, gpt_loss=0.323, loss_mean=0.396][A
+Train step of epoch 0:   7%|▋         | 475/6434 [1:07:05<14:31:27,  8.77s/it, gpt_loss=0.323, loss_mean=0.396][A
+Train step of epoch 0:   7%|▋         | 475/6434 [1:07:12<14:31:27,  8.77s/it, gpt_loss=0.361, loss_mean=0.393][A
+Train step of epoch 0:   7%|▋         | 476/6434 [1:07:12<13:36:49,  8.23s/it, gpt_loss=0.361, loss_mean=0.393][A
+Train step of epoch 0:   7%|▋         | 476/6434 [1:07:20<13:36:49,  8.23s/it, gpt_loss=0.457, loss_mean=0.399][A
+Train step of epoch 0:   7%|▋         | 477/6434 [1:07:20<13:46:51,  8.33s/it, gpt_loss=0.457, loss_mean=0.399][A
+Train step of epoch 0:   7%|▋         | 477/6434 [1:07:28<13:46:51,  8.33s/it, gpt_loss=0.372, loss_mean=0.397][A
+Train step of epoch 0:   7%|▋         | 478/6434 [1:07:28<13:37:52,  8.24s/it, gpt_loss=0.372, loss_mean=0.397][A
+Train step of epoch 0:   7%|▋         | 478/6434 [1:07:36<13:37:52,  8.24s/it, gpt_loss=0.368, loss_mean=0.394][A
+Train step of epoch 0:   7%|▋         | 479/6434 [1:07:36<13:30:26,  8.17s/it, gpt_loss=0.368, loss_mean=0.394][A
+[LID Router Debug] Step: 480
+Batch Size: 10
+Audio Batch Size: 88
+LID Assignments: [5, 4, 4, 1, 2, 9, 5, 1, 5, 6]
+Active Experts in Batch: {1, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:   7%|▋         | 479/6434 [1:07:46<13:30:26,  8.17s/it, gpt_loss=0.374, loss_mean=0.392][A
+Train step of epoch 0:   7%|▋         | 480/6434 [1:07:46<14:18:40,  8.65s/it, gpt_loss=0.374, loss_mean=0.392][A
+Train step of epoch 0:   7%|▋         | 480/6434 [1:07:55<14:18:40,  8.65s/it, gpt_loss=0.357, loss_mean=0.388][A
+Train step of epoch 0:   7%|▋         | 481/6434 [1:07:55<14:24:27,  8.71s/it, gpt_loss=0.357, loss_mean=0.388][A
+Train step of epoch 0:   7%|▋         | 481/6434 [1:08:03<14:24:27,  8.71s/it, gpt_loss=0.375, loss_mean=0.387][A
+Train step of epoch 0:   7%|▋         | 482/6434 [1:08:03<14:10:42,  8.58s/it, gpt_loss=0.375, loss_mean=0.387][A
+Train step of epoch 0:   7%|▋         | 482/6434 [1:08:13<14:10:42,  8.58s/it, gpt_loss=0.433, loss_mean=0.391][A
+Train step of epoch 0:   8%|▊         | 483/6434 [1:08:13<14:40:48,  8.88s/it, gpt_loss=0.433, loss_mean=0.391][A
+Train step of epoch 0:   8%|▊         | 483/6434 [1:08:21<14:40:48,  8.88s/it, gpt_loss=0.34, loss_mean=0.386] [A
+Train step of epoch 0:   8%|▊         | 484/6434 [1:08:21<14:32:49,  8.80s/it, gpt_loss=0.34, loss_mean=0.386][A
+Train step of epoch 0:   8%|▊         | 484/6434 [1:08:30<14:32:49,  8.80s/it, gpt_loss=0.342, loss_mean=0.382][A
+Train step of epoch 0:   8%|▊         | 485/6434 [1:08:30<14:30:08,  8.78s/it, gpt_loss=0.342, loss_mean=0.382][A
+Train step of epoch 0:   8%|▊         | 485/6434 [1:08:38<14:30:08,  8.78s/it, gpt_loss=0.345, loss_mean=0.378][A
+Train step of epoch 0:   8%|▊         | 486/6434 [1:08:38<14:07:38,  8.55s/it, gpt_loss=0.345, loss_mean=0.378][A
+Train step of epoch 0:   8%|▊         | 486/6434 [1:08:46<14:07:38,  8.55s/it, gpt_loss=0.495, loss_mean=0.39] [A
+Train step of epoch 0:   8%|▊         | 487/6434 [1:08:46<13:58:27,  8.46s/it, gpt_loss=0.495, loss_mean=0.39][A
+Train step of epoch 0:   8%|▊         | 487/6434 [1:08:55<13:58:27,  8.46s/it, gpt_loss=0.429, loss_mean=0.394][A
+Train step of epoch 0:   8%|▊         | 488/6434 [1:08:55<14:13:04,  8.61s/it, gpt_loss=0.429, loss_mean=0.394][A
+Train step of epoch 0:   8%|▊         | 488/6434 [1:09:03<14:13:04,  8.61s/it, gpt_loss=0.301, loss_mean=0.384][A
+Train step of epoch 0:   8%|▊         | 489/6434 [1:09:03<13:53:55,  8.42s/it, gpt_loss=0.301, loss_mean=0.384][A
+[LID Router Debug] Step: 490
+Batch Size: 10
+Audio Batch Size: 87
+LID Assignments: [5, 6, 5, 1, 0, 9, 3, 4, 2, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:   8%|▊         | 489/6434 [1:09:11<13:53:55,  8.42s/it, gpt_loss=0.407, loss_mean=0.387][A
+Train step of epoch 0:   8%|▊         | 490/6434 [1:09:11<13:36:27,  8.24s/it, gpt_loss=0.407, loss_mean=0.387][A
+Train step of epoch 0:   8%|▊         | 490/6434 [1:09:20<13:36:27,  8.24s/it, gpt_loss=0.393, loss_mean=0.387][A
+Train step of epoch 0:   8%|▊         | 491/6434 [1:09:20<13:43:31,  8.31s/it, gpt_loss=0.393, loss_mean=0.387][A
+Train step of epoch 0:   8%|▊         | 491/6434 [1:09:27<13:43:31,  8.31s/it, gpt_loss=0.328, loss_mean=0.381][A
+Train step of epoch 0:   8%|▊         | 492/6434 [1:09:27<13:27:54,  8.16s/it, gpt_loss=0.328, loss_mean=0.381][A
+Train step of epoch 0:   8%|▊         | 492/6434 [1:09:35<13:27:54,  8.16s/it, gpt_loss=0.322, loss_mean=0.375][A
+Train step of epoch 0:   8%|▊         | 493/6434 [1:09:35<13:04:27,  7.92s/it, gpt_loss=0.322, loss_mean=0.375][A
+Train step of epoch 0:   8%|▊         | 493/6434 [1:09:43<13:04:27,  7.92s/it, gpt_loss=0.374, loss_mean=0.375][A
+Train step of epoch 0:   8%|▊         | 494/6434 [1:09:43<13:17:02,  8.05s/it, gpt_loss=0.374, loss_mean=0.375][A
+Train step of epoch 0:   8%|▊         | 494/6434 [1:09:51<13:17:02,  8.05s/it, gpt_loss=0.364, loss_mean=0.374][A
+Train step of epoch 0:   8%|▊         | 495/6434 [1:09:51<13:19:53,  8.08s/it, gpt_loss=0.364, loss_mean=0.374][A
+Train step of epoch 0:   8%|▊         | 495/6434 [1:10:00<13:19:53,  8.08s/it, gpt_loss=0.324, loss_mean=0.369][A
+Train step of epoch 0:   8%|▊         | 496/6434 [1:10:00<13:25:10,  8.14s/it, gpt_loss=0.324, loss_mean=0.369][A
+Train step of epoch 0:   8%|▊         | 496/6434 [1:10:09<13:25:10,  8.14s/it, gpt_loss=0.395, loss_mean=0.372][A
+Train step of epoch 0:   8%|▊         | 497/6434 [1:10:09<13:53:47,  8.43s/it, gpt_loss=0.395, loss_mean=0.372][A
+Train step of epoch 0:   8%|▊         | 497/6434 [1:10:16<13:53:47,  8.43s/it, gpt_loss=0.402, loss_mean=0.375][A
+Train step of epoch 0:   8%|▊         | 498/6434 [1:10:16<13:30:30,  8.19s/it, gpt_loss=0.402, loss_mean=0.375][A
+Train step of epoch 0:   8%|▊         | 498/6434 [1:10:24<13:30:30,  8.19s/it, gpt_loss=0.432, loss_mean=0.381][A
+Train step of epoch 0:   8%|▊         | 499/6434 [1:10:24<13:25:48,  8.15s/it, gpt_loss=0.432, loss_mean=0.381][A
+[LID Router Debug] Step: 500
+Batch Size: 10
+Audio Batch Size: 129
+LID Assignments: [9, 5, 3, 3, 2, 4, 3, 5, 9, 5]
+Active Experts in Batch: {2, 3, 4, 5, 9}
+
+Train step of epoch 0:   8%|▊         | 499/6434 [1:10:33<13:25:48,  8.15s/it, gpt_loss=0.4, loss_mean=0.382]  [A
+Train step of epoch 0:   8%|▊         | 500/6434 [1:10:33<13:34:51,  8.24s/it, gpt_loss=0.4, loss_mean=0.382][A
+Train step of epoch 0:   8%|▊         | 500/6434 [1:10:42<13:34:51,  8.24s/it, gpt_loss=0.354, loss_mean=0.38][A
+Train step of epoch 0:   8%|▊         | 501/6434 [1:10:42<14:03:19,  8.53s/it, gpt_loss=0.354, loss_mean=0.38][A
+Train step of epoch 0:   8%|▊         | 501/6434 [1:10:50<14:03:19,  8.53s/it, gpt_loss=0.337, loss_mean=0.375][A
+Train step of epoch 0:   8%|▊         | 502/6434 [1:10:50<13:48:36,  8.38s/it, gpt_loss=0.337, loss_mean=0.375][A
+Train step of epoch 0:   8%|▊         | 502/6434 [1:10:59<13:48:36,  8.38s/it, gpt_loss=0.429, loss_mean=0.381][A
+Train step of epoch 0:   8%|▊         | 503/6434 [1:10:59<13:53:45,  8.43s/it, gpt_loss=0.429, loss_mean=0.381][A
+Train step of epoch 0:   8%|▊         | 503/6434 [1:11:06<13:53:45,  8.43s/it, gpt_loss=0.438, loss_mean=0.386][A
+Train step of epoch 0:   8%|▊         | 504/6434 [1:11:06<13:21:36,  8.11s/it, gpt_loss=0.438, loss_mean=0.386][A
+Train step of epoch 0:   8%|▊         | 504/6434 [1:11:15<13:21:36,  8.11s/it, gpt_loss=0.332, loss_mean=0.381][A
+Train step of epoch 0:   8%|▊         | 505/6434 [1:11:15<13:36:23,  8.26s/it, gpt_loss=0.332, loss_mean=0.381][A
+Train step of epoch 0:   8%|▊         | 505/6434 [1:11:23<13:36:23,  8.26s/it, gpt_loss=0.346, loss_mean=0.378][A
+Train step of epoch 0:   8%|▊         | 506/6434 [1:11:23<13:45:41,  8.36s/it, gpt_loss=0.346, loss_mean=0.378][A
+Train step of epoch 0:   8%|▊         | 506/6434 [1:11:31<13:45:41,  8.36s/it, gpt_loss=0.296, loss_mean=0.369][A
+Train step of epoch 0:   8%|▊         | 507/6434 [1:11:31<13:39:53,  8.30s/it, gpt_loss=0.296, loss_mean=0.369][A
+Train step of epoch 0:   8%|▊         | 507/6434 [1:11:40<13:39:53,  8.30s/it, gpt_loss=0.311, loss_mean=0.364][A
+Train step of epoch 0:   8%|▊         | 508/6434 [1:11:40<13:37:48,  8.28s/it, gpt_loss=0.311, loss_mean=0.364][A
+Train step of epoch 0:   8%|▊         | 508/6434 [1:11:49<13:37:48,  8.28s/it, gpt_loss=0.462, loss_mean=0.373][A
+Train step of epoch 0:   8%|▊         | 509/6434 [1:11:49<13:59:22,  8.50s/it, gpt_loss=0.462, loss_mean=0.373][A
+[LID Router Debug] Step: 510
+Batch Size: 10
+Audio Batch Size: 95
+LID Assignments: [3, 4, 1, 4, 1, 4, 5, 5, 9, 5]
+Active Experts in Batch: {1, 3, 4, 5, 9}
+
+Train step of epoch 0:   8%|▊         | 509/6434 [1:11:57<13:59:22,  8.50s/it, gpt_loss=0.5, loss_mean=0.386]  [A
+Train step of epoch 0:   8%|▊         | 510/6434 [1:11:57<13:43:50,  8.34s/it, gpt_loss=0.5, loss_mean=0.386][A
+Train step of epoch 0:   8%|▊         | 510/6434 [1:12:04<13:43:50,  8.34s/it, gpt_loss=0.377, loss_mean=0.385][A
+Train step of epoch 0:   8%|▊         | 511/6434 [1:12:04<13:23:58,  8.14s/it, gpt_loss=0.377, loss_mean=0.385][A
+Train step of epoch 0:   8%|▊         | 511/6434 [1:12:12<13:23:58,  8.14s/it, gpt_loss=0.379, loss_mean=0.385][A
+Train step of epoch 0:   8%|▊         | 512/6434 [1:12:12<13:07:55,  7.98s/it, gpt_loss=0.379, loss_mean=0.385][A
+Train step of epoch 0:   8%|▊         | 512/6434 [1:12:19<13:07:55,  7.98s/it, gpt_loss=0.395, loss_mean=0.386][A
+Train step of epoch 0:   8%|▊         | 513/6434 [1:12:19<12:33:19,  7.63s/it, gpt_loss=0.395, loss_mean=0.386][A
+Train step of epoch 0:   8%|▊         | 513/6434 [1:12:26<12:33:19,  7.63s/it, gpt_loss=0.388, loss_mean=0.386][A
+Train step of epoch 0:   8%|▊         | 514/6434 [1:12:26<12:37:01,  7.67s/it, gpt_loss=0.388, loss_mean=0.386][A
+Train step of epoch 0:   8%|▊         | 514/6434 [1:12:35<12:37:01,  7.67s/it, gpt_loss=0.411, loss_mean=0.388][A
+Train step of epoch 0:   8%|▊         | 515/6434 [1:12:35<13:03:25,  7.94s/it, gpt_loss=0.411, loss_mean=0.388][A
+Train step of epoch 0:   8%|▊         | 515/6434 [1:12:42<13:03:25,  7.94s/it, gpt_loss=0.351, loss_mean=0.385][A
+Train step of epoch 0:   8%|▊         | 516/6434 [1:12:42<12:48:22,  7.79s/it, gpt_loss=0.351, loss_mean=0.385][A
+Train step of epoch 0:   8%|▊         | 516/6434 [1:12:50<12:48:22,  7.79s/it, gpt_loss=0.408, loss_mean=0.387][A
+Train step of epoch 0:   8%|▊         | 517/6434 [1:12:50<12:40:44,  7.71s/it, gpt_loss=0.408, loss_mean=0.387][A
+Train step of epoch 0:   8%|▊         | 517/6434 [1:12:58<12:40:44,  7.71s/it, gpt_loss=0.355, loss_mean=0.384][A
+Train step of epoch 0:   8%|▊         | 518/6434 [1:12:58<12:53:39,  7.85s/it, gpt_loss=0.355, loss_mean=0.384][A
+Train step of epoch 0:   8%|▊         | 518/6434 [1:13:08<12:53:39,  7.85s/it, gpt_loss=0.332, loss_mean=0.379][A
+Train step of epoch 0:   8%|▊         | 519/6434 [1:13:08<13:51:45,  8.44s/it, gpt_loss=0.332, loss_mean=0.379][A
+[LID Router Debug] Step: 520
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [9, 4, 1, 4, 2, 2, 0, 3, 6, 2]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:   8%|▊         | 519/6434 [1:13:17<13:51:45,  8.44s/it, gpt_loss=0.323, loss_mean=0.373][A
+Train step of epoch 0:   8%|▊         | 520/6434 [1:13:17<14:14:57,  8.67s/it, gpt_loss=0.323, loss_mean=0.373][A
+Train step of epoch 0:   8%|▊         | 520/6434 [1:13:25<14:14:57,  8.67s/it, gpt_loss=0.417, loss_mean=0.377][A
+Train step of epoch 0:   8%|▊         | 521/6434 [1:13:25<14:01:22,  8.54s/it, gpt_loss=0.417, loss_mean=0.377][A
+Train step of epoch 0:   8%|▊         | 521/6434 [1:13:33<14:01:22,  8.54s/it, gpt_loss=0.344, loss_mean=0.374][A
+Train step of epoch 0:   8%|▊         | 522/6434 [1:13:33<13:40:30,  8.33s/it, gpt_loss=0.344, loss_mean=0.374][A
+Train step of epoch 0:   8%|▊         | 522/6434 [1:13:42<13:40:30,  8.33s/it, gpt_loss=0.419, loss_mean=0.379][A
+Train step of epoch 0:   8%|▊         | 523/6434 [1:13:42<13:47:37,  8.40s/it, gpt_loss=0.419, loss_mean=0.379][A
+Train step of epoch 0:   8%|▊         | 523/6434 [1:13:51<13:47:37,  8.40s/it, gpt_loss=0.373, loss_mean=0.378][A
+Train step of epoch 0:   8%|▊         | 524/6434 [1:13:51<14:02:47,  8.56s/it, gpt_loss=0.373, loss_mean=0.378][A
+Train step of epoch 0:   8%|▊         | 524/6434 [1:14:00<14:02:47,  8.56s/it, gpt_loss=0.501, loss_mean=0.39] [A
+Train step of epoch 0:   8%|▊         | 525/6434 [1:14:00<14:26:46,  8.80s/it, gpt_loss=0.501, loss_mean=0.39][A
+Train step of epoch 0:   8%|▊         | 525/6434 [1:14:08<14:26:46,  8.80s/it, gpt_loss=0.292, loss_mean=0.381][A
+Train step of epoch 0:   8%|▊         | 526/6434 [1:14:08<14:05:35,  8.59s/it, gpt_loss=0.292, loss_mean=0.381][A
+Train step of epoch 0:   8%|▊         | 526/6434 [1:14:17<14:05:35,  8.59s/it, gpt_loss=0.36, loss_mean=0.379] [A
+Train step of epoch 0:   8%|▊         | 527/6434 [1:14:17<13:59:01,  8.52s/it, gpt_loss=0.36, loss_mean=0.379][A
+Train step of epoch 0:   8%|▊         | 527/6434 [1:14:24<13:59:01,  8.52s/it, gpt_loss=0.453, loss_mean=0.386][A
+Train step of epoch 0:   8%|▊         | 528/6434 [1:14:24<13:34:37,  8.28s/it, gpt_loss=0.453, loss_mean=0.386][A
+Train step of epoch 0:   8%|▊         | 528/6434 [1:14:32<13:34:37,  8.28s/it, gpt_loss=0.359, loss_mean=0.383][A
+Train step of epoch 0:   8%|▊         | 529/6434 [1:14:32<13:07:30,  8.00s/it, gpt_loss=0.359, loss_mean=0.383][A
+[LID Router Debug] Step: 530
+Batch Size: 10
+Audio Batch Size: 84
+LID Assignments: [6, 9, 5, 1, 1, 4, 2, 5, 1, 2]
+Active Experts in Batch: {1, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:   8%|▊         | 529/6434 [1:14:40<13:07:30,  8.00s/it, gpt_loss=0.466, loss_mean=0.392][A
+Train step of epoch 0:   8%|▊         | 530/6434 [1:14:40<13:08:20,  8.01s/it, gpt_loss=0.466, loss_mean=0.392][A
+Train step of epoch 0:   8%|▊         | 530/6434 [1:14:48<13:08:20,  8.01s/it, gpt_loss=0.334, loss_mean=0.386][A
+Train step of epoch 0:   8%|▊         | 531/6434 [1:14:48<13:17:36,  8.11s/it, gpt_loss=0.334, loss_mean=0.386][A
+Train step of epoch 0:   8%|▊         | 531/6434 [1:14:55<13:17:36,  8.11s/it, gpt_loss=0.322, loss_mean=0.379][A
+Train step of epoch 0:   8%|▊         | 532/6434 [1:14:55<12:37:42,  7.70s/it, gpt_loss=0.322, loss_mean=0.379][A
+Train step of epoch 0:   8%|▊         | 532/6434 [1:15:04<12:37:42,  7.70s/it, gpt_loss=0.369, loss_mean=0.378][A
+Train step of epoch 0:   8%|▊         | 533/6434 [1:15:04<13:30:49,  8.24s/it, gpt_loss=0.369, loss_mean=0.378][A
+Train step of epoch 0:   8%|▊         | 533/6434 [1:15:12<13:30:49,  8.24s/it, gpt_loss=0.279, loss_mean=0.368][A
+Train step of epoch 0:   8%|▊         | 534/6434 [1:15:12<13:14:46,  8.08s/it, gpt_loss=0.279, loss_mean=0.368][A
+Train step of epoch 0:   8%|▊         | 534/6434 [1:15:20<13:14:46,  8.08s/it, gpt_loss=0.39, loss_mean=0.37]  [A
+Train step of epoch 0:   8%|▊         | 535/6434 [1:15:20<13:20:03,  8.14s/it, gpt_loss=0.39, loss_mean=0.37][A
+Train step of epoch 0:   8%|▊         | 535/6434 [1:15:28<13:20:03,  8.14s/it, gpt_loss=0.473, loss_mean=0.381][A
+Train step of epoch 0:   8%|▊         | 536/6434 [1:15:28<13:17:18,  8.11s/it, gpt_loss=0.473, loss_mean=0.381][A
+Train step of epoch 0:   8%|▊         | 536/6434 [1:15:36<13:17:18,  8.11s/it, gpt_loss=0.392, loss_mean=0.382][A
+Train step of epoch 0:   8%|▊         | 537/6434 [1:15:36<13:01:57,  7.96s/it, gpt_loss=0.392, loss_mean=0.382][A
+Train step of epoch 0:   8%|▊         | 537/6434 [1:15:44<13:01:57,  7.96s/it, gpt_loss=0.419, loss_mean=0.386][A
+Train step of epoch 0:   8%|▊         | 538/6434 [1:15:44<13:17:48,  8.12s/it, gpt_loss=0.419, loss_mean=0.386][A
+Train step of epoch 0:   8%|▊         | 538/6434 [1:15:52<13:17:48,  8.12s/it, gpt_loss=0.471, loss_mean=0.394][A
+Train step of epoch 0:   8%|▊         | 539/6434 [1:15:52<13:13:37,  8.08s/it, gpt_loss=0.471, loss_mean=0.394][A
+[LID Router Debug] Step: 540
+Batch Size: 10
+Audio Batch Size: 85
+LID Assignments: [4, 4, 3, 5, 4, 5, 1, 9, 2, 5]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:   8%|▊         | 539/6434 [1:16:00<13:13:37,  8.08s/it, gpt_loss=0.5, loss_mean=0.405]  [A
+Train step of epoch 0:   8%|▊         | 540/6434 [1:16:00<13:03:51,  7.98s/it, gpt_loss=0.5, loss_mean=0.405][A
+Train step of epoch 0:   8%|▊         | 540/6434 [1:16:08<13:03:51,  7.98s/it, gpt_loss=0.363, loss_mean=0.401][A
+Train step of epoch 0:   8%|▊         | 541/6434 [1:16:08<12:59:44,  7.94s/it, gpt_loss=0.363, loss_mean=0.401][A
+Train step of epoch 0:   8%|▊         | 541/6434 [1:16:17<12:59:44,  7.94s/it, gpt_loss=0.375, loss_mean=0.398][A
+Train step of epoch 0:   8%|▊         | 542/6434 [1:16:17<13:41:38,  8.37s/it, gpt_loss=0.375, loss_mean=0.398][A
+Train step of epoch 0:   8%|▊         | 542/6434 [1:16:25<13:41:38,  8.37s/it, gpt_loss=0.31, loss_mean=0.389] [A
+Train step of epoch 0:   8%|▊         | 543/6434 [1:16:25<13:17:58,  8.13s/it, gpt_loss=0.31, loss_mean=0.389][A
+Train step of epoch 0:   8%|▊         | 543/6434 [1:16:33<13:17:58,  8.13s/it, gpt_loss=0.377, loss_mean=0.388][A
+Train step of epoch 0:   8%|▊         | 544/6434 [1:16:33<13:20:12,  8.15s/it, gpt_loss=0.377, loss_mean=0.388][A
+Train step of epoch 0:   8%|▊         | 544/6434 [1:16:41<13:20:12,  8.15s/it, gpt_loss=0.307, loss_mean=0.38] [A
+Train step of epoch 0:   8%|▊         | 545/6434 [1:16:41<13:24:52,  8.20s/it, gpt_loss=0.307, loss_mean=0.38][A
+Train step of epoch 0:   8%|▊         | 545/6434 [1:16:51<13:24:52,  8.20s/it, gpt_loss=0.414, loss_mean=0.383][A
+Train step of epoch 0:   8%|▊         | 546/6434 [1:16:51<13:57:16,  8.53s/it, gpt_loss=0.414, loss_mean=0.383][A
+Train step of epoch 0:   8%|▊         | 546/6434 [1:16:58<13:57:16,  8.53s/it, gpt_loss=0.38, loss_mean=0.383] [A
+Train step of epoch 0:   9%|▊         | 547/6434 [1:16:58<13:34:12,  8.30s/it, gpt_loss=0.38, loss_mean=0.383][A
+Train step of epoch 0:   9%|▊         | 547/6434 [1:17:06<13:34:12,  8.30s/it, gpt_loss=0.355, loss_mean=0.38][A
+Train step of epoch 0:   9%|▊         | 548/6434 [1:17:06<13:25:11,  8.21s/it, gpt_loss=0.355, loss_mean=0.38][A
+Train step of epoch 0:   9%|▊         | 548/6434 [1:17:15<13:25:11,  8.21s/it, gpt_loss=0.494, loss_mean=0.392][A
+Train step of epoch 0:   9%|▊         | 549/6434 [1:17:15<13:21:52,  8.18s/it, gpt_loss=0.494, loss_mean=0.392][A
+[LID Router Debug] Step: 550
+Batch Size: 10
+Audio Batch Size: 133
+LID Assignments: [3, 4, 5, 9, 5, 2, 11, 3, 0, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9, 11}
+
+Train step of epoch 0:   9%|▊         | 549/6434 [1:17:23<13:21:52,  8.18s/it, gpt_loss=0.369, loss_mean=0.389][A
+Train step of epoch 0:   9%|▊         | 550/6434 [1:17:23<13:30:09,  8.26s/it, gpt_loss=0.369, loss_mean=0.389][A
+Train step of epoch 0:   9%|▊         | 550/6434 [1:17:31<13:30:09,  8.26s/it, gpt_loss=0.494, loss_mean=0.4]  [A
+Train step of epoch 0:   9%|▊         | 551/6434 [1:17:31<13:23:08,  8.19s/it, gpt_loss=0.494, loss_mean=0.4][A
+Train step of epoch 0:   9%|▊         | 551/6434 [1:17:39<13:23:08,  8.19s/it, gpt_loss=0.368, loss_mean=0.396][A
+Train step of epoch 0:   9%|▊         | 552/6434 [1:17:39<13:12:13,  8.08s/it, gpt_loss=0.368, loss_mean=0.396][A
+Train step of epoch 0:   9%|▊         | 552/6434 [1:17:48<13:12:13,  8.08s/it, gpt_loss=0.386, loss_mean=0.395][A
+Train step of epoch 0:   9%|▊         | 553/6434 [1:17:48<13:42:16,  8.39s/it, gpt_loss=0.386, loss_mean=0.395][A
+Train step of epoch 0:   9%|▊         | 553/6434 [1:17:58<13:42:16,  8.39s/it, gpt_loss=0.435, loss_mean=0.399][A
+Train step of epoch 0:   9%|▊         | 554/6434 [1:17:58<14:24:20,  8.82s/it, gpt_loss=0.435, loss_mean=0.399][A
+Train step of epoch 0:   9%|▊         | 554/6434 [1:18:06<14:24:20,  8.82s/it, gpt_loss=0.399, loss_mean=0.399][A
+Train step of epoch 0:   9%|▊         | 555/6434 [1:18:06<14:11:48,  8.69s/it, gpt_loss=0.399, loss_mean=0.399][A
+Train step of epoch 0:   9%|▊         | 555/6434 [1:18:15<14:11:48,  8.69s/it, gpt_loss=0.44, loss_mean=0.403] [A
+Train step of epoch 0:   9%|▊         | 556/6434 [1:18:15<14:24:34,  8.83s/it, gpt_loss=0.44, loss_mean=0.403][A
+Train step of epoch 0:   9%|▊         | 556/6434 [1:18:24<14:24:34,  8.83s/it, gpt_loss=0.384, loss_mean=0.402][A
+Train step of epoch 0:   9%|▊         | 557/6434 [1:18:24<14:22:20,  8.80s/it, gpt_loss=0.384, loss_mean=0.402][A
+Train step of epoch 0:   9%|▊         | 557/6434 [1:18:32<14:22:20,  8.80s/it, gpt_loss=0.337, loss_mean=0.395][A
+Train step of epoch 0:   9%|▊         | 558/6434 [1:18:32<14:08:14,  8.66s/it, gpt_loss=0.337, loss_mean=0.395][A
+Train step of epoch 0:   9%|▊         | 558/6434 [1:18:41<14:08:14,  8.66s/it, gpt_loss=0.339, loss_mean=0.39] [A
+Train step of epoch 0:   9%|▊         | 559/6434 [1:18:41<14:19:03,  8.77s/it, gpt_loss=0.339, loss_mean=0.39][A
+[LID Router Debug] Step: 560
+Batch Size: 10
+Audio Batch Size: 83
+LID Assignments: [0, 0, 1, 4, 9, 1, 5, 4, 0, 2]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+Train step of epoch 0:   9%|▊         | 559/6434 [1:18:50<14:19:03,  8.77s/it, gpt_loss=0.433, loss_mean=0.394][A
+Train step of epoch 0:   9%|▊         | 560/6434 [1:18:50<14:14:50,  8.73s/it, gpt_loss=0.433, loss_mean=0.394][A
+Train step of epoch 0:   9%|▊         | 560/6434 [1:18:57<14:14:50,  8.73s/it, gpt_loss=0.395, loss_mean=0.394][A
+Train step of epoch 0:   9%|▊         | 561/6434 [1:18:57<13:33:24,  8.31s/it, gpt_loss=0.395, loss_mean=0.394][A
+Train step of epoch 0:   9%|▊         | 561/6434 [1:19:05<13:33:24,  8.31s/it, gpt_loss=0.42, loss_mean=0.397] [A
+Train step of epoch 0:   9%|▊         | 562/6434 [1:19:05<13:00:12,  7.97s/it, gpt_loss=0.42, loss_mean=0.397][A
+Train step of epoch 0:   9%|▊         | 562/6434 [1:19:12<13:00:12,  7.97s/it, gpt_loss=0.443, loss_mean=0.401][A
+Train step of epoch 0:   9%|▉         | 563/6434 [1:19:12<12:49:35,  7.87s/it, gpt_loss=0.443, loss_mean=0.401][A
+Train step of epoch 0:   9%|▉         | 563/6434 [1:19:21<12:49:35,  7.87s/it, gpt_loss=0.38, loss_mean=0.399] [A
+Train step of epoch 0:   9%|▉         | 564/6434 [1:19:21<13:12:18,  8.10s/it, gpt_loss=0.38, loss_mean=0.399][A
+Train step of epoch 0:   9%|▉         | 564/6434 [1:19:29<13:12:18,  8.10s/it, gpt_loss=0.389, loss_mean=0.398][A
+Train step of epoch 0:   9%|▉         | 565/6434 [1:19:29<13:28:10,  8.26s/it, gpt_loss=0.389, loss_mean=0.398][A
+Train step of epoch 0:   9%|▉         | 565/6434 [1:19:39<13:28:10,  8.26s/it, gpt_loss=0.407, loss_mean=0.399][A
+Train step of epoch 0:   9%|▉         | 566/6434 [1:19:39<13:53:16,  8.52s/it, gpt_loss=0.407, loss_mean=0.399][A
+Train step of epoch 0:   9%|▉         | 566/6434 [1:19:47<13:53:16,  8.52s/it, gpt_loss=0.281, loss_mean=0.387][A
+Train step of epoch 0:   9%|▉         | 567/6434 [1:19:47<13:46:51,  8.46s/it, gpt_loss=0.281, loss_mean=0.387][A
+Train step of epoch 0:   9%|▉         | 567/6434 [1:19:55<13:46:51,  8.46s/it, gpt_loss=0.383, loss_mean=0.387][A
+Train step of epoch 0:   9%|▉         | 568/6434 [1:19:55<13:43:59,  8.43s/it, gpt_loss=0.383, loss_mean=0.387][A
+Train step of epoch 0:   9%|▉         | 568/6434 [1:20:03<13:43:59,  8.43s/it, gpt_loss=0.368, loss_mean=0.385][A
+Train step of epoch 0:   9%|▉         | 569/6434 [1:20:03<13:35:29,  8.34s/it, gpt_loss=0.368, loss_mean=0.385][A
+[LID Router Debug] Step: 570
+Batch Size: 10
+Audio Batch Size: 125
+LID Assignments: [0, 9, 3, 4, 0, 2, 3, 1, 4, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+Train step of epoch 0:   9%|▉         | 569/6434 [1:20:12<13:35:29,  8.34s/it, gpt_loss=0.39, loss_mean=0.385] [A
+Train step of epoch 0:   9%|▉         | 570/6434 [1:20:12<13:35:19,  8.34s/it, gpt_loss=0.39, loss_mean=0.385][A
+Train step of epoch 0:   9%|▉         | 570/6434 [1:20:21<13:35:19,  8.34s/it, gpt_loss=0.372, loss_mean=0.384][A
+Train step of epoch 0:   9%|▉         | 571/6434 [1:20:21<14:02:05,  8.62s/it, gpt_loss=0.372, loss_mean=0.384][A
+Train step of epoch 0:   9%|▉         | 571/6434 [1:20:29<14:02:05,  8.62s/it, gpt_loss=0.375, loss_mean=0.383][A
+Train step of epoch 0:   9%|▉         | 572/6434 [1:20:29<13:49:18,  8.49s/it, gpt_loss=0.375, loss_mean=0.383][A
+Train step of epoch 0:   9%|▉         | 572/6434 [1:20:37<13:49:18,  8.49s/it, gpt_loss=0.329, loss_mean=0.378][A
+Train step of epoch 0:   9%|▉         | 573/6434 [1:20:37<13:34:57,  8.34s/it, gpt_loss=0.329, loss_mean=0.378][A
+Train step of epoch 0:   9%|▉         | 573/6434 [1:20:46<13:34:57,  8.34s/it, gpt_loss=0.396, loss_mean=0.38] [A
+Train step of epoch 0:   9%|▉         | 574/6434 [1:20:46<13:39:15,  8.39s/it, gpt_loss=0.396, loss_mean=0.38][A
+Train step of epoch 0:   9%|▉         | 574/6434 [1:20:54<13:39:15,  8.39s/it, gpt_loss=0.348, loss_mean=0.376][A
+Train step of epoch 0:   9%|▉         | 575/6434 [1:20:54<13:24:56,  8.24s/it, gpt_loss=0.348, loss_mean=0.376][A
+Train step of epoch 0:   9%|▉         | 575/6434 [1:21:02<13:24:56,  8.24s/it, gpt_loss=0.406, loss_mean=0.379][A
+Train step of epoch 0:   9%|▉         | 576/6434 [1:21:02<13:15:14,  8.15s/it, gpt_loss=0.406, loss_mean=0.379][A
+Train step of epoch 0:   9%|▉         | 576/6434 [1:21:10<13:15:14,  8.15s/it, gpt_loss=0.388, loss_mean=0.38] [A
+Train step of epoch 0:   9%|▉         | 577/6434 [1:21:10<13:35:47,  8.36s/it, gpt_loss=0.388, loss_mean=0.38][A
+Train step of epoch 0:   9%|▉         | 577/6434 [1:21:18<13:35:47,  8.36s/it, gpt_loss=0.369, loss_mean=0.379][A
+Train step of epoch 0:   9%|▉         | 578/6434 [1:21:18<13:26:10,  8.26s/it, gpt_loss=0.369, loss_mean=0.379][A
+Train step of epoch 0:   9%|▉         | 578/6434 [1:21:27<13:26:10,  8.26s/it, gpt_loss=0.365, loss_mean=0.378][A
+Train step of epoch 0:   9%|▉         | 579/6434 [1:21:27<13:27:58,  8.28s/it, gpt_loss=0.365, loss_mean=0.378][A
+[LID Router Debug] Step: 580
+Batch Size: 10
+Audio Batch Size: 120
+LID Assignments: [3, 4, 0, 4, 1, 9, 3, 3, 4, 9]
+Active Experts in Batch: {0, 1, 3, 4, 9}
+
+Train step of epoch 0:   9%|▉         | 579/6434 [1:21:35<13:27:58,  8.28s/it, gpt_loss=0.336, loss_mean=0.374][A
+Train step of epoch 0:   9%|▉         | 580/6434 [1:21:35<13:27:48,  8.28s/it, gpt_loss=0.336, loss_mean=0.374][A
+Train step of epoch 0:   9%|▉         | 580/6434 [1:21:44<13:27:48,  8.28s/it, gpt_loss=0.346, loss_mean=0.371][A
+Train step of epoch 0:   9%|▉         | 581/6434 [1:21:44<13:44:36,  8.45s/it, gpt_loss=0.346, loss_mean=0.371][A
+Train step of epoch 0:   9%|▉         | 581/6434 [1:21:54<13:44:36,  8.45s/it, gpt_loss=0.37, loss_mean=0.371] [A
+Train step of epoch 0:   9%|▉         | 582/6434 [1:21:54<14:22:43,  8.85s/it, gpt_loss=0.37, loss_mean=0.371][A
+Train step of epoch 0:   9%|▉         | 582/6434 [1:22:02<14:22:43,  8.85s/it, gpt_loss=0.397, loss_mean=0.373][A
+Train step of epoch 0:   9%|▉         | 583/6434 [1:22:02<14:20:25,  8.82s/it, gpt_loss=0.397, loss_mean=0.373][A
+Train step of epoch 0:   9%|▉         | 583/6434 [1:22:10<14:20:25,  8.82s/it, gpt_loss=0.458, loss_mean=0.382][A
+Train step of epoch 0:   9%|▉         | 584/6434 [1:22:10<13:43:20,  8.44s/it, gpt_loss=0.458, loss_mean=0.382][A
+Train step of epoch 0:   9%|▉         | 584/6434 [1:22:18<13:43:20,  8.44s/it, gpt_loss=0.273, loss_mean=0.371][A
+Train step of epoch 0:   9%|▉         | 585/6434 [1:22:18<13:25:44,  8.27s/it, gpt_loss=0.273, loss_mean=0.371][A
+Train step of epoch 0:   9%|▉         | 585/6434 [1:22:27<13:25:44,  8.27s/it, gpt_loss=0.335, loss_mean=0.367][A
+Train step of epoch 0:   9%|▉         | 586/6434 [1:22:27<13:46:56,  8.48s/it, gpt_loss=0.335, loss_mean=0.367][A
+Train step of epoch 0:   9%|▉         | 586/6434 [1:22:35<13:46:56,  8.48s/it, gpt_loss=0.417, loss_mean=0.372][A
+Train step of epoch 0:   9%|▉         | 587/6434 [1:22:35<13:33:06,  8.34s/it, gpt_loss=0.417, loss_mean=0.372][A
+Train step of epoch 0:   9%|▉         | 587/6434 [1:22:43<13:33:06,  8.34s/it, gpt_loss=0.352, loss_mean=0.37] [A
+Train step of epoch 0:   9%|▉         | 588/6434 [1:22:43<13:21:21,  8.22s/it, gpt_loss=0.352, loss_mean=0.37][A
+Train step of epoch 0:   9%|▉         | 588/6434 [1:22:50<13:21:21,  8.22s/it, gpt_loss=0.364, loss_mean=0.37][A
+Train step of epoch 0:   9%|▉         | 589/6434 [1:22:50<13:06:32,  8.07s/it, gpt_loss=0.364, loss_mean=0.37][A
+[LID Router Debug] Step: 590
+Batch Size: 10
+Audio Batch Size: 118
+LID Assignments: [3, 9, 5, 6, 5, 6, 3, 5, 3, 5]
+Active Experts in Batch: {9, 3, 5, 6}
+
+Train step of epoch 0:   9%|▉         | 589/6434 [1:22:59<13:06:32,  8.07s/it, gpt_loss=0.481, loss_mean=0.381][A
+Train step of epoch 0:   9%|▉         | 590/6434 [1:22:59<13:12:53,  8.14s/it, gpt_loss=0.481, loss_mean=0.381][A
+Train step of epoch 0:   9%|▉         | 590/6434 [1:23:07<13:12:53,  8.14s/it, gpt_loss=0.396, loss_mean=0.382][A
+Train step of epoch 0:   9%|▉         | 591/6434 [1:23:07<13:24:16,  8.26s/it, gpt_loss=0.396, loss_mean=0.382][A
+Train step of epoch 0:   9%|▉         | 591/6434 [1:23:15<13:24:16,  8.26s/it, gpt_loss=0.386, loss_mean=0.383][A
+Train step of epoch 0:   9%|▉         | 592/6434 [1:23:15<13:00:18,  8.01s/it, gpt_loss=0.386, loss_mean=0.383][A
+Train step of epoch 0:   9%|▉         | 592/6434 [1:23:23<13:00:18,  8.01s/it, gpt_loss=0.419, loss_mean=0.386][A
+Train step of epoch 0:   9%|▉         | 593/6434 [1:23:23<13:01:36,  8.03s/it, gpt_loss=0.419, loss_mean=0.386][A
+Train step of epoch 0:   9%|▉         | 593/6434 [1:23:32<13:01:36,  8.03s/it, gpt_loss=0.472, loss_mean=0.395][A
+Train step of epoch 0:   9%|▉         | 594/6434 [1:23:32<13:47:06,  8.50s/it, gpt_loss=0.472, loss_mean=0.395][A
+Train step of epoch 0:   9%|▉         | 594/6434 [1:23:41<13:47:06,  8.50s/it, gpt_loss=0.296, loss_mean=0.385][A
+Train step of epoch 0:   9%|▉         | 595/6434 [1:23:41<13:48:32,  8.51s/it, gpt_loss=0.296, loss_mean=0.385][A
+Train step of epoch 0:   9%|▉         | 595/6434 [1:23:49<13:48:32,  8.51s/it, gpt_loss=0.293, loss_mean=0.376][A
+Train step of epoch 0:   9%|▉         | 596/6434 [1:23:49<13:21:12,  8.23s/it, gpt_loss=0.293, loss_mean=0.376][A
+Train step of epoch 0:   9%|▉         | 596/6434 [1:23:57<13:21:12,  8.23s/it, gpt_loss=0.348, loss_mean=0.373][A
+Train step of epoch 0:   9%|▉         | 597/6434 [1:23:57<13:34:07,  8.37s/it, gpt_loss=0.348, loss_mean=0.373][A
+Train step of epoch 0:   9%|▉         | 597/6434 [1:24:05<13:34:07,  8.37s/it, gpt_loss=0.281, loss_mean=0.364][A
+Train step of epoch 0:   9%|▉         | 598/6434 [1:24:05<13:30:29,  8.33s/it, gpt_loss=0.281, loss_mean=0.364][A
+Train step of epoch 0:   9%|▉         | 598/6434 [1:24:13<13:30:29,  8.33s/it, gpt_loss=0.342, loss_mean=0.362][A
+Train step of epoch 0:   9%|▉         | 599/6434 [1:24:13<13:06:31,  8.09s/it, gpt_loss=0.342, loss_mean=0.362][A
+[LID Router Debug] Step: 600
+Batch Size: 10
+Audio Batch Size: 139
+LID Assignments: [0, 9, 2, 5, 3, 4, 4, 10, 3, 3]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9, 10}
+[2026-02-06 17:20:26,463] [INFO] [logging.py:96:log_dist] [Rank 0] step=300, skipped=0, lr=[1.9977729493902112e-05, 1.9977729493902112e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-06 17:20:26,463] [INFO] [timer.py:260:stop] epoch=0/micro_step=600/global_step=300, RunningAvgSamplesPerSec=4.761299026587927, CurrSamplesPerSec=4.856945585945292, MemAllocated=12.61GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:   9%|▉         | 599/6434 [1:24:22<13:06:31,  8.09s/it, gpt_loss=0.34, loss_mean=0.36]  [A
+Train step of epoch 0:   9%|▉         | 600/6434 [1:24:22<13:32:35,  8.36s/it, gpt_loss=0.34, loss_mean=0.36][A
+Train step of epoch 0:   9%|▉         | 600/6434 [1:24:31<13:32:35,  8.36s/it, gpt_loss=0.374, loss_mean=0.361][A
+Train step of epoch 0:   9%|▉         | 601/6434 [1:24:31<13:43:02,  8.47s/it, gpt_loss=0.374, loss_mean=0.361][A
+Train step of epoch 0:   9%|▉         | 601/6434 [1:24:39<13:43:02,  8.47s/it, gpt_loss=0.296, loss_mean=0.354][A
+Train step of epoch 0:   9%|▉         | 602/6434 [1:24:39<13:31:34,  8.35s/it, gpt_loss=0.296, loss_mean=0.354][A
+Train step of epoch 0:   9%|▉         | 602/6434 [1:24:48<13:31:34,  8.35s/it, gpt_loss=0.42, loss_mean=0.361] [A
+Train step of epoch 0:   9%|▉         | 603/6434 [1:24:48<14:01:02,  8.65s/it, gpt_loss=0.42, loss_mean=0.361][A
+Train step of epoch 0:   9%|▉         | 603/6434 [1:24:56<14:01:02,  8.65s/it, gpt_loss=0.417, loss_mean=0.366][A
+Train step of epoch 0:   9%|▉         | 604/6434 [1:24:56<13:34:59,  8.39s/it, gpt_loss=0.417, loss_mean=0.366][A
+Train step of epoch 0:   9%|▉         | 604/6434 [1:25:04<13:34:59,  8.39s/it, gpt_loss=0.364, loss_mean=0.366][A
+Train step of epoch 0:   9%|▉         | 605/6434 [1:25:04<13:17:54,  8.21s/it, gpt_loss=0.364, loss_mean=0.366][A
+Train step of epoch 0:   9%|▉         | 605/6434 [1:25:12<13:17:54,  8.21s/it, gpt_loss=0.344, loss_mean=0.364][A
+Train step of epoch 0:   9%|▉         | 606/6434 [1:25:12<13:28:55,  8.33s/it, gpt_loss=0.344, loss_mean=0.364][A
+Train step of epoch 0:   9%|▉         | 606/6434 [1:25:21<13:28:55,  8.33s/it, gpt_loss=0.388, loss_mean=0.366][A
+Train step of epoch 0:   9%|▉         | 607/6434 [1:25:21<13:34:25,  8.39s/it, gpt_loss=0.388, loss_mean=0.366][A
+Train step of epoch 0:   9%|▉         | 607/6434 [1:25:30<13:34:25,  8.39s/it, gpt_loss=0.314, loss_mean=0.361][A
+Train step of epoch 0:   9%|▉         | 608/6434 [1:25:30<13:48:38,  8.53s/it, gpt_loss=0.314, loss_mean=0.361][A
+Train step of epoch 0:   9%|▉         | 608/6434 [1:25:39<13:48:38,  8.53s/it, gpt_loss=0.386, loss_mean=0.364][A
+Train step of epoch 0:   9%|▉         | 609/6434 [1:25:39<14:00:06,  8.65s/it, gpt_loss=0.386, loss_mean=0.364][A
+[LID Router Debug] Step: 610
+Batch Size: 10
+Audio Batch Size: 118
+LID Assignments: [4, 2, 9, 5, 1, 3, 5, 0, 2, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:   9%|▉         | 609/6434 [1:25:47<14:00:06,  8.65s/it, gpt_loss=0.328, loss_mean=0.36] [A
+Train step of epoch 0:   9%|▉         | 610/6434 [1:25:47<13:40:10,  8.45s/it, gpt_loss=0.328, loss_mean=0.36][A
+Train step of epoch 0:   9%|▉         | 610/6434 [1:25:56<13:40:10,  8.45s/it, gpt_loss=0.425, loss_mean=0.367][A
+Train step of epoch 0:   9%|▉         | 611/6434 [1:25:56<13:53:19,  8.59s/it, gpt_loss=0.425, loss_mean=0.367][A
+Train step of epoch 0:   9%|▉         | 611/6434 [1:26:04<13:53:19,  8.59s/it, gpt_loss=0.329, loss_mean=0.363][A
+Train step of epoch 0:  10%|▉         | 612/6434 [1:26:04<13:40:57,  8.46s/it, gpt_loss=0.329, loss_mean=0.363][A
+Train step of epoch 0:  10%|▉         | 612/6434 [1:26:12<13:40:57,  8.46s/it, gpt_loss=0.359, loss_mean=0.362][A
+Train step of epoch 0:  10%|▉         | 613/6434 [1:26:12<13:45:14,  8.51s/it, gpt_loss=0.359, loss_mean=0.362][A
+Train step of epoch 0:  10%|▉         | 613/6434 [1:26:21<13:45:14,  8.51s/it, gpt_loss=0.437, loss_mean=0.37] [A
+Train step of epoch 0:  10%|▉         | 614/6434 [1:26:21<14:03:48,  8.70s/it, gpt_loss=0.437, loss_mean=0.37][A
+Train step of epoch 0:  10%|▉         | 614/6434 [1:26:29<14:03:48,  8.70s/it, gpt_loss=0.394, loss_mean=0.372][A
+Train step of epoch 0:  10%|▉         | 615/6434 [1:26:29<13:36:47,  8.42s/it, gpt_loss=0.394, loss_mean=0.372][A
+Train step of epoch 0:  10%|▉         | 615/6434 [1:26:38<13:36:47,  8.42s/it, gpt_loss=0.256, loss_mean=0.361][A
+Train step of epoch 0:  10%|▉         | 616/6434 [1:26:38<13:48:56,  8.55s/it, gpt_loss=0.256, loss_mean=0.361][A
+Train step of epoch 0:  10%|▉         | 616/6434 [1:26:47<13:48:56,  8.55s/it, gpt_loss=0.422, loss_mean=0.367][A
+Train step of epoch 0:  10%|▉         | 617/6434 [1:26:47<13:50:16,  8.56s/it, gpt_loss=0.422, loss_mean=0.367][A
+Train step of epoch 0:  10%|▉         | 617/6434 [1:26:54<13:50:16,  8.56s/it, gpt_loss=0.456, loss_mean=0.376][A
+Train step of epoch 0:  10%|▉         | 618/6434 [1:26:54<13:22:12,  8.28s/it, gpt_loss=0.456, loss_mean=0.376][A
+Train step of epoch 0:  10%|▉         | 618/6434 [1:27:03<13:22:12,  8.28s/it, gpt_loss=0.353, loss_mean=0.373][A
+Train step of epoch 0:  10%|▉         | 619/6434 [1:27:03<13:45:12,  8.51s/it, gpt_loss=0.353, loss_mean=0.373][A
+[LID Router Debug] Step: 620
+Batch Size: 10
+Audio Batch Size: 73
+LID Assignments: [4, 0, 4, 4, 0, 6, 2, 0, 1, 4]
+Active Experts in Batch: {0, 1, 2, 4, 6}
+
+Train step of epoch 0:  10%|▉         | 619/6434 [1:27:12<13:45:12,  8.51s/it, gpt_loss=0.325, loss_mean=0.369][A
+Train step of epoch 0:  10%|▉         | 620/6434 [1:27:12<13:57:10,  8.64s/it, gpt_loss=0.325, loss_mean=0.369][A
+Train step of epoch 0:  10%|▉         | 620/6434 [1:27:21<13:57:10,  8.64s/it, gpt_loss=0.384, loss_mean=0.37] [A
+Train step of epoch 0:  10%|▉         | 621/6434 [1:27:21<14:10:43,  8.78s/it, gpt_loss=0.384, loss_mean=0.37][A
+Train step of epoch 0:  10%|▉         | 621/6434 [1:27:29<14:10:43,  8.78s/it, gpt_loss=0.413, loss_mean=0.374][A
+Train step of epoch 0:  10%|▉         | 622/6434 [1:27:29<13:48:51,  8.56s/it, gpt_loss=0.413, loss_mean=0.374][A
+Train step of epoch 0:  10%|▉         | 622/6434 [1:27:38<13:48:51,  8.56s/it, gpt_loss=0.367, loss_mean=0.374][A
+Train step of epoch 0:  10%|▉         | 623/6434 [1:27:38<13:35:26,  8.42s/it, gpt_loss=0.367, loss_mean=0.374][A
+Train step of epoch 0:  10%|▉         | 623/6434 [1:27:45<13:35:26,  8.42s/it, gpt_loss=0.365, loss_mean=0.373][A
+Train step of epoch 0:  10%|▉         | 624/6434 [1:27:45<13:21:57,  8.28s/it, gpt_loss=0.365, loss_mean=0.373][A
+Train step of epoch 0:  10%|▉         | 624/6434 [1:27:55<13:21:57,  8.28s/it, gpt_loss=0.39, loss_mean=0.375] [A
+Train step of epoch 0:  10%|▉         | 625/6434 [1:27:55<14:01:58,  8.70s/it, gpt_loss=0.39, loss_mean=0.375][A
+Train step of epoch 0:  10%|▉         | 625/6434 [1:28:03<14:01:58,  8.70s/it, gpt_loss=0.314, loss_mean=0.368][A
+Train step of epoch 0:  10%|▉         | 626/6434 [1:28:03<13:50:36,  8.58s/it, gpt_loss=0.314, loss_mean=0.368][A
+Train step of epoch 0:  10%|▉         | 626/6434 [1:28:12<13:50:36,  8.58s/it, gpt_loss=0.344, loss_mean=0.366][A
+Train step of epoch 0:  10%|▉         | 627/6434 [1:28:12<13:51:52,  8.60s/it, gpt_loss=0.344, loss_mean=0.366][A
+Train step of epoch 0:  10%|▉         | 627/6434 [1:28:20<13:51:52,  8.60s/it, gpt_loss=0.377, loss_mean=0.367][A
+Train step of epoch 0:  10%|▉         | 628/6434 [1:28:20<13:38:16,  8.46s/it, gpt_loss=0.377, loss_mean=0.367][A
+Train step of epoch 0:  10%|▉         | 628/6434 [1:28:28<13:38:16,  8.46s/it, gpt_loss=0.397, loss_mean=0.37] [A
+Train step of epoch 0:  10%|▉         | 629/6434 [1:28:28<13:32:26,  8.40s/it, gpt_loss=0.397, loss_mean=0.37][A
+[LID Router Debug] Step: 630
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [4, 9, 9, 5, 6, 9, 0, 4, 2, 3]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  10%|▉         | 629/6434 [1:28:36<13:32:26,  8.40s/it, gpt_loss=0.411, loss_mean=0.374][A
+Train step of epoch 0:  10%|▉         | 630/6434 [1:28:36<13:04:26,  8.11s/it, gpt_loss=0.411, loss_mean=0.374][A
+Train step of epoch 0:  10%|▉         | 630/6434 [1:28:44<13:04:26,  8.11s/it, gpt_loss=0.266, loss_mean=0.363][A
+Train step of epoch 0:  10%|▉         | 631/6434 [1:28:44<13:08:03,  8.15s/it, gpt_loss=0.266, loss_mean=0.363][A
+Train step of epoch 0:  10%|▉         | 631/6434 [1:28:51<13:08:03,  8.15s/it, gpt_loss=0.42, loss_mean=0.369] [A
+Train step of epoch 0:  10%|▉         | 632/6434 [1:28:51<12:33:01,  7.79s/it, gpt_loss=0.42, loss_mean=0.369][A
+Train step of epoch 0:  10%|▉         | 632/6434 [1:28:59<12:33:01,  7.79s/it, gpt_loss=0.461, loss_mean=0.378][A
+Train step of epoch 0:  10%|▉         | 633/6434 [1:28:59<12:45:20,  7.92s/it, gpt_loss=0.461, loss_mean=0.378][A
+Train step of epoch 0:  10%|▉         | 633/6434 [1:29:07<12:45:20,  7.92s/it, gpt_loss=0.363, loss_mean=0.377][A
+Train step of epoch 0:  10%|▉         | 634/6434 [1:29:07<12:46:37,  7.93s/it, gpt_loss=0.363, loss_mean=0.377][A
+Train step of epoch 0:  10%|▉         | 634/6434 [1:29:15<12:46:37,  7.93s/it, gpt_loss=0.283, loss_mean=0.367][A
+Train step of epoch 0:  10%|▉         | 635/6434 [1:29:15<12:26:01,  7.72s/it, gpt_loss=0.283, loss_mean=0.367][A
+Train step of epoch 0:  10%|▉         | 635/6434 [1:29:24<12:26:01,  7.72s/it, gpt_loss=0.312, loss_mean=0.362][A
+Train step of epoch 0:  10%|▉         | 636/6434 [1:29:24<13:09:17,  8.17s/it, gpt_loss=0.312, loss_mean=0.362][A
+Train step of epoch 0:  10%|▉         | 636/6434 [1:29:31<13:09:17,  8.17s/it, gpt_loss=0.313, loss_mean=0.357][A
+Train step of epoch 0:  10%|▉         | 637/6434 [1:29:31<12:53:39,  8.01s/it, gpt_loss=0.313, loss_mean=0.357][A
+Train step of epoch 0:  10%|▉         | 637/6434 [1:29:41<12:53:39,  8.01s/it, gpt_loss=0.353, loss_mean=0.357][A
+Train step of epoch 0:  10%|▉         | 638/6434 [1:29:41<13:44:11,  8.53s/it, gpt_loss=0.353, loss_mean=0.357][A
+Train step of epoch 0:  10%|▉         | 638/6434 [1:29:49<13:44:11,  8.53s/it, gpt_loss=0.398, loss_mean=0.361][A
+Train step of epoch 0:  10%|▉         | 639/6434 [1:29:49<13:22:38,  8.31s/it, gpt_loss=0.398, loss_mean=0.361][A
+[LID Router Debug] Step: 640
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [2, 1, 5, 9, 0, 3, 4, 3, 4, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  10%|▉         | 639/6434 [1:29:58<13:22:38,  8.31s/it, gpt_loss=0.351, loss_mean=0.36] [A
+Train step of epoch 0:  10%|▉         | 640/6434 [1:29:58<13:39:27,  8.49s/it, gpt_loss=0.351, loss_mean=0.36][A
+Train step of epoch 0:  10%|▉         | 640/6434 [1:30:06<13:39:27,  8.49s/it, gpt_loss=0.341, loss_mean=0.358][A
+Train step of epoch 0:  10%|▉         | 641/6434 [1:30:06<13:42:02,  8.51s/it, gpt_loss=0.341, loss_mean=0.358][A
+Train step of epoch 0:  10%|▉         | 641/6434 [1:30:15<13:42:02,  8.51s/it, gpt_loss=0.33, loss_mean=0.355] [A
+Train step of epoch 0:  10%|▉         | 642/6434 [1:30:15<13:58:40,  8.69s/it, gpt_loss=0.33, loss_mean=0.355][A
+Train step of epoch 0:  10%|▉         | 642/6434 [1:30:23<13:58:40,  8.69s/it, gpt_loss=0.336, loss_mean=0.353][A
+Train step of epoch 0:  10%|▉         | 643/6434 [1:30:23<13:23:24,  8.32s/it, gpt_loss=0.336, loss_mean=0.353][A
+Train step of epoch 0:  10%|▉         | 643/6434 [1:30:31<13:23:24,  8.32s/it, gpt_loss=0.367, loss_mean=0.355][A
+Train step of epoch 0:  10%|█         | 644/6434 [1:30:31<13:05:41,  8.14s/it, gpt_loss=0.367, loss_mean=0.355][A
+Train step of epoch 0:  10%|█         | 644/6434 [1:30:39<13:05:41,  8.14s/it, gpt_loss=0.312, loss_mean=0.35] [A
+Train step of epoch 0:  10%|█         | 645/6434 [1:30:39<13:23:29,  8.33s/it, gpt_loss=0.312, loss_mean=0.35][A
+Train step of epoch 0:  10%|█         | 645/6434 [1:30:48<13:23:29,  8.33s/it, gpt_loss=0.408, loss_mean=0.356][A
+Train step of epoch 0:  10%|█         | 646/6434 [1:30:48<13:22:30,  8.32s/it, gpt_loss=0.408, loss_mean=0.356][A
+Train step of epoch 0:  10%|█         | 646/6434 [1:30:55<13:22:30,  8.32s/it, gpt_loss=0.341, loss_mean=0.354][A
+Train step of epoch 0:  10%|█         | 647/6434 [1:30:55<12:57:12,  8.06s/it, gpt_loss=0.341, loss_mean=0.354][A
+Train step of epoch 0:  10%|█         | 647/6434 [1:31:04<12:57:12,  8.06s/it, gpt_loss=0.407, loss_mean=0.36] [A
+Train step of epoch 0:  10%|█         | 648/6434 [1:31:04<13:13:53,  8.23s/it, gpt_loss=0.407, loss_mean=0.36][A
+Train step of epoch 0:  10%|█         | 648/6434 [1:31:12<13:13:53,  8.23s/it, gpt_loss=0.435, loss_mean=0.367][A
+Train step of epoch 0:  10%|█         | 649/6434 [1:31:12<13:02:14,  8.11s/it, gpt_loss=0.435, loss_mean=0.367][A
+[LID Router Debug] Step: 650
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [1, 0, 4, 1, 2, 1, 0, 3, 2, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4}
+
+Train step of epoch 0:  10%|█         | 649/6434 [1:31:19<13:02:14,  8.11s/it, gpt_loss=0.28, loss_mean=0.359] [A
+Train step of epoch 0:  10%|█         | 650/6434 [1:31:19<12:43:52,  7.92s/it, gpt_loss=0.28, loss_mean=0.359][A
+Train step of epoch 0:  10%|█         | 650/6434 [1:31:27<12:43:52,  7.92s/it, gpt_loss=0.294, loss_mean=0.352][A
+Train step of epoch 0:  10%|█         | 651/6434 [1:31:27<12:27:35,  7.76s/it, gpt_loss=0.294, loss_mean=0.352][A
+Train step of epoch 0:  10%|█         | 651/6434 [1:31:35<12:27:35,  7.76s/it, gpt_loss=0.378, loss_mean=0.355][A
+Train step of epoch 0:  10%|█         | 652/6434 [1:31:35<12:36:51,  7.85s/it, gpt_loss=0.378, loss_mean=0.355][A
+Train step of epoch 0:  10%|█         | 652/6434 [1:31:43<12:36:51,  7.85s/it, gpt_loss=0.419, loss_mean=0.361][A
+Train step of epoch 0:  10%|█         | 653/6434 [1:31:43<12:46:09,  7.95s/it, gpt_loss=0.419, loss_mean=0.361][A
+Train step of epoch 0:  10%|█         | 653/6434 [1:31:51<12:46:09,  7.95s/it, gpt_loss=0.399, loss_mean=0.365][A
+Train step of epoch 0:  10%|█         | 654/6434 [1:31:51<12:43:14,  7.92s/it, gpt_loss=0.399, loss_mean=0.365][A
+Train step of epoch 0:  10%|█         | 654/6434 [1:31:58<12:43:14,  7.92s/it, gpt_loss=0.308, loss_mean=0.359][A
+Train step of epoch 0:  10%|█         | 655/6434 [1:31:58<12:27:39,  7.76s/it, gpt_loss=0.308, loss_mean=0.359][A
+Train step of epoch 0:  10%|█         | 655/6434 [1:32:06<12:27:39,  7.76s/it, gpt_loss=0.382, loss_mean=0.362][A
+Train step of epoch 0:  10%|█         | 656/6434 [1:32:06<12:24:39,  7.73s/it, gpt_loss=0.382, loss_mean=0.362][A
+Train step of epoch 0:  10%|█         | 656/6434 [1:32:13<12:24:39,  7.73s/it, gpt_loss=0.354, loss_mean=0.361][A
+Train step of epoch 0:  10%|█         | 657/6434 [1:32:13<12:23:17,  7.72s/it, gpt_loss=0.354, loss_mean=0.361][A
+Train step of epoch 0:  10%|█         | 657/6434 [1:32:21<12:23:17,  7.72s/it, gpt_loss=0.418, loss_mean=0.367][A
+Train step of epoch 0:  10%|█         | 658/6434 [1:32:21<12:16:45,  7.65s/it, gpt_loss=0.418, loss_mean=0.367][A
+Train step of epoch 0:  10%|█         | 658/6434 [1:32:29<12:16:45,  7.65s/it, gpt_loss=0.296, loss_mean=0.36] [A
+Train step of epoch 0:  10%|█         | 659/6434 [1:32:29<12:36:35,  7.86s/it, gpt_loss=0.296, loss_mean=0.36][A
+[LID Router Debug] Step: 660
+Batch Size: 10
+Audio Batch Size: 119
+LID Assignments: [3, 6, 2, 0, 2, 5, 3, 9, 5, 4]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  10%|█         | 659/6434 [1:32:37<12:36:35,  7.86s/it, gpt_loss=0.351, loss_mean=0.359][A
+Train step of epoch 0:  10%|█         | 660/6434 [1:32:37<12:43:25,  7.93s/it, gpt_loss=0.351, loss_mean=0.359][A
+Train step of epoch 0:  10%|█         | 660/6434 [1:32:47<12:43:25,  7.93s/it, gpt_loss=0.306, loss_mean=0.353][A
+Train step of epoch 0:  10%|█         | 661/6434 [1:32:47<13:22:13,  8.34s/it, gpt_loss=0.306, loss_mean=0.353][A
+Train step of epoch 0:  10%|█         | 661/6434 [1:32:55<13:22:13,  8.34s/it, gpt_loss=0.283, loss_mean=0.346][A
+Train step of epoch 0:  10%|█         | 662/6434 [1:32:55<13:14:45,  8.26s/it, gpt_loss=0.283, loss_mean=0.346][A
+Train step of epoch 0:  10%|█         | 662/6434 [1:33:03<13:14:45,  8.26s/it, gpt_loss=0.322, loss_mean=0.344][A
+Train step of epoch 0:  10%|█         | 663/6434 [1:33:03<13:09:29,  8.21s/it, gpt_loss=0.322, loss_mean=0.344][A
+Train step of epoch 0:  10%|█         | 663/6434 [1:33:11<13:09:29,  8.21s/it, gpt_loss=0.354, loss_mean=0.345][A
+Train step of epoch 0:  10%|█         | 664/6434 [1:33:11<13:20:48,  8.33s/it, gpt_loss=0.354, loss_mean=0.345][A
+Train step of epoch 0:  10%|█         | 664/6434 [1:33:20<13:20:48,  8.33s/it, gpt_loss=0.294, loss_mean=0.34] [A
+Train step of epoch 0:  10%|█         | 665/6434 [1:33:20<13:30:15,  8.43s/it, gpt_loss=0.294, loss_mean=0.34][A
+Train step of epoch 0:  10%|█         | 665/6434 [1:33:27<13:30:15,  8.43s/it, gpt_loss=0.332, loss_mean=0.339][A
+Train step of epoch 0:  10%|█         | 666/6434 [1:33:27<12:56:04,  8.07s/it, gpt_loss=0.332, loss_mean=0.339][A
+Train step of epoch 0:  10%|█         | 666/6434 [1:33:35<12:56:04,  8.07s/it, gpt_loss=0.384, loss_mean=0.344][A
+Train step of epoch 0:  10%|█         | 667/6434 [1:33:35<12:37:59,  7.89s/it, gpt_loss=0.384, loss_mean=0.344][A
+Train step of epoch 0:  10%|█         | 667/6434 [1:33:43<12:37:59,  7.89s/it, gpt_loss=0.423, loss_mean=0.352][A
+Train step of epoch 0:  10%|█         | 668/6434 [1:33:43<12:50:50,  8.02s/it, gpt_loss=0.423, loss_mean=0.352][A
+Train step of epoch 0:  10%|█         | 668/6434 [1:33:52<12:50:50,  8.02s/it, gpt_loss=0.405, loss_mean=0.357][A
+Train step of epoch 0:  10%|█         | 669/6434 [1:33:52<13:21:15,  8.34s/it, gpt_loss=0.405, loss_mean=0.357][A
+[LID Router Debug] Step: 670
+Batch Size: 10
+Audio Batch Size: 104
+LID Assignments: [2, 1, 4, 1, 2, 0, 5, 0, 3, 2]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5}
+
+Train step of epoch 0:  10%|█         | 669/6434 [1:34:00<13:21:15,  8.34s/it, gpt_loss=0.315, loss_mean=0.353][A
+Train step of epoch 0:  10%|█         | 670/6434 [1:34:00<13:16:54,  8.30s/it, gpt_loss=0.315, loss_mean=0.353][A
+Train step of epoch 0:  10%|█         | 670/6434 [1:34:09<13:16:54,  8.30s/it, gpt_loss=0.294, loss_mean=0.347][A
+Train step of epoch 0:  10%|█         | 671/6434 [1:34:09<13:22:31,  8.36s/it, gpt_loss=0.294, loss_mean=0.347][A
+Train step of epoch 0:  10%|█         | 671/6434 [1:34:18<13:22:31,  8.36s/it, gpt_loss=0.483, loss_mean=0.36] [A
+Train step of epoch 0:  10%|█         | 672/6434 [1:34:18<13:44:43,  8.59s/it, gpt_loss=0.483, loss_mean=0.36][A
+Train step of epoch 0:  10%|█         | 672/6434 [1:34:26<13:44:43,  8.59s/it, gpt_loss=0.429, loss_mean=0.367][A
+Train step of epoch 0:  10%|█         | 673/6434 [1:34:26<13:39:02,  8.53s/it, gpt_loss=0.429, loss_mean=0.367][A
+Train step of epoch 0:  10%|█         | 673/6434 [1:34:34<13:39:02,  8.53s/it, gpt_loss=0.295, loss_mean=0.36] [A
+Train step of epoch 0:  10%|█         | 674/6434 [1:34:34<13:10:00,  8.23s/it, gpt_loss=0.295, loss_mean=0.36][A
+Train step of epoch 0:  10%|█         | 674/6434 [1:34:43<13:10:00,  8.23s/it, gpt_loss=0.369, loss_mean=0.361][A
+Train step of epoch 0:  10%|█         | 675/6434 [1:34:43<13:40:42,  8.55s/it, gpt_loss=0.369, loss_mean=0.361][A
+Train step of epoch 0:  10%|█         | 675/6434 [1:34:52<13:40:42,  8.55s/it, gpt_loss=0.354, loss_mean=0.36] [A
+Train step of epoch 0:  11%|█         | 676/6434 [1:34:52<13:53:56,  8.69s/it, gpt_loss=0.354, loss_mean=0.36][A
+Train step of epoch 0:  11%|█         | 676/6434 [1:35:02<13:53:56,  8.69s/it, gpt_loss=0.384, loss_mean=0.363][A
+Train step of epoch 0:  11%|█         | 677/6434 [1:35:02<14:26:20,  9.03s/it, gpt_loss=0.384, loss_mean=0.363][A
+Train step of epoch 0:  11%|█         | 677/6434 [1:35:11<14:26:20,  9.03s/it, gpt_loss=0.381, loss_mean=0.364][A
+Train step of epoch 0:  11%|█         | 678/6434 [1:35:11<14:23:43,  9.00s/it, gpt_loss=0.381, loss_mean=0.364][A
+Train step of epoch 0:  11%|█         | 678/6434 [1:35:19<14:23:43,  9.00s/it, gpt_loss=0.334, loss_mean=0.361][A
+Train step of epoch 0:  11%|█         | 679/6434 [1:35:19<14:04:45,  8.81s/it, gpt_loss=0.334, loss_mean=0.361][A
+[LID Router Debug] Step: 680
+Batch Size: 10
+Audio Batch Size: 114
+LID Assignments: [9, 2, 4, 3, 7, 3, 0, 0, 0, 0]
+Active Experts in Batch: {0, 2, 3, 4, 7, 9}
+
+Train step of epoch 0:  11%|█         | 679/6434 [1:35:28<14:04:45,  8.81s/it, gpt_loss=0.433, loss_mean=0.369][A
+Train step of epoch 0:  11%|█         | 680/6434 [1:35:28<13:58:18,  8.74s/it, gpt_loss=0.433, loss_mean=0.369][A
+Train step of epoch 0:  11%|█         | 680/6434 [1:35:36<13:58:18,  8.74s/it, gpt_loss=0.389, loss_mean=0.371][A
+Train step of epoch 0:  11%|█         | 681/6434 [1:35:36<13:40:32,  8.56s/it, gpt_loss=0.389, loss_mean=0.371][A
+Train step of epoch 0:  11%|█         | 681/6434 [1:35:44<13:40:32,  8.56s/it, gpt_loss=0.31, loss_mean=0.365] [A
+Train step of epoch 0:  11%|█         | 682/6434 [1:35:44<13:09:22,  8.23s/it, gpt_loss=0.31, loss_mean=0.365][A
+Train step of epoch 0:  11%|█         | 682/6434 [1:35:52<13:09:22,  8.23s/it, gpt_loss=0.295, loss_mean=0.358][A
+Train step of epoch 0:  11%|█         | 683/6434 [1:35:52<13:15:43,  8.30s/it, gpt_loss=0.295, loss_mean=0.358][A
+Train step of epoch 0:  11%|█         | 683/6434 [1:36:00<13:15:43,  8.30s/it, gpt_loss=0.307, loss_mean=0.353][A
+Train step of epoch 0:  11%|█         | 684/6434 [1:36:00<13:05:20,  8.19s/it, gpt_loss=0.307, loss_mean=0.353][A
+Train step of epoch 0:  11%|█         | 684/6434 [1:36:07<13:05:20,  8.19s/it, gpt_loss=0.404, loss_mean=0.358][A
+Train step of epoch 0:  11%|█         | 685/6434 [1:36:07<12:36:27,  7.89s/it, gpt_loss=0.404, loss_mean=0.358][A
+Train step of epoch 0:  11%|█         | 685/6434 [1:36:16<12:36:27,  7.89s/it, gpt_loss=0.288, loss_mean=0.351][A
+Train step of epoch 0:  11%|█         | 686/6434 [1:36:16<12:55:49,  8.10s/it, gpt_loss=0.288, loss_mean=0.351][A
+Train step of epoch 0:  11%|█         | 686/6434 [1:36:25<12:55:49,  8.10s/it, gpt_loss=0.33, loss_mean=0.349] [A
+Train step of epoch 0:  11%|█         | 687/6434 [1:36:25<13:26:16,  8.42s/it, gpt_loss=0.33, loss_mean=0.349][A
+Train step of epoch 0:  11%|█         | 687/6434 [1:36:32<13:26:16,  8.42s/it, gpt_loss=0.352, loss_mean=0.349][A
+Train step of epoch 0:  11%|█         | 688/6434 [1:36:32<12:56:27,  8.11s/it, gpt_loss=0.352, loss_mean=0.349][A
+Train step of epoch 0:  11%|█         | 688/6434 [1:36:42<12:56:27,  8.11s/it, gpt_loss=0.347, loss_mean=0.349][A
+Train step of epoch 0:  11%|█         | 689/6434 [1:36:42<13:38:01,  8.54s/it, gpt_loss=0.347, loss_mean=0.349][A
+[LID Router Debug] Step: 690
+Batch Size: 10
+Audio Batch Size: 89
+LID Assignments: [9, 9, 1, 4, 4, 6, 6, 3, 1, 5]
+Active Experts in Batch: {1, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  11%|█         | 689/6434 [1:36:50<13:38:01,  8.54s/it, gpt_loss=0.355, loss_mean=0.349][A
+Train step of epoch 0:  11%|█         | 690/6434 [1:36:50<13:23:20,  8.39s/it, gpt_loss=0.355, loss_mean=0.349][A
+Train step of epoch 0:  11%|█         | 690/6434 [1:36:59<13:23:20,  8.39s/it, gpt_loss=0.391, loss_mean=0.354][A
+Train step of epoch 0:  11%|█         | 691/6434 [1:36:59<13:37:45,  8.54s/it, gpt_loss=0.391, loss_mean=0.354][A
+Train step of epoch 0:  11%|█         | 691/6434 [1:37:08<13:37:45,  8.54s/it, gpt_loss=0.318, loss_mean=0.35] [A
+Train step of epoch 0:  11%|█         | 692/6434 [1:37:08<14:01:16,  8.79s/it, gpt_loss=0.318, loss_mean=0.35][A
+Train step of epoch 0:  11%|█         | 692/6434 [1:37:16<14:01:16,  8.79s/it, gpt_loss=0.364, loss_mean=0.351][A
+Train step of epoch 0:  11%|█         | 693/6434 [1:37:16<13:47:40,  8.65s/it, gpt_loss=0.364, loss_mean=0.351][A
+Train step of epoch 0:  11%|█         | 693/6434 [1:37:25<13:47:40,  8.65s/it, gpt_loss=0.463, loss_mean=0.363][A
+Train step of epoch 0:  11%|█         | 694/6434 [1:37:25<13:33:57,  8.51s/it, gpt_loss=0.463, loss_mean=0.363][A
+Train step of epoch 0:  11%|█         | 694/6434 [1:37:34<13:33:57,  8.51s/it, gpt_loss=0.354, loss_mean=0.362][A
+Train step of epoch 0:  11%|█         | 695/6434 [1:37:34<13:50:24,  8.68s/it, gpt_loss=0.354, loss_mean=0.362][A
+Train step of epoch 0:  11%|█         | 695/6434 [1:37:42<13:50:24,  8.68s/it, gpt_loss=0.384, loss_mean=0.364][A
+Train step of epoch 0:  11%|█         | 696/6434 [1:37:42<13:40:40,  8.58s/it, gpt_loss=0.384, loss_mean=0.364][A
+Train step of epoch 0:  11%|█         | 696/6434 [1:37:50<13:40:40,  8.58s/it, gpt_loss=0.284, loss_mean=0.356][A
+Train step of epoch 0:  11%|█         | 697/6434 [1:37:50<13:22:25,  8.39s/it, gpt_loss=0.284, loss_mean=0.356][A
+Train step of epoch 0:  11%|█         | 697/6434 [1:37:59<13:22:25,  8.39s/it, gpt_loss=0.354, loss_mean=0.356][A
+Train step of epoch 0:  11%|█         | 698/6434 [1:37:59<13:53:40,  8.72s/it, gpt_loss=0.354, loss_mean=0.356][A
+Train step of epoch 0:  11%|█         | 698/6434 [1:38:07<13:53:40,  8.72s/it, gpt_loss=0.322, loss_mean=0.352][A
+Train step of epoch 0:  11%|█         | 699/6434 [1:38:07<13:28:42,  8.46s/it, gpt_loss=0.322, loss_mean=0.352][A
+[LID Router Debug] Step: 700
+Batch Size: 10
+Audio Batch Size: 131
+LID Assignments: [3, 9, 0, 2, 4, 3, 4, 2, 11, 0]
+Active Experts in Batch: {0, 2, 3, 4, 9, 11}
+
+Train step of epoch 0:  11%|█         | 699/6434 [1:38:16<13:28:42,  8.46s/it, gpt_loss=0.404, loss_mean=0.358][A
+Train step of epoch 0:  11%|█         | 700/6434 [1:38:16<13:29:26,  8.47s/it, gpt_loss=0.404, loss_mean=0.358][A
+Train step of epoch 0:  11%|█         | 700/6434 [1:38:24<13:29:26,  8.47s/it, gpt_loss=0.397, loss_mean=0.362][A
+Train step of epoch 0:  11%|█         | 701/6434 [1:38:24<13:28:25,  8.46s/it, gpt_loss=0.397, loss_mean=0.362][A
+Train step of epoch 0:  11%|█         | 701/6434 [1:38:32<13:28:25,  8.46s/it, gpt_loss=0.465, loss_mean=0.372][A
+Train step of epoch 0:  11%|█         | 702/6434 [1:38:32<12:57:43,  8.14s/it, gpt_loss=0.465, loss_mean=0.372][A
+Train step of epoch 0:  11%|█         | 702/6434 [1:38:40<12:57:43,  8.14s/it, gpt_loss=0.406, loss_mean=0.375][A
+Train step of epoch 0:  11%|█         | 703/6434 [1:38:40<13:03:15,  8.20s/it, gpt_loss=0.406, loss_mean=0.375][A
+Train step of epoch 0:  11%|█         | 703/6434 [1:38:47<13:03:15,  8.20s/it, gpt_loss=0.34, loss_mean=0.372] [A
+Train step of epoch 0:  11%|█         | 704/6434 [1:38:47<12:21:34,  7.77s/it, gpt_loss=0.34, loss_mean=0.372][A
+Train step of epoch 0:  11%|█         | 704/6434 [1:38:55<12:21:34,  7.77s/it, gpt_loss=0.439, loss_mean=0.379][A
+Train step of epoch 0:  11%|█         | 705/6434 [1:38:55<12:37:58,  7.94s/it, gpt_loss=0.439, loss_mean=0.379][A
+Train step of epoch 0:  11%|█         | 705/6434 [1:39:03<12:37:58,  7.94s/it, gpt_loss=0.41, loss_mean=0.382] [A
+Train step of epoch 0:  11%|█         | 706/6434 [1:39:03<12:24:29,  7.80s/it, gpt_loss=0.41, loss_mean=0.382][A
+Train step of epoch 0:  11%|█         | 706/6434 [1:39:11<12:24:29,  7.80s/it, gpt_loss=0.263, loss_mean=0.37][A
+Train step of epoch 0:  11%|█         | 707/6434 [1:39:11<12:36:26,  7.92s/it, gpt_loss=0.263, loss_mean=0.37][A
+Train step of epoch 0:  11%|█         | 707/6434 [1:39:19<12:36:26,  7.92s/it, gpt_loss=0.346, loss_mean=0.368][A
+Train step of epoch 0:  11%|█         | 708/6434 [1:39:19<12:54:38,  8.12s/it, gpt_loss=0.346, loss_mean=0.368][A
+Train step of epoch 0:  11%|█         | 708/6434 [1:39:28<12:54:38,  8.12s/it, gpt_loss=0.335, loss_mean=0.364][A
+Train step of epoch 0:  11%|█         | 709/6434 [1:39:28<12:58:39,  8.16s/it, gpt_loss=0.335, loss_mean=0.364][A
+[LID Router Debug] Step: 710
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [3, 4, 6, 3, 3, 6, 4, 1, 2, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6}
+
+Train step of epoch 0:  11%|█         | 709/6434 [1:39:36<12:58:39,  8.16s/it, gpt_loss=0.356, loss_mean=0.363][A
+Train step of epoch 0:  11%|█         | 710/6434 [1:39:36<12:51:32,  8.09s/it, gpt_loss=0.356, loss_mean=0.363][A
+Train step of epoch 0:  11%|█         | 710/6434 [1:39:45<12:51:32,  8.09s/it, gpt_loss=0.366, loss_mean=0.364][A
+Train step of epoch 0:  11%|█         | 711/6434 [1:39:45<13:18:23,  8.37s/it, gpt_loss=0.366, loss_mean=0.364][A
+Train step of epoch 0:  11%|█         | 711/6434 [1:39:54<13:18:23,  8.37s/it, gpt_loss=0.352, loss_mean=0.363][A
+Train step of epoch 0:  11%|█         | 712/6434 [1:39:54<13:43:19,  8.63s/it, gpt_loss=0.352, loss_mean=0.363][A
+Train step of epoch 0:  11%|█         | 712/6434 [1:40:02<13:43:19,  8.63s/it, gpt_loss=0.33, loss_mean=0.359] [A
+Train step of epoch 0:  11%|█         | 713/6434 [1:40:02<13:33:59,  8.54s/it, gpt_loss=0.33, loss_mean=0.359][A
+Train step of epoch 0:  11%|█         | 713/6434 [1:40:11<13:33:59,  8.54s/it, gpt_loss=0.363, loss_mean=0.36][A
+Train step of epoch 0:  11%|█         | 714/6434 [1:40:11<13:45:48,  8.66s/it, gpt_loss=0.363, loss_mean=0.36][A
+Train step of epoch 0:  11%|█         | 714/6434 [1:40:20<13:45:48,  8.66s/it, gpt_loss=0.295, loss_mean=0.353][A
+Train step of epoch 0:  11%|█         | 715/6434 [1:40:20<13:49:12,  8.70s/it, gpt_loss=0.295, loss_mean=0.353][A
+Train step of epoch 0:  11%|█         | 715/6434 [1:40:29<13:49:12,  8.70s/it, gpt_loss=0.489, loss_mean=0.367][A
+Train step of epoch 0:  11%|█         | 716/6434 [1:40:29<13:49:52,  8.71s/it, gpt_loss=0.489, loss_mean=0.367][A
+Train step of epoch 0:  11%|█         | 716/6434 [1:40:37<13:49:52,  8.71s/it, gpt_loss=0.372, loss_mean=0.367][A
+Train step of epoch 0:  11%|█         | 717/6434 [1:40:37<13:42:26,  8.63s/it, gpt_loss=0.372, loss_mean=0.367][A
+Train step of epoch 0:  11%|█         | 717/6434 [1:40:45<13:42:26,  8.63s/it, gpt_loss=0.375, loss_mean=0.368][A
+Train step of epoch 0:  11%|█         | 718/6434 [1:40:45<13:19:17,  8.39s/it, gpt_loss=0.375, loss_mean=0.368][A
+Train step of epoch 0:  11%|█         | 718/6434 [1:40:54<13:19:17,  8.39s/it, gpt_loss=0.381, loss_mean=0.369][A
+Train step of epoch 0:  11%|█         | 719/6434 [1:40:54<13:31:23,  8.52s/it, gpt_loss=0.381, loss_mean=0.369][A
+[LID Router Debug] Step: 720
+Batch Size: 10
+Audio Batch Size: 94
+LID Assignments: [2, 2, 9, 6, 10, 2, 6, 4, 4, 5]
+Active Experts in Batch: {2, 4, 5, 6, 9, 10}
+
+Train step of epoch 0:  11%|█         | 719/6434 [1:41:02<13:31:23,  8.52s/it, gpt_loss=0.372, loss_mean=0.37] [A
+Train step of epoch 0:  11%|█         | 720/6434 [1:41:02<13:16:16,  8.36s/it, gpt_loss=0.372, loss_mean=0.37][A
+Train step of epoch 0:  11%|█         | 720/6434 [1:41:10<13:16:16,  8.36s/it, gpt_loss=0.339, loss_mean=0.367][A
+Train step of epoch 0:  11%|█         | 721/6434 [1:41:10<13:06:07,  8.26s/it, gpt_loss=0.339, loss_mean=0.367][A
+Train step of epoch 0:  11%|█         | 721/6434 [1:41:18<13:06:07,  8.26s/it, gpt_loss=0.34, loss_mean=0.364] [A
+Train step of epoch 0:  11%|█         | 722/6434 [1:41:18<13:00:00,  8.19s/it, gpt_loss=0.34, loss_mean=0.364][A
+Train step of epoch 0:  11%|█         | 722/6434 [1:41:26<13:00:00,  8.19s/it, gpt_loss=0.315, loss_mean=0.359][A
+Train step of epoch 0:  11%|█         | 723/6434 [1:41:26<13:00:31,  8.20s/it, gpt_loss=0.315, loss_mean=0.359][A
+Train step of epoch 0:  11%|█         | 723/6434 [1:41:34<13:00:31,  8.20s/it, gpt_loss=0.445, loss_mean=0.368][A
+Train step of epoch 0:  11%|█▏        | 724/6434 [1:41:34<12:57:44,  8.17s/it, gpt_loss=0.445, loss_mean=0.368][A
+Train step of epoch 0:  11%|█▏        | 724/6434 [1:41:43<12:57:44,  8.17s/it, gpt_loss=0.483, loss_mean=0.379][A
+Train step of epoch 0:  11%|█▏        | 725/6434 [1:41:43<13:15:04,  8.36s/it, gpt_loss=0.483, loss_mean=0.379][A
+Train step of epoch 0:  11%|█▏        | 725/6434 [1:41:51<13:15:04,  8.36s/it, gpt_loss=0.359, loss_mean=0.377][A
+Train step of epoch 0:  11%|█▏        | 726/6434 [1:41:51<13:20:31,  8.41s/it, gpt_loss=0.359, loss_mean=0.377][A
+Train step of epoch 0:  11%|█▏        | 726/6434 [1:42:00<13:20:31,  8.41s/it, gpt_loss=0.331, loss_mean=0.373][A
+Train step of epoch 0:  11%|█▏        | 727/6434 [1:42:00<13:24:41,  8.46s/it, gpt_loss=0.331, loss_mean=0.373][A
+Train step of epoch 0:  11%|█▏        | 727/6434 [1:42:08<13:24:41,  8.46s/it, gpt_loss=0.407, loss_mean=0.376][A
+Train step of epoch 0:  11%|█▏        | 728/6434 [1:42:08<13:09:18,  8.30s/it, gpt_loss=0.407, loss_mean=0.376][A
+Train step of epoch 0:  11%|█▏        | 728/6434 [1:42:16<13:09:18,  8.30s/it, gpt_loss=0.367, loss_mean=0.375][A
+Train step of epoch 0:  11%|█▏        | 729/6434 [1:42:16<13:14:02,  8.35s/it, gpt_loss=0.367, loss_mean=0.375][A
+[LID Router Debug] Step: 730
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [6, 2, 3, 9, 9, 4, 1, 1, 6, 3]
+Active Experts in Batch: {1, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:  11%|█▏        | 729/6434 [1:42:24<13:14:02,  8.35s/it, gpt_loss=0.429, loss_mean=0.381][A
+Train step of epoch 0:  11%|█▏        | 730/6434 [1:42:24<12:49:06,  8.09s/it, gpt_loss=0.429, loss_mean=0.381][A
+Train step of epoch 0:  11%|█▏        | 730/6434 [1:42:32<12:49:06,  8.09s/it, gpt_loss=0.308, loss_mean=0.373][A
+Train step of epoch 0:  11%|█▏        | 731/6434 [1:42:32<13:00:35,  8.21s/it, gpt_loss=0.308, loss_mean=0.373][A
+Train step of epoch 0:  11%|█▏        | 731/6434 [1:42:41<13:00:35,  8.21s/it, gpt_loss=0.337, loss_mean=0.37] [A
+Train step of epoch 0:  11%|█▏        | 732/6434 [1:42:41<13:26:19,  8.48s/it, gpt_loss=0.337, loss_mean=0.37][A
+Train step of epoch 0:  11%|█▏        | 732/6434 [1:42:51<13:26:19,  8.48s/it, gpt_loss=0.396, loss_mean=0.372][A
+Train step of epoch 0:  11%|█▏        | 733/6434 [1:42:51<13:50:31,  8.74s/it, gpt_loss=0.396, loss_mean=0.372][A
+Train step of epoch 0:  11%|█▏        | 733/6434 [1:42:59<13:50:31,  8.74s/it, gpt_loss=0.325, loss_mean=0.368][A
+Train step of epoch 0:  11%|█▏        | 734/6434 [1:42:59<13:49:39,  8.73s/it, gpt_loss=0.325, loss_mean=0.368][A
+Train step of epoch 0:  11%|█▏        | 734/6434 [1:43:08<13:49:39,  8.73s/it, gpt_loss=0.458, loss_mean=0.377][A
+Train step of epoch 0:  11%|█▏        | 735/6434 [1:43:08<13:31:45,  8.55s/it, gpt_loss=0.458, loss_mean=0.377][A
+Train step of epoch 0:  11%|█▏        | 735/6434 [1:43:16<13:31:45,  8.55s/it, gpt_loss=0.345, loss_mean=0.373][A
+Train step of epoch 0:  11%|█▏        | 736/6434 [1:43:16<13:21:03,  8.44s/it, gpt_loss=0.345, loss_mean=0.373][A
+Train step of epoch 0:  11%|█▏        | 736/6434 [1:43:24<13:21:03,  8.44s/it, gpt_loss=0.354, loss_mean=0.371][A
+Train step of epoch 0:  11%|█▏        | 737/6434 [1:43:24<13:04:41,  8.26s/it, gpt_loss=0.354, loss_mean=0.371][A
+Train step of epoch 0:  11%|█▏        | 737/6434 [1:43:32<13:04:41,  8.26s/it, gpt_loss=0.343, loss_mean=0.369][A
+Train step of epoch 0:  11%|█▏        | 738/6434 [1:43:32<13:16:48,  8.39s/it, gpt_loss=0.343, loss_mean=0.369][A
+Train step of epoch 0:  11%|█▏        | 738/6434 [1:43:41<13:16:48,  8.39s/it, gpt_loss=0.357, loss_mean=0.367][A
+Train step of epoch 0:  11%|█▏        | 739/6434 [1:43:41<13:13:34,  8.36s/it, gpt_loss=0.357, loss_mean=0.367][A
+[LID Router Debug] Step: 740
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [5, 4, 4, 5, 3, 2, 2, 2, 1, 4]
+Active Experts in Batch: {1, 2, 3, 4, 5}
+
+Train step of epoch 0:  11%|█▏        | 739/6434 [1:43:48<13:13:34,  8.36s/it, gpt_loss=0.367, loss_mean=0.367][A
+Train step of epoch 0:  12%|█▏        | 740/6434 [1:43:48<12:41:01,  8.02s/it, gpt_loss=0.367, loss_mean=0.367][A
+Train step of epoch 0:  12%|█▏        | 740/6434 [1:43:55<12:41:01,  8.02s/it, gpt_loss=0.354, loss_mean=0.366][A
+Train step of epoch 0:  12%|█▏        | 741/6434 [1:43:55<12:26:35,  7.87s/it, gpt_loss=0.354, loss_mean=0.366][A
+Train step of epoch 0:  12%|█▏        | 741/6434 [1:44:03<12:26:35,  7.87s/it, gpt_loss=0.425, loss_mean=0.372][A
+Train step of epoch 0:  12%|█▏        | 742/6434 [1:44:03<12:17:50,  7.78s/it, gpt_loss=0.425, loss_mean=0.372][A
+Train step of epoch 0:  12%|█▏        | 742/6434 [1:44:10<12:17:50,  7.78s/it, gpt_loss=0.344, loss_mean=0.369][A
+Train step of epoch 0:  12%|█▏        | 743/6434 [1:44:10<12:01:41,  7.61s/it, gpt_loss=0.344, loss_mean=0.369][A
+Train step of epoch 0:  12%|█▏        | 743/6434 [1:44:19<12:01:41,  7.61s/it, gpt_loss=0.365, loss_mean=0.369][A
+Train step of epoch 0:  12%|█▏        | 744/6434 [1:44:19<12:50:53,  8.13s/it, gpt_loss=0.365, loss_mean=0.369][A
+Train step of epoch 0:  12%|█▏        | 744/6434 [1:44:29<12:50:53,  8.13s/it, gpt_loss=0.38, loss_mean=0.37]  [A
+Train step of epoch 0:  12%|█▏        | 745/6434 [1:44:29<13:24:21,  8.48s/it, gpt_loss=0.38, loss_mean=0.37][A
+Train step of epoch 0:  12%|█▏        | 745/6434 [1:44:37<13:24:21,  8.48s/it, gpt_loss=0.291, loss_mean=0.362][A
+Train step of epoch 0:  12%|█▏        | 746/6434 [1:44:37<13:11:31,  8.35s/it, gpt_loss=0.291, loss_mean=0.362][A
+Train step of epoch 0:  12%|█▏        | 746/6434 [1:44:45<13:11:31,  8.35s/it, gpt_loss=0.271, loss_mean=0.353][A
+Train step of epoch 0:  12%|█▏        | 747/6434 [1:44:45<13:00:09,  8.23s/it, gpt_loss=0.271, loss_mean=0.353][A
+Train step of epoch 0:  12%|█▏        | 747/6434 [1:44:54<13:00:09,  8.23s/it, gpt_loss=0.337, loss_mean=0.351][A
+Train step of epoch 0:  12%|█▏        | 748/6434 [1:44:54<13:20:25,  8.45s/it, gpt_loss=0.337, loss_mean=0.351][A
+Train step of epoch 0:  12%|█▏        | 748/6434 [1:45:02<13:20:25,  8.45s/it, gpt_loss=0.333, loss_mean=0.35] [A
+Train step of epoch 0:  12%|█▏        | 749/6434 [1:45:02<13:17:04,  8.41s/it, gpt_loss=0.333, loss_mean=0.35][A
+[LID Router Debug] Step: 750
+Batch Size: 10
+Audio Batch Size: 128
+LID Assignments: [4, 1, 9, 3, 4, 2, 3, 1, 0, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+Train step of epoch 0:  12%|█▏        | 749/6434 [1:45:11<13:17:04,  8.41s/it, gpt_loss=0.29, loss_mean=0.344][A
+Train step of epoch 0:  12%|█▏        | 750/6434 [1:45:11<13:36:09,  8.62s/it, gpt_loss=0.29, loss_mean=0.344][A
+Train step of epoch 0:  12%|█▏        | 750/6434 [1:45:20<13:36:09,  8.62s/it, gpt_loss=0.409, loss_mean=0.35][A
+Train step of epoch 0:  12%|█▏        | 751/6434 [1:45:20<13:37:34,  8.63s/it, gpt_loss=0.409, loss_mean=0.35][A
+Train step of epoch 0:  12%|█▏        | 751/6434 [1:45:28<13:37:34,  8.63s/it, gpt_loss=0.361, loss_mean=0.351][A
+Train step of epoch 0:  12%|█▏        | 752/6434 [1:45:28<13:35:48,  8.61s/it, gpt_loss=0.361, loss_mean=0.351][A
+Train step of epoch 0:  12%|█▏        | 752/6434 [1:45:37<13:35:48,  8.61s/it, gpt_loss=0.414, loss_mean=0.357][A
+Train step of epoch 0:  12%|█▏        | 753/6434 [1:45:37<13:48:02,  8.75s/it, gpt_loss=0.414, loss_mean=0.357][A
+Train step of epoch 0:  12%|█▏        | 753/6434 [1:45:47<13:48:02,  8.75s/it, gpt_loss=0.348, loss_mean=0.357][A
+Train step of epoch 0:  12%|█▏        | 754/6434 [1:45:47<14:01:56,  8.89s/it, gpt_loss=0.348, loss_mean=0.357][A
+Train step of epoch 0:  12%|█▏        | 754/6434 [1:45:55<14:01:56,  8.89s/it, gpt_loss=0.334, loss_mean=0.354][A
+Train step of epoch 0:  12%|█▏        | 755/6434 [1:45:55<13:48:59,  8.76s/it, gpt_loss=0.334, loss_mean=0.354][A
+Train step of epoch 0:  12%|█▏        | 755/6434 [1:46:03<13:48:59,  8.76s/it, gpt_loss=0.306, loss_mean=0.349][A
+Train step of epoch 0:  12%|█▏        | 756/6434 [1:46:03<13:35:51,  8.62s/it, gpt_loss=0.306, loss_mean=0.349][A
+Train step of epoch 0:  12%|█▏        | 756/6434 [1:46:12<13:35:51,  8.62s/it, gpt_loss=0.426, loss_mean=0.357][A
+Train step of epoch 0:  12%|█▏        | 757/6434 [1:46:12<13:41:56,  8.69s/it, gpt_loss=0.426, loss_mean=0.357][A
+Train step of epoch 0:  12%|█▏        | 757/6434 [1:46:21<13:41:56,  8.69s/it, gpt_loss=0.361, loss_mean=0.358][A
+Train step of epoch 0:  12%|█▏        | 758/6434 [1:46:21<13:30:18,  8.57s/it, gpt_loss=0.361, loss_mean=0.358][A
+Train step of epoch 0:  12%|█▏        | 758/6434 [1:46:29<13:30:18,  8.57s/it, gpt_loss=0.371, loss_mean=0.359][A
+Train step of epoch 0:  12%|█▏        | 759/6434 [1:46:29<13:21:20,  8.47s/it, gpt_loss=0.371, loss_mean=0.359][A
+[LID Router Debug] Step: 760
+Batch Size: 10
+Audio Batch Size: 129
+LID Assignments: [3, 6, 3, 4, 5, 3, 5, 5, 6, 3]
+Active Experts in Batch: {3, 4, 5, 6}
+
+Train step of epoch 0:  12%|█▏        | 759/6434 [1:46:37<13:21:20,  8.47s/it, gpt_loss=0.352, loss_mean=0.358][A
+Train step of epoch 0:  12%|█▏        | 760/6434 [1:46:37<13:26:43,  8.53s/it, gpt_loss=0.352, loss_mean=0.358][A
+Train step of epoch 0:  12%|█▏        | 760/6434 [1:46:46<13:26:43,  8.53s/it, gpt_loss=0.374, loss_mean=0.36] [A
+Train step of epoch 0:  12%|█▏        | 761/6434 [1:46:46<13:34:46,  8.62s/it, gpt_loss=0.374, loss_mean=0.36][A
+Train step of epoch 0:  12%|█▏        | 761/6434 [1:46:55<13:34:46,  8.62s/it, gpt_loss=0.378, loss_mean=0.362][A
+Train step of epoch 0:  12%|█▏        | 762/6434 [1:46:55<13:38:22,  8.66s/it, gpt_loss=0.378, loss_mean=0.362][A
+Train step of epoch 0:  12%|█▏        | 762/6434 [1:47:03<13:38:22,  8.66s/it, gpt_loss=0.399, loss_mean=0.365][A
+Train step of epoch 0:  12%|█▏        | 763/6434 [1:47:03<13:10:06,  8.36s/it, gpt_loss=0.399, loss_mean=0.365][A
+Train step of epoch 0:  12%|█▏        | 763/6434 [1:47:11<13:10:06,  8.36s/it, gpt_loss=0.361, loss_mean=0.365][A
+Train step of epoch 0:  12%|█▏        | 764/6434 [1:47:11<13:10:32,  8.37s/it, gpt_loss=0.361, loss_mean=0.365][A
+Train step of epoch 0:  12%|█▏        | 764/6434 [1:47:19<13:10:32,  8.37s/it, gpt_loss=0.354, loss_mean=0.364][A
+Train step of epoch 0:  12%|█▏        | 765/6434 [1:47:19<13:03:19,  8.29s/it, gpt_loss=0.354, loss_mean=0.364][A
+Train step of epoch 0:  12%|█▏        | 765/6434 [1:47:29<13:03:19,  8.29s/it, gpt_loss=0.272, loss_mean=0.355][A
+Train step of epoch 0:  12%|█▏        | 766/6434 [1:47:29<13:51:50,  8.81s/it, gpt_loss=0.272, loss_mean=0.355][A
+Train step of epoch 0:  12%|█▏        | 766/6434 [1:47:38<13:51:50,  8.81s/it, gpt_loss=0.415, loss_mean=0.361][A
+Train step of epoch 0:  12%|█▏        | 767/6434 [1:47:38<13:39:46,  8.68s/it, gpt_loss=0.415, loss_mean=0.361][A
+Train step of epoch 0:  12%|█▏        | 767/6434 [1:47:46<13:39:46,  8.68s/it, gpt_loss=0.293, loss_mean=0.354][A
+Train step of epoch 0:  12%|█▏        | 768/6434 [1:47:46<13:33:57,  8.62s/it, gpt_loss=0.293, loss_mean=0.354][A
+Train step of epoch 0:  12%|█▏        | 768/6434 [1:47:54<13:33:57,  8.62s/it, gpt_loss=0.367, loss_mean=0.355][A
+Train step of epoch 0:  12%|█▏        | 769/6434 [1:47:54<13:07:33,  8.34s/it, gpt_loss=0.367, loss_mean=0.355][A
+[LID Router Debug] Step: 770
+Batch Size: 10
+Audio Batch Size: 100
+LID Assignments: [9, 7, 4, 4, 1, 3, 5, 9, 1, 2]
+Active Experts in Batch: {1, 2, 3, 4, 5, 7, 9}
+
+Train step of epoch 0:  12%|█▏        | 769/6434 [1:48:02<13:07:33,  8.34s/it, gpt_loss=0.497, loss_mean=0.369][A
+Train step of epoch 0:  12%|█▏        | 770/6434 [1:48:02<13:11:06,  8.38s/it, gpt_loss=0.497, loss_mean=0.369][A
+Train step of epoch 0:  12%|█▏        | 770/6434 [1:48:11<13:11:06,  8.38s/it, gpt_loss=0.376, loss_mean=0.37] [A
+Train step of epoch 0:  12%|█▏        | 771/6434 [1:48:11<13:19:03,  8.47s/it, gpt_loss=0.376, loss_mean=0.37][A
+Train step of epoch 0:  12%|█▏        | 771/6434 [1:48:19<13:19:03,  8.47s/it, gpt_loss=0.511, loss_mean=0.384][A
+Train step of epoch 0:  12%|█▏        | 772/6434 [1:48:19<12:56:24,  8.23s/it, gpt_loss=0.511, loss_mean=0.384][A
+Train step of epoch 0:  12%|█▏        | 772/6434 [1:48:28<12:56:24,  8.23s/it, gpt_loss=0.353, loss_mean=0.381][A
+Train step of epoch 0:  12%|█▏        | 773/6434 [1:48:28<13:21:43,  8.50s/it, gpt_loss=0.353, loss_mean=0.381][A
+Train step of epoch 0:  12%|█▏        | 773/6434 [1:48:35<13:21:43,  8.50s/it, gpt_loss=0.343, loss_mean=0.377][A
+Train step of epoch 0:  12%|█▏        | 774/6434 [1:48:35<12:49:22,  8.16s/it, gpt_loss=0.343, loss_mean=0.377][A
+Train step of epoch 0:  12%|█▏        | 774/6434 [1:48:43<12:49:22,  8.16s/it, gpt_loss=0.379, loss_mean=0.377][A
+Train step of epoch 0:  12%|█▏        | 775/6434 [1:48:43<12:52:52,  8.19s/it, gpt_loss=0.379, loss_mean=0.377][A
+Train step of epoch 0:  12%|█▏        | 775/6434 [1:48:52<12:52:52,  8.19s/it, gpt_loss=0.51, loss_mean=0.391] [A
+Train step of epoch 0:  12%|█▏        | 776/6434 [1:48:52<13:18:43,  8.47s/it, gpt_loss=0.51, loss_mean=0.391][A
+Train step of epoch 0:  12%|█▏        | 776/6434 [1:49:01<13:18:43,  8.47s/it, gpt_loss=0.359, loss_mean=0.388][A
+Train step of epoch 0:  12%|█▏        | 777/6434 [1:49:01<13:23:16,  8.52s/it, gpt_loss=0.359, loss_mean=0.388][A
+Train step of epoch 0:  12%|█▏        | 777/6434 [1:49:10<13:23:16,  8.52s/it, gpt_loss=0.303, loss_mean=0.379][A
+Train step of epoch 0:  12%|█▏        | 778/6434 [1:49:10<13:33:52,  8.63s/it, gpt_loss=0.303, loss_mean=0.379][A
+Train step of epoch 0:  12%|█▏        | 778/6434 [1:49:18<13:33:52,  8.63s/it, gpt_loss=0.468, loss_mean=0.388][A
+Train step of epoch 0:  12%|█▏        | 779/6434 [1:49:18<13:29:06,  8.58s/it, gpt_loss=0.468, loss_mean=0.388][A
+[LID Router Debug] Step: 780
+Batch Size: 10
+Audio Batch Size: 104
+LID Assignments: [0, 3, 1, 5, 9, 4, 1, 0, 3, 1]
+Active Experts in Batch: {0, 1, 3, 4, 5, 9}
+
+Train step of epoch 0:  12%|█▏        | 779/6434 [1:49:26<13:29:06,  8.58s/it, gpt_loss=0.391, loss_mean=0.388][A
+Train step of epoch 0:  12%|█▏        | 780/6434 [1:49:26<13:09:21,  8.38s/it, gpt_loss=0.391, loss_mean=0.388][A
+Train step of epoch 0:  12%|█▏        | 780/6434 [1:49:34<13:09:21,  8.38s/it, gpt_loss=0.333, loss_mean=0.383][A
+Train step of epoch 0:  12%|█▏        | 781/6434 [1:49:34<12:59:49,  8.28s/it, gpt_loss=0.333, loss_mean=0.383][A
+Train step of epoch 0:  12%|█▏        | 781/6434 [1:49:42<12:59:49,  8.28s/it, gpt_loss=0.361, loss_mean=0.381][A
+Train step of epoch 0:  12%|█▏        | 782/6434 [1:49:42<12:42:54,  8.10s/it, gpt_loss=0.361, loss_mean=0.381][A
+Train step of epoch 0:  12%|█▏        | 782/6434 [1:49:50<12:42:54,  8.10s/it, gpt_loss=0.381, loss_mean=0.381][A
+Train step of epoch 0:  12%|█▏        | 783/6434 [1:49:50<12:31:08,  7.98s/it, gpt_loss=0.381, loss_mean=0.381][A
+Train step of epoch 0:  12%|█▏        | 783/6434 [1:49:58<12:31:08,  7.98s/it, gpt_loss=0.346, loss_mean=0.377][A
+Train step of epoch 0:  12%|█▏        | 784/6434 [1:49:58<12:37:33,  8.04s/it, gpt_loss=0.346, loss_mean=0.377][A
+Train step of epoch 0:  12%|█▏        | 784/6434 [1:50:06<12:37:33,  8.04s/it, gpt_loss=0.413, loss_mean=0.381][A
+Train step of epoch 0:  12%|█▏        | 785/6434 [1:50:06<12:31:54,  7.99s/it, gpt_loss=0.413, loss_mean=0.381][A
+Train step of epoch 0:  12%|█▏        | 785/6434 [1:50:16<12:31:54,  7.99s/it, gpt_loss=0.377, loss_mean=0.38] [A
+Train step of epoch 0:  12%|█▏        | 786/6434 [1:50:16<13:36:51,  8.68s/it, gpt_loss=0.377, loss_mean=0.38][A
+Train step of epoch 0:  12%|█▏        | 786/6434 [1:50:24<13:36:51,  8.68s/it, gpt_loss=0.284, loss_mean=0.371][A
+Train step of epoch 0:  12%|█▏        | 787/6434 [1:50:24<13:14:41,  8.44s/it, gpt_loss=0.284, loss_mean=0.371][A
+Train step of epoch 0:  12%|█▏        | 787/6434 [1:50:33<13:14:41,  8.44s/it, gpt_loss=0.351, loss_mean=0.369][A
+Train step of epoch 0:  12%|█▏        | 788/6434 [1:50:33<13:20:43,  8.51s/it, gpt_loss=0.351, loss_mean=0.369][A
+Train step of epoch 0:  12%|█▏        | 788/6434 [1:50:41<13:20:43,  8.51s/it, gpt_loss=0.4, loss_mean=0.372]  [A
+Train step of epoch 0:  12%|█▏        | 789/6434 [1:50:41<13:09:21,  8.39s/it, gpt_loss=0.4, loss_mean=0.372][A
+[LID Router Debug] Step: 790
+Batch Size: 10
+Audio Batch Size: 123
+LID Assignments: [9, 3, 0, 9, 3, 5, 1, 1, 0, 9]
+Active Experts in Batch: {0, 1, 3, 5, 9}
+
+Train step of epoch 0:  12%|█▏        | 789/6434 [1:50:49<13:09:21,  8.39s/it, gpt_loss=0.335, loss_mean=0.368][A
+Train step of epoch 0:  12%|█▏        | 790/6434 [1:50:49<13:03:46,  8.33s/it, gpt_loss=0.335, loss_mean=0.368][A
+Train step of epoch 0:  12%|█▏        | 790/6434 [1:50:57<13:03:46,  8.33s/it, gpt_loss=0.337, loss_mean=0.365][A
+Train step of epoch 0:  12%|█▏        | 791/6434 [1:50:57<12:49:53,  8.19s/it, gpt_loss=0.337, loss_mean=0.365][A
+Train step of epoch 0:  12%|█▏        | 791/6434 [1:51:06<12:49:53,  8.19s/it, gpt_loss=0.278, loss_mean=0.356][A
+Train step of epoch 0:  12%|█▏        | 792/6434 [1:51:06<13:10:27,  8.41s/it, gpt_loss=0.278, loss_mean=0.356][A
+Train step of epoch 0:  12%|█▏        | 792/6434 [1:51:15<13:10:27,  8.41s/it, gpt_loss=0.428, loss_mean=0.363][A
+Train step of epoch 0:  12%|█▏        | 793/6434 [1:51:15<13:31:25,  8.63s/it, gpt_loss=0.428, loss_mean=0.363][A
+Train step of epoch 0:  12%|█▏        | 793/6434 [1:51:23<13:31:25,  8.63s/it, gpt_loss=0.355, loss_mean=0.363][A
+Train step of epoch 0:  12%|█▏        | 794/6434 [1:51:23<13:09:35,  8.40s/it, gpt_loss=0.355, loss_mean=0.363][A
+Train step of epoch 0:  12%|█▏        | 794/6434 [1:51:31<13:09:35,  8.40s/it, gpt_loss=0.348, loss_mean=0.361][A
+Train step of epoch 0:  12%|█▏        | 795/6434 [1:51:31<13:02:15,  8.32s/it, gpt_loss=0.348, loss_mean=0.361][A
+Train step of epoch 0:  12%|█▏        | 795/6434 [1:51:39<13:02:15,  8.32s/it, gpt_loss=0.351, loss_mean=0.36] [A
+Train step of epoch 0:  12%|█▏        | 796/6434 [1:51:39<12:52:59,  8.23s/it, gpt_loss=0.351, loss_mean=0.36][A
+Train step of epoch 0:  12%|█▏        | 796/6434 [1:51:48<12:52:59,  8.23s/it, gpt_loss=0.391, loss_mean=0.363][A
+Train step of epoch 0:  12%|█▏        | 797/6434 [1:51:48<13:03:52,  8.34s/it, gpt_loss=0.391, loss_mean=0.363][A
+Train step of epoch 0:  12%|█▏        | 797/6434 [1:51:56<13:03:52,  8.34s/it, gpt_loss=0.369, loss_mean=0.364][A
+Train step of epoch 0:  12%|█▏        | 798/6434 [1:51:56<13:00:39,  8.31s/it, gpt_loss=0.369, loss_mean=0.364][A
+Train step of epoch 0:  12%|█▏        | 798/6434 [1:52:04<13:00:39,  8.31s/it, gpt_loss=0.322, loss_mean=0.36] [A
+Train step of epoch 0:  12%|█▏        | 799/6434 [1:52:04<13:01:08,  8.32s/it, gpt_loss=0.322, loss_mean=0.36][A
+[LID Router Debug] Step: 800
+Batch Size: 10
+Audio Batch Size: 98
+LID Assignments: [0, 5, 4, 3, 1, 1, 0, 1, 3, 4]
+Active Experts in Batch: {0, 1, 3, 4, 5}
+[2026-02-06 17:48:16,222] [INFO] [logging.py:96:log_dist] [Rank 0] step=400, skipped=0, lr=[1.9950406445561778e-05, 1.9950406445561778e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-06 17:48:16,223] [INFO] [timer.py:260:stop] epoch=0/micro_step=800/global_step=400, RunningAvgSamplesPerSec=4.771069923740978, CurrSamplesPerSec=5.01482608249891, MemAllocated=12.5GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  12%|█▏        | 799/6434 [1:52:12<13:01:08,  8.32s/it, gpt_loss=0.332, loss_mean=0.357][A
+Train step of epoch 0:  12%|█▏        | 800/6434 [1:52:12<12:42:10,  8.12s/it, gpt_loss=0.332, loss_mean=0.357][A
+Train step of epoch 0:  12%|█▏        | 800/6434 [1:52:19<12:42:10,  8.12s/it, gpt_loss=0.386, loss_mean=0.36] [A
+Train step of epoch 0:  12%|█▏        | 801/6434 [1:52:19<12:25:01,  7.94s/it, gpt_loss=0.386, loss_mean=0.36][A
+Train step of epoch 0:  12%|█▏        | 801/6434 [1:52:27<12:25:01,  7.94s/it, gpt_loss=0.4, loss_mean=0.364] [A
+Train step of epoch 0:  12%|█▏        | 802/6434 [1:52:27<12:31:28,  8.01s/it, gpt_loss=0.4, loss_mean=0.364][A
+Train step of epoch 0:  12%|█▏        | 802/6434 [1:52:36<12:31:28,  8.01s/it, gpt_loss=0.433, loss_mean=0.371][A
+Train step of epoch 0:  12%|█▏        | 803/6434 [1:52:36<12:46:51,  8.17s/it, gpt_loss=0.433, loss_mean=0.371][A
+Train step of epoch 0:  12%|█▏        | 803/6434 [1:52:44<12:46:51,  8.17s/it, gpt_loss=0.413, loss_mean=0.375][A
+Train step of epoch 0:  12%|█▏        | 804/6434 [1:52:44<12:29:47,  7.99s/it, gpt_loss=0.413, loss_mean=0.375][A
+Train step of epoch 0:  12%|█▏        | 804/6434 [1:52:52<12:29:47,  7.99s/it, gpt_loss=0.436, loss_mean=0.381][A
+Train step of epoch 0:  13%|█▎        | 805/6434 [1:52:52<12:53:01,  8.24s/it, gpt_loss=0.436, loss_mean=0.381][A
+Train step of epoch 0:  13%|█▎        | 805/6434 [1:53:00<12:53:01,  8.24s/it, gpt_loss=0.338, loss_mean=0.377][A
+Train step of epoch 0:  13%|█▎        | 806/6434 [1:53:00<12:49:19,  8.20s/it, gpt_loss=0.338, loss_mean=0.377][A
+Train step of epoch 0:  13%|█▎        | 806/6434 [1:53:09<12:49:19,  8.20s/it, gpt_loss=0.372, loss_mean=0.376][A
+Train step of epoch 0:  13%|█▎        | 807/6434 [1:53:09<13:02:53,  8.35s/it, gpt_loss=0.372, loss_mean=0.376][A
+Train step of epoch 0:  13%|█▎        | 807/6434 [1:53:17<13:02:53,  8.35s/it, gpt_loss=0.41, loss_mean=0.38]  [A
+Train step of epoch 0:  13%|█▎        | 808/6434 [1:53:17<12:54:03,  8.26s/it, gpt_loss=0.41, loss_mean=0.38][A
+Train step of epoch 0:  13%|█▎        | 808/6434 [1:53:25<12:54:03,  8.26s/it, gpt_loss=0.336, loss_mean=0.375][A
+Train step of epoch 0:  13%|█▎        | 809/6434 [1:53:25<12:34:40,  8.05s/it, gpt_loss=0.336, loss_mean=0.375][A
+[LID Router Debug] Step: 810
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [2, 9, 5, 0, 9, 5, 4, 1, 2, 9]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+Train step of epoch 0:  13%|█▎        | 809/6434 [1:53:34<12:34:40,  8.05s/it, gpt_loss=0.47, loss_mean=0.385] [A
+Train step of epoch 0:  13%|█▎        | 810/6434 [1:53:34<12:59:54,  8.32s/it, gpt_loss=0.47, loss_mean=0.385][A
+Train step of epoch 0:  13%|█▎        | 810/6434 [1:53:42<12:59:54,  8.32s/it, gpt_loss=0.327, loss_mean=0.379][A
+Train step of epoch 0:  13%|█▎        | 811/6434 [1:53:42<13:02:46,  8.35s/it, gpt_loss=0.327, loss_mean=0.379][A
+Train step of epoch 0:  13%|█▎        | 811/6434 [1:53:52<13:02:46,  8.35s/it, gpt_loss=0.431, loss_mean=0.384][A
+Train step of epoch 0:  13%|█▎        | 812/6434 [1:53:52<13:52:44,  8.89s/it, gpt_loss=0.431, loss_mean=0.384][A
+Train step of epoch 0:  13%|█▎        | 812/6434 [1:54:01<13:52:44,  8.89s/it, gpt_loss=0.284, loss_mean=0.374][A
+Train step of epoch 0:  13%|█▎        | 813/6434 [1:54:01<13:40:01,  8.75s/it, gpt_loss=0.284, loss_mean=0.374][A
+Train step of epoch 0:  13%|█▎        | 813/6434 [1:54:09<13:40:01,  8.75s/it, gpt_loss=0.327, loss_mean=0.369][A
+Train step of epoch 0:  13%|█▎        | 814/6434 [1:54:09<13:19:40,  8.54s/it, gpt_loss=0.327, loss_mean=0.369][A
+Train step of epoch 0:  13%|█▎        | 814/6434 [1:54:17<13:19:40,  8.54s/it, gpt_loss=0.41, loss_mean=0.374] [A
+Train step of epoch 0:  13%|█▎        | 815/6434 [1:54:17<13:21:18,  8.56s/it, gpt_loss=0.41, loss_mean=0.374][A
+Train step of epoch 0:  13%|█▎        | 815/6434 [1:54:26<13:21:18,  8.56s/it, gpt_loss=0.374, loss_mean=0.374][A
+Train step of epoch 0:  13%|█▎        | 816/6434 [1:54:26<13:17:04,  8.51s/it, gpt_loss=0.374, loss_mean=0.374][A
+Train step of epoch 0:  13%|█▎        | 816/6434 [1:54:34<13:17:04,  8.51s/it, gpt_loss=0.307, loss_mean=0.367][A
+Train step of epoch 0:  13%|█▎        | 817/6434 [1:54:34<13:09:50,  8.44s/it, gpt_loss=0.307, loss_mean=0.367][A
+Train step of epoch 0:  13%|█▎        | 817/6434 [1:54:44<13:09:50,  8.44s/it, gpt_loss=0.438, loss_mean=0.374][A
+Train step of epoch 0:  13%|█▎        | 818/6434 [1:54:44<13:42:11,  8.78s/it, gpt_loss=0.438, loss_mean=0.374][A
+Train step of epoch 0:  13%|█▎        | 818/6434 [1:54:51<13:42:11,  8.78s/it, gpt_loss=0.298, loss_mean=0.366][A
+Train step of epoch 0:  13%|█▎        | 819/6434 [1:54:51<12:57:04,  8.30s/it, gpt_loss=0.298, loss_mean=0.366][A
+[LID Router Debug] Step: 820
+Batch Size: 10
+Audio Batch Size: 148
+LID Assignments: [6, 3, 9, 4, 1, 3, 3, 3, 3, 5]
+Active Experts in Batch: {1, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  13%|█▎        | 819/6434 [1:55:00<12:57:04,  8.30s/it, gpt_loss=0.333, loss_mean=0.363][A
+Train step of epoch 0:  13%|█▎        | 820/6434 [1:55:00<13:30:54,  8.67s/it, gpt_loss=0.333, loss_mean=0.363][A
+Train step of epoch 0:  13%|█▎        | 820/6434 [1:55:08<13:30:54,  8.67s/it, gpt_loss=0.336, loss_mean=0.36] [A
+Train step of epoch 0:  13%|█▎        | 821/6434 [1:55:08<13:12:09,  8.47s/it, gpt_loss=0.336, loss_mean=0.36][A
+Train step of epoch 0:  13%|█▎        | 821/6434 [1:55:17<13:12:09,  8.47s/it, gpt_loss=0.284, loss_mean=0.353][A
+Train step of epoch 0:  13%|█▎        | 822/6434 [1:55:17<13:18:11,  8.53s/it, gpt_loss=0.284, loss_mean=0.353][A
+Train step of epoch 0:  13%|█▎        | 822/6434 [1:55:25<13:18:11,  8.53s/it, gpt_loss=0.354, loss_mean=0.353][A
+Train step of epoch 0:  13%|█▎        | 823/6434 [1:55:25<12:55:37,  8.29s/it, gpt_loss=0.354, loss_mean=0.353][A
+Train step of epoch 0:  13%|█▎        | 823/6434 [1:55:33<12:55:37,  8.29s/it, gpt_loss=0.367, loss_mean=0.354][A
+Train step of epoch 0:  13%|█▎        | 824/6434 [1:55:33<12:45:52,  8.19s/it, gpt_loss=0.367, loss_mean=0.354][A
+Train step of epoch 0:  13%|█▎        | 824/6434 [1:55:41<12:45:52,  8.19s/it, gpt_loss=0.414, loss_mean=0.36] [A
+Train step of epoch 0:  13%|█▎        | 825/6434 [1:55:41<12:53:15,  8.27s/it, gpt_loss=0.414, loss_mean=0.36][A
+Train step of epoch 0:  13%|█▎        | 825/6434 [1:55:48<12:53:15,  8.27s/it, gpt_loss=0.37, loss_mean=0.361][A
+Train step of epoch 0:  13%|█▎        | 826/6434 [1:55:48<12:24:36,  7.97s/it, gpt_loss=0.37, loss_mean=0.361][A
+Train step of epoch 0:  13%|█▎        | 826/6434 [1:55:57<12:24:36,  7.97s/it, gpt_loss=0.298, loss_mean=0.355][A
+Train step of epoch 0:  13%|█▎        | 827/6434 [1:55:57<12:42:27,  8.16s/it, gpt_loss=0.298, loss_mean=0.355][A
+Train step of epoch 0:  13%|█▎        | 827/6434 [1:56:05<12:42:27,  8.16s/it, gpt_loss=0.432, loss_mean=0.363][A
+Train step of epoch 0:  13%|█▎        | 828/6434 [1:56:05<12:38:00,  8.11s/it, gpt_loss=0.432, loss_mean=0.363][A
+Train step of epoch 0:  13%|█▎        | 828/6434 [1:56:13<12:38:00,  8.11s/it, gpt_loss=0.425, loss_mean=0.369][A
+Train step of epoch 0:  13%|█▎        | 829/6434 [1:56:13<12:42:56,  8.17s/it, gpt_loss=0.425, loss_mean=0.369][A
+[LID Router Debug] Step: 830
+Batch Size: 10
+Audio Batch Size: 81
+LID Assignments: [6, 0, 9, 0, 5, 4, 5, 4, 6, 5]
+Active Experts in Batch: {0, 4, 5, 6, 9}
+
+Train step of epoch 0:  13%|█▎        | 829/6434 [1:56:21<12:42:56,  8.17s/it, gpt_loss=0.4, loss_mean=0.372]  [A
+Train step of epoch 0:  13%|█▎        | 830/6434 [1:56:21<12:37:31,  8.11s/it, gpt_loss=0.4, loss_mean=0.372][A
+Train step of epoch 0:  13%|█▎        | 830/6434 [1:56:29<12:37:31,  8.11s/it, gpt_loss=0.348, loss_mean=0.37][A
+Train step of epoch 0:  13%|█▎        | 831/6434 [1:56:29<12:36:10,  8.10s/it, gpt_loss=0.348, loss_mean=0.37][A
+Train step of epoch 0:  13%|█▎        | 831/6434 [1:56:38<12:36:10,  8.10s/it, gpt_loss=0.284, loss_mean=0.361][A
+Train step of epoch 0:  13%|█▎        | 832/6434 [1:56:38<13:00:13,  8.36s/it, gpt_loss=0.284, loss_mean=0.361][A
+Train step of epoch 0:  13%|█▎        | 832/6434 [1:56:46<13:00:13,  8.36s/it, gpt_loss=0.395, loss_mean=0.364][A
+Train step of epoch 0:  13%|█▎        | 833/6434 [1:56:46<12:50:09,  8.25s/it, gpt_loss=0.395, loss_mean=0.364][A
+Train step of epoch 0:  13%|█▎        | 833/6434 [1:56:58<12:50:09,  8.25s/it, gpt_loss=0.39, loss_mean=0.367] [A
+Train step of epoch 0:  13%|█▎        | 834/6434 [1:56:58<14:35:59,  9.39s/it, gpt_loss=0.39, loss_mean=0.367][A
+Train step of epoch 0:  13%|█▎        | 834/6434 [1:57:08<14:35:59,  9.39s/it, gpt_loss=0.274, loss_mean=0.358][A
+Train step of epoch 0:  13%|█▎        | 835/6434 [1:57:08<14:43:49,  9.47s/it, gpt_loss=0.274, loss_mean=0.358][A
+Train step of epoch 0:  13%|█▎        | 835/6434 [1:57:17<14:43:49,  9.47s/it, gpt_loss=0.427, loss_mean=0.365][A
+Train step of epoch 0:  13%|█▎        | 836/6434 [1:57:17<14:17:57,  9.20s/it, gpt_loss=0.427, loss_mean=0.365][A
+Train step of epoch 0:  13%|█▎        | 836/6434 [1:57:25<14:17:57,  9.20s/it, gpt_loss=0.411, loss_mean=0.369][A
+Train step of epoch 0:  13%|█▎        | 837/6434 [1:57:25<13:41:57,  8.81s/it, gpt_loss=0.411, loss_mean=0.369][A
+Train step of epoch 0:  13%|█▎        | 837/6434 [1:57:32<13:41:57,  8.81s/it, gpt_loss=0.305, loss_mean=0.363][A
+Train step of epoch 0:  13%|█▎        | 838/6434 [1:57:32<12:53:33,  8.29s/it, gpt_loss=0.305, loss_mean=0.363][A
+Train step of epoch 0:  13%|█▎        | 838/6434 [1:57:39<12:53:33,  8.29s/it, gpt_loss=0.353, loss_mean=0.362][A
+Train step of epoch 0:  13%|█▎        | 839/6434 [1:57:39<12:39:23,  8.14s/it, gpt_loss=0.353, loss_mean=0.362][A
+[LID Router Debug] Step: 840
+Batch Size: 10
+Audio Batch Size: 115
+LID Assignments: [3, 6, 4, 3, 0, 3, 4, 4, 4, 6]
+Active Experts in Batch: {0, 3, 4, 6}
+
+Train step of epoch 0:  13%|█▎        | 839/6434 [1:57:47<12:39:23,  8.14s/it, gpt_loss=0.285, loss_mean=0.354][A
+Train step of epoch 0:  13%|█▎        | 840/6434 [1:57:47<12:37:00,  8.12s/it, gpt_loss=0.285, loss_mean=0.354][A
+Train step of epoch 0:  13%|█▎        | 840/6434 [1:57:56<12:37:00,  8.12s/it, gpt_loss=0.458, loss_mean=0.364][A
+Train step of epoch 0:  13%|█▎        | 841/6434 [1:57:56<12:35:48,  8.11s/it, gpt_loss=0.458, loss_mean=0.364][A
+Train step of epoch 0:  13%|█▎        | 841/6434 [1:58:04<12:35:48,  8.11s/it, gpt_loss=0.449, loss_mean=0.373][A
+Train step of epoch 0:  13%|█▎        | 842/6434 [1:58:04<12:54:18,  8.31s/it, gpt_loss=0.449, loss_mean=0.373][A
+Train step of epoch 0:  13%|█▎        | 842/6434 [1:58:13<12:54:18,  8.31s/it, gpt_loss=0.323, loss_mean=0.368][A
+Train step of epoch 0:  13%|█▎        | 843/6434 [1:58:13<12:53:11,  8.30s/it, gpt_loss=0.323, loss_mean=0.368][A
+Train step of epoch 0:  13%|█▎        | 843/6434 [1:58:21<12:53:11,  8.30s/it, gpt_loss=0.387, loss_mean=0.37] [A
+Train step of epoch 0:  13%|█▎        | 844/6434 [1:58:21<12:55:43,  8.33s/it, gpt_loss=0.387, loss_mean=0.37][A
+Train step of epoch 0:  13%|█▎        | 844/6434 [1:58:28<12:55:43,  8.33s/it, gpt_loss=0.364, loss_mean=0.369][A
+Train step of epoch 0:  13%|█▎        | 845/6434 [1:58:28<12:30:52,  8.06s/it, gpt_loss=0.364, loss_mean=0.369][A
+Train step of epoch 0:  13%|█▎        | 845/6434 [1:58:38<12:30:52,  8.06s/it, gpt_loss=0.263, loss_mean=0.359][A
+Train step of epoch 0:  13%|█▎        | 846/6434 [1:58:38<13:12:52,  8.51s/it, gpt_loss=0.263, loss_mean=0.359][A
+Train step of epoch 0:  13%|█▎        | 846/6434 [1:58:49<13:12:52,  8.51s/it, gpt_loss=0.329, loss_mean=0.356][A
+Train step of epoch 0:  13%|█▎        | 847/6434 [1:58:49<14:08:30,  9.11s/it, gpt_loss=0.329, loss_mean=0.356][A
+Train step of epoch 0:  13%|█▎        | 847/6434 [1:58:57<14:08:30,  9.11s/it, gpt_loss=0.403, loss_mean=0.36] [A
+Train step of epoch 0:  13%|█▎        | 848/6434 [1:58:57<13:53:01,  8.95s/it, gpt_loss=0.403, loss_mean=0.36][A
+Train step of epoch 0:  13%|█▎        | 848/6434 [1:59:05<13:53:01,  8.95s/it, gpt_loss=0.285, loss_mean=0.353][A
+Train step of epoch 0:  13%|█▎        | 849/6434 [1:59:05<13:28:52,  8.69s/it, gpt_loss=0.285, loss_mean=0.353][A
+[LID Router Debug] Step: 850
+Batch Size: 10
+Audio Batch Size: 90
+LID Assignments: [4, 4, 5, 9, 3, 1, 4, 2, 1, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  13%|█▎        | 849/6434 [1:59:13<13:28:52,  8.69s/it, gpt_loss=0.372, loss_mean=0.355][A
+Train step of epoch 0:  13%|█▎        | 850/6434 [1:59:13<13:01:33,  8.40s/it, gpt_loss=0.372, loss_mean=0.355][A
+Train step of epoch 0:  13%|█▎        | 850/6434 [1:59:22<13:01:33,  8.40s/it, gpt_loss=0.349, loss_mean=0.354][A
+Train step of epoch 0:  13%|█▎        | 851/6434 [1:59:22<13:24:48,  8.65s/it, gpt_loss=0.349, loss_mean=0.354][A
+Train step of epoch 0:  13%|█▎        | 851/6434 [1:59:31<13:24:48,  8.65s/it, gpt_loss=0.337, loss_mean=0.352][A
+Train step of epoch 0:  13%|█▎        | 852/6434 [1:59:31<13:21:47,  8.62s/it, gpt_loss=0.337, loss_mean=0.352][A
+Train step of epoch 0:  13%|█▎        | 852/6434 [1:59:39<13:21:47,  8.62s/it, gpt_loss=0.389, loss_mean=0.356][A
+Train step of epoch 0:  13%|█▎        | 853/6434 [1:59:39<13:08:41,  8.48s/it, gpt_loss=0.389, loss_mean=0.356][A
+Train step of epoch 0:  13%|█▎        | 853/6434 [1:59:47<13:08:41,  8.48s/it, gpt_loss=0.353, loss_mean=0.356][A
+Train step of epoch 0:  13%|█▎        | 854/6434 [1:59:47<12:48:15,  8.26s/it, gpt_loss=0.353, loss_mean=0.356][A
+Train step of epoch 0:  13%|█▎        | 854/6434 [1:59:55<12:48:15,  8.26s/it, gpt_loss=0.362, loss_mean=0.356][A
+Train step of epoch 0:  13%|█▎        | 855/6434 [1:59:55<12:42:44,  8.20s/it, gpt_loss=0.362, loss_mean=0.356][A
+Train step of epoch 0:  13%|█▎        | 855/6434 [2:00:01<12:42:44,  8.20s/it, gpt_loss=0.499, loss_mean=0.371][A
+Train step of epoch 0:  13%|█▎        | 856/6434 [2:00:01<11:58:05,  7.72s/it, gpt_loss=0.499, loss_mean=0.371][A
+Train step of epoch 0:  13%|█▎        | 856/6434 [2:00:09<11:58:05,  7.72s/it, gpt_loss=0.367, loss_mean=0.37] [A
+Train step of epoch 0:  13%|█▎        | 857/6434 [2:00:09<12:09:12,  7.85s/it, gpt_loss=0.367, loss_mean=0.37][A
+Train step of epoch 0:  13%|█▎        | 857/6434 [2:00:18<12:09:12,  7.85s/it, gpt_loss=0.41, loss_mean=0.374][A
+Train step of epoch 0:  13%|█▎        | 858/6434 [2:00:18<12:29:09,  8.06s/it, gpt_loss=0.41, loss_mean=0.374][A
+Train step of epoch 0:  13%|█▎        | 858/6434 [2:00:26<12:29:09,  8.06s/it, gpt_loss=0.375, loss_mean=0.374][A
+Train step of epoch 0:  13%|█▎        | 859/6434 [2:00:26<12:24:56,  8.02s/it, gpt_loss=0.375, loss_mean=0.374][A
+[LID Router Debug] Step: 860
+Batch Size: 10
+Audio Batch Size: 155
+LID Assignments: [3, 0, 1, 9, 0, 3, 2, 3, 0, 9]
+Active Experts in Batch: {0, 1, 2, 3, 9}
+
+Train step of epoch 0:  13%|█▎        | 859/6434 [2:00:35<12:24:56,  8.02s/it, gpt_loss=0.373, loss_mean=0.374][A
+Train step of epoch 0:  13%|█▎        | 860/6434 [2:00:35<12:59:50,  8.39s/it, gpt_loss=0.373, loss_mean=0.374][A
+Train step of epoch 0:  13%|█▎        | 860/6434 [2:00:43<12:59:50,  8.39s/it, gpt_loss=0.376, loss_mean=0.374][A
+Train step of epoch 0:  13%|█▎        | 861/6434 [2:00:43<12:31:20,  8.09s/it, gpt_loss=0.376, loss_mean=0.374][A
+Train step of epoch 0:  13%|█▎        | 861/6434 [2:00:50<12:31:20,  8.09s/it, gpt_loss=0.315, loss_mean=0.368][A
+Train step of epoch 0:  13%|█▎        | 862/6434 [2:00:50<12:19:39,  7.96s/it, gpt_loss=0.315, loss_mean=0.368][A
+Train step of epoch 0:  13%|█▎        | 862/6434 [2:00:58<12:19:39,  7.96s/it, gpt_loss=0.344, loss_mean=0.366][A
+Train step of epoch 0:  13%|█▎        | 863/6434 [2:00:58<12:02:41,  7.78s/it, gpt_loss=0.344, loss_mean=0.366][A
+Train step of epoch 0:  13%|█▎        | 863/6434 [2:01:06<12:02:41,  7.78s/it, gpt_loss=0.455, loss_mean=0.375][A
+Train step of epoch 0:  13%|█▎        | 864/6434 [2:01:06<12:16:03,  7.93s/it, gpt_loss=0.455, loss_mean=0.375][A
+Train step of epoch 0:  13%|█▎        | 864/6434 [2:01:16<12:16:03,  7.93s/it, gpt_loss=0.373, loss_mean=0.375][A
+Train step of epoch 0:  13%|█▎        | 865/6434 [2:01:16<13:12:12,  8.54s/it, gpt_loss=0.373, loss_mean=0.375][A
+Train step of epoch 0:  13%|█▎        | 865/6434 [2:01:23<13:12:12,  8.54s/it, gpt_loss=0.346, loss_mean=0.372][A
+Train step of epoch 0:  13%|█▎        | 866/6434 [2:01:23<12:39:39,  8.19s/it, gpt_loss=0.346, loss_mean=0.372][A
+Train step of epoch 0:  13%|█▎        | 866/6434 [2:01:31<12:39:39,  8.19s/it, gpt_loss=0.335, loss_mean=0.368][A
+Train step of epoch 0:  13%|█▎        | 867/6434 [2:01:31<12:39:09,  8.18s/it, gpt_loss=0.335, loss_mean=0.368][A
+Train step of epoch 0:  13%|█▎        | 867/6434 [2:01:41<12:39:09,  8.18s/it, gpt_loss=0.387, loss_mean=0.37] [A
+Train step of epoch 0:  13%|█▎        | 868/6434 [2:01:41<13:21:49,  8.64s/it, gpt_loss=0.387, loss_mean=0.37][A
+Train step of epoch 0:  13%|█▎        | 868/6434 [2:01:49<13:21:49,  8.64s/it, gpt_loss=0.363, loss_mean=0.369][A
+Train step of epoch 0:  14%|█▎        | 869/6434 [2:01:49<13:03:42,  8.45s/it, gpt_loss=0.363, loss_mean=0.369][A
+[LID Router Debug] Step: 870
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [3, 4, 9, 3, 4, 6, 5, 2, 4, 3]
+Active Experts in Batch: {2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  14%|█▎        | 869/6434 [2:01:57<13:03:42,  8.45s/it, gpt_loss=0.327, loss_mean=0.365][A
+Train step of epoch 0:  14%|█▎        | 870/6434 [2:01:57<12:50:17,  8.31s/it, gpt_loss=0.327, loss_mean=0.365][A
+Train step of epoch 0:  14%|█▎        | 870/6434 [2:02:05<12:50:17,  8.31s/it, gpt_loss=0.4, loss_mean=0.369]  [A
+Train step of epoch 0:  14%|█▎        | 871/6434 [2:02:05<12:50:26,  8.31s/it, gpt_loss=0.4, loss_mean=0.369][A
+Train step of epoch 0:  14%|█▎        | 871/6434 [2:02:15<12:50:26,  8.31s/it, gpt_loss=0.471, loss_mean=0.379][A
+Train step of epoch 0:  14%|█▎        | 872/6434 [2:02:15<13:29:06,  8.73s/it, gpt_loss=0.471, loss_mean=0.379][A
+Train step of epoch 0:  14%|█▎        | 872/6434 [2:02:24<13:29:06,  8.73s/it, gpt_loss=0.497, loss_mean=0.391][A
+Train step of epoch 0:  14%|█▎        | 873/6434 [2:02:24<13:39:17,  8.84s/it, gpt_loss=0.497, loss_mean=0.391][A
+Train step of epoch 0:  14%|█▎        | 873/6434 [2:02:32<13:39:17,  8.84s/it, gpt_loss=0.308, loss_mean=0.382][A
+Train step of epoch 0:  14%|█▎        | 874/6434 [2:02:32<13:09:08,  8.52s/it, gpt_loss=0.308, loss_mean=0.382][A
+Train step of epoch 0:  14%|█▎        | 874/6434 [2:02:40<13:09:08,  8.52s/it, gpt_loss=0.282, loss_mean=0.372][A
+Train step of epoch 0:  14%|█▎        | 875/6434 [2:02:40<13:06:56,  8.49s/it, gpt_loss=0.282, loss_mean=0.372][A
+Train step of epoch 0:  14%|█▎        | 875/6434 [2:02:49<13:06:56,  8.49s/it, gpt_loss=0.327, loss_mean=0.368][A
+Train step of epoch 0:  14%|█▎        | 876/6434 [2:02:49<13:25:20,  8.69s/it, gpt_loss=0.327, loss_mean=0.368][A
+Train step of epoch 0:  14%|█▎        | 876/6434 [2:02:59<13:25:20,  8.69s/it, gpt_loss=0.322, loss_mean=0.363][A
+Train step of epoch 0:  14%|█▎        | 877/6434 [2:02:59<13:50:31,  8.97s/it, gpt_loss=0.322, loss_mean=0.363][A
+Train step of epoch 0:  14%|█▎        | 877/6434 [2:03:09<13:50:31,  8.97s/it, gpt_loss=0.281, loss_mean=0.355][A
+Train step of epoch 0:  14%|█▎        | 878/6434 [2:03:09<14:20:27,  9.29s/it, gpt_loss=0.281, loss_mean=0.355][A
+Train step of epoch 0:  14%|█▎        | 878/6434 [2:03:18<14:20:27,  9.29s/it, gpt_loss=0.468, loss_mean=0.366][A
+Train step of epoch 0:  14%|█▎        | 879/6434 [2:03:18<14:14:42,  9.23s/it, gpt_loss=0.468, loss_mean=0.366][A
+[LID Router Debug] Step: 880
+Batch Size: 10
+Audio Batch Size: 175
+LID Assignments: [9, 3, 4, 3, 3, 4, 9, 0, 9, 1]
+Active Experts in Batch: {0, 1, 3, 4, 9}
+
+Train step of epoch 0:  14%|█▎        | 879/6434 [2:03:29<14:14:42,  9.23s/it, gpt_loss=0.347, loss_mean=0.364][A
+Train step of epoch 0:  14%|█▎        | 880/6434 [2:03:29<14:50:57,  9.63s/it, gpt_loss=0.347, loss_mean=0.364][A
+Train step of epoch 0:  14%|█▎        | 880/6434 [2:03:36<14:50:57,  9.63s/it, gpt_loss=0.372, loss_mean=0.365][A
+Train step of epoch 0:  14%|█▎        | 881/6434 [2:03:36<13:57:44,  9.05s/it, gpt_loss=0.372, loss_mean=0.365][A
+Train step of epoch 0:  14%|█▎        | 881/6434 [2:03:45<13:57:44,  9.05s/it, gpt_loss=0.332, loss_mean=0.362][A
+Train step of epoch 0:  14%|█▎        | 882/6434 [2:03:45<13:49:59,  8.97s/it, gpt_loss=0.332, loss_mean=0.362][A
+Train step of epoch 0:  14%|█▎        | 882/6434 [2:03:53<13:49:59,  8.97s/it, gpt_loss=0.324, loss_mean=0.358][A
+Train step of epoch 0:  14%|█▎        | 883/6434 [2:03:54<13:29:43,  8.75s/it, gpt_loss=0.324, loss_mean=0.358][A
+Train step of epoch 0:  14%|█▎        | 883/6434 [2:04:03<13:29:43,  8.75s/it, gpt_loss=0.334, loss_mean=0.356][A
+Train step of epoch 0:  14%|█▎        | 884/6434 [2:04:03<13:47:06,  8.94s/it, gpt_loss=0.334, loss_mean=0.356][A
+Train step of epoch 0:  14%|█▎        | 884/6434 [2:04:11<13:47:06,  8.94s/it, gpt_loss=0.357, loss_mean=0.356][A
+Train step of epoch 0:  14%|█▍        | 885/6434 [2:04:11<13:36:02,  8.82s/it, gpt_loss=0.357, loss_mean=0.356][A
+Train step of epoch 0:  14%|█▍        | 885/6434 [2:04:19<13:36:02,  8.82s/it, gpt_loss=0.406, loss_mean=0.361][A
+Train step of epoch 0:  14%|█▍        | 886/6434 [2:04:19<13:11:21,  8.56s/it, gpt_loss=0.406, loss_mean=0.361][A
+Train step of epoch 0:  14%|█▍        | 886/6434 [2:04:29<13:11:21,  8.56s/it, gpt_loss=0.393, loss_mean=0.364][A
+Train step of epoch 0:  14%|█▍        | 887/6434 [2:04:29<13:41:17,  8.88s/it, gpt_loss=0.393, loss_mean=0.364][A
+Train step of epoch 0:  14%|█▍        | 887/6434 [2:04:38<13:41:17,  8.88s/it, gpt_loss=0.485, loss_mean=0.376][A
+Train step of epoch 0:  14%|█▍        | 888/6434 [2:04:38<13:32:19,  8.79s/it, gpt_loss=0.485, loss_mean=0.376][A
+Train step of epoch 0:  14%|█▍        | 888/6434 [2:04:46<13:32:19,  8.79s/it, gpt_loss=0.291, loss_mean=0.368][A
+Train step of epoch 0:  14%|█▍        | 889/6434 [2:04:46<13:29:26,  8.76s/it, gpt_loss=0.291, loss_mean=0.368][A
+[LID Router Debug] Step: 890
+Batch Size: 10
+Audio Batch Size: 82
+LID Assignments: [2, 4, 6, 5, 4, 4, 2, 1, 0, 0]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6}
+
+Train step of epoch 0:  14%|█▍        | 889/6434 [2:04:55<13:29:26,  8.76s/it, gpt_loss=0.417, loss_mean=0.373][A
+Train step of epoch 0:  14%|█▍        | 890/6434 [2:04:55<13:17:15,  8.63s/it, gpt_loss=0.417, loss_mean=0.373][A
+Train step of epoch 0:  14%|█▍        | 890/6434 [2:05:03<13:17:15,  8.63s/it, gpt_loss=0.348, loss_mean=0.37] [A
+Train step of epoch 0:  14%|█▍        | 891/6434 [2:05:03<13:00:51,  8.45s/it, gpt_loss=0.348, loss_mean=0.37][A
+Train step of epoch 0:  14%|█▍        | 891/6434 [2:05:11<13:00:51,  8.45s/it, gpt_loss=0.243, loss_mean=0.357][A
+Train step of epoch 0:  14%|█▍        | 892/6434 [2:05:11<12:56:31,  8.41s/it, gpt_loss=0.243, loss_mean=0.357][A
+Train step of epoch 0:  14%|█▍        | 892/6434 [2:05:19<12:56:31,  8.41s/it, gpt_loss=0.316, loss_mean=0.353][A
+Train step of epoch 0:  14%|█▍        | 893/6434 [2:05:19<12:55:55,  8.40s/it, gpt_loss=0.316, loss_mean=0.353][A
+Train step of epoch 0:  14%|█▍        | 893/6434 [2:05:28<12:55:55,  8.40s/it, gpt_loss=0.435, loss_mean=0.361][A
+Train step of epoch 0:  14%|█▍        | 894/6434 [2:05:28<12:50:46,  8.35s/it, gpt_loss=0.435, loss_mean=0.361][A
+Train step of epoch 0:  14%|█▍        | 894/6434 [2:05:37<12:50:46,  8.35s/it, gpt_loss=0.362, loss_mean=0.361][A
+Train step of epoch 0:  14%|█▍        | 895/6434 [2:05:37<13:32:37,  8.80s/it, gpt_loss=0.362, loss_mean=0.361][A
+Train step of epoch 0:  14%|█▍        | 895/6434 [2:05:46<13:32:37,  8.80s/it, gpt_loss=0.276, loss_mean=0.353][A
+Train step of epoch 0:  14%|█▍        | 896/6434 [2:05:46<13:13:38,  8.60s/it, gpt_loss=0.276, loss_mean=0.353][A
+Train step of epoch 0:  14%|█▍        | 896/6434 [2:05:55<13:13:38,  8.60s/it, gpt_loss=0.412, loss_mean=0.359][A
+Train step of epoch 0:  14%|█▍        | 897/6434 [2:05:55<13:48:48,  8.98s/it, gpt_loss=0.412, loss_mean=0.359][A
+Train step of epoch 0:  14%|█▍        | 897/6434 [2:06:04<13:48:48,  8.98s/it, gpt_loss=0.383, loss_mean=0.361][A
+Train step of epoch 0:  14%|█▍        | 898/6434 [2:06:04<13:33:01,  8.81s/it, gpt_loss=0.383, loss_mean=0.361][A
+Train step of epoch 0:  14%|█▍        | 898/6434 [2:06:13<13:33:01,  8.81s/it, gpt_loss=0.323, loss_mean=0.357][A
+Train step of epoch 0:  14%|█▍        | 899/6434 [2:06:13<13:41:43,  8.91s/it, gpt_loss=0.323, loss_mean=0.357][A
+[LID Router Debug] Step: 900
+Batch Size: 10
+Audio Batch Size: 105
+LID Assignments: [9, 4, 9, 4, 5, 3, 4, 4, 2, 9]
+Active Experts in Batch: {2, 3, 4, 5, 9}
+
+Train step of epoch 0:  14%|█▍        | 899/6434 [2:06:22<13:41:43,  8.91s/it, gpt_loss=0.363, loss_mean=0.358][A
+Train step of epoch 0:  14%|█▍        | 900/6434 [2:06:22<13:41:46,  8.91s/it, gpt_loss=0.363, loss_mean=0.358][A
+Train step of epoch 0:  14%|█▍        | 900/6434 [2:06:30<13:41:46,  8.91s/it, gpt_loss=0.245, loss_mean=0.347][A
+Train step of epoch 0:  14%|█▍        | 901/6434 [2:06:30<13:12:03,  8.59s/it, gpt_loss=0.245, loss_mean=0.347][A
+Train step of epoch 0:  14%|█▍        | 901/6434 [2:06:37<13:12:03,  8.59s/it, gpt_loss=0.326, loss_mean=0.345][A
+Train step of epoch 0:  14%|█▍        | 902/6434 [2:06:37<12:45:49,  8.31s/it, gpt_loss=0.326, loss_mean=0.345][A
+Train step of epoch 0:  14%|█▍        | 902/6434 [2:06:45<12:45:49,  8.31s/it, gpt_loss=0.387, loss_mean=0.349][A
+Train step of epoch 0:  14%|█▍        | 903/6434 [2:06:45<12:39:26,  8.24s/it, gpt_loss=0.387, loss_mean=0.349][A
+Train step of epoch 0:  14%|█▍        | 903/6434 [2:06:53<12:39:26,  8.24s/it, gpt_loss=0.357, loss_mean=0.35] [A
+Train step of epoch 0:  14%|█▍        | 904/6434 [2:06:53<12:08:59,  7.91s/it, gpt_loss=0.357, loss_mean=0.35][A
+Train step of epoch 0:  14%|█▍        | 904/6434 [2:07:01<12:08:59,  7.91s/it, gpt_loss=0.432, loss_mean=0.358][A
+Train step of epoch 0:  14%|█▍        | 905/6434 [2:07:01<12:28:08,  8.12s/it, gpt_loss=0.432, loss_mean=0.358][A
+Train step of epoch 0:  14%|█▍        | 905/6434 [2:07:10<12:28:08,  8.12s/it, gpt_loss=0.39, loss_mean=0.361] [A
+Train step of epoch 0:  14%|█▍        | 906/6434 [2:07:10<12:54:24,  8.41s/it, gpt_loss=0.39, loss_mean=0.361][A
+Train step of epoch 0:  14%|█▍        | 906/6434 [2:07:20<12:54:24,  8.41s/it, gpt_loss=0.416, loss_mean=0.367][A
+Train step of epoch 0:  14%|█▍        | 907/6434 [2:07:20<13:28:18,  8.77s/it, gpt_loss=0.416, loss_mean=0.367][A
+Train step of epoch 0:  14%|█▍        | 907/6434 [2:07:28<13:28:18,  8.77s/it, gpt_loss=0.366, loss_mean=0.367][A
+Train step of epoch 0:  14%|█▍        | 908/6434 [2:07:28<13:10:39,  8.58s/it, gpt_loss=0.366, loss_mean=0.367][A
+Train step of epoch 0:  14%|█▍        | 908/6434 [2:07:37<13:10:39,  8.58s/it, gpt_loss=0.369, loss_mean=0.367][A
+Train step of epoch 0:  14%|█▍        | 909/6434 [2:07:37<13:10:06,  8.58s/it, gpt_loss=0.369, loss_mean=0.367][A
+[LID Router Debug] Step: 910
+Batch Size: 10
+Audio Batch Size: 109
+LID Assignments: [3, 1, 0, 3, 1, 1, 1, 2, 2, 0]
+Active Experts in Batch: {0, 1, 2, 3}
+
+Train step of epoch 0:  14%|█▍        | 909/6434 [2:07:45<13:10:06,  8.58s/it, gpt_loss=0.281, loss_mean=0.358][A
+Train step of epoch 0:  14%|█▍        | 910/6434 [2:07:45<13:02:16,  8.50s/it, gpt_loss=0.281, loss_mean=0.358][A
+Train step of epoch 0:  14%|█▍        | 910/6434 [2:07:54<13:02:16,  8.50s/it, gpt_loss=0.354, loss_mean=0.358][A
+Train step of epoch 0:  14%|█▍        | 911/6434 [2:07:54<13:08:52,  8.57s/it, gpt_loss=0.354, loss_mean=0.358][A
+Train step of epoch 0:  14%|█▍        | 911/6434 [2:08:02<13:08:52,  8.57s/it, gpt_loss=0.328, loss_mean=0.355][A
+Train step of epoch 0:  14%|█▍        | 912/6434 [2:08:02<13:07:47,  8.56s/it, gpt_loss=0.328, loss_mean=0.355][A
+Train step of epoch 0:  14%|█▍        | 912/6434 [2:08:11<13:07:47,  8.56s/it, gpt_loss=0.319, loss_mean=0.351][A
+Train step of epoch 0:  14%|█▍        | 913/6434 [2:08:11<13:03:35,  8.52s/it, gpt_loss=0.319, loss_mean=0.351][A
+Train step of epoch 0:  14%|█▍        | 913/6434 [2:08:19<13:03:35,  8.52s/it, gpt_loss=0.31, loss_mean=0.347] [A
+Train step of epoch 0:  14%|█▍        | 914/6434 [2:08:19<12:55:26,  8.43s/it, gpt_loss=0.31, loss_mean=0.347][A
+Train step of epoch 0:  14%|█▍        | 914/6434 [2:08:27<12:55:26,  8.43s/it, gpt_loss=0.371, loss_mean=0.35][A
+Train step of epoch 0:  14%|█▍        | 915/6434 [2:08:27<12:40:57,  8.27s/it, gpt_loss=0.371, loss_mean=0.35][A
+Train step of epoch 0:  14%|█▍        | 915/6434 [2:08:35<12:40:57,  8.27s/it, gpt_loss=0.352, loss_mean=0.35][A
+Train step of epoch 0:  14%|█▍        | 916/6434 [2:08:35<12:53:49,  8.41s/it, gpt_loss=0.352, loss_mean=0.35][A
+Train step of epoch 0:  14%|█▍        | 916/6434 [2:08:44<12:53:49,  8.41s/it, gpt_loss=0.364, loss_mean=0.351][A
+Train step of epoch 0:  14%|█▍        | 917/6434 [2:08:44<13:07:40,  8.57s/it, gpt_loss=0.364, loss_mean=0.351][A
+Train step of epoch 0:  14%|█▍        | 917/6434 [2:08:53<13:07:40,  8.57s/it, gpt_loss=0.36, loss_mean=0.352] [A
+Train step of epoch 0:  14%|█▍        | 918/6434 [2:08:53<12:55:35,  8.44s/it, gpt_loss=0.36, loss_mean=0.352][A
+Train step of epoch 0:  14%|█▍        | 918/6434 [2:09:01<12:55:35,  8.44s/it, gpt_loss=0.333, loss_mean=0.35][A
+Train step of epoch 0:  14%|█▍        | 919/6434 [2:09:01<12:42:59,  8.30s/it, gpt_loss=0.333, loss_mean=0.35][A
+[LID Router Debug] Step: 920
+Batch Size: 10
+Audio Batch Size: 125
+LID Assignments: [3, 5, 9, 8, 3, 1, 1, 5, 5, 1]
+Active Experts in Batch: {1, 3, 5, 8, 9}
+
+Train step of epoch 0:  14%|█▍        | 919/6434 [2:09:09<12:42:59,  8.30s/it, gpt_loss=0.351, loss_mean=0.35][A
+Train step of epoch 0:  14%|█▍        | 920/6434 [2:09:09<12:44:36,  8.32s/it, gpt_loss=0.351, loss_mean=0.35][A
+Train step of epoch 0:  14%|█▍        | 920/6434 [2:09:17<12:44:36,  8.32s/it, gpt_loss=0.403, loss_mean=0.355][A
+Train step of epoch 0:  14%|█▍        | 921/6434 [2:09:17<12:38:11,  8.25s/it, gpt_loss=0.403, loss_mean=0.355][A
+Train step of epoch 0:  14%|█▍        | 921/6434 [2:09:25<12:38:11,  8.25s/it, gpt_loss=0.432, loss_mean=0.363][A
+Train step of epoch 0:  14%|█▍        | 922/6434 [2:09:25<12:45:24,  8.33s/it, gpt_loss=0.432, loss_mean=0.363][A
+Train step of epoch 0:  14%|█▍        | 922/6434 [2:09:34<12:45:24,  8.33s/it, gpt_loss=0.354, loss_mean=0.362][A
+Train step of epoch 0:  14%|█▍        | 923/6434 [2:09:34<12:40:32,  8.28s/it, gpt_loss=0.354, loss_mean=0.362][A
+Train step of epoch 0:  14%|█▍        | 923/6434 [2:09:42<12:40:32,  8.28s/it, gpt_loss=0.377, loss_mean=0.364][A
+Train step of epoch 0:  14%|█▍        | 924/6434 [2:09:42<12:42:01,  8.30s/it, gpt_loss=0.377, loss_mean=0.364][A
+Train step of epoch 0:  14%|█▍        | 924/6434 [2:09:50<12:42:01,  8.30s/it, gpt_loss=0.405, loss_mean=0.368][A
+Train step of epoch 0:  14%|█▍        | 925/6434 [2:09:50<12:30:05,  8.17s/it, gpt_loss=0.405, loss_mean=0.368][A
+Train step of epoch 0:  14%|█▍        | 925/6434 [2:09:59<12:30:05,  8.17s/it, gpt_loss=0.476, loss_mean=0.379][A
+Train step of epoch 0:  14%|█▍        | 926/6434 [2:09:59<12:58:57,  8.49s/it, gpt_loss=0.476, loss_mean=0.379][A
+Train step of epoch 0:  14%|█▍        | 926/6434 [2:10:08<12:58:57,  8.49s/it, gpt_loss=0.329, loss_mean=0.374][A
+Train step of epoch 0:  14%|█▍        | 927/6434 [2:10:08<12:58:31,  8.48s/it, gpt_loss=0.329, loss_mean=0.374][A
+Train step of epoch 0:  14%|█▍        | 927/6434 [2:10:16<12:58:31,  8.48s/it, gpt_loss=0.406, loss_mean=0.377][A
+Train step of epoch 0:  14%|█▍        | 928/6434 [2:10:16<12:55:54,  8.46s/it, gpt_loss=0.406, loss_mean=0.377][A
+Train step of epoch 0:  14%|█▍        | 928/6434 [2:10:25<12:55:54,  8.46s/it, gpt_loss=0.323, loss_mean=0.371][A
+Train step of epoch 0:  14%|█▍        | 929/6434 [2:10:25<13:01:00,  8.51s/it, gpt_loss=0.323, loss_mean=0.371][A
+[LID Router Debug] Step: 930
+Batch Size: 10
+Audio Batch Size: 126
+LID Assignments: [0, 9, 0, 5, 5, 3, 5, 9, 1, 10]
+Active Experts in Batch: {0, 1, 3, 5, 9, 10}
+
+Train step of epoch 0:  14%|█▍        | 929/6434 [2:10:34<13:01:00,  8.51s/it, gpt_loss=0.401, loss_mean=0.374][A
+Train step of epoch 0:  14%|█▍        | 930/6434 [2:10:34<13:31:28,  8.85s/it, gpt_loss=0.401, loss_mean=0.374][A
+Train step of epoch 0:  14%|█▍        | 930/6434 [2:10:42<13:31:28,  8.85s/it, gpt_loss=0.265, loss_mean=0.363][A
+Train step of epoch 0:  14%|█▍        | 931/6434 [2:10:42<13:03:09,  8.54s/it, gpt_loss=0.265, loss_mean=0.363][A
+Train step of epoch 0:  14%|█▍        | 931/6434 [2:10:50<13:03:09,  8.54s/it, gpt_loss=0.422, loss_mean=0.369][A
+Train step of epoch 0:  14%|█▍        | 932/6434 [2:10:50<12:50:21,  8.40s/it, gpt_loss=0.422, loss_mean=0.369][A
+Train step of epoch 0:  14%|█▍        | 932/6434 [2:10:58<12:50:21,  8.40s/it, gpt_loss=0.353, loss_mean=0.368][A
+Train step of epoch 0:  15%|█▍        | 933/6434 [2:10:58<12:36:48,  8.25s/it, gpt_loss=0.353, loss_mean=0.368][A
+Train step of epoch 0:  15%|█▍        | 933/6434 [2:11:06<12:36:48,  8.25s/it, gpt_loss=0.375, loss_mean=0.368][A
+Train step of epoch 0:  15%|█▍        | 934/6434 [2:11:06<12:22:40,  8.10s/it, gpt_loss=0.375, loss_mean=0.368][A
+Train step of epoch 0:  15%|█▍        | 934/6434 [2:11:15<12:22:40,  8.10s/it, gpt_loss=0.3, loss_mean=0.362]  [A
+Train step of epoch 0:  15%|█▍        | 935/6434 [2:11:15<12:50:37,  8.41s/it, gpt_loss=0.3, loss_mean=0.362][A
+Train step of epoch 0:  15%|█▍        | 935/6434 [2:11:24<12:50:37,  8.41s/it, gpt_loss=0.404, loss_mean=0.366][A
+Train step of epoch 0:  15%|█▍        | 936/6434 [2:11:24<13:19:33,  8.73s/it, gpt_loss=0.404, loss_mean=0.366][A
+Train step of epoch 0:  15%|█▍        | 936/6434 [2:11:33<13:19:33,  8.73s/it, gpt_loss=0.322, loss_mean=0.361][A
+Train step of epoch 0:  15%|█▍        | 937/6434 [2:11:33<13:12:28,  8.65s/it, gpt_loss=0.322, loss_mean=0.361][A
+Train step of epoch 0:  15%|█▍        | 937/6434 [2:11:42<13:12:28,  8.65s/it, gpt_loss=0.459, loss_mean=0.371][A
+Train step of epoch 0:  15%|█▍        | 938/6434 [2:11:42<13:22:29,  8.76s/it, gpt_loss=0.459, loss_mean=0.371][A
+Train step of epoch 0:  15%|█▍        | 938/6434 [2:11:50<13:22:29,  8.76s/it, gpt_loss=0.332, loss_mean=0.367][A
+Train step of epoch 0:  15%|█▍        | 939/6434 [2:11:50<13:11:17,  8.64s/it, gpt_loss=0.332, loss_mean=0.367][A
+[LID Router Debug] Step: 940
+Batch Size: 10
+Audio Batch Size: 90
+LID Assignments: [9, 6, 9, 1, 2, 5, 2, 9, 6, 4]
+Active Experts in Batch: {1, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  15%|█▍        | 939/6434 [2:11:59<13:11:17,  8.64s/it, gpt_loss=0.352, loss_mean=0.366][A
+Train step of epoch 0:  15%|█▍        | 940/6434 [2:11:59<13:24:04,  8.78s/it, gpt_loss=0.352, loss_mean=0.366][A
+Train step of epoch 0:  15%|█▍        | 940/6434 [2:12:07<13:24:04,  8.78s/it, gpt_loss=0.345, loss_mean=0.364][A
+Train step of epoch 0:  15%|█▍        | 941/6434 [2:12:07<13:02:04,  8.54s/it, gpt_loss=0.345, loss_mean=0.364][A
+Train step of epoch 0:  15%|█▍        | 941/6434 [2:12:15<13:02:04,  8.54s/it, gpt_loss=0.4, loss_mean=0.367]  [A
+Train step of epoch 0:  15%|█▍        | 942/6434 [2:12:15<12:48:58,  8.40s/it, gpt_loss=0.4, loss_mean=0.367][A
+Train step of epoch 0:  15%|█▍        | 942/6434 [2:12:24<12:48:58,  8.40s/it, gpt_loss=0.411, loss_mean=0.372][A
+Train step of epoch 0:  15%|█▍        | 943/6434 [2:12:24<13:01:02,  8.53s/it, gpt_loss=0.411, loss_mean=0.372][A
+Train step of epoch 0:  15%|█▍        | 943/6434 [2:12:32<13:01:02,  8.53s/it, gpt_loss=0.3, loss_mean=0.364]  [A
+Train step of epoch 0:  15%|█▍        | 944/6434 [2:12:32<12:49:39,  8.41s/it, gpt_loss=0.3, loss_mean=0.364][A
+Train step of epoch 0:  15%|█▍        | 944/6434 [2:12:41<12:49:39,  8.41s/it, gpt_loss=0.319, loss_mean=0.36][A
+Train step of epoch 0:  15%|█▍        | 945/6434 [2:12:41<12:51:24,  8.43s/it, gpt_loss=0.319, loss_mean=0.36][A
+Train step of epoch 0:  15%|█▍        | 945/6434 [2:12:49<12:51:24,  8.43s/it, gpt_loss=0.279, loss_mean=0.352][A
+Train step of epoch 0:  15%|█▍        | 946/6434 [2:12:49<12:53:31,  8.46s/it, gpt_loss=0.279, loss_mean=0.352][A
+Train step of epoch 0:  15%|█▍        | 946/6434 [2:12:59<12:53:31,  8.46s/it, gpt_loss=0.439, loss_mean=0.36] [A
+Train step of epoch 0:  15%|█▍        | 947/6434 [2:12:59<13:18:44,  8.73s/it, gpt_loss=0.439, loss_mean=0.36][A
+Train step of epoch 0:  15%|█▍        | 947/6434 [2:13:06<13:18:44,  8.73s/it, gpt_loss=0.396, loss_mean=0.364][A
+Train step of epoch 0:  15%|█▍        | 948/6434 [2:13:06<12:44:07,  8.36s/it, gpt_loss=0.396, loss_mean=0.364][A
+Train step of epoch 0:  15%|█▍        | 948/6434 [2:13:14<12:44:07,  8.36s/it, gpt_loss=0.342, loss_mean=0.362][A
+Train step of epoch 0:  15%|█▍        | 949/6434 [2:13:14<12:28:29,  8.19s/it, gpt_loss=0.342, loss_mean=0.362][A
+[LID Router Debug] Step: 950
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [4, 1, 6, 3, 9, 2, 2, 0, 5, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  15%|█▍        | 949/6434 [2:13:25<12:28:29,  8.19s/it, gpt_loss=0.333, loss_mean=0.359][A
+Train step of epoch 0:  15%|█▍        | 950/6434 [2:13:25<13:32:51,  8.89s/it, gpt_loss=0.333, loss_mean=0.359][A
+Train step of epoch 0:  15%|█▍        | 950/6434 [2:13:33<13:32:51,  8.89s/it, gpt_loss=0.327, loss_mean=0.356][A
+Train step of epoch 0:  15%|█▍        | 951/6434 [2:13:33<13:07:15,  8.61s/it, gpt_loss=0.327, loss_mean=0.356][A
+Train step of epoch 0:  15%|█▍        | 951/6434 [2:13:42<13:07:15,  8.61s/it, gpt_loss=0.311, loss_mean=0.351][A
+Train step of epoch 0:  15%|█▍        | 952/6434 [2:13:42<13:31:41,  8.88s/it, gpt_loss=0.311, loss_mean=0.351][A
+Train step of epoch 0:  15%|█▍        | 952/6434 [2:13:50<13:31:41,  8.88s/it, gpt_loss=0.299, loss_mean=0.346][A
+Train step of epoch 0:  15%|█▍        | 953/6434 [2:13:50<13:19:08,  8.75s/it, gpt_loss=0.299, loss_mean=0.346][A
+Train step of epoch 0:  15%|█▍        | 953/6434 [2:13:58<13:19:08,  8.75s/it, gpt_loss=0.372, loss_mean=0.349][A
+Train step of epoch 0:  15%|█▍        | 954/6434 [2:13:58<12:56:49,  8.51s/it, gpt_loss=0.372, loss_mean=0.349][A
+Train step of epoch 0:  15%|█▍        | 954/6434 [2:14:06<12:56:49,  8.51s/it, gpt_loss=0.309, loss_mean=0.345][A
+Train step of epoch 0:  15%|█▍        | 955/6434 [2:14:06<12:41:20,  8.34s/it, gpt_loss=0.309, loss_mean=0.345][A
+Train step of epoch 0:  15%|█▍        | 955/6434 [2:14:15<12:41:20,  8.34s/it, gpt_loss=0.43, loss_mean=0.353] [A
+Train step of epoch 0:  15%|█▍        | 956/6434 [2:14:15<12:55:05,  8.49s/it, gpt_loss=0.43, loss_mean=0.353][A
+Train step of epoch 0:  15%|█▍        | 956/6434 [2:14:25<12:55:05,  8.49s/it, gpt_loss=0.354, loss_mean=0.353][A
+Train step of epoch 0:  15%|█▍        | 957/6434 [2:14:25<13:30:01,  8.87s/it, gpt_loss=0.354, loss_mean=0.353][A
+Train step of epoch 0:  15%|█▍        | 957/6434 [2:14:33<13:30:01,  8.87s/it, gpt_loss=0.317, loss_mean=0.35] [A
+Train step of epoch 0:  15%|█▍        | 958/6434 [2:14:33<13:11:05,  8.67s/it, gpt_loss=0.317, loss_mean=0.35][A
+Train step of epoch 0:  15%|█▍        | 958/6434 [2:14:41<13:11:05,  8.67s/it, gpt_loss=0.336, loss_mean=0.348][A
+Train step of epoch 0:  15%|█▍        | 959/6434 [2:14:41<12:54:00,  8.48s/it, gpt_loss=0.336, loss_mean=0.348][A
+[LID Router Debug] Step: 960
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [4, 9, 9, 3, 6, 6, 4, 0, 4, 0]
+Active Experts in Batch: {0, 3, 4, 6, 9}
+
+Train step of epoch 0:  15%|█▍        | 959/6434 [2:14:50<12:54:00,  8.48s/it, gpt_loss=0.43, loss_mean=0.357] [A
+Train step of epoch 0:  15%|█▍        | 960/6434 [2:14:50<13:02:26,  8.58s/it, gpt_loss=0.43, loss_mean=0.357][A
+Train step of epoch 0:  15%|█▍        | 960/6434 [2:14:59<13:02:26,  8.58s/it, gpt_loss=0.378, loss_mean=0.359][A
+Train step of epoch 0:  15%|█▍        | 961/6434 [2:14:59<13:04:24,  8.60s/it, gpt_loss=0.378, loss_mean=0.359][A
+Train step of epoch 0:  15%|█▍        | 961/6434 [2:15:07<13:04:24,  8.60s/it, gpt_loss=0.329, loss_mean=0.356][A
+Train step of epoch 0:  15%|█▍        | 962/6434 [2:15:07<13:07:02,  8.63s/it, gpt_loss=0.329, loss_mean=0.356][A
+Train step of epoch 0:  15%|█▍        | 962/6434 [2:15:15<13:07:02,  8.63s/it, gpt_loss=0.402, loss_mean=0.36] [A
+Train step of epoch 0:  15%|█▍        | 963/6434 [2:15:15<12:27:24,  8.20s/it, gpt_loss=0.402, loss_mean=0.36][A
+Train step of epoch 0:  15%|█▍        | 963/6434 [2:15:22<12:27:24,  8.20s/it, gpt_loss=0.364, loss_mean=0.361][A
+Train step of epoch 0:  15%|█▍        | 964/6434 [2:15:22<12:18:47,  8.10s/it, gpt_loss=0.364, loss_mean=0.361][A
+Train step of epoch 0:  15%|█▍        | 964/6434 [2:15:30<12:18:47,  8.10s/it, gpt_loss=0.439, loss_mean=0.369][A
+Train step of epoch 0:  15%|█▍        | 965/6434 [2:15:30<12:15:05,  8.06s/it, gpt_loss=0.439, loss_mean=0.369][A
+Train step of epoch 0:  15%|█▍        | 965/6434 [2:15:40<12:15:05,  8.06s/it, gpt_loss=0.331, loss_mean=0.365][A
+Train step of epoch 0:  15%|█▌        | 966/6434 [2:15:40<12:49:25,  8.44s/it, gpt_loss=0.331, loss_mean=0.365][A
+Train step of epoch 0:  15%|█▌        | 966/6434 [2:15:47<12:49:25,  8.44s/it, gpt_loss=0.255, loss_mean=0.354][A
+Train step of epoch 0:  15%|█▌        | 967/6434 [2:15:47<12:22:39,  8.15s/it, gpt_loss=0.255, loss_mean=0.354][A
+Train step of epoch 0:  15%|█▌        | 967/6434 [2:15:55<12:22:39,  8.15s/it, gpt_loss=0.336, loss_mean=0.352][A
+Train step of epoch 0:  15%|█▌        | 968/6434 [2:15:55<12:19:09,  8.11s/it, gpt_loss=0.336, loss_mean=0.352][A
+Train step of epoch 0:  15%|█▌        | 968/6434 [2:16:05<12:19:09,  8.11s/it, gpt_loss=0.295, loss_mean=0.346][A
+Train step of epoch 0:  15%|█▌        | 969/6434 [2:16:05<13:06:20,  8.63s/it, gpt_loss=0.295, loss_mean=0.346][A
+[LID Router Debug] Step: 970
+Batch Size: 10
+Audio Batch Size: 142
+LID Assignments: [4, 0, 9, 4, 9, 2, 2, 3, 3, 9]
+Active Experts in Batch: {0, 2, 3, 4, 9}
+
+Train step of epoch 0:  15%|█▌        | 969/6434 [2:16:14<13:06:20,  8.63s/it, gpt_loss=0.465, loss_mean=0.358][A
+Train step of epoch 0:  15%|█▌        | 970/6434 [2:16:14<13:21:40,  8.80s/it, gpt_loss=0.465, loss_mean=0.358][A
+Train step of epoch 0:  15%|█▌        | 970/6434 [2:16:23<13:21:40,  8.80s/it, gpt_loss=0.333, loss_mean=0.356][A
+Train step of epoch 0:  15%|█▌        | 971/6434 [2:16:23<13:13:25,  8.71s/it, gpt_loss=0.333, loss_mean=0.356][A
+Train step of epoch 0:  15%|█▌        | 971/6434 [2:16:30<13:13:25,  8.71s/it, gpt_loss=0.368, loss_mean=0.357][A
+Train step of epoch 0:  15%|█▌        | 972/6434 [2:16:30<12:44:38,  8.40s/it, gpt_loss=0.368, loss_mean=0.357][A
+Train step of epoch 0:  15%|█▌        | 972/6434 [2:16:40<12:44:38,  8.40s/it, gpt_loss=0.326, loss_mean=0.354][A
+Train step of epoch 0:  15%|█▌        | 973/6434 [2:16:40<13:04:15,  8.62s/it, gpt_loss=0.326, loss_mean=0.354][A
+Train step of epoch 0:  15%|█▌        | 973/6434 [2:16:49<13:04:15,  8.62s/it, gpt_loss=0.436, loss_mean=0.362][A
+Train step of epoch 0:  15%|█▌        | 974/6434 [2:16:49<13:34:00,  8.95s/it, gpt_loss=0.436, loss_mean=0.362][A
+Train step of epoch 0:  15%|█▌        | 974/6434 [2:16:58<13:34:00,  8.95s/it, gpt_loss=0.311, loss_mean=0.357][A
+Train step of epoch 0:  15%|█▌        | 975/6434 [2:16:58<13:20:25,  8.80s/it, gpt_loss=0.311, loss_mean=0.357][A
+Train step of epoch 0:  15%|█▌        | 975/6434 [2:17:06<13:20:25,  8.80s/it, gpt_loss=0.281, loss_mean=0.349][A
+Train step of epoch 0:  15%|█▌        | 976/6434 [2:17:06<13:11:27,  8.70s/it, gpt_loss=0.281, loss_mean=0.349][A
+Train step of epoch 0:  15%|█▌        | 976/6434 [2:17:15<13:11:27,  8.70s/it, gpt_loss=0.34, loss_mean=0.348] [A
+Train step of epoch 0:  15%|█▌        | 977/6434 [2:17:15<13:06:32,  8.65s/it, gpt_loss=0.34, loss_mean=0.348][A
+Train step of epoch 0:  15%|█▌        | 977/6434 [2:17:24<13:06:32,  8.65s/it, gpt_loss=0.362, loss_mean=0.35][A
+Train step of epoch 0:  15%|█▌        | 978/6434 [2:17:24<13:31:28,  8.92s/it, gpt_loss=0.362, loss_mean=0.35][A
+Train step of epoch 0:  15%|█▌        | 978/6434 [2:17:33<13:31:28,  8.92s/it, gpt_loss=0.364, loss_mean=0.351][A
+Train step of epoch 0:  15%|█▌        | 979/6434 [2:17:33<13:35:49,  8.97s/it, gpt_loss=0.364, loss_mean=0.351][A
+[LID Router Debug] Step: 980
+Batch Size: 10
+Audio Batch Size: 118
+LID Assignments: [4, 3, 3, 9, 4, 1, 0, 6, 5, 9]
+Active Experts in Batch: {0, 1, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  15%|█▌        | 979/6434 [2:17:43<13:35:49,  8.97s/it, gpt_loss=0.402, loss_mean=0.356][A
+Train step of epoch 0:  15%|█▌        | 980/6434 [2:17:43<13:51:31,  9.15s/it, gpt_loss=0.402, loss_mean=0.356][A
+Train step of epoch 0:  15%|█▌        | 980/6434 [2:17:51<13:51:31,  9.15s/it, gpt_loss=0.302, loss_mean=0.351][A
+Train step of epoch 0:  15%|█▌        | 981/6434 [2:17:51<13:25:55,  8.87s/it, gpt_loss=0.302, loss_mean=0.351][A
+Train step of epoch 0:  15%|█▌        | 981/6434 [2:17:59<13:25:55,  8.87s/it, gpt_loss=0.359, loss_mean=0.352][A
+Train step of epoch 0:  15%|█▌        | 982/6434 [2:17:59<13:00:13,  8.59s/it, gpt_loss=0.359, loss_mean=0.352][A
+Train step of epoch 0:  15%|█▌        | 982/6434 [2:18:07<13:00:13,  8.59s/it, gpt_loss=0.325, loss_mean=0.349][A
+Train step of epoch 0:  15%|█▌        | 983/6434 [2:18:07<12:29:08,  8.25s/it, gpt_loss=0.325, loss_mean=0.349][A
+Train step of epoch 0:  15%|█▌        | 983/6434 [2:18:15<12:29:08,  8.25s/it, gpt_loss=0.325, loss_mean=0.347][A
+Train step of epoch 0:  15%|█▌        | 984/6434 [2:18:15<12:35:14,  8.31s/it, gpt_loss=0.325, loss_mean=0.347][A
+Train step of epoch 0:  15%|█▌        | 984/6434 [2:18:23<12:35:14,  8.31s/it, gpt_loss=0.362, loss_mean=0.348][A
+Train step of epoch 0:  15%|█▌        | 985/6434 [2:18:23<12:25:37,  8.21s/it, gpt_loss=0.362, loss_mean=0.348][A
+Train step of epoch 0:  15%|█▌        | 985/6434 [2:18:31<12:25:37,  8.21s/it, gpt_loss=0.32, loss_mean=0.345] [A
+Train step of epoch 0:  15%|█▌        | 986/6434 [2:18:31<12:09:24,  8.03s/it, gpt_loss=0.32, loss_mean=0.345][A
+Train step of epoch 0:  15%|█▌        | 986/6434 [2:18:40<12:09:24,  8.03s/it, gpt_loss=0.361, loss_mean=0.347][A
+Train step of epoch 0:  15%|█▌        | 987/6434 [2:18:40<12:48:59,  8.47s/it, gpt_loss=0.361, loss_mean=0.347][A
+Train step of epoch 0:  15%|█▌        | 987/6434 [2:18:49<12:48:59,  8.47s/it, gpt_loss=0.314, loss_mean=0.344][A
+Train step of epoch 0:  15%|█▌        | 988/6434 [2:18:49<12:50:20,  8.49s/it, gpt_loss=0.314, loss_mean=0.344][A
+Train step of epoch 0:  15%|█▌        | 988/6434 [2:18:59<12:50:20,  8.49s/it, gpt_loss=0.25, loss_mean=0.334] [A
+Train step of epoch 0:  15%|█▌        | 989/6434 [2:18:59<13:31:45,  8.94s/it, gpt_loss=0.25, loss_mean=0.334][A
+[LID Router Debug] Step: 990
+Batch Size: 10
+Audio Batch Size: 119
+LID Assignments: [2, 4, 0, 3, 7, 6, 5, 7, 0, 3]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6, 7}
+
+Train step of epoch 0:  15%|█▌        | 989/6434 [2:19:08<13:31:45,  8.94s/it, gpt_loss=0.372, loss_mean=0.338][A
+Train step of epoch 0:  15%|█▌        | 990/6434 [2:19:08<13:39:31,  9.03s/it, gpt_loss=0.372, loss_mean=0.338][A
+Train step of epoch 0:  15%|█▌        | 990/6434 [2:19:15<13:39:31,  9.03s/it, gpt_loss=0.374, loss_mean=0.342][A
+Train step of epoch 0:  15%|█▌        | 991/6434 [2:19:15<13:00:36,  8.60s/it, gpt_loss=0.374, loss_mean=0.342][A
+Train step of epoch 0:  15%|█▌        | 991/6434 [2:19:24<13:00:36,  8.60s/it, gpt_loss=0.323, loss_mean=0.34] [A
+Train step of epoch 0:  15%|█▌        | 992/6434 [2:19:24<12:51:30,  8.51s/it, gpt_loss=0.323, loss_mean=0.34][A
+Train step of epoch 0:  15%|█▌        | 992/6434 [2:19:32<12:51:30,  8.51s/it, gpt_loss=0.275, loss_mean=0.333][A
+Train step of epoch 0:  15%|█▌        | 993/6434 [2:19:32<12:41:15,  8.39s/it, gpt_loss=0.275, loss_mean=0.333][A
+Train step of epoch 0:  15%|█▌        | 993/6434 [2:19:41<12:41:15,  8.39s/it, gpt_loss=0.362, loss_mean=0.336][A
+Train step of epoch 0:  15%|█▌        | 994/6434 [2:19:41<13:07:29,  8.69s/it, gpt_loss=0.362, loss_mean=0.336][A
+Train step of epoch 0:  15%|█▌        | 994/6434 [2:19:49<13:07:29,  8.69s/it, gpt_loss=0.347, loss_mean=0.337][A
+Train step of epoch 0:  15%|█▌        | 995/6434 [2:19:49<12:45:58,  8.45s/it, gpt_loss=0.347, loss_mean=0.337][A
+Train step of epoch 0:  15%|█▌        | 995/6434 [2:19:57<12:45:58,  8.45s/it, gpt_loss=0.317, loss_mean=0.335][A
+Train step of epoch 0:  15%|█▌        | 996/6434 [2:19:57<12:24:47,  8.22s/it, gpt_loss=0.317, loss_mean=0.335][A
+Train step of epoch 0:  15%|█▌        | 996/6434 [2:20:04<12:24:47,  8.22s/it, gpt_loss=0.311, loss_mean=0.333][A
+Train step of epoch 0:  15%|█▌        | 997/6434 [2:20:04<12:08:43,  8.04s/it, gpt_loss=0.311, loss_mean=0.333][A
+Train step of epoch 0:  15%|█▌        | 997/6434 [2:20:13<12:08:43,  8.04s/it, gpt_loss=0.323, loss_mean=0.332][A
+Train step of epoch 0:  16%|█▌        | 998/6434 [2:20:13<12:21:23,  8.18s/it, gpt_loss=0.323, loss_mean=0.332][A
+Train step of epoch 0:  16%|█▌        | 998/6434 [2:20:23<12:21:23,  8.18s/it, gpt_loss=0.264, loss_mean=0.325][A
+Train step of epoch 0:  16%|█▌        | 999/6434 [2:20:23<13:10:08,  8.72s/it, gpt_loss=0.264, loss_mean=0.325][A
+[LID Router Debug] Step: 1000
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [4, 5, 3, 5, 5, 1, 4, 2, 4, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5}
+[2026-02-06 18:16:34,992] [INFO] [logging.py:96:log_dist] [Rank 0] step=500, skipped=0, lr=[1.991232541158971e-05, 1.991232541158971e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-06 18:16:34,992] [INFO] [timer.py:260:stop] epoch=0/micro_step=1000/global_step=500, RunningAvgSamplesPerSec=4.760262134805079, CurrSamplesPerSec=4.560649888999201, MemAllocated=12.7GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  16%|█▌        | 999/6434 [2:20:31<13:10:08,  8.72s/it, gpt_loss=0.343, loss_mean=0.327][A
+Train step of epoch 0:  16%|█▌        | 1000/6434 [2:20:31<12:39:14,  8.38s/it, gpt_loss=0.343, loss_mean=0.327][A
+Train step of epoch 0:  16%|█▌        | 1000/6434 [2:20:39<12:39:14,  8.38s/it, gpt_loss=0.333, loss_mean=0.327][A
+Train step of epoch 0:  16%|█▌        | 1001/6434 [2:20:39<12:48:55,  8.49s/it, gpt_loss=0.333, loss_mean=0.327][A
+Train step of epoch 0:  16%|█▌        | 1001/6434 [2:20:46<12:48:55,  8.49s/it, gpt_loss=0.301, loss_mean=0.325][A
+Train step of epoch 0:  16%|█▌        | 1002/6434 [2:20:46<12:07:33,  8.04s/it, gpt_loss=0.301, loss_mean=0.325][A
+Train step of epoch 0:  16%|█▌        | 1002/6434 [2:20:54<12:07:33,  8.04s/it, gpt_loss=0.348, loss_mean=0.327][A
+Train step of epoch 0:  16%|█▌        | 1003/6434 [2:20:54<12:11:58,  8.09s/it, gpt_loss=0.348, loss_mean=0.327][A
+Train step of epoch 0:  16%|█▌        | 1003/6434 [2:21:03<12:11:58,  8.09s/it, gpt_loss=0.457, loss_mean=0.34] [A
+Train step of epoch 0:  16%|█▌        | 1004/6434 [2:21:03<12:30:10,  8.29s/it, gpt_loss=0.457, loss_mean=0.34][A
+Train step of epoch 0:  16%|█▌        | 1004/6434 [2:21:12<12:30:10,  8.29s/it, gpt_loss=0.328, loss_mean=0.339][A
+Train step of epoch 0:  16%|█▌        | 1005/6434 [2:21:12<12:34:37,  8.34s/it, gpt_loss=0.328, loss_mean=0.339][A
+Train step of epoch 0:  16%|█▌        | 1005/6434 [2:21:20<12:34:37,  8.34s/it, gpt_loss=0.415, loss_mean=0.347][A
+Train step of epoch 0:  16%|█▌        | 1006/6434 [2:21:20<12:39:16,  8.39s/it, gpt_loss=0.415, loss_mean=0.347][A
+Train step of epoch 0:  16%|█▌        | 1006/6434 [2:21:28<12:39:16,  8.39s/it, gpt_loss=0.387, loss_mean=0.351][A
+Train step of epoch 0:  16%|█▌        | 1007/6434 [2:21:28<12:36:39,  8.37s/it, gpt_loss=0.387, loss_mean=0.351][A
+Train step of epoch 0:  16%|█▌        | 1007/6434 [2:21:37<12:36:39,  8.37s/it, gpt_loss=0.491, loss_mean=0.365][A
+Train step of epoch 0:  16%|█▌        | 1008/6434 [2:21:37<12:35:42,  8.36s/it, gpt_loss=0.491, loss_mean=0.365][A
+Train step of epoch 0:  16%|█▌        | 1008/6434 [2:21:45<12:35:42,  8.36s/it, gpt_loss=0.376, loss_mean=0.366][A
+Train step of epoch 0:  16%|█▌        | 1009/6434 [2:21:45<12:35:17,  8.35s/it, gpt_loss=0.376, loss_mean=0.366][A
+[LID Router Debug] Step: 1010
+Batch Size: 10
+Audio Batch Size: 84
+LID Assignments: [1, 4, 4, 1, 9, 4, 1, 9, 0, 0]
+Active Experts in Batch: {0, 1, 4, 9}
+
+Train step of epoch 0:  16%|█▌        | 1009/6434 [2:21:54<12:35:17,  8.35s/it, gpt_loss=0.298, loss_mean=0.359][A
+Train step of epoch 0:  16%|█▌        | 1010/6434 [2:21:54<12:39:25,  8.40s/it, gpt_loss=0.298, loss_mean=0.359][A
+Train step of epoch 0:  16%|█▌        | 1010/6434 [2:22:03<12:39:25,  8.40s/it, gpt_loss=0.274, loss_mean=0.351][A
+Train step of epoch 0:  16%|█▌        | 1011/6434 [2:22:03<12:53:53,  8.56s/it, gpt_loss=0.274, loss_mean=0.351][A
+Train step of epoch 0:  16%|█▌        | 1011/6434 [2:22:12<12:53:53,  8.56s/it, gpt_loss=0.296, loss_mean=0.345][A
+Train step of epoch 0:  16%|█▌        | 1012/6434 [2:22:12<13:10:39,  8.75s/it, gpt_loss=0.296, loss_mean=0.345][A
+Train step of epoch 0:  16%|█▌        | 1012/6434 [2:22:19<13:10:39,  8.75s/it, gpt_loss=0.379, loss_mean=0.348][A
+Train step of epoch 0:  16%|█▌        | 1013/6434 [2:22:19<12:33:25,  8.34s/it, gpt_loss=0.379, loss_mean=0.348][A
+Train step of epoch 0:  16%|█▌        | 1013/6434 [2:22:27<12:33:25,  8.34s/it, gpt_loss=0.35, loss_mean=0.349] [A
+Train step of epoch 0:  16%|█▌        | 1014/6434 [2:22:27<12:11:32,  8.10s/it, gpt_loss=0.35, loss_mean=0.349][A
+Train step of epoch 0:  16%|█▌        | 1014/6434 [2:22:34<12:11:32,  8.10s/it, gpt_loss=0.317, loss_mean=0.345][A
+Train step of epoch 0:  16%|█▌        | 1015/6434 [2:22:34<12:01:59,  7.99s/it, gpt_loss=0.317, loss_mean=0.345][A
+Train step of epoch 0:  16%|█▌        | 1015/6434 [2:22:42<12:01:59,  7.99s/it, gpt_loss=0.384, loss_mean=0.349][A
+Train step of epoch 0:  16%|█▌        | 1016/6434 [2:22:42<11:50:35,  7.87s/it, gpt_loss=0.384, loss_mean=0.349][A
+Train step of epoch 0:  16%|█▌        | 1016/6434 [2:22:50<11:50:35,  7.87s/it, gpt_loss=0.298, loss_mean=0.344][A
+Train step of epoch 0:  16%|█▌        | 1017/6434 [2:22:50<11:57:50,  7.95s/it, gpt_loss=0.298, loss_mean=0.344][A
+Train step of epoch 0:  16%|█▌        | 1017/6434 [2:22:58<11:57:50,  7.95s/it, gpt_loss=0.322, loss_mean=0.342][A
+Train step of epoch 0:  16%|█▌        | 1018/6434 [2:22:58<11:46:08,  7.82s/it, gpt_loss=0.322, loss_mean=0.342][A
+Train step of epoch 0:  16%|█▌        | 1018/6434 [2:23:05<11:46:08,  7.82s/it, gpt_loss=0.304, loss_mean=0.338][A
+Train step of epoch 0:  16%|█▌        | 1019/6434 [2:23:05<11:26:54,  7.61s/it, gpt_loss=0.304, loss_mean=0.338][A
+[LID Router Debug] Step: 1020
+Batch Size: 10
+Audio Batch Size: 112
+LID Assignments: [4, 3, 7, 3, 3, 5, 9, 1, 2, 5]
+Active Experts in Batch: {1, 2, 3, 4, 5, 7, 9}
+
+Train step of epoch 0:  16%|█▌        | 1019/6434 [2:23:13<11:26:54,  7.61s/it, gpt_loss=0.299, loss_mean=0.334][A
+Train step of epoch 0:  16%|█▌        | 1020/6434 [2:23:13<11:53:21,  7.91s/it, gpt_loss=0.299, loss_mean=0.334][A
+Train step of epoch 0:  16%|█▌        | 1020/6434 [2:23:21<11:53:21,  7.91s/it, gpt_loss=0.336, loss_mean=0.334][A
+Train step of epoch 0:  16%|█▌        | 1021/6434 [2:23:21<11:54:19,  7.92s/it, gpt_loss=0.336, loss_mean=0.334][A
+Train step of epoch 0:  16%|█▌        | 1021/6434 [2:23:30<11:54:19,  7.92s/it, gpt_loss=0.325, loss_mean=0.334][A
+Train step of epoch 0:  16%|█▌        | 1022/6434 [2:23:30<12:00:10,  7.98s/it, gpt_loss=0.325, loss_mean=0.334][A
+Train step of epoch 0:  16%|█▌        | 1022/6434 [2:23:39<12:00:10,  7.98s/it, gpt_loss=0.324, loss_mean=0.333][A
+Train step of epoch 0:  16%|█▌        | 1023/6434 [2:23:39<12:28:11,  8.30s/it, gpt_loss=0.324, loss_mean=0.333][A
+Train step of epoch 0:  16%|█▌        | 1023/6434 [2:23:47<12:28:11,  8.30s/it, gpt_loss=0.396, loss_mean=0.339][A
+Train step of epoch 0:  16%|█▌        | 1024/6434 [2:23:47<12:41:07,  8.44s/it, gpt_loss=0.396, loss_mean=0.339][A
+Train step of epoch 0:  16%|█▌        | 1024/6434 [2:23:55<12:41:07,  8.44s/it, gpt_loss=0.42, loss_mean=0.347] [A
+Train step of epoch 0:  16%|█▌        | 1025/6434 [2:23:55<12:31:58,  8.34s/it, gpt_loss=0.42, loss_mean=0.347][A
+Train step of epoch 0:  16%|█▌        | 1025/6434 [2:24:03<12:31:58,  8.34s/it, gpt_loss=0.329, loss_mean=0.345][A
+Train step of epoch 0:  16%|█▌        | 1026/6434 [2:24:03<12:19:43,  8.21s/it, gpt_loss=0.329, loss_mean=0.345][A
+Train step of epoch 0:  16%|█▌        | 1026/6434 [2:24:12<12:19:43,  8.21s/it, gpt_loss=0.413, loss_mean=0.352][A
+Train step of epoch 0:  16%|█▌        | 1027/6434 [2:24:12<12:25:57,  8.28s/it, gpt_loss=0.413, loss_mean=0.352][A
+Train step of epoch 0:  16%|█▌        | 1027/6434 [2:24:20<12:25:57,  8.28s/it, gpt_loss=0.332, loss_mean=0.35] [A
+Train step of epoch 0:  16%|█▌        | 1028/6434 [2:24:20<12:23:55,  8.26s/it, gpt_loss=0.332, loss_mean=0.35][A
+Train step of epoch 0:  16%|█▌        | 1028/6434 [2:24:29<12:23:55,  8.26s/it, gpt_loss=0.298, loss_mean=0.345][A
+Train step of epoch 0:  16%|█▌        | 1029/6434 [2:24:29<12:43:10,  8.47s/it, gpt_loss=0.298, loss_mean=0.345][A
+[LID Router Debug] Step: 1030
+Batch Size: 10
+Audio Batch Size: 118
+LID Assignments: [6, 1, 1, 1, 3, 3, 1, 9, 6, 9]
+Active Experts in Batch: {1, 3, 6, 9}
+
+Train step of epoch 0:  16%|█▌        | 1029/6434 [2:24:37<12:43:10,  8.47s/it, gpt_loss=0.371, loss_mean=0.347][A
+Train step of epoch 0:  16%|█▌        | 1030/6434 [2:24:37<12:32:29,  8.35s/it, gpt_loss=0.371, loss_mean=0.347][A
+Train step of epoch 0:  16%|█▌        | 1030/6434 [2:24:45<12:32:29,  8.35s/it, gpt_loss=0.356, loss_mean=0.348][A
+Train step of epoch 0:  16%|█▌        | 1031/6434 [2:24:45<12:14:57,  8.16s/it, gpt_loss=0.356, loss_mean=0.348][A
+Train step of epoch 0:  16%|█▌        | 1031/6434 [2:24:53<12:14:57,  8.16s/it, gpt_loss=0.229, loss_mean=0.336][A
+Train step of epoch 0:  16%|█▌        | 1032/6434 [2:24:53<12:26:13,  8.29s/it, gpt_loss=0.229, loss_mean=0.336][A
+Train step of epoch 0:  16%|█▌        | 1032/6434 [2:25:02<12:26:13,  8.29s/it, gpt_loss=0.276, loss_mean=0.33] [A
+Train step of epoch 0:  16%|█▌        | 1033/6434 [2:25:02<12:28:38,  8.32s/it, gpt_loss=0.276, loss_mean=0.33][A
+Train step of epoch 0:  16%|█▌        | 1033/6434 [2:25:10<12:28:38,  8.32s/it, gpt_loss=0.445, loss_mean=0.342][A
+Train step of epoch 0:  16%|█▌        | 1034/6434 [2:25:10<12:28:36,  8.32s/it, gpt_loss=0.445, loss_mean=0.342][A
+Train step of epoch 0:  16%|█▌        | 1034/6434 [2:25:18<12:28:36,  8.32s/it, gpt_loss=0.328, loss_mean=0.34] [A
+Train step of epoch 0:  16%|█▌        | 1035/6434 [2:25:18<12:28:49,  8.32s/it, gpt_loss=0.328, loss_mean=0.34][A
+Train step of epoch 0:  16%|█▌        | 1035/6434 [2:25:26<12:28:49,  8.32s/it, gpt_loss=0.313, loss_mean=0.338][A
+Train step of epoch 0:  16%|█▌        | 1036/6434 [2:25:26<12:08:37,  8.10s/it, gpt_loss=0.313, loss_mean=0.338][A
+Train step of epoch 0:  16%|█▌        | 1036/6434 [2:25:34<12:08:37,  8.10s/it, gpt_loss=0.334, loss_mean=0.337][A
+Train step of epoch 0:  16%|█▌        | 1037/6434 [2:25:34<12:21:07,  8.24s/it, gpt_loss=0.334, loss_mean=0.337][A
+Train step of epoch 0:  16%|█▌        | 1037/6434 [2:25:44<12:21:07,  8.24s/it, gpt_loss=0.388, loss_mean=0.342][A
+Train step of epoch 0:  16%|█▌        | 1038/6434 [2:25:44<12:44:41,  8.50s/it, gpt_loss=0.388, loss_mean=0.342][A
+Train step of epoch 0:  16%|█▌        | 1038/6434 [2:25:53<12:44:41,  8.50s/it, gpt_loss=0.377, loss_mean=0.346][A
+Train step of epoch 0:  16%|█▌        | 1039/6434 [2:25:53<12:55:57,  8.63s/it, gpt_loss=0.377, loss_mean=0.346][A
+[LID Router Debug] Step: 1040
+Batch Size: 10
+Audio Batch Size: 123
+LID Assignments: [5, 2, 3, 0, 6, 0, 4, 5, 9, 2]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  16%|█▌        | 1039/6434 [2:26:01<12:55:57,  8.63s/it, gpt_loss=0.316, loss_mean=0.343][A
+Train step of epoch 0:  16%|█▌        | 1040/6434 [2:26:01<12:42:51,  8.49s/it, gpt_loss=0.316, loss_mean=0.343][A
+Train step of epoch 0:  16%|█▌        | 1040/6434 [2:26:10<12:42:51,  8.49s/it, gpt_loss=0.335, loss_mean=0.342][A
+Train step of epoch 0:  16%|█▌        | 1041/6434 [2:26:10<13:01:25,  8.69s/it, gpt_loss=0.335, loss_mean=0.342][A
+Train step of epoch 0:  16%|█▌        | 1041/6434 [2:26:18<13:01:25,  8.69s/it, gpt_loss=0.391, loss_mean=0.347][A
+Train step of epoch 0:  16%|█▌        | 1042/6434 [2:26:18<12:51:51,  8.59s/it, gpt_loss=0.391, loss_mean=0.347][A
+Train step of epoch 0:  16%|█▌        | 1042/6434 [2:26:26<12:51:51,  8.59s/it, gpt_loss=0.444, loss_mean=0.357][A
+Train step of epoch 0:  16%|█▌        | 1043/6434 [2:26:26<12:35:38,  8.41s/it, gpt_loss=0.444, loss_mean=0.357][A
+Train step of epoch 0:  16%|█▌        | 1043/6434 [2:26:35<12:35:38,  8.41s/it, gpt_loss=0.366, loss_mean=0.358][A
+Train step of epoch 0:  16%|█▌        | 1044/6434 [2:26:35<12:51:23,  8.59s/it, gpt_loss=0.366, loss_mean=0.358][A
+Train step of epoch 0:  16%|█▌        | 1044/6434 [2:26:43<12:51:23,  8.59s/it, gpt_loss=0.286, loss_mean=0.351][A
+Train step of epoch 0:  16%|█▌        | 1045/6434 [2:26:43<12:35:29,  8.41s/it, gpt_loss=0.286, loss_mean=0.351][A
+Train step of epoch 0:  16%|█▌        | 1045/6434 [2:26:52<12:35:29,  8.41s/it, gpt_loss=0.367, loss_mean=0.352][A
+Train step of epoch 0:  16%|█▋        | 1046/6434 [2:26:52<12:51:38,  8.59s/it, gpt_loss=0.367, loss_mean=0.352][A
+Train step of epoch 0:  16%|█▋        | 1046/6434 [2:27:01<12:51:38,  8.59s/it, gpt_loss=0.31, loss_mean=0.348] [A
+Train step of epoch 0:  16%|█▋        | 1047/6434 [2:27:01<12:44:38,  8.52s/it, gpt_loss=0.31, loss_mean=0.348][A
+Train step of epoch 0:  16%|█▋        | 1047/6434 [2:27:08<12:44:38,  8.52s/it, gpt_loss=0.344, loss_mean=0.348][A
+Train step of epoch 0:  16%|█▋        | 1048/6434 [2:27:08<12:27:52,  8.33s/it, gpt_loss=0.344, loss_mean=0.348][A
+Train step of epoch 0:  16%|█▋        | 1048/6434 [2:27:19<12:27:52,  8.33s/it, gpt_loss=0.324, loss_mean=0.345][A
+Train step of epoch 0:  16%|█▋        | 1049/6434 [2:27:19<13:20:20,  8.92s/it, gpt_loss=0.324, loss_mean=0.345][A
+[LID Router Debug] Step: 1050
+Batch Size: 10
+Audio Batch Size: 122
+LID Assignments: [9, 9, 5, 3, 2, 3, 3, 5, 9, 4]
+Active Experts in Batch: {2, 3, 4, 5, 9}
+
+Train step of epoch 0:  16%|█▋        | 1049/6434 [2:27:27<13:20:20,  8.92s/it, gpt_loss=0.4, loss_mean=0.351]  [A
+Train step of epoch 0:  16%|█▋        | 1050/6434 [2:27:27<13:01:21,  8.71s/it, gpt_loss=0.4, loss_mean=0.351][A
+Train step of epoch 0:  16%|█▋        | 1050/6434 [2:27:36<13:01:21,  8.71s/it, gpt_loss=0.413, loss_mean=0.357][A
+Train step of epoch 0:  16%|█▋        | 1051/6434 [2:27:36<13:14:25,  8.85s/it, gpt_loss=0.413, loss_mean=0.357][A
+Train step of epoch 0:  16%|█▋        | 1051/6434 [2:27:46<13:14:25,  8.85s/it, gpt_loss=0.376, loss_mean=0.359][A
+Train step of epoch 0:  16%|█▋        | 1052/6434 [2:27:46<13:35:22,  9.09s/it, gpt_loss=0.376, loss_mean=0.359][A
+Train step of epoch 0:  16%|█▋        | 1052/6434 [2:27:55<13:35:22,  9.09s/it, gpt_loss=0.343, loss_mean=0.357][A
+Train step of epoch 0:  16%|█▋        | 1053/6434 [2:27:55<13:26:18,  8.99s/it, gpt_loss=0.343, loss_mean=0.357][A
+Train step of epoch 0:  16%|█▋        | 1053/6434 [2:28:03<13:26:18,  8.99s/it, gpt_loss=0.435, loss_mean=0.365][A
+Train step of epoch 0:  16%|█▋        | 1054/6434 [2:28:03<13:19:00,  8.91s/it, gpt_loss=0.435, loss_mean=0.365][A
+Train step of epoch 0:  16%|█▋        | 1054/6434 [2:28:12<13:19:00,  8.91s/it, gpt_loss=0.291, loss_mean=0.358][A
+Train step of epoch 0:  16%|█▋        | 1055/6434 [2:28:12<13:01:19,  8.72s/it, gpt_loss=0.291, loss_mean=0.358][A
+Train step of epoch 0:  16%|█▋        | 1055/6434 [2:28:19<13:01:19,  8.72s/it, gpt_loss=0.321, loss_mean=0.354][A
+Train step of epoch 0:  16%|█▋        | 1056/6434 [2:28:19<12:31:12,  8.38s/it, gpt_loss=0.321, loss_mean=0.354][A
+Train step of epoch 0:  16%|█▋        | 1056/6434 [2:28:28<12:31:12,  8.38s/it, gpt_loss=0.343, loss_mean=0.353][A
+Train step of epoch 0:  16%|█▋        | 1057/6434 [2:28:28<12:34:04,  8.41s/it, gpt_loss=0.343, loss_mean=0.353][A
+Train step of epoch 0:  16%|█▋        | 1057/6434 [2:28:35<12:34:04,  8.41s/it, gpt_loss=0.378, loss_mean=0.355][A
+Train step of epoch 0:  16%|█▋        | 1058/6434 [2:28:35<12:10:19,  8.15s/it, gpt_loss=0.378, loss_mean=0.355][A
+Train step of epoch 0:  16%|█▋        | 1058/6434 [2:28:43<12:10:19,  8.15s/it, gpt_loss=0.445, loss_mean=0.364][A
+Train step of epoch 0:  16%|█▋        | 1059/6434 [2:28:43<12:02:20,  8.06s/it, gpt_loss=0.445, loss_mean=0.364][A
+[LID Router Debug] Step: 1060
+Batch Size: 10
+Audio Batch Size: 82
+LID Assignments: [1, 3, 1, 7, 1, 1, 6, 0, 4, 9]
+Active Experts in Batch: {0, 1, 3, 4, 6, 7, 9}
+
+Train step of epoch 0:  16%|█▋        | 1059/6434 [2:28:52<12:02:20,  8.06s/it, gpt_loss=0.381, loss_mean=0.366][A
+Train step of epoch 0:  16%|█▋        | 1060/6434 [2:28:52<12:18:08,  8.24s/it, gpt_loss=0.381, loss_mean=0.366][A
+Train step of epoch 0:  16%|█▋        | 1060/6434 [2:29:00<12:18:08,  8.24s/it, gpt_loss=0.268, loss_mean=0.356][A
+Train step of epoch 0:  16%|█▋        | 1061/6434 [2:29:00<12:10:55,  8.16s/it, gpt_loss=0.268, loss_mean=0.356][A
+Train step of epoch 0:  16%|█▋        | 1061/6434 [2:29:08<12:10:55,  8.16s/it, gpt_loss=0.259, loss_mean=0.346][A
+Train step of epoch 0:  17%|█▋        | 1062/6434 [2:29:08<12:23:07,  8.30s/it, gpt_loss=0.259, loss_mean=0.346][A
+Train step of epoch 0:  17%|█▋        | 1062/6434 [2:29:16<12:23:07,  8.30s/it, gpt_loss=0.32, loss_mean=0.344] [A
+Train step of epoch 0:  17%|█▋        | 1063/6434 [2:29:16<12:03:44,  8.09s/it, gpt_loss=0.32, loss_mean=0.344][A
+Train step of epoch 0:  17%|█▋        | 1063/6434 [2:29:25<12:03:44,  8.09s/it, gpt_loss=0.346, loss_mean=0.344][A
+Train step of epoch 0:  17%|█▋        | 1064/6434 [2:29:25<12:24:48,  8.32s/it, gpt_loss=0.346, loss_mean=0.344][A
+Train step of epoch 0:  17%|█▋        | 1064/6434 [2:29:33<12:24:48,  8.32s/it, gpt_loss=0.364, loss_mean=0.346][A
+Train step of epoch 0:  17%|█▋        | 1065/6434 [2:29:33<12:35:40,  8.44s/it, gpt_loss=0.364, loss_mean=0.346][A
+Train step of epoch 0:  17%|█▋        | 1065/6434 [2:29:42<12:35:40,  8.44s/it, gpt_loss=0.445, loss_mean=0.356][A
+Train step of epoch 0:  17%|█▋        | 1066/6434 [2:29:42<12:28:46,  8.37s/it, gpt_loss=0.445, loss_mean=0.356][A
+Train step of epoch 0:  17%|█▋        | 1066/6434 [2:29:49<12:28:46,  8.37s/it, gpt_loss=0.289, loss_mean=0.349][A
+Train step of epoch 0:  17%|█▋        | 1067/6434 [2:29:49<12:09:44,  8.16s/it, gpt_loss=0.289, loss_mean=0.349][A
+Train step of epoch 0:  17%|█▋        | 1067/6434 [2:29:58<12:09:44,  8.16s/it, gpt_loss=0.361, loss_mean=0.35] [A
+Train step of epoch 0:  17%|█▋        | 1068/6434 [2:29:58<12:23:57,  8.32s/it, gpt_loss=0.361, loss_mean=0.35][A
+Train step of epoch 0:  17%|█▋        | 1068/6434 [2:30:07<12:23:57,  8.32s/it, gpt_loss=0.333, loss_mean=0.349][A
+Train step of epoch 0:  17%|█▋        | 1069/6434 [2:30:07<12:42:10,  8.52s/it, gpt_loss=0.333, loss_mean=0.349][A
+[LID Router Debug] Step: 1070
+Batch Size: 10
+Audio Batch Size: 129
+LID Assignments: [6, 3, 0, 3, 2, 2, 9, 0, 1, 3]
+Active Experts in Batch: {0, 1, 2, 3, 6, 9}
+
+Train step of epoch 0:  17%|█▋        | 1069/6434 [2:30:15<12:42:10,  8.52s/it, gpt_loss=0.245, loss_mean=0.338][A
+Train step of epoch 0:  17%|█▋        | 1070/6434 [2:30:15<12:37:42,  8.48s/it, gpt_loss=0.245, loss_mean=0.338][A
+Train step of epoch 0:  17%|█▋        | 1070/6434 [2:30:24<12:37:42,  8.48s/it, gpt_loss=0.292, loss_mean=0.334][A
+Train step of epoch 0:  17%|█▋        | 1071/6434 [2:30:24<12:43:16,  8.54s/it, gpt_loss=0.292, loss_mean=0.334][A
+Train step of epoch 0:  17%|█▋        | 1071/6434 [2:30:32<12:43:16,  8.54s/it, gpt_loss=0.313, loss_mean=0.332][A
+Train step of epoch 0:  17%|█▋        | 1072/6434 [2:30:32<12:26:35,  8.35s/it, gpt_loss=0.313, loss_mean=0.332][A
+Train step of epoch 0:  17%|█▋        | 1072/6434 [2:30:41<12:26:35,  8.35s/it, gpt_loss=0.331, loss_mean=0.332][A
+Train step of epoch 0:  17%|█▋        | 1073/6434 [2:30:41<12:48:01,  8.60s/it, gpt_loss=0.331, loss_mean=0.332][A
+Train step of epoch 0:  17%|█▋        | 1073/6434 [2:30:49<12:48:01,  8.60s/it, gpt_loss=0.38, loss_mean=0.336] [A
+Train step of epoch 0:  17%|█▋        | 1074/6434 [2:30:49<12:30:28,  8.40s/it, gpt_loss=0.38, loss_mean=0.336][A
+Train step of epoch 0:  17%|█▋        | 1074/6434 [2:30:58<12:30:28,  8.40s/it, gpt_loss=0.335, loss_mean=0.336][A
+Train step of epoch 0:  17%|█▋        | 1075/6434 [2:30:58<12:40:11,  8.51s/it, gpt_loss=0.335, loss_mean=0.336][A
+Train step of epoch 0:  17%|█▋        | 1075/6434 [2:31:07<12:40:11,  8.51s/it, gpt_loss=0.286, loss_mean=0.331][A
+Train step of epoch 0:  17%|█▋        | 1076/6434 [2:31:07<12:45:09,  8.57s/it, gpt_loss=0.286, loss_mean=0.331][A
+Train step of epoch 0:  17%|█▋        | 1076/6434 [2:31:15<12:45:09,  8.57s/it, gpt_loss=0.249, loss_mean=0.323][A
+Train step of epoch 0:  17%|█▋        | 1077/6434 [2:31:15<12:54:13,  8.67s/it, gpt_loss=0.249, loss_mean=0.323][A
+Train step of epoch 0:  17%|█▋        | 1077/6434 [2:31:24<12:54:13,  8.67s/it, gpt_loss=0.348, loss_mean=0.326][A
+Train step of epoch 0:  17%|█▋        | 1078/6434 [2:31:24<12:44:41,  8.57s/it, gpt_loss=0.348, loss_mean=0.326][A
+Train step of epoch 0:  17%|█▋        | 1078/6434 [2:31:32<12:44:41,  8.57s/it, gpt_loss=0.397, loss_mean=0.333][A
+Train step of epoch 0:  17%|█▋        | 1079/6434 [2:31:32<12:41:20,  8.53s/it, gpt_loss=0.397, loss_mean=0.333][A
+[LID Router Debug] Step: 1080
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [1, 4, 0, 0, 9, 2, 5, 0, 2, 9]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+Train step of epoch 0:  17%|█▋        | 1079/6434 [2:31:40<12:41:20,  8.53s/it, gpt_loss=0.503, loss_mean=0.35] [A
+Train step of epoch 0:  17%|█▋        | 1080/6434 [2:31:40<12:23:19,  8.33s/it, gpt_loss=0.503, loss_mean=0.35][A
+Train step of epoch 0:  17%|█▋        | 1080/6434 [2:31:48<12:23:19,  8.33s/it, gpt_loss=0.397, loss_mean=0.355][A
+Train step of epoch 0:  17%|█▋        | 1081/6434 [2:31:48<12:13:49,  8.23s/it, gpt_loss=0.397, loss_mean=0.355][A
+Train step of epoch 0:  17%|█▋        | 1081/6434 [2:31:58<12:13:49,  8.23s/it, gpt_loss=0.257, loss_mean=0.345][A
+Train step of epoch 0:  17%|█▋        | 1082/6434 [2:31:58<12:48:34,  8.62s/it, gpt_loss=0.257, loss_mean=0.345][A
+Train step of epoch 0:  17%|█▋        | 1082/6434 [2:32:06<12:48:34,  8.62s/it, gpt_loss=0.385, loss_mean=0.349][A
+Train step of epoch 0:  17%|█▋        | 1083/6434 [2:32:06<12:32:56,  8.44s/it, gpt_loss=0.385, loss_mean=0.349][A
+Train step of epoch 0:  17%|█▋        | 1083/6434 [2:32:14<12:32:56,  8.44s/it, gpt_loss=0.389, loss_mean=0.353][A
+Train step of epoch 0:  17%|█▋        | 1084/6434 [2:32:14<12:28:21,  8.39s/it, gpt_loss=0.389, loss_mean=0.353][A
+Train step of epoch 0:  17%|█▋        | 1084/6434 [2:32:23<12:28:21,  8.39s/it, gpt_loss=0.286, loss_mean=0.346][A
+Train step of epoch 0:  17%|█▋        | 1085/6434 [2:32:23<12:42:51,  8.56s/it, gpt_loss=0.286, loss_mean=0.346][A
+Train step of epoch 0:  17%|█▋        | 1085/6434 [2:32:31<12:42:51,  8.56s/it, gpt_loss=0.313, loss_mean=0.343][A
+Train step of epoch 0:  17%|█▋        | 1086/6434 [2:32:31<12:35:01,  8.47s/it, gpt_loss=0.313, loss_mean=0.343][A
+Train step of epoch 0:  17%|█▋        | 1086/6434 [2:32:40<12:35:01,  8.47s/it, gpt_loss=0.419, loss_mean=0.35] [A
+Train step of epoch 0:  17%|█▋        | 1087/6434 [2:32:40<12:34:11,  8.46s/it, gpt_loss=0.419, loss_mean=0.35][A
+Train step of epoch 0:  17%|█▋        | 1087/6434 [2:32:48<12:34:11,  8.46s/it, gpt_loss=0.327, loss_mean=0.348][A
+Train step of epoch 0:  17%|█▋        | 1088/6434 [2:32:48<12:21:15,  8.32s/it, gpt_loss=0.327, loss_mean=0.348][A
+Train step of epoch 0:  17%|█▋        | 1088/6434 [2:32:55<12:21:15,  8.32s/it, gpt_loss=0.381, loss_mean=0.351][A
+Train step of epoch 0:  17%|█▋        | 1089/6434 [2:32:55<11:50:00,  7.97s/it, gpt_loss=0.381, loss_mean=0.351][A
+[LID Router Debug] Step: 1090
+Batch Size: 10
+Audio Batch Size: 141
+LID Assignments: [3, 0, 0, 2, 9, 5, 3, 4, 4, 3]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  17%|█▋        | 1089/6434 [2:33:04<11:50:00,  7.97s/it, gpt_loss=0.295, loss_mean=0.346][A
+Train step of epoch 0:  17%|█▋        | 1090/6434 [2:33:04<12:13:50,  8.24s/it, gpt_loss=0.295, loss_mean=0.346][A
+Train step of epoch 0:  17%|█▋        | 1090/6434 [2:33:12<12:13:50,  8.24s/it, gpt_loss=0.283, loss_mean=0.339][A
+Train step of epoch 0:  17%|█▋        | 1091/6434 [2:33:12<12:25:03,  8.37s/it, gpt_loss=0.283, loss_mean=0.339][A
+Train step of epoch 0:  17%|█▋        | 1091/6434 [2:33:21<12:25:03,  8.37s/it, gpt_loss=0.346, loss_mean=0.34] [A
+Train step of epoch 0:  17%|█▋        | 1092/6434 [2:33:21<12:21:27,  8.33s/it, gpt_loss=0.346, loss_mean=0.34][A
+Train step of epoch 0:  17%|█▋        | 1092/6434 [2:33:30<12:21:27,  8.33s/it, gpt_loss=0.325, loss_mean=0.339][A
+Train step of epoch 0:  17%|█▋        | 1093/6434 [2:33:30<12:41:43,  8.56s/it, gpt_loss=0.325, loss_mean=0.339][A
+Train step of epoch 0:  17%|█▋        | 1093/6434 [2:33:39<12:41:43,  8.56s/it, gpt_loss=0.367, loss_mean=0.341][A
+Train step of epoch 0:  17%|█▋        | 1094/6434 [2:33:39<12:52:20,  8.68s/it, gpt_loss=0.367, loss_mean=0.341][A
+Train step of epoch 0:  17%|█▋        | 1094/6434 [2:33:47<12:52:20,  8.68s/it, gpt_loss=0.276, loss_mean=0.335][A
+Train step of epoch 0:  17%|█▋        | 1095/6434 [2:33:47<12:40:00,  8.54s/it, gpt_loss=0.276, loss_mean=0.335][A
+Train step of epoch 0:  17%|█▋        | 1095/6434 [2:33:55<12:40:00,  8.54s/it, gpt_loss=0.387, loss_mean=0.34] [A
+Train step of epoch 0:  17%|█▋        | 1096/6434 [2:33:55<12:37:57,  8.52s/it, gpt_loss=0.387, loss_mean=0.34][A
+Train step of epoch 0:  17%|█▋        | 1096/6434 [2:34:03<12:37:57,  8.52s/it, gpt_loss=0.281, loss_mean=0.334][A
+Train step of epoch 0:  17%|█▋        | 1097/6434 [2:34:03<12:19:30,  8.31s/it, gpt_loss=0.281, loss_mean=0.334][A
+Train step of epoch 0:  17%|█▋        | 1097/6434 [2:34:11<12:19:30,  8.31s/it, gpt_loss=0.342, loss_mean=0.335][A
+Train step of epoch 0:  17%|█▋        | 1098/6434 [2:34:11<12:01:40,  8.11s/it, gpt_loss=0.342, loss_mean=0.335][A
+Train step of epoch 0:  17%|█▋        | 1098/6434 [2:34:20<12:01:40,  8.11s/it, gpt_loss=0.296, loss_mean=0.331][A
+Train step of epoch 0:  17%|█▋        | 1099/6434 [2:34:20<12:19:32,  8.32s/it, gpt_loss=0.296, loss_mean=0.331][A
+[LID Router Debug] Step: 1100
+Batch Size: 10
+Audio Batch Size: 121
+LID Assignments: [1, 3, 3, 6, 9, 0, 0, 9, 1, 0]
+Active Experts in Batch: {0, 1, 3, 6, 9}
+
+Train step of epoch 0:  17%|█▋        | 1099/6434 [2:34:27<12:19:32,  8.32s/it, gpt_loss=0.446, loss_mean=0.343][A
+Train step of epoch 0:  17%|█▋        | 1100/6434 [2:34:27<12:07:38,  8.18s/it, gpt_loss=0.446, loss_mean=0.343][A
+Train step of epoch 0:  17%|█▋        | 1100/6434 [2:34:35<12:07:38,  8.18s/it, gpt_loss=0.307, loss_mean=0.339][A
+Train step of epoch 0:  17%|█▋        | 1101/6434 [2:34:35<11:59:48,  8.10s/it, gpt_loss=0.307, loss_mean=0.339][A
+Train step of epoch 0:  17%|█▋        | 1101/6434 [2:34:44<11:59:48,  8.10s/it, gpt_loss=0.361, loss_mean=0.341][A
+Train step of epoch 0:  17%|█▋        | 1102/6434 [2:34:44<12:11:26,  8.23s/it, gpt_loss=0.361, loss_mean=0.341][A
+Train step of epoch 0:  17%|█▋        | 1102/6434 [2:34:52<12:11:26,  8.23s/it, gpt_loss=0.339, loss_mean=0.341][A
+Train step of epoch 0:  17%|█▋        | 1103/6434 [2:34:52<12:09:34,  8.21s/it, gpt_loss=0.339, loss_mean=0.341][A
+Train step of epoch 0:  17%|█▋        | 1103/6434 [2:35:01<12:09:34,  8.21s/it, gpt_loss=0.344, loss_mean=0.341][A
+Train step of epoch 0:  17%|█▋        | 1104/6434 [2:35:01<12:31:22,  8.46s/it, gpt_loss=0.344, loss_mean=0.341][A
+Train step of epoch 0:  17%|█▋        | 1104/6434 [2:35:08<12:31:22,  8.46s/it, gpt_loss=0.419, loss_mean=0.349][A
+Train step of epoch 0:  17%|█▋        | 1105/6434 [2:35:08<12:04:32,  8.16s/it, gpt_loss=0.419, loss_mean=0.349][A
+Train step of epoch 0:  17%|█▋        | 1105/6434 [2:35:17<12:04:32,  8.16s/it, gpt_loss=0.382, loss_mean=0.352][A
+Train step of epoch 0:  17%|█▋        | 1106/6434 [2:35:17<12:11:00,  8.23s/it, gpt_loss=0.382, loss_mean=0.352][A
+Train step of epoch 0:  17%|█▋        | 1106/6434 [2:35:26<12:11:00,  8.23s/it, gpt_loss=0.383, loss_mean=0.355][A
+Train step of epoch 0:  17%|█▋        | 1107/6434 [2:35:26<12:22:56,  8.37s/it, gpt_loss=0.383, loss_mean=0.355][A
+Train step of epoch 0:  17%|█▋        | 1107/6434 [2:35:34<12:22:56,  8.37s/it, gpt_loss=0.373, loss_mean=0.357][A
+Train step of epoch 0:  17%|█▋        | 1108/6434 [2:35:34<12:21:21,  8.35s/it, gpt_loss=0.373, loss_mean=0.357][A
+Train step of epoch 0:  17%|█▋        | 1108/6434 [2:35:43<12:21:21,  8.35s/it, gpt_loss=0.37, loss_mean=0.358] [A
+Train step of epoch 0:  17%|█▋        | 1109/6434 [2:35:43<12:30:38,  8.46s/it, gpt_loss=0.37, loss_mean=0.358][A
+[LID Router Debug] Step: 1110
+Batch Size: 10
+Audio Batch Size: 133
+LID Assignments: [5, 3, 1, 4, 1, 5, 3, 3, 1, 9]
+Active Experts in Batch: {1, 3, 4, 5, 9}
+
+Train step of epoch 0:  17%|█▋        | 1109/6434 [2:35:52<12:30:38,  8.46s/it, gpt_loss=0.322, loss_mean=0.355][A
+Train step of epoch 0:  17%|█▋        | 1110/6434 [2:35:52<12:57:50,  8.77s/it, gpt_loss=0.322, loss_mean=0.355][A
+Train step of epoch 0:  17%|█▋        | 1110/6434 [2:36:01<12:57:50,  8.77s/it, gpt_loss=0.309, loss_mean=0.35] [A
+Train step of epoch 0:  17%|█▋        | 1111/6434 [2:36:01<13:12:39,  8.93s/it, gpt_loss=0.309, loss_mean=0.35][A
+Train step of epoch 0:  17%|█▋        | 1111/6434 [2:36:11<13:12:39,  8.93s/it, gpt_loss=0.396, loss_mean=0.355][A
+Train step of epoch 0:  17%|█▋        | 1112/6434 [2:36:11<13:26:26,  9.09s/it, gpt_loss=0.396, loss_mean=0.355][A
+Train step of epoch 0:  17%|█▋        | 1112/6434 [2:36:20<13:26:26,  9.09s/it, gpt_loss=0.417, loss_mean=0.361][A
+Train step of epoch 0:  17%|█▋        | 1113/6434 [2:36:20<13:34:08,  9.18s/it, gpt_loss=0.417, loss_mean=0.361][A
+Train step of epoch 0:  17%|█▋        | 1113/6434 [2:36:30<13:34:08,  9.18s/it, gpt_loss=0.418, loss_mean=0.367][A
+Train step of epoch 0:  17%|█▋        | 1114/6434 [2:36:30<13:53:45,  9.40s/it, gpt_loss=0.418, loss_mean=0.367][A
+Train step of epoch 0:  17%|█▋        | 1114/6434 [2:36:39<13:53:45,  9.40s/it, gpt_loss=0.334, loss_mean=0.363][A
+Train step of epoch 0:  17%|█▋        | 1115/6434 [2:36:39<13:26:52,  9.10s/it, gpt_loss=0.334, loss_mean=0.363][A
+Train step of epoch 0:  17%|█▋        | 1115/6434 [2:36:47<13:26:52,  9.10s/it, gpt_loss=0.321, loss_mean=0.359][A
+Train step of epoch 0:  17%|█▋        | 1116/6434 [2:36:47<13:18:17,  9.01s/it, gpt_loss=0.321, loss_mean=0.359][A
+Train step of epoch 0:  17%|█▋        | 1116/6434 [2:36:55<13:18:17,  9.01s/it, gpt_loss=0.351, loss_mean=0.358][A
+Train step of epoch 0:  17%|█▋        | 1117/6434 [2:36:55<12:53:09,  8.72s/it, gpt_loss=0.351, loss_mean=0.358][A
+Train step of epoch 0:  17%|█▋        | 1117/6434 [2:37:03<12:53:09,  8.72s/it, gpt_loss=0.41, loss_mean=0.364] [A
+Train step of epoch 0:  17%|█▋        | 1118/6434 [2:37:03<12:29:12,  8.46s/it, gpt_loss=0.41, loss_mean=0.364][A
+Train step of epoch 0:  17%|█▋        | 1118/6434 [2:37:12<12:29:12,  8.46s/it, gpt_loss=0.349, loss_mean=0.362][A
+Train step of epoch 0:  17%|█▋        | 1119/6434 [2:37:12<12:31:04,  8.48s/it, gpt_loss=0.349, loss_mean=0.362][A
+[LID Router Debug] Step: 1120
+Batch Size: 10
+Audio Batch Size: 113
+LID Assignments: [2, 10, 1, 4, 9, 2, 3, 5, 6, 4]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6, 9, 10}
+
+Train step of epoch 0:  17%|█▋        | 1119/6434 [2:37:20<12:31:04,  8.48s/it, gpt_loss=0.361, loss_mean=0.362][A
+Train step of epoch 0:  17%|█▋        | 1120/6434 [2:37:20<12:22:04,  8.38s/it, gpt_loss=0.361, loss_mean=0.362][A
+Train step of epoch 0:  17%|█▋        | 1120/6434 [2:37:28<12:22:04,  8.38s/it, gpt_loss=0.332, loss_mean=0.359][A
+Train step of epoch 0:  17%|█▋        | 1121/6434 [2:37:28<12:13:28,  8.28s/it, gpt_loss=0.332, loss_mean=0.359][A
+Train step of epoch 0:  17%|█▋        | 1121/6434 [2:37:36<12:13:28,  8.28s/it, gpt_loss=0.316, loss_mean=0.355][A
+Train step of epoch 0:  17%|█▋        | 1122/6434 [2:37:36<12:16:25,  8.32s/it, gpt_loss=0.316, loss_mean=0.355][A
+Train step of epoch 0:  17%|█▋        | 1122/6434 [2:37:45<12:16:25,  8.32s/it, gpt_loss=0.439, loss_mean=0.363][A
+Train step of epoch 0:  17%|█▋        | 1123/6434 [2:37:45<12:26:53,  8.44s/it, gpt_loss=0.439, loss_mean=0.363][A
+Train step of epoch 0:  17%|█▋        | 1123/6434 [2:37:53<12:26:53,  8.44s/it, gpt_loss=0.353, loss_mean=0.362][A
+Train step of epoch 0:  17%|█▋        | 1124/6434 [2:37:53<12:18:26,  8.34s/it, gpt_loss=0.353, loss_mean=0.362][A
+Train step of epoch 0:  17%|█▋        | 1124/6434 [2:38:02<12:18:26,  8.34s/it, gpt_loss=0.315, loss_mean=0.357][A
+Train step of epoch 0:  17%|█▋        | 1125/6434 [2:38:02<12:15:52,  8.32s/it, gpt_loss=0.315, loss_mean=0.357][A
+Train step of epoch 0:  17%|█▋        | 1125/6434 [2:38:11<12:15:52,  8.32s/it, gpt_loss=0.287, loss_mean=0.35] [A
+Train step of epoch 0:  18%|█▊        | 1126/6434 [2:38:11<12:35:43,  8.54s/it, gpt_loss=0.287, loss_mean=0.35][A
+Train step of epoch 0:  18%|█▊        | 1126/6434 [2:38:19<12:35:43,  8.54s/it, gpt_loss=0.325, loss_mean=0.348][A
+Train step of epoch 0:  18%|█▊        | 1127/6434 [2:38:19<12:32:53,  8.51s/it, gpt_loss=0.325, loss_mean=0.348][A
+Train step of epoch 0:  18%|█▊        | 1127/6434 [2:38:27<12:32:53,  8.51s/it, gpt_loss=0.382, loss_mean=0.351][A
+Train step of epoch 0:  18%|█▊        | 1128/6434 [2:38:27<12:18:00,  8.35s/it, gpt_loss=0.382, loss_mean=0.351][A
+Train step of epoch 0:  18%|█▊        | 1128/6434 [2:38:36<12:18:00,  8.35s/it, gpt_loss=0.258, loss_mean=0.342][A
+Train step of epoch 0:  18%|█▊        | 1129/6434 [2:38:36<12:30:50,  8.49s/it, gpt_loss=0.258, loss_mean=0.342][A
+[LID Router Debug] Step: 1130
+Batch Size: 10
+Audio Batch Size: 122
+LID Assignments: [5, 2, 6, 6, 2, 1, 2, 3, 3, 2]
+Active Experts in Batch: {1, 2, 3, 5, 6}
+
+Train step of epoch 0:  18%|█▊        | 1129/6434 [2:38:44<12:30:50,  8.49s/it, gpt_loss=0.373, loss_mean=0.345][A
+Train step of epoch 0:  18%|█▊        | 1130/6434 [2:38:44<12:20:51,  8.38s/it, gpt_loss=0.373, loss_mean=0.345][A
+Train step of epoch 0:  18%|█▊        | 1130/6434 [2:38:53<12:20:51,  8.38s/it, gpt_loss=0.298, loss_mean=0.34] [A
+Train step of epoch 0:  18%|█▊        | 1131/6434 [2:38:53<12:45:01,  8.66s/it, gpt_loss=0.298, loss_mean=0.34][A
+Train step of epoch 0:  18%|█▊        | 1131/6434 [2:39:03<12:45:01,  8.66s/it, gpt_loss=0.302, loss_mean=0.336][A
+Train step of epoch 0:  18%|█▊        | 1132/6434 [2:39:03<13:12:12,  8.96s/it, gpt_loss=0.302, loss_mean=0.336][A
+Train step of epoch 0:  18%|█▊        | 1132/6434 [2:39:11<13:12:12,  8.96s/it, gpt_loss=0.301, loss_mean=0.333][A
+Train step of epoch 0:  18%|█▊        | 1133/6434 [2:39:11<12:55:10,  8.77s/it, gpt_loss=0.301, loss_mean=0.333][A
+Train step of epoch 0:  18%|█▊        | 1133/6434 [2:39:20<12:55:10,  8.77s/it, gpt_loss=0.331, loss_mean=0.333][A
+Train step of epoch 0:  18%|█▊        | 1134/6434 [2:39:20<12:44:58,  8.66s/it, gpt_loss=0.331, loss_mean=0.333][A
+Train step of epoch 0:  18%|█▊        | 1134/6434 [2:39:28<12:44:58,  8.66s/it, gpt_loss=0.381, loss_mean=0.338][A
+Train step of epoch 0:  18%|█▊        | 1135/6434 [2:39:28<12:46:09,  8.68s/it, gpt_loss=0.381, loss_mean=0.338][A
+Train step of epoch 0:  18%|█▊        | 1135/6434 [2:39:39<12:46:09,  8.68s/it, gpt_loss=0.312, loss_mean=0.335][A
+Train step of epoch 0:  18%|█▊        | 1136/6434 [2:39:39<13:34:01,  9.22s/it, gpt_loss=0.312, loss_mean=0.335][A
+Train step of epoch 0:  18%|█▊        | 1136/6434 [2:39:46<13:34:01,  9.22s/it, gpt_loss=0.287, loss_mean=0.33] [A
+Train step of epoch 0:  18%|█▊        | 1137/6434 [2:39:46<12:47:21,  8.69s/it, gpt_loss=0.287, loss_mean=0.33][A
+Train step of epoch 0:  18%|█▊        | 1137/6434 [2:39:54<12:47:21,  8.69s/it, gpt_loss=0.306, loss_mean=0.328][A
+Train step of epoch 0:  18%|█▊        | 1138/6434 [2:39:54<12:26:21,  8.46s/it, gpt_loss=0.306, loss_mean=0.328][A
+Train step of epoch 0:  18%|█▊        | 1138/6434 [2:40:03<12:26:21,  8.46s/it, gpt_loss=0.387, loss_mean=0.334][A
+Train step of epoch 0:  18%|█▊        | 1139/6434 [2:40:03<12:48:29,  8.71s/it, gpt_loss=0.387, loss_mean=0.334][A
+[LID Router Debug] Step: 1140
+Batch Size: 10
+Audio Batch Size: 135
+LID Assignments: [9, 5, 3, 8, 7, 1, 3, 5, 3, 0]
+Active Experts in Batch: {0, 1, 3, 5, 7, 8, 9}
+
+Train step of epoch 0:  18%|█▊        | 1139/6434 [2:40:12<12:48:29,  8.71s/it, gpt_loss=0.331, loss_mean=0.333][A
+Train step of epoch 0:  18%|█▊        | 1140/6434 [2:40:12<12:49:07,  8.72s/it, gpt_loss=0.331, loss_mean=0.333][A
+Train step of epoch 0:  18%|█▊        | 1140/6434 [2:40:21<12:49:07,  8.72s/it, gpt_loss=0.356, loss_mean=0.336][A
+Train step of epoch 0:  18%|█▊        | 1141/6434 [2:40:21<13:02:41,  8.87s/it, gpt_loss=0.356, loss_mean=0.336][A
+Train step of epoch 0:  18%|█▊        | 1141/6434 [2:40:29<13:02:41,  8.87s/it, gpt_loss=0.366, loss_mean=0.339][A
+Train step of epoch 0:  18%|█▊        | 1142/6434 [2:40:29<12:27:21,  8.47s/it, gpt_loss=0.366, loss_mean=0.339][A
+Train step of epoch 0:  18%|█▊        | 1142/6434 [2:40:37<12:27:21,  8.47s/it, gpt_loss=0.345, loss_mean=0.339][A
+Train step of epoch 0:  18%|█▊        | 1143/6434 [2:40:37<12:01:43,  8.18s/it, gpt_loss=0.345, loss_mean=0.339][A
+Train step of epoch 0:  18%|█▊        | 1143/6434 [2:40:45<12:01:43,  8.18s/it, gpt_loss=0.373, loss_mean=0.343][A
+Train step of epoch 0:  18%|█▊        | 1144/6434 [2:40:45<12:06:03,  8.23s/it, gpt_loss=0.373, loss_mean=0.343][A
+Train step of epoch 0:  18%|█▊        | 1144/6434 [2:40:53<12:06:03,  8.23s/it, gpt_loss=0.362, loss_mean=0.345][A
+Train step of epoch 0:  18%|█▊        | 1145/6434 [2:40:53<12:14:52,  8.34s/it, gpt_loss=0.362, loss_mean=0.345][A
+Train step of epoch 0:  18%|█▊        | 1145/6434 [2:41:03<12:14:52,  8.34s/it, gpt_loss=0.563, loss_mean=0.366][A
+Train step of epoch 0:  18%|█▊        | 1146/6434 [2:41:03<12:38:20,  8.60s/it, gpt_loss=0.563, loss_mean=0.366][A
+Train step of epoch 0:  18%|█▊        | 1146/6434 [2:41:11<12:38:20,  8.60s/it, gpt_loss=0.342, loss_mean=0.364][A
+Train step of epoch 0:  18%|█▊        | 1147/6434 [2:41:11<12:21:44,  8.42s/it, gpt_loss=0.342, loss_mean=0.364][A
+Train step of epoch 0:  18%|█▊        | 1147/6434 [2:41:19<12:21:44,  8.42s/it, gpt_loss=0.297, loss_mean=0.357][A
+Train step of epoch 0:  18%|█▊        | 1148/6434 [2:41:19<12:28:51,  8.50s/it, gpt_loss=0.297, loss_mean=0.357][A
+Train step of epoch 0:  18%|█▊        | 1148/6434 [2:41:27<12:28:51,  8.50s/it, gpt_loss=0.425, loss_mean=0.364][A
+Train step of epoch 0:  18%|█▊        | 1149/6434 [2:41:27<12:18:25,  8.38s/it, gpt_loss=0.425, loss_mean=0.364][A
+[LID Router Debug] Step: 1150
+Batch Size: 10
+Audio Batch Size: 75
+LID Assignments: [0, 1, 5, 4, 4, 9, 4, 2, 2, 4]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+Train step of epoch 0:  18%|█▊        | 1149/6434 [2:41:35<12:18:25,  8.38s/it, gpt_loss=0.399, loss_mean=0.368][A
+Train step of epoch 0:  18%|█▊        | 1150/6434 [2:41:35<12:04:25,  8.23s/it, gpt_loss=0.399, loss_mean=0.368][A
+Train step of epoch 0:  18%|█▊        | 1150/6434 [2:41:45<12:04:25,  8.23s/it, gpt_loss=0.381, loss_mean=0.369][A
+Train step of epoch 0:  18%|█▊        | 1151/6434 [2:41:45<12:36:31,  8.59s/it, gpt_loss=0.381, loss_mean=0.369][A
+Train step of epoch 0:  18%|█▊        | 1151/6434 [2:41:54<12:36:31,  8.59s/it, gpt_loss=0.397, loss_mean=0.372][A
+Train step of epoch 0:  18%|█▊        | 1152/6434 [2:41:54<12:42:39,  8.66s/it, gpt_loss=0.397, loss_mean=0.372][A
+Train step of epoch 0:  18%|█▊        | 1152/6434 [2:42:01<12:42:39,  8.66s/it, gpt_loss=0.296, loss_mean=0.364][A
+Train step of epoch 0:  18%|█▊        | 1153/6434 [2:42:01<12:13:15,  8.33s/it, gpt_loss=0.296, loss_mean=0.364][A
+Train step of epoch 0:  18%|█▊        | 1153/6434 [2:42:09<12:13:15,  8.33s/it, gpt_loss=0.4, loss_mean=0.368]  [A
+Train step of epoch 0:  18%|█▊        | 1154/6434 [2:42:09<12:04:30,  8.23s/it, gpt_loss=0.4, loss_mean=0.368][A
+Train step of epoch 0:  18%|█▊        | 1154/6434 [2:42:17<12:04:30,  8.23s/it, gpt_loss=0.304, loss_mean=0.361][A
+Train step of epoch 0:  18%|█▊        | 1155/6434 [2:42:17<11:47:12,  8.04s/it, gpt_loss=0.304, loss_mean=0.361][A
+Train step of epoch 0:  18%|█▊        | 1155/6434 [2:42:26<11:47:12,  8.04s/it, gpt_loss=0.309, loss_mean=0.356][A
+Train step of epoch 0:  18%|█▊        | 1156/6434 [2:42:26<12:19:35,  8.41s/it, gpt_loss=0.309, loss_mean=0.356][A
+Train step of epoch 0:  18%|█▊        | 1156/6434 [2:42:34<12:19:35,  8.41s/it, gpt_loss=0.319, loss_mean=0.353][A
+Train step of epoch 0:  18%|█▊        | 1157/6434 [2:42:34<12:20:39,  8.42s/it, gpt_loss=0.319, loss_mean=0.353][A
+Train step of epoch 0:  18%|█▊        | 1157/6434 [2:42:42<12:20:39,  8.42s/it, gpt_loss=0.334, loss_mean=0.351][A
+Train step of epoch 0:  18%|█▊        | 1158/6434 [2:42:42<12:06:19,  8.26s/it, gpt_loss=0.334, loss_mean=0.351][A
+Train step of epoch 0:  18%|█▊        | 1158/6434 [2:42:50<12:06:19,  8.26s/it, gpt_loss=0.383, loss_mean=0.354][A
+Train step of epoch 0:  18%|█▊        | 1159/6434 [2:42:50<11:45:49,  8.03s/it, gpt_loss=0.383, loss_mean=0.354][A
+[LID Router Debug] Step: 1160
+Batch Size: 10
+Audio Batch Size: 108
+LID Assignments: [10, 1, 7, 6, 4, 0, 1, 0, 0, 2]
+Active Experts in Batch: {0, 1, 2, 4, 6, 7, 10}
+
+Train step of epoch 0:  18%|█▊        | 1159/6434 [2:42:59<11:45:49,  8.03s/it, gpt_loss=0.292, loss_mean=0.348][A
+Train step of epoch 0:  18%|█▊        | 1160/6434 [2:42:59<12:05:30,  8.25s/it, gpt_loss=0.292, loss_mean=0.348][A
+Train step of epoch 0:  18%|█▊        | 1160/6434 [2:43:08<12:05:30,  8.25s/it, gpt_loss=0.385, loss_mean=0.351][A
+Train step of epoch 0:  18%|█▊        | 1161/6434 [2:43:08<12:28:57,  8.52s/it, gpt_loss=0.385, loss_mean=0.351][A
+Train step of epoch 0:  18%|█▊        | 1161/6434 [2:43:16<12:28:57,  8.52s/it, gpt_loss=0.315, loss_mean=0.348][A
+Train step of epoch 0:  18%|█▊        | 1162/6434 [2:43:16<12:25:40,  8.49s/it, gpt_loss=0.315, loss_mean=0.348][A
+Train step of epoch 0:  18%|█▊        | 1162/6434 [2:43:26<12:25:40,  8.49s/it, gpt_loss=0.295, loss_mean=0.343][A
+Train step of epoch 0:  18%|█▊        | 1163/6434 [2:43:26<12:55:34,  8.83s/it, gpt_loss=0.295, loss_mean=0.343][A
+Train step of epoch 0:  18%|█▊        | 1163/6434 [2:43:34<12:55:34,  8.83s/it, gpt_loss=0.273, loss_mean=0.336][A
+Train step of epoch 0:  18%|█▊        | 1164/6434 [2:43:34<12:27:52,  8.51s/it, gpt_loss=0.273, loss_mean=0.336][A
+Train step of epoch 0:  18%|█▊        | 1164/6434 [2:43:41<12:27:52,  8.51s/it, gpt_loss=0.277, loss_mean=0.33] [A
+Train step of epoch 0:  18%|█▊        | 1165/6434 [2:43:41<11:53:22,  8.12s/it, gpt_loss=0.277, loss_mean=0.33][A
+Train step of epoch 0:  18%|█▊        | 1165/6434 [2:43:50<11:53:22,  8.12s/it, gpt_loss=0.318, loss_mean=0.329][A
+Train step of epoch 0:  18%|█▊        | 1166/6434 [2:43:50<12:11:47,  8.33s/it, gpt_loss=0.318, loss_mean=0.329][A
+Train step of epoch 0:  18%|█▊        | 1166/6434 [2:43:58<12:11:47,  8.33s/it, gpt_loss=0.318, loss_mean=0.328][A
+Train step of epoch 0:  18%|█▊        | 1167/6434 [2:43:58<12:08:18,  8.30s/it, gpt_loss=0.318, loss_mean=0.328][A
+Train step of epoch 0:  18%|█▊        | 1167/6434 [2:44:07<12:08:18,  8.30s/it, gpt_loss=0.271, loss_mean=0.322][A
+Train step of epoch 0:  18%|█▊        | 1168/6434 [2:44:07<12:20:27,  8.44s/it, gpt_loss=0.271, loss_mean=0.322][A
+Train step of epoch 0:  18%|█▊        | 1168/6434 [2:44:16<12:20:27,  8.44s/it, gpt_loss=0.38, loss_mean=0.328] [A
+Train step of epoch 0:  18%|█▊        | 1169/6434 [2:44:16<12:48:17,  8.76s/it, gpt_loss=0.38, loss_mean=0.328][A
+[LID Router Debug] Step: 1170
+Batch Size: 10
+Audio Batch Size: 91
+LID Assignments: [4, 5, 5, 5, 4, 5, 2, 4, 4, 5]
+Active Experts in Batch: {2, 4, 5}
+
+Train step of epoch 0:  18%|█▊        | 1169/6434 [2:44:24<12:48:17,  8.76s/it, gpt_loss=0.344, loss_mean=0.329][A
+Train step of epoch 0:  18%|█▊        | 1170/6434 [2:44:24<12:34:54,  8.60s/it, gpt_loss=0.344, loss_mean=0.329][A
+Train step of epoch 0:  18%|█▊        | 1170/6434 [2:44:33<12:34:54,  8.60s/it, gpt_loss=0.3, loss_mean=0.326]  [A
+Train step of epoch 0:  18%|█▊        | 1171/6434 [2:44:33<12:33:47,  8.59s/it, gpt_loss=0.3, loss_mean=0.326][A
+Train step of epoch 0:  18%|█▊        | 1171/6434 [2:44:41<12:33:47,  8.59s/it, gpt_loss=0.326, loss_mean=0.326][A
+Train step of epoch 0:  18%|█▊        | 1172/6434 [2:44:41<12:25:11,  8.50s/it, gpt_loss=0.326, loss_mean=0.326][A
+Train step of epoch 0:  18%|█▊        | 1172/6434 [2:44:50<12:25:11,  8.50s/it, gpt_loss=0.352, loss_mean=0.329][A
+Train step of epoch 0:  18%|█▊        | 1173/6434 [2:44:50<12:27:12,  8.52s/it, gpt_loss=0.352, loss_mean=0.329][A
+Train step of epoch 0:  18%|█▊        | 1173/6434 [2:44:58<12:27:12,  8.52s/it, gpt_loss=0.316, loss_mean=0.328][A
+Train step of epoch 0:  18%|█▊        | 1174/6434 [2:44:58<12:17:16,  8.41s/it, gpt_loss=0.316, loss_mean=0.328][A
+Train step of epoch 0:  18%|█▊        | 1174/6434 [2:45:06<12:17:16,  8.41s/it, gpt_loss=0.339, loss_mean=0.329][A
+Train step of epoch 0:  18%|█▊        | 1175/6434 [2:45:06<12:11:36,  8.35s/it, gpt_loss=0.339, loss_mean=0.329][A
+Train step of epoch 0:  18%|█▊        | 1175/6434 [2:45:15<12:11:36,  8.35s/it, gpt_loss=0.309, loss_mean=0.327][A
+Train step of epoch 0:  18%|█▊        | 1176/6434 [2:45:15<12:23:13,  8.48s/it, gpt_loss=0.309, loss_mean=0.327][A
+Train step of epoch 0:  18%|█▊        | 1176/6434 [2:45:23<12:23:13,  8.48s/it, gpt_loss=0.381, loss_mean=0.332][A
+Train step of epoch 0:  18%|█▊        | 1177/6434 [2:45:23<12:12:27,  8.36s/it, gpt_loss=0.381, loss_mean=0.332][A
+Train step of epoch 0:  18%|█▊        | 1177/6434 [2:45:30<12:12:27,  8.36s/it, gpt_loss=0.343, loss_mean=0.333][A
+Train step of epoch 0:  18%|█▊        | 1178/6434 [2:45:30<11:48:31,  8.09s/it, gpt_loss=0.343, loss_mean=0.333][A
+Train step of epoch 0:  18%|█▊        | 1178/6434 [2:45:39<11:48:31,  8.09s/it, gpt_loss=0.312, loss_mean=0.331][A
+Train step of epoch 0:  18%|█▊        | 1179/6434 [2:45:39<11:50:24,  8.11s/it, gpt_loss=0.312, loss_mean=0.331][A
+[LID Router Debug] Step: 1180
+Batch Size: 10
+Audio Batch Size: 104
+LID Assignments: [0, 4, 6, 5, 9, 0, 3, 6, 1, 5]
+Active Experts in Batch: {0, 1, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  18%|█▊        | 1179/6434 [2:45:47<11:50:24,  8.11s/it, gpt_loss=0.327, loss_mean=0.331][A
+Train step of epoch 0:  18%|█▊        | 1180/6434 [2:45:47<11:55:18,  8.17s/it, gpt_loss=0.327, loss_mean=0.331][A
+Train step of epoch 0:  18%|█▊        | 1180/6434 [2:45:55<11:55:18,  8.17s/it, gpt_loss=0.268, loss_mean=0.324][A
+Train step of epoch 0:  18%|█▊        | 1181/6434 [2:45:55<11:46:11,  8.07s/it, gpt_loss=0.268, loss_mean=0.324][A
+Train step of epoch 0:  18%|█▊        | 1181/6434 [2:46:03<11:46:11,  8.07s/it, gpt_loss=0.312, loss_mean=0.323][A
+Train step of epoch 0:  18%|█▊        | 1182/6434 [2:46:03<11:58:29,  8.21s/it, gpt_loss=0.312, loss_mean=0.323][A
+Train step of epoch 0:  18%|█▊        | 1182/6434 [2:46:12<11:58:29,  8.21s/it, gpt_loss=0.349, loss_mean=0.326][A
+Train step of epoch 0:  18%|█▊        | 1183/6434 [2:46:12<12:09:45,  8.34s/it, gpt_loss=0.349, loss_mean=0.326][A
+Train step of epoch 0:  18%|█▊        | 1183/6434 [2:46:21<12:09:45,  8.34s/it, gpt_loss=0.345, loss_mean=0.328][A
+Train step of epoch 0:  18%|█▊        | 1184/6434 [2:46:21<12:23:13,  8.49s/it, gpt_loss=0.345, loss_mean=0.328][A
+Train step of epoch 0:  18%|█▊        | 1184/6434 [2:46:30<12:23:13,  8.49s/it, gpt_loss=0.369, loss_mean=0.332][A
+Train step of epoch 0:  18%|█▊        | 1185/6434 [2:46:30<12:43:27,  8.73s/it, gpt_loss=0.369, loss_mean=0.332][A
+Train step of epoch 0:  18%|█▊        | 1185/6434 [2:46:38<12:43:27,  8.73s/it, gpt_loss=0.344, loss_mean=0.333][A
+Train step of epoch 0:  18%|█▊        | 1186/6434 [2:46:38<12:22:57,  8.49s/it, gpt_loss=0.344, loss_mean=0.333][A
+Train step of epoch 0:  18%|█▊        | 1186/6434 [2:46:46<12:22:57,  8.49s/it, gpt_loss=0.4, loss_mean=0.34]   [A
+Train step of epoch 0:  18%|█▊        | 1187/6434 [2:46:46<12:22:28,  8.49s/it, gpt_loss=0.4, loss_mean=0.34][A
+Train step of epoch 0:  18%|█▊        | 1187/6434 [2:46:55<12:22:28,  8.49s/it, gpt_loss=0.353, loss_mean=0.341][A
+Train step of epoch 0:  18%|█▊        | 1188/6434 [2:46:55<12:31:36,  8.60s/it, gpt_loss=0.353, loss_mean=0.341][A
+Train step of epoch 0:  18%|█▊        | 1188/6434 [2:47:04<12:31:36,  8.60s/it, gpt_loss=0.391, loss_mean=0.346][A
+Train step of epoch 0:  18%|█▊        | 1189/6434 [2:47:04<12:31:04,  8.59s/it, gpt_loss=0.391, loss_mean=0.346][A
+[LID Router Debug] Step: 1190
+Batch Size: 10
+Audio Batch Size: 123
+LID Assignments: [9, 1, 5, 9, 9, 0, 0, 2, 8, 0]
+Active Experts in Batch: {0, 1, 2, 5, 8, 9}
+
+Train step of epoch 0:  18%|█▊        | 1189/6434 [2:47:14<12:31:04,  8.59s/it, gpt_loss=0.458, loss_mean=0.357][A
+Train step of epoch 0:  18%|█▊        | 1190/6434 [2:47:14<13:07:39,  9.01s/it, gpt_loss=0.458, loss_mean=0.357][A
+Train step of epoch 0:  18%|█▊        | 1190/6434 [2:47:22<13:07:39,  9.01s/it, gpt_loss=0.306, loss_mean=0.352][A
+Train step of epoch 0:  19%|█▊        | 1191/6434 [2:47:22<12:31:12,  8.60s/it, gpt_loss=0.306, loss_mean=0.352][A
+Train step of epoch 0:  19%|█▊        | 1191/6434 [2:47:30<12:31:12,  8.60s/it, gpt_loss=0.362, loss_mean=0.353][A
+Train step of epoch 0:  19%|█▊        | 1192/6434 [2:47:30<12:30:10,  8.59s/it, gpt_loss=0.362, loss_mean=0.353][A
+Train step of epoch 0:  19%|█▊        | 1192/6434 [2:47:39<12:30:10,  8.59s/it, gpt_loss=0.302, loss_mean=0.348][A
+Train step of epoch 0:  19%|█▊        | 1193/6434 [2:47:39<12:28:07,  8.56s/it, gpt_loss=0.302, loss_mean=0.348][A
+Train step of epoch 0:  19%|█▊        | 1193/6434 [2:47:47<12:28:07,  8.56s/it, gpt_loss=0.417, loss_mean=0.355][A
+Train step of epoch 0:  19%|█▊        | 1194/6434 [2:47:47<12:19:43,  8.47s/it, gpt_loss=0.417, loss_mean=0.355][A
+Train step of epoch 0:  19%|█▊        | 1194/6434 [2:47:55<12:19:43,  8.47s/it, gpt_loss=0.282, loss_mean=0.348][A
+Train step of epoch 0:  19%|█▊        | 1195/6434 [2:47:55<12:18:06,  8.45s/it, gpt_loss=0.282, loss_mean=0.348][A
+Train step of epoch 0:  19%|█▊        | 1195/6434 [2:48:04<12:18:06,  8.45s/it, gpt_loss=0.32, loss_mean=0.345] [A
+Train step of epoch 0:  19%|█▊        | 1196/6434 [2:48:04<12:16:15,  8.43s/it, gpt_loss=0.32, loss_mean=0.345][A
+Train step of epoch 0:  19%|█▊        | 1196/6434 [2:48:13<12:16:15,  8.43s/it, gpt_loss=0.343, loss_mean=0.345][A
+Train step of epoch 0:  19%|█▊        | 1197/6434 [2:48:13<12:45:35,  8.77s/it, gpt_loss=0.343, loss_mean=0.345][A
+Train step of epoch 0:  19%|█▊        | 1197/6434 [2:48:22<12:45:35,  8.77s/it, gpt_loss=0.28, loss_mean=0.338] [A
+Train step of epoch 0:  19%|█▊        | 1198/6434 [2:48:22<12:39:08,  8.70s/it, gpt_loss=0.28, loss_mean=0.338][A
+Train step of epoch 0:  19%|█▊        | 1198/6434 [2:48:30<12:39:08,  8.70s/it, gpt_loss=0.381, loss_mean=0.343][A
+Train step of epoch 0:  19%|█▊        | 1199/6434 [2:48:30<12:25:51,  8.55s/it, gpt_loss=0.381, loss_mean=0.343][A
+[LID Router Debug] Step: 1200
+Batch Size: 10
+Audio Batch Size: 81
+LID Assignments: [4, 1, 2, 1, 2, 1, 6, 2, 9, 2]
+Active Experts in Batch: {1, 2, 4, 6, 9}
+[2026-02-06 18:44:43,199] [INFO] [logging.py:96:log_dist] [Rank 0] step=600, skipped=0, lr=[1.9863527563692662e-05, 1.9863527563692662e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-06 18:44:43,200] [INFO] [timer.py:260:stop] epoch=0/micro_step=1200/global_step=600, RunningAvgSamplesPerSec=4.758080757947174, CurrSamplesPerSec=4.712133682104068, MemAllocated=12.67GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  19%|█▊        | 1199/6434 [2:48:39<12:25:51,  8.55s/it, gpt_loss=0.405, loss_mean=0.349][A
+Train step of epoch 0:  19%|█▊        | 1200/6434 [2:48:39<12:32:32,  8.63s/it, gpt_loss=0.405, loss_mean=0.349][A
+Train step of epoch 0:  19%|█▊        | 1200/6434 [2:48:47<12:32:32,  8.63s/it, gpt_loss=0.25, loss_mean=0.339] [A
+Train step of epoch 0:  19%|█▊        | 1201/6434 [2:48:47<12:17:43,  8.46s/it, gpt_loss=0.25, loss_mean=0.339][A
+Train step of epoch 0:  19%|█▊        | 1201/6434 [2:48:56<12:17:43,  8.46s/it, gpt_loss=0.413, loss_mean=0.346][A
+Train step of epoch 0:  19%|█▊        | 1202/6434 [2:48:56<12:24:05,  8.53s/it, gpt_loss=0.413, loss_mean=0.346][A
+Train step of epoch 0:  19%|█▊        | 1202/6434 [2:49:03<12:24:05,  8.53s/it, gpt_loss=0.361, loss_mean=0.348][A
+Train step of epoch 0:  19%|█▊        | 1203/6434 [2:49:03<12:01:41,  8.28s/it, gpt_loss=0.361, loss_mean=0.348][A
+Train step of epoch 0:  19%|█▊        | 1203/6434 [2:49:11<12:01:41,  8.28s/it, gpt_loss=0.322, loss_mean=0.345][A
+Train step of epoch 0:  19%|█▊        | 1204/6434 [2:49:11<11:50:43,  8.15s/it, gpt_loss=0.322, loss_mean=0.345][A
+Train step of epoch 0:  19%|█▊        | 1204/6434 [2:49:19<11:50:43,  8.15s/it, gpt_loss=0.331, loss_mean=0.344][A
+Train step of epoch 0:  19%|█▊        | 1205/6434 [2:49:19<11:57:19,  8.23s/it, gpt_loss=0.331, loss_mean=0.344][A
+Train step of epoch 0:  19%|█▊        | 1205/6434 [2:49:29<11:57:19,  8.23s/it, gpt_loss=0.404, loss_mean=0.35] [A
+Train step of epoch 0:  19%|█▊        | 1206/6434 [2:49:29<12:25:44,  8.56s/it, gpt_loss=0.404, loss_mean=0.35][A
+Train step of epoch 0:  19%|█▊        | 1206/6434 [2:49:37<12:25:44,  8.56s/it, gpt_loss=0.373, loss_mean=0.352][A
+Train step of epoch 0:  19%|█▉        | 1207/6434 [2:49:37<12:14:57,  8.44s/it, gpt_loss=0.373, loss_mean=0.352][A
+Train step of epoch 0:  19%|█▉        | 1207/6434 [2:49:46<12:14:57,  8.44s/it, gpt_loss=0.36, loss_mean=0.353] [A
+Train step of epoch 0:  19%|█▉        | 1208/6434 [2:49:46<12:19:18,  8.49s/it, gpt_loss=0.36, loss_mean=0.353][A
+Train step of epoch 0:  19%|█▉        | 1208/6434 [2:49:54<12:19:18,  8.49s/it, gpt_loss=0.328, loss_mean=0.35][A
+Train step of epoch 0:  19%|█▉        | 1209/6434 [2:49:54<12:12:19,  8.41s/it, gpt_loss=0.328, loss_mean=0.35][A
+[LID Router Debug] Step: 1210
+Batch Size: 10
+Audio Batch Size: 124
+LID Assignments: [1, 9, 3, 0, 9, 9, 4, 2, 2, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+Train step of epoch 0:  19%|█▉        | 1209/6434 [2:50:02<12:12:19,  8.41s/it, gpt_loss=0.481, loss_mean=0.364][A
+Train step of epoch 0:  19%|█▉        | 1210/6434 [2:50:02<12:10:43,  8.39s/it, gpt_loss=0.481, loss_mean=0.364][A
+Train step of epoch 0:  19%|█▉        | 1210/6434 [2:50:11<12:10:43,  8.39s/it, gpt_loss=0.336, loss_mean=0.361][A
+Train step of epoch 0:  19%|█▉        | 1211/6434 [2:50:11<12:31:43,  8.64s/it, gpt_loss=0.336, loss_mean=0.361][A
+Train step of epoch 0:  19%|█▉        | 1211/6434 [2:50:19<12:31:43,  8.64s/it, gpt_loss=0.318, loss_mean=0.357][A
+Train step of epoch 0:  19%|█▉        | 1212/6434 [2:50:19<12:14:24,  8.44s/it, gpt_loss=0.318, loss_mean=0.357][A
+Train step of epoch 0:  19%|█▉        | 1212/6434 [2:50:28<12:14:24,  8.44s/it, gpt_loss=0.371, loss_mean=0.358][A
+Train step of epoch 0:  19%|█▉        | 1213/6434 [2:50:28<12:09:26,  8.38s/it, gpt_loss=0.371, loss_mean=0.358][A
+Train step of epoch 0:  19%|█▉        | 1213/6434 [2:50:35<12:09:26,  8.38s/it, gpt_loss=0.316, loss_mean=0.354][A
+Train step of epoch 0:  19%|█▉        | 1214/6434 [2:50:35<11:50:35,  8.17s/it, gpt_loss=0.316, loss_mean=0.354][A
+Train step of epoch 0:  19%|█▉        | 1214/6434 [2:50:43<11:50:35,  8.17s/it, gpt_loss=0.358, loss_mean=0.354][A
+Train step of epoch 0:  19%|█▉        | 1215/6434 [2:50:43<11:47:50,  8.14s/it, gpt_loss=0.358, loss_mean=0.354][A
+Train step of epoch 0:  19%|█▉        | 1215/6434 [2:50:52<11:47:50,  8.14s/it, gpt_loss=0.283, loss_mean=0.347][A
+Train step of epoch 0:  19%|█▉        | 1216/6434 [2:50:52<12:04:08,  8.33s/it, gpt_loss=0.283, loss_mean=0.347][A
+Train step of epoch 0:  19%|█▉        | 1216/6434 [2:51:00<12:04:08,  8.33s/it, gpt_loss=0.375, loss_mean=0.35] [A
+Train step of epoch 0:  19%|█▉        | 1217/6434 [2:51:00<11:57:58,  8.26s/it, gpt_loss=0.375, loss_mean=0.35][A
+Train step of epoch 0:  19%|█▉        | 1217/6434 [2:51:08<11:57:58,  8.26s/it, gpt_loss=0.34, loss_mean=0.349][A
+Train step of epoch 0:  19%|█▉        | 1218/6434 [2:51:08<11:37:18,  8.02s/it, gpt_loss=0.34, loss_mean=0.349][A
+Train step of epoch 0:  19%|█▉        | 1218/6434 [2:51:16<11:37:18,  8.02s/it, gpt_loss=0.344, loss_mean=0.349][A
+Train step of epoch 0:  19%|█▉        | 1219/6434 [2:51:16<11:35:40,  8.00s/it, gpt_loss=0.344, loss_mean=0.349][A
+[LID Router Debug] Step: 1220
+Batch Size: 10
+Audio Batch Size: 121
+LID Assignments: [4, 9, 0, 5, 3, 3, 5, 1, 1, 3]
+Active Experts in Batch: {0, 1, 3, 4, 5, 9}
+
+Train step of epoch 0:  19%|█▉        | 1219/6434 [2:51:24<11:35:40,  8.00s/it, gpt_loss=0.342, loss_mean=0.348][A
+Train step of epoch 0:  19%|█▉        | 1220/6434 [2:51:24<11:40:50,  8.06s/it, gpt_loss=0.342, loss_mean=0.348][A
+Train step of epoch 0:  19%|█▉        | 1220/6434 [2:51:31<11:40:50,  8.06s/it, gpt_loss=0.302, loss_mean=0.343][A
+Train step of epoch 0:  19%|█▉        | 1221/6434 [2:51:31<11:30:52,  7.95s/it, gpt_loss=0.302, loss_mean=0.343][A
+Train step of epoch 0:  19%|█▉        | 1221/6434 [2:51:39<11:30:52,  7.95s/it, gpt_loss=0.368, loss_mean=0.346][A
+Train step of epoch 0:  19%|█▉        | 1222/6434 [2:51:39<11:22:13,  7.85s/it, gpt_loss=0.368, loss_mean=0.346][A
+Train step of epoch 0:  19%|█▉        | 1222/6434 [2:51:48<11:22:13,  7.85s/it, gpt_loss=0.299, loss_mean=0.341][A
+Train step of epoch 0:  19%|█▉        | 1223/6434 [2:51:48<11:38:41,  8.04s/it, gpt_loss=0.299, loss_mean=0.341][A
+Train step of epoch 0:  19%|█▉        | 1223/6434 [2:51:56<11:38:41,  8.04s/it, gpt_loss=0.403, loss_mean=0.347][A
+Train step of epoch 0:  19%|█▉        | 1224/6434 [2:51:56<11:50:02,  8.18s/it, gpt_loss=0.403, loss_mean=0.347][A
+Train step of epoch 0:  19%|█▉        | 1224/6434 [2:52:04<11:50:02,  8.18s/it, gpt_loss=0.312, loss_mean=0.344][A
+Train step of epoch 0:  19%|█▉        | 1225/6434 [2:52:04<11:49:59,  8.18s/it, gpt_loss=0.312, loss_mean=0.344][A
+Train step of epoch 0:  19%|█▉        | 1225/6434 [2:52:12<11:49:59,  8.18s/it, gpt_loss=0.499, loss_mean=0.359][A
+Train step of epoch 0:  19%|█▉        | 1226/6434 [2:52:12<11:49:49,  8.18s/it, gpt_loss=0.499, loss_mean=0.359][A
+Train step of epoch 0:  19%|█▉        | 1226/6434 [2:52:20<11:49:49,  8.18s/it, gpt_loss=0.352, loss_mean=0.358][A
+Train step of epoch 0:  19%|█▉        | 1227/6434 [2:52:20<11:41:04,  8.08s/it, gpt_loss=0.352, loss_mean=0.358][A
+Train step of epoch 0:  19%|█▉        | 1227/6434 [2:52:28<11:41:04,  8.08s/it, gpt_loss=0.397, loss_mean=0.362][A
+Train step of epoch 0:  19%|█▉        | 1228/6434 [2:52:28<11:41:28,  8.08s/it, gpt_loss=0.397, loss_mean=0.362][A
+Train step of epoch 0:  19%|█▉        | 1228/6434 [2:52:38<11:41:28,  8.08s/it, gpt_loss=0.283, loss_mean=0.354][A
+Train step of epoch 0:  19%|█▉        | 1229/6434 [2:52:38<12:08:36,  8.40s/it, gpt_loss=0.283, loss_mean=0.354][A
+[LID Router Debug] Step: 1230
+Batch Size: 10
+Audio Batch Size: 125
+LID Assignments: [5, 0, 4, 3, 3, 9, 2, 3, 0, 9]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  19%|█▉        | 1229/6434 [2:52:46<12:08:36,  8.40s/it, gpt_loss=0.339, loss_mean=0.353][A
+Train step of epoch 0:  19%|█▉        | 1230/6434 [2:52:46<12:07:00,  8.38s/it, gpt_loss=0.339, loss_mean=0.353][A
+Train step of epoch 0:  19%|█▉        | 1230/6434 [2:52:54<12:07:00,  8.38s/it, gpt_loss=0.312, loss_mean=0.349][A
+Train step of epoch 0:  19%|█▉        | 1231/6434 [2:52:54<12:07:22,  8.39s/it, gpt_loss=0.312, loss_mean=0.349][A
+Train step of epoch 0:  19%|█▉        | 1231/6434 [2:53:02<12:07:22,  8.39s/it, gpt_loss=0.409, loss_mean=0.355][A
+Train step of epoch 0:  19%|█▉        | 1232/6434 [2:53:02<11:53:53,  8.23s/it, gpt_loss=0.409, loss_mean=0.355][A
+Train step of epoch 0:  19%|█▉        | 1232/6434 [2:53:11<11:53:53,  8.23s/it, gpt_loss=0.326, loss_mean=0.352][A
+Train step of epoch 0:  19%|█▉        | 1233/6434 [2:53:11<12:07:31,  8.39s/it, gpt_loss=0.326, loss_mean=0.352][A
+Train step of epoch 0:  19%|█▉        | 1233/6434 [2:53:20<12:07:31,  8.39s/it, gpt_loss=0.38, loss_mean=0.355] [A
+Train step of epoch 0:  19%|█▉        | 1234/6434 [2:53:20<12:31:42,  8.67s/it, gpt_loss=0.38, loss_mean=0.355][A
+Train step of epoch 0:  19%|█▉        | 1234/6434 [2:53:29<12:31:42,  8.67s/it, gpt_loss=0.3, loss_mean=0.349] [A
+Train step of epoch 0:  19%|█▉        | 1235/6434 [2:53:29<12:28:50,  8.64s/it, gpt_loss=0.3, loss_mean=0.349][A
+Train step of epoch 0:  19%|█▉        | 1235/6434 [2:53:37<12:28:50,  8.64s/it, gpt_loss=0.331, loss_mean=0.348][A
+Train step of epoch 0:  19%|█▉        | 1236/6434 [2:53:37<12:19:47,  8.54s/it, gpt_loss=0.331, loss_mean=0.348][A
+Train step of epoch 0:  19%|█▉        | 1236/6434 [2:53:45<12:19:47,  8.54s/it, gpt_loss=0.336, loss_mean=0.346][A
+Train step of epoch 0:  19%|█▉        | 1237/6434 [2:53:45<12:06:35,  8.39s/it, gpt_loss=0.336, loss_mean=0.346][A
+Train step of epoch 0:  19%|█▉        | 1237/6434 [2:53:53<12:06:35,  8.39s/it, gpt_loss=0.33, loss_mean=0.345] [A
+Train step of epoch 0:  19%|█▉        | 1238/6434 [2:53:53<12:04:13,  8.36s/it, gpt_loss=0.33, loss_mean=0.345][A
+Train step of epoch 0:  19%|█▉        | 1238/6434 [2:54:01<12:04:13,  8.36s/it, gpt_loss=0.387, loss_mean=0.349][A
+Train step of epoch 0:  19%|█▉        | 1239/6434 [2:54:01<11:43:12,  8.12s/it, gpt_loss=0.387, loss_mean=0.349][A
+[LID Router Debug] Step: 1240
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [2, 1, 9, 1, 4, 9, 1, 4, 2, 6]
+Active Experts in Batch: {1, 2, 4, 6, 9}
+
+Train step of epoch 0:  19%|█▉        | 1239/6434 [2:54:09<11:43:12,  8.12s/it, gpt_loss=0.315, loss_mean=0.346][A
+Train step of epoch 0:  19%|█▉        | 1240/6434 [2:54:09<11:50:07,  8.20s/it, gpt_loss=0.315, loss_mean=0.346][A
+Train step of epoch 0:  19%|█▉        | 1240/6434 [2:54:17<11:50:07,  8.20s/it, gpt_loss=0.343, loss_mean=0.345][A
+Train step of epoch 0:  19%|█▉        | 1241/6434 [2:54:17<11:29:18,  7.96s/it, gpt_loss=0.343, loss_mean=0.345][A
+Train step of epoch 0:  19%|█▉        | 1241/6434 [2:54:26<11:29:18,  7.96s/it, gpt_loss=0.267, loss_mean=0.337][A
+Train step of epoch 0:  19%|█▉        | 1242/6434 [2:54:26<11:49:29,  8.20s/it, gpt_loss=0.267, loss_mean=0.337][A
+Train step of epoch 0:  19%|█▉        | 1242/6434 [2:54:36<11:49:29,  8.20s/it, gpt_loss=0.383, loss_mean=0.342][A
+Train step of epoch 0:  19%|█▉        | 1243/6434 [2:54:36<12:39:14,  8.78s/it, gpt_loss=0.383, loss_mean=0.342][A
+Train step of epoch 0:  19%|█▉        | 1243/6434 [2:54:44<12:39:14,  8.78s/it, gpt_loss=0.561, loss_mean=0.364][A
+Train step of epoch 0:  19%|█▉        | 1244/6434 [2:54:44<12:24:37,  8.61s/it, gpt_loss=0.561, loss_mean=0.364][A
+Train step of epoch 0:  19%|█▉        | 1244/6434 [2:54:52<12:24:37,  8.61s/it, gpt_loss=0.391, loss_mean=0.367][A
+Train step of epoch 0:  19%|█▉        | 1245/6434 [2:54:52<12:07:12,  8.41s/it, gpt_loss=0.391, loss_mean=0.367][A
+Train step of epoch 0:  19%|█▉        | 1245/6434 [2:54:59<12:07:12,  8.41s/it, gpt_loss=0.416, loss_mean=0.372][A
+Train step of epoch 0:  19%|█▉        | 1246/6434 [2:54:59<11:47:18,  8.18s/it, gpt_loss=0.416, loss_mean=0.372][A
+Train step of epoch 0:  19%|█▉        | 1246/6434 [2:55:09<11:47:18,  8.18s/it, gpt_loss=0.311, loss_mean=0.365][A
+Train step of epoch 0:  19%|█▉        | 1247/6434 [2:55:09<12:15:12,  8.50s/it, gpt_loss=0.311, loss_mean=0.365][A
+Train step of epoch 0:  19%|█▉        | 1247/6434 [2:55:18<12:15:12,  8.50s/it, gpt_loss=0.413, loss_mean=0.37] [A
+Train step of epoch 0:  19%|█▉        | 1248/6434 [2:55:18<12:27:39,  8.65s/it, gpt_loss=0.413, loss_mean=0.37][A
+Train step of epoch 0:  19%|█▉        | 1248/6434 [2:55:26<12:27:39,  8.65s/it, gpt_loss=0.404, loss_mean=0.374][A
+Train step of epoch 0:  19%|█▉        | 1249/6434 [2:55:26<12:22:42,  8.59s/it, gpt_loss=0.404, loss_mean=0.374][A
+[LID Router Debug] Step: 1250
+Batch Size: 10
+Audio Batch Size: 107
+LID Assignments: [5, 9, 1, 1, 0, 6, 9, 1, 2, 3]
+Active Experts in Batch: {0, 1, 2, 3, 5, 6, 9}
+
+Train step of epoch 0:  19%|█▉        | 1249/6434 [2:55:34<12:22:42,  8.59s/it, gpt_loss=0.419, loss_mean=0.378][A
+Train step of epoch 0:  19%|█▉        | 1250/6434 [2:55:34<11:53:12,  8.25s/it, gpt_loss=0.419, loss_mean=0.378][A
+Train step of epoch 0:  19%|█▉        | 1250/6434 [2:55:41<11:53:12,  8.25s/it, gpt_loss=0.349, loss_mean=0.375][A
+Train step of epoch 0:  19%|█▉        | 1251/6434 [2:55:41<11:36:07,  8.06s/it, gpt_loss=0.349, loss_mean=0.375][A
+Train step of epoch 0:  19%|█▉        | 1251/6434 [2:55:50<11:36:07,  8.06s/it, gpt_loss=0.305, loss_mean=0.368][A
+Train step of epoch 0:  19%|█▉        | 1252/6434 [2:55:50<11:54:49,  8.28s/it, gpt_loss=0.305, loss_mean=0.368][A
+Train step of epoch 0:  19%|█▉        | 1252/6434 [2:55:59<11:54:49,  8.28s/it, gpt_loss=0.265, loss_mean=0.358][A
+Train step of epoch 0:  19%|█▉        | 1253/6434 [2:55:59<12:13:24,  8.49s/it, gpt_loss=0.265, loss_mean=0.358][A
+Train step of epoch 0:  19%|█▉        | 1253/6434 [2:56:06<12:13:24,  8.49s/it, gpt_loss=0.412, loss_mean=0.363][A
+Train step of epoch 0:  19%|█▉        | 1254/6434 [2:56:06<11:43:56,  8.15s/it, gpt_loss=0.412, loss_mean=0.363][A
+Train step of epoch 0:  19%|█▉        | 1254/6434 [2:56:15<11:43:56,  8.15s/it, gpt_loss=0.32, loss_mean=0.359] [A
+Train step of epoch 0:  20%|█▉        | 1255/6434 [2:56:15<11:46:10,  8.18s/it, gpt_loss=0.32, loss_mean=0.359][A
+Train step of epoch 0:  20%|█▉        | 1255/6434 [2:56:24<11:46:10,  8.18s/it, gpt_loss=0.286, loss_mean=0.352][A
+Train step of epoch 0:  20%|█▉        | 1256/6434 [2:56:24<12:18:47,  8.56s/it, gpt_loss=0.286, loss_mean=0.352][A
+Train step of epoch 0:  20%|█▉        | 1256/6434 [2:56:33<12:18:47,  8.56s/it, gpt_loss=0.326, loss_mean=0.349][A
+Train step of epoch 0:  20%|█▉        | 1257/6434 [2:56:33<12:14:45,  8.52s/it, gpt_loss=0.326, loss_mean=0.349][A
+Train step of epoch 0:  20%|█▉        | 1257/6434 [2:56:41<12:14:45,  8.52s/it, gpt_loss=0.307, loss_mean=0.345][A
+Train step of epoch 0:  20%|█▉        | 1258/6434 [2:56:41<12:08:05,  8.44s/it, gpt_loss=0.307, loss_mean=0.345][A
+Train step of epoch 0:  20%|█▉        | 1258/6434 [2:56:49<12:08:05,  8.44s/it, gpt_loss=0.317, loss_mean=0.342][A
+Train step of epoch 0:  20%|█▉        | 1259/6434 [2:56:49<11:57:37,  8.32s/it, gpt_loss=0.317, loss_mean=0.342][A
+[LID Router Debug] Step: 1260
+Batch Size: 10
+Audio Batch Size: 109
+LID Assignments: [5, 9, 1, 0, 3, 1, 11, 2, 5, 5]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9, 11}
+
+Train step of epoch 0:  20%|█▉        | 1259/6434 [2:56:57<11:57:37,  8.32s/it, gpt_loss=0.396, loss_mean=0.348][A
+Train step of epoch 0:  20%|█▉        | 1260/6434 [2:56:57<12:03:45,  8.39s/it, gpt_loss=0.396, loss_mean=0.348][A
+Train step of epoch 0:  20%|█▉        | 1260/6434 [2:57:05<12:03:45,  8.39s/it, gpt_loss=0.356, loss_mean=0.348][A
+Train step of epoch 0:  20%|█▉        | 1261/6434 [2:57:05<11:50:10,  8.24s/it, gpt_loss=0.356, loss_mean=0.348][A
+Train step of epoch 0:  20%|█▉        | 1261/6434 [2:57:12<11:50:10,  8.24s/it, gpt_loss=0.331, loss_mean=0.347][A
+Train step of epoch 0:  20%|█▉        | 1262/6434 [2:57:12<11:23:03,  7.92s/it, gpt_loss=0.331, loss_mean=0.347][A
+Train step of epoch 0:  20%|█▉        | 1262/6434 [2:57:20<11:23:03,  7.92s/it, gpt_loss=0.337, loss_mean=0.346][A
+Train step of epoch 0:  20%|█▉        | 1263/6434 [2:57:20<11:13:10,  7.81s/it, gpt_loss=0.337, loss_mean=0.346][A
+Train step of epoch 0:  20%|█▉        | 1263/6434 [2:57:28<11:13:10,  7.81s/it, gpt_loss=0.469, loss_mean=0.358][A
+Train step of epoch 0:  20%|█▉        | 1264/6434 [2:57:28<11:16:23,  7.85s/it, gpt_loss=0.469, loss_mean=0.358][A
+Train step of epoch 0:  20%|█▉        | 1264/6434 [2:57:37<11:16:23,  7.85s/it, gpt_loss=0.295, loss_mean=0.352][A
+Train step of epoch 0:  20%|█▉        | 1265/6434 [2:57:37<11:37:55,  8.10s/it, gpt_loss=0.295, loss_mean=0.352][A
+Train step of epoch 0:  20%|█▉        | 1265/6434 [2:57:45<11:37:55,  8.10s/it, gpt_loss=0.292, loss_mean=0.346][A
+Train step of epoch 0:  20%|█▉        | 1266/6434 [2:57:45<11:36:32,  8.09s/it, gpt_loss=0.292, loss_mean=0.346][A
+Train step of epoch 0:  20%|█▉        | 1266/6434 [2:57:52<11:36:32,  8.09s/it, gpt_loss=0.316, loss_mean=0.343][A
+Train step of epoch 0:  20%|█▉        | 1267/6434 [2:57:52<11:24:13,  7.95s/it, gpt_loss=0.316, loss_mean=0.343][A
+Train step of epoch 0:  20%|█▉        | 1267/6434 [2:58:00<11:24:13,  7.95s/it, gpt_loss=0.455, loss_mean=0.354][A
+Train step of epoch 0:  20%|█▉        | 1268/6434 [2:58:00<11:28:35,  8.00s/it, gpt_loss=0.455, loss_mean=0.354][A
+Train step of epoch 0:  20%|█▉        | 1268/6434 [2:58:08<11:28:35,  8.00s/it, gpt_loss=0.364, loss_mean=0.355][A
+Train step of epoch 0:  20%|█▉        | 1269/6434 [2:58:08<11:24:41,  7.95s/it, gpt_loss=0.364, loss_mean=0.355][A
+[LID Router Debug] Step: 1270
+Batch Size: 10
+Audio Batch Size: 105
+LID Assignments: [1, 4, 3, 9, 0, 1, 3, 1, 4, 4]
+Active Experts in Batch: {0, 1, 3, 4, 9}
+
+Train step of epoch 0:  20%|█▉        | 1269/6434 [2:58:15<11:24:41,  7.95s/it, gpt_loss=0.274, loss_mean=0.347][A
+Train step of epoch 0:  20%|█▉        | 1270/6434 [2:58:15<11:01:07,  7.68s/it, gpt_loss=0.274, loss_mean=0.347][A
+Train step of epoch 0:  20%|█▉        | 1270/6434 [2:58:24<11:01:07,  7.68s/it, gpt_loss=0.246, loss_mean=0.337][A
+Train step of epoch 0:  20%|█▉        | 1271/6434 [2:58:24<11:31:19,  8.03s/it, gpt_loss=0.246, loss_mean=0.337][A
+Train step of epoch 0:  20%|█▉        | 1271/6434 [2:58:34<11:31:19,  8.03s/it, gpt_loss=0.331, loss_mean=0.336][A
+Train step of epoch 0:  20%|█▉        | 1272/6434 [2:58:34<12:16:56,  8.57s/it, gpt_loss=0.331, loss_mean=0.336][A
+Train step of epoch 0:  20%|█▉        | 1272/6434 [2:58:42<12:16:56,  8.57s/it, gpt_loss=0.328, loss_mean=0.335][A
+Train step of epoch 0:  20%|█▉        | 1273/6434 [2:58:42<12:07:09,  8.45s/it, gpt_loss=0.328, loss_mean=0.335][A
+Train step of epoch 0:  20%|█▉        | 1273/6434 [2:58:51<12:07:09,  8.45s/it, gpt_loss=0.271, loss_mean=0.329][A
+Train step of epoch 0:  20%|█▉        | 1274/6434 [2:58:51<12:08:53,  8.48s/it, gpt_loss=0.271, loss_mean=0.329][A
+Train step of epoch 0:  20%|█▉        | 1274/6434 [2:59:00<12:08:53,  8.48s/it, gpt_loss=0.381, loss_mean=0.334][A
+Train step of epoch 0:  20%|█▉        | 1275/6434 [2:59:00<12:23:16,  8.64s/it, gpt_loss=0.381, loss_mean=0.334][A
+Train step of epoch 0:  20%|█▉        | 1275/6434 [2:59:09<12:23:16,  8.64s/it, gpt_loss=0.344, loss_mean=0.335][A
+Train step of epoch 0:  20%|█▉        | 1276/6434 [2:59:09<12:36:08,  8.80s/it, gpt_loss=0.344, loss_mean=0.335][A
+Train step of epoch 0:  20%|█▉        | 1276/6434 [2:59:18<12:36:08,  8.80s/it, gpt_loss=0.376, loss_mean=0.339][A
+Train step of epoch 0:  20%|█▉        | 1277/6434 [2:59:18<12:32:39,  8.76s/it, gpt_loss=0.376, loss_mean=0.339][A
+Train step of epoch 0:  20%|█▉        | 1277/6434 [2:59:25<12:32:39,  8.76s/it, gpt_loss=0.4, loss_mean=0.345]  [A
+Train step of epoch 0:  20%|█▉        | 1278/6434 [2:59:25<11:59:29,  8.37s/it, gpt_loss=0.4, loss_mean=0.345][A
+Train step of epoch 0:  20%|█▉        | 1278/6434 [2:59:35<11:59:29,  8.37s/it, gpt_loss=0.348, loss_mean=0.346][A
+Train step of epoch 0:  20%|█▉        | 1279/6434 [2:59:35<12:30:44,  8.74s/it, gpt_loss=0.348, loss_mean=0.346][A
+[LID Router Debug] Step: 1280
+Batch Size: 10
+Audio Batch Size: 119
+LID Assignments: [1, 5, 3, 3, 2, 1, 4, 2, 1, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5}
+
+Train step of epoch 0:  20%|█▉        | 1279/6434 [2:59:43<12:30:44,  8.74s/it, gpt_loss=0.302, loss_mean=0.341][A
+Train step of epoch 0:  20%|█▉        | 1280/6434 [2:59:43<12:14:36,  8.55s/it, gpt_loss=0.302, loss_mean=0.341][A
+Train step of epoch 0:  20%|█▉        | 1280/6434 [2:59:51<12:14:36,  8.55s/it, gpt_loss=0.358, loss_mean=0.343][A
+Train step of epoch 0:  20%|█▉        | 1281/6434 [2:59:51<11:56:57,  8.35s/it, gpt_loss=0.358, loss_mean=0.343][A
+Train step of epoch 0:  20%|█▉        | 1281/6434 [2:59:59<11:56:57,  8.35s/it, gpt_loss=0.424, loss_mean=0.351][A
+Train step of epoch 0:  20%|█▉        | 1282/6434 [2:59:59<11:57:16,  8.35s/it, gpt_loss=0.424, loss_mean=0.351][A
+Train step of epoch 0:  20%|█▉        | 1282/6434 [3:00:07<11:57:16,  8.35s/it, gpt_loss=0.418, loss_mean=0.358][A
+Train step of epoch 0:  20%|█▉        | 1283/6434 [3:00:07<11:48:40,  8.25s/it, gpt_loss=0.418, loss_mean=0.358][A
+Train step of epoch 0:  20%|█▉        | 1283/6434 [3:00:15<11:48:40,  8.25s/it, gpt_loss=0.248, loss_mean=0.347][A
+Train step of epoch 0:  20%|█▉        | 1284/6434 [3:00:15<11:43:06,  8.19s/it, gpt_loss=0.248, loss_mean=0.347][A
+Train step of epoch 0:  20%|█▉        | 1284/6434 [3:00:23<11:43:06,  8.19s/it, gpt_loss=0.294, loss_mean=0.341][A
+Train step of epoch 0:  20%|█▉        | 1285/6434 [3:00:23<11:40:59,  8.17s/it, gpt_loss=0.294, loss_mean=0.341][A
+Train step of epoch 0:  20%|█▉        | 1285/6434 [3:00:31<11:40:59,  8.17s/it, gpt_loss=0.289, loss_mean=0.336][A
+Train step of epoch 0:  20%|█▉        | 1286/6434 [3:00:31<11:36:43,  8.12s/it, gpt_loss=0.289, loss_mean=0.336][A
+Train step of epoch 0:  20%|█▉        | 1286/6434 [3:00:40<11:36:43,  8.12s/it, gpt_loss=0.311, loss_mean=0.334][A
+Train step of epoch 0:  20%|██        | 1287/6434 [3:00:40<11:53:24,  8.32s/it, gpt_loss=0.311, loss_mean=0.334][A
+Train step of epoch 0:  20%|██        | 1287/6434 [3:00:50<11:53:24,  8.32s/it, gpt_loss=0.379, loss_mean=0.338][A
+Train step of epoch 0:  20%|██        | 1288/6434 [3:00:50<12:28:46,  8.73s/it, gpt_loss=0.379, loss_mean=0.338][A
+Train step of epoch 0:  20%|██        | 1288/6434 [3:00:59<12:28:46,  8.73s/it, gpt_loss=0.277, loss_mean=0.332][A
+Train step of epoch 0:  20%|██        | 1289/6434 [3:00:59<12:47:11,  8.95s/it, gpt_loss=0.277, loss_mean=0.332][A
+[LID Router Debug] Step: 1290
+Batch Size: 10
+Audio Batch Size: 115
+LID Assignments: [9, 9, 9, 6, 6, 1, 4, 2, 5, 4]
+Active Experts in Batch: {1, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  20%|██        | 1289/6434 [3:01:08<12:47:11,  8.95s/it, gpt_loss=0.344, loss_mean=0.333][A
+Train step of epoch 0:  20%|██        | 1290/6434 [3:01:08<12:49:34,  8.98s/it, gpt_loss=0.344, loss_mean=0.333][A
+Train step of epoch 0:  20%|██        | 1290/6434 [3:01:17<12:49:34,  8.98s/it, gpt_loss=0.281, loss_mean=0.328][A
+Train step of epoch 0:  20%|██        | 1291/6434 [3:01:17<12:45:17,  8.93s/it, gpt_loss=0.281, loss_mean=0.328][A
+Train step of epoch 0:  20%|██        | 1291/6434 [3:01:24<12:45:17,  8.93s/it, gpt_loss=0.43, loss_mean=0.338] [A
+Train step of epoch 0:  20%|██        | 1292/6434 [3:01:24<11:57:11,  8.37s/it, gpt_loss=0.43, loss_mean=0.338][A
+Train step of epoch 0:  20%|██        | 1292/6434 [3:01:33<11:57:11,  8.37s/it, gpt_loss=0.391, loss_mean=0.343][A
+Train step of epoch 0:  20%|██        | 1293/6434 [3:01:33<12:15:41,  8.59s/it, gpt_loss=0.391, loss_mean=0.343][A
+Train step of epoch 0:  20%|██        | 1293/6434 [3:01:44<12:15:41,  8.59s/it, gpt_loss=0.307, loss_mean=0.34] [A
+Train step of epoch 0:  20%|██        | 1294/6434 [3:01:44<13:07:59,  9.20s/it, gpt_loss=0.307, loss_mean=0.34][A
+Train step of epoch 0:  20%|██        | 1294/6434 [3:01:52<13:07:59,  9.20s/it, gpt_loss=0.32, loss_mean=0.338][A
+Train step of epoch 0:  20%|██        | 1295/6434 [3:01:52<12:56:29,  9.07s/it, gpt_loss=0.32, loss_mean=0.338][A
+Train step of epoch 0:  20%|██        | 1295/6434 [3:02:02<12:56:29,  9.07s/it, gpt_loss=0.389, loss_mean=0.343][A
+Train step of epoch 0:  20%|██        | 1296/6434 [3:02:02<13:07:16,  9.19s/it, gpt_loss=0.389, loss_mean=0.343][A
+Train step of epoch 0:  20%|██        | 1296/6434 [3:02:11<13:07:16,  9.19s/it, gpt_loss=0.342, loss_mean=0.343][A
+Train step of epoch 0:  20%|██        | 1297/6434 [3:02:11<13:05:23,  9.17s/it, gpt_loss=0.342, loss_mean=0.343][A
+Train step of epoch 0:  20%|██        | 1297/6434 [3:02:19<13:05:23,  9.17s/it, gpt_loss=0.373, loss_mean=0.346][A
+Train step of epoch 0:  20%|██        | 1298/6434 [3:02:19<12:33:39,  8.80s/it, gpt_loss=0.373, loss_mean=0.346][A
+Train step of epoch 0:  20%|██        | 1298/6434 [3:02:27<12:33:39,  8.80s/it, gpt_loss=0.313, loss_mean=0.343][A
+Train step of epoch 0:  20%|██        | 1299/6434 [3:02:27<12:12:39,  8.56s/it, gpt_loss=0.313, loss_mean=0.343][A
+[LID Router Debug] Step: 1300
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [4, 3, 9, 1, 3, 4, 5, 5, 2, 1]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  20%|██        | 1299/6434 [3:02:35<12:12:39,  8.56s/it, gpt_loss=0.305, loss_mean=0.339][A
+Train step of epoch 0:  20%|██        | 1300/6434 [3:02:35<11:55:04,  8.36s/it, gpt_loss=0.305, loss_mean=0.339][A
+Train step of epoch 0:  20%|██        | 1300/6434 [3:02:43<11:55:04,  8.36s/it, gpt_loss=0.377, loss_mean=0.343][A
+Train step of epoch 0:  20%|██        | 1301/6434 [3:02:43<11:42:33,  8.21s/it, gpt_loss=0.377, loss_mean=0.343][A
+Train step of epoch 0:  20%|██        | 1301/6434 [3:02:51<11:42:33,  8.21s/it, gpt_loss=0.292, loss_mean=0.338][A
+Train step of epoch 0:  20%|██        | 1302/6434 [3:02:51<11:45:55,  8.25s/it, gpt_loss=0.292, loss_mean=0.338][A
+Train step of epoch 0:  20%|██        | 1302/6434 [3:02:59<11:45:55,  8.25s/it, gpt_loss=0.297, loss_mean=0.334][A
+Train step of epoch 0:  20%|██        | 1303/6434 [3:02:59<11:43:30,  8.23s/it, gpt_loss=0.297, loss_mean=0.334][A
+Train step of epoch 0:  20%|██        | 1303/6434 [3:03:07<11:43:30,  8.23s/it, gpt_loss=0.246, loss_mean=0.325][A
+Train step of epoch 0:  20%|██        | 1304/6434 [3:03:07<11:35:46,  8.14s/it, gpt_loss=0.246, loss_mean=0.325][A
+Train step of epoch 0:  20%|██        | 1304/6434 [3:03:16<11:35:46,  8.14s/it, gpt_loss=0.363, loss_mean=0.329][A
+Train step of epoch 0:  20%|██        | 1305/6434 [3:03:16<11:40:27,  8.19s/it, gpt_loss=0.363, loss_mean=0.329][A
+Train step of epoch 0:  20%|██        | 1305/6434 [3:03:24<11:40:27,  8.19s/it, gpt_loss=0.396, loss_mean=0.335][A
+Train step of epoch 0:  20%|██        | 1306/6434 [3:03:24<11:51:40,  8.33s/it, gpt_loss=0.396, loss_mean=0.335][A
+Train step of epoch 0:  20%|██        | 1306/6434 [3:03:32<11:51:40,  8.33s/it, gpt_loss=0.287, loss_mean=0.331][A
+Train step of epoch 0:  20%|██        | 1307/6434 [3:03:32<11:47:04,  8.27s/it, gpt_loss=0.287, loss_mean=0.331][A
+Train step of epoch 0:  20%|██        | 1307/6434 [3:03:41<11:47:04,  8.27s/it, gpt_loss=0.252, loss_mean=0.323][A
+Train step of epoch 0:  20%|██        | 1308/6434 [3:03:41<11:51:23,  8.33s/it, gpt_loss=0.252, loss_mean=0.323][A
+Train step of epoch 0:  20%|██        | 1308/6434 [3:03:49<11:51:23,  8.33s/it, gpt_loss=0.326, loss_mean=0.323][A
+Train step of epoch 0:  20%|██        | 1309/6434 [3:03:49<11:36:35,  8.16s/it, gpt_loss=0.326, loss_mean=0.323][A
+[LID Router Debug] Step: 1310
+Batch Size: 10
+Audio Batch Size: 80
+LID Assignments: [0, 6, 0, 4, 0, 5, 0, 0, 5, 1]
+Active Experts in Batch: {0, 1, 4, 5, 6}
+
+Train step of epoch 0:  20%|██        | 1309/6434 [3:03:57<11:36:35,  8.16s/it, gpt_loss=0.286, loss_mean=0.319][A
+Train step of epoch 0:  20%|██        | 1310/6434 [3:03:57<11:39:25,  8.19s/it, gpt_loss=0.286, loss_mean=0.319][A
+Train step of epoch 0:  20%|██        | 1310/6434 [3:04:05<11:39:25,  8.19s/it, gpt_loss=0.338, loss_mean=0.321][A
+Train step of epoch 0:  20%|██        | 1311/6434 [3:04:05<11:43:50,  8.24s/it, gpt_loss=0.338, loss_mean=0.321][A
+Train step of epoch 0:  20%|██        | 1311/6434 [3:04:15<11:43:50,  8.24s/it, gpt_loss=0.403, loss_mean=0.329][A
+Train step of epoch 0:  20%|██        | 1312/6434 [3:04:15<12:24:07,  8.72s/it, gpt_loss=0.403, loss_mean=0.329][A
+Train step of epoch 0:  20%|██        | 1312/6434 [3:04:23<12:24:07,  8.72s/it, gpt_loss=0.372, loss_mean=0.334][A
+Train step of epoch 0:  20%|██        | 1313/6434 [3:04:23<12:09:34,  8.55s/it, gpt_loss=0.372, loss_mean=0.334][A
+Train step of epoch 0:  20%|██        | 1313/6434 [3:04:33<12:09:34,  8.55s/it, gpt_loss=0.29, loss_mean=0.329] [A
+Train step of epoch 0:  20%|██        | 1314/6434 [3:04:33<12:30:31,  8.80s/it, gpt_loss=0.29, loss_mean=0.329][A
+Train step of epoch 0:  20%|██        | 1314/6434 [3:04:40<12:30:31,  8.80s/it, gpt_loss=0.342, loss_mean=0.33][A
+Train step of epoch 0:  20%|██        | 1315/6434 [3:04:40<11:56:43,  8.40s/it, gpt_loss=0.342, loss_mean=0.33][A
+Train step of epoch 0:  20%|██        | 1315/6434 [3:04:48<11:56:43,  8.40s/it, gpt_loss=0.298, loss_mean=0.327][A
+Train step of epoch 0:  20%|██        | 1316/6434 [3:04:48<11:35:25,  8.15s/it, gpt_loss=0.298, loss_mean=0.327][A
+Train step of epoch 0:  20%|██        | 1316/6434 [3:04:55<11:35:25,  8.15s/it, gpt_loss=0.313, loss_mean=0.326][A
+Train step of epoch 0:  20%|██        | 1317/6434 [3:04:55<11:21:55,  8.00s/it, gpt_loss=0.313, loss_mean=0.326][A
+Train step of epoch 0:  20%|██        | 1317/6434 [3:05:04<11:21:55,  8.00s/it, gpt_loss=0.376, loss_mean=0.331][A
+Train step of epoch 0:  20%|██        | 1318/6434 [3:05:04<11:29:20,  8.08s/it, gpt_loss=0.376, loss_mean=0.331][A
+Train step of epoch 0:  20%|██        | 1318/6434 [3:05:12<11:29:20,  8.08s/it, gpt_loss=0.32, loss_mean=0.33]  [A
+Train step of epoch 0:  21%|██        | 1319/6434 [3:05:12<11:41:04,  8.22s/it, gpt_loss=0.32, loss_mean=0.33][A
+[LID Router Debug] Step: 1320
+Batch Size: 10
+Audio Batch Size: 88
+LID Assignments: [2, 4, 6, 5, 1, 2, 4, 1, 2, 3]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6}
+
+Train step of epoch 0:  21%|██        | 1319/6434 [3:05:20<11:41:04,  8.22s/it, gpt_loss=0.319, loss_mean=0.329][A
+Train step of epoch 0:  21%|██        | 1320/6434 [3:05:20<11:33:32,  8.14s/it, gpt_loss=0.319, loss_mean=0.329][A
+Train step of epoch 0:  21%|██        | 1320/6434 [3:05:28<11:33:32,  8.14s/it, gpt_loss=0.364, loss_mean=0.332][A
+Train step of epoch 0:  21%|██        | 1321/6434 [3:05:28<11:38:07,  8.19s/it, gpt_loss=0.364, loss_mean=0.332][A
+Train step of epoch 0:  21%|██        | 1321/6434 [3:05:38<11:38:07,  8.19s/it, gpt_loss=0.334, loss_mean=0.332][A
+Train step of epoch 0:  21%|██        | 1322/6434 [3:05:38<12:17:03,  8.65s/it, gpt_loss=0.334, loss_mean=0.332][A
+Train step of epoch 0:  21%|██        | 1322/6434 [3:05:46<12:17:03,  8.65s/it, gpt_loss=0.305, loss_mean=0.33] [A
+Train step of epoch 0:  21%|██        | 1323/6434 [3:05:46<12:10:58,  8.58s/it, gpt_loss=0.305, loss_mean=0.33][A
+Train step of epoch 0:  21%|██        | 1323/6434 [3:05:55<12:10:58,  8.58s/it, gpt_loss=0.356, loss_mean=0.332][A
+Train step of epoch 0:  21%|██        | 1324/6434 [3:05:55<12:06:20,  8.53s/it, gpt_loss=0.356, loss_mean=0.332][A
+Train step of epoch 0:  21%|██        | 1324/6434 [3:06:03<12:06:20,  8.53s/it, gpt_loss=0.337, loss_mean=0.333][A
+Train step of epoch 0:  21%|██        | 1325/6434 [3:06:03<12:01:54,  8.48s/it, gpt_loss=0.337, loss_mean=0.333][A
+Train step of epoch 0:  21%|██        | 1325/6434 [3:06:11<12:01:54,  8.48s/it, gpt_loss=0.324, loss_mean=0.332][A
+Train step of epoch 0:  21%|██        | 1326/6434 [3:06:11<11:54:32,  8.39s/it, gpt_loss=0.324, loss_mean=0.332][A
+Train step of epoch 0:  21%|██        | 1326/6434 [3:06:20<11:54:32,  8.39s/it, gpt_loss=0.393, loss_mean=0.338][A
+Train step of epoch 0:  21%|██        | 1327/6434 [3:06:20<11:48:33,  8.32s/it, gpt_loss=0.393, loss_mean=0.338][A
+Train step of epoch 0:  21%|██        | 1327/6434 [3:06:28<11:48:33,  8.32s/it, gpt_loss=0.366, loss_mean=0.341][A
+Train step of epoch 0:  21%|██        | 1328/6434 [3:06:28<12:01:25,  8.48s/it, gpt_loss=0.366, loss_mean=0.341][A
+Train step of epoch 0:  21%|██        | 1328/6434 [3:06:36<12:01:25,  8.48s/it, gpt_loss=0.346, loss_mean=0.341][A
+Train step of epoch 0:  21%|██        | 1329/6434 [3:06:36<11:35:27,  8.17s/it, gpt_loss=0.346, loss_mean=0.341][A
+[LID Router Debug] Step: 1330
+Batch Size: 10
+Audio Batch Size: 98
+LID Assignments: [2, 5, 5, 9, 2, 0, 3, 2, 9, 1]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9}
+
+Train step of epoch 0:  21%|██        | 1329/6434 [3:06:44<11:35:27,  8.17s/it, gpt_loss=0.307, loss_mean=0.338][A
+Train step of epoch 0:  21%|██        | 1330/6434 [3:06:44<11:28:49,  8.10s/it, gpt_loss=0.307, loss_mean=0.338][A
+Train step of epoch 0:  21%|██        | 1330/6434 [3:06:52<11:28:49,  8.10s/it, gpt_loss=0.393, loss_mean=0.343][A
+Train step of epoch 0:  21%|██        | 1331/6434 [3:06:52<11:38:47,  8.22s/it, gpt_loss=0.393, loss_mean=0.343][A
+Train step of epoch 0:  21%|██        | 1331/6434 [3:07:01<11:38:47,  8.22s/it, gpt_loss=0.323, loss_mean=0.341][A
+Train step of epoch 0:  21%|██        | 1332/6434 [3:07:01<11:51:04,  8.36s/it, gpt_loss=0.323, loss_mean=0.341][A
+Train step of epoch 0:  21%|██        | 1332/6434 [3:07:10<11:51:04,  8.36s/it, gpt_loss=0.314, loss_mean=0.339][A
+Train step of epoch 0:  21%|██        | 1333/6434 [3:07:10<12:06:47,  8.55s/it, gpt_loss=0.314, loss_mean=0.339][A
+Train step of epoch 0:  21%|██        | 1333/6434 [3:07:18<12:06:47,  8.55s/it, gpt_loss=0.406, loss_mean=0.345][A
+Train step of epoch 0:  21%|██        | 1334/6434 [3:07:18<11:51:45,  8.37s/it, gpt_loss=0.406, loss_mean=0.345][A
+Train step of epoch 0:  21%|██        | 1334/6434 [3:07:27<11:51:45,  8.37s/it, gpt_loss=0.332, loss_mean=0.344][A
+Train step of epoch 0:  21%|██        | 1335/6434 [3:07:27<12:12:14,  8.62s/it, gpt_loss=0.332, loss_mean=0.344][A
+Train step of epoch 0:  21%|██        | 1335/6434 [3:07:36<12:12:14,  8.62s/it, gpt_loss=0.32, loss_mean=0.342] [A
+Train step of epoch 0:  21%|██        | 1336/6434 [3:07:36<12:13:10,  8.63s/it, gpt_loss=0.32, loss_mean=0.342][A
+Train step of epoch 0:  21%|██        | 1336/6434 [3:07:44<12:13:10,  8.63s/it, gpt_loss=0.434, loss_mean=0.351][A
+Train step of epoch 0:  21%|██        | 1337/6434 [3:07:44<12:11:12,  8.61s/it, gpt_loss=0.434, loss_mean=0.351][A
+Train step of epoch 0:  21%|██        | 1337/6434 [3:07:54<12:11:12,  8.61s/it, gpt_loss=0.313, loss_mean=0.347][A
+Train step of epoch 0:  21%|██        | 1338/6434 [3:07:54<12:27:53,  8.81s/it, gpt_loss=0.313, loss_mean=0.347][A
+Train step of epoch 0:  21%|██        | 1338/6434 [3:08:03<12:27:53,  8.81s/it, gpt_loss=0.345, loss_mean=0.347][A
+Train step of epoch 0:  21%|██        | 1339/6434 [3:08:03<12:41:32,  8.97s/it, gpt_loss=0.345, loss_mean=0.347][A
+[LID Router Debug] Step: 1340
+Batch Size: 10
+Audio Batch Size: 131
+LID Assignments: [1, 3, 3, 0, 5, 5, 3, 1, 3, 9]
+Active Experts in Batch: {0, 1, 3, 5, 9}
+
+Train step of epoch 0:  21%|██        | 1339/6434 [3:08:12<12:41:32,  8.97s/it, gpt_loss=0.345, loss_mean=0.347][A
+Train step of epoch 0:  21%|██        | 1340/6434 [3:08:12<12:34:16,  8.88s/it, gpt_loss=0.345, loss_mean=0.347][A
+Train step of epoch 0:  21%|██        | 1340/6434 [3:08:21<12:34:16,  8.88s/it, gpt_loss=0.319, loss_mean=0.344][A
+Train step of epoch 0:  21%|██        | 1341/6434 [3:08:21<12:34:31,  8.89s/it, gpt_loss=0.319, loss_mean=0.344][A
+Train step of epoch 0:  21%|██        | 1341/6434 [3:08:29<12:34:31,  8.89s/it, gpt_loss=0.35, loss_mean=0.345] [A
+Train step of epoch 0:  21%|██        | 1342/6434 [3:08:29<12:18:38,  8.70s/it, gpt_loss=0.35, loss_mean=0.345][A
+Train step of epoch 0:  21%|██        | 1342/6434 [3:08:38<12:18:38,  8.70s/it, gpt_loss=0.29, loss_mean=0.339][A
+Train step of epoch 0:  21%|██        | 1343/6434 [3:08:38<12:24:13,  8.77s/it, gpt_loss=0.29, loss_mean=0.339][A
+Train step of epoch 0:  21%|██        | 1343/6434 [3:08:46<12:24:13,  8.77s/it, gpt_loss=0.344, loss_mean=0.34][A
+Train step of epoch 0:  21%|██        | 1344/6434 [3:08:46<12:21:29,  8.74s/it, gpt_loss=0.344, loss_mean=0.34][A
+Train step of epoch 0:  21%|██        | 1344/6434 [3:08:55<12:21:29,  8.74s/it, gpt_loss=0.377, loss_mean=0.343][A
+Train step of epoch 0:  21%|██        | 1345/6434 [3:08:55<12:27:32,  8.81s/it, gpt_loss=0.377, loss_mean=0.343][A
+Train step of epoch 0:  21%|██        | 1345/6434 [3:09:04<12:27:32,  8.81s/it, gpt_loss=0.293, loss_mean=0.338][A
+Train step of epoch 0:  21%|██        | 1346/6434 [3:09:04<12:27:27,  8.81s/it, gpt_loss=0.293, loss_mean=0.338][A
+Train step of epoch 0:  21%|██        | 1346/6434 [3:09:12<12:27:27,  8.81s/it, gpt_loss=0.326, loss_mean=0.337][A
+Train step of epoch 0:  21%|██        | 1347/6434 [3:09:12<12:01:03,  8.50s/it, gpt_loss=0.326, loss_mean=0.337][A
+Train step of epoch 0:  21%|██        | 1347/6434 [3:09:21<12:01:03,  8.50s/it, gpt_loss=0.323, loss_mean=0.336][A
+Train step of epoch 0:  21%|██        | 1348/6434 [3:09:21<12:09:16,  8.60s/it, gpt_loss=0.323, loss_mean=0.336][A
+Train step of epoch 0:  21%|██        | 1348/6434 [3:09:30<12:09:16,  8.60s/it, gpt_loss=0.377, loss_mean=0.34] [A
+Train step of epoch 0:  21%|██        | 1349/6434 [3:09:30<12:12:21,  8.64s/it, gpt_loss=0.377, loss_mean=0.34][A
+[LID Router Debug] Step: 1350
+Batch Size: 10
+Audio Batch Size: 127
+LID Assignments: [3, 3, 9, 4, 4, 0, 0, 5, 3, 5]
+Active Experts in Batch: {0, 3, 4, 5, 9}
+
+Train step of epoch 0:  21%|██        | 1349/6434 [3:09:38<12:12:21,  8.64s/it, gpt_loss=0.378, loss_mean=0.344][A
+Train step of epoch 0:  21%|██        | 1350/6434 [3:09:38<12:05:23,  8.56s/it, gpt_loss=0.378, loss_mean=0.344][A
+Train step of epoch 0:  21%|██        | 1350/6434 [3:09:46<12:05:23,  8.56s/it, gpt_loss=0.377, loss_mean=0.347][A
+Train step of epoch 0:  21%|██        | 1351/6434 [3:09:46<12:03:15,  8.54s/it, gpt_loss=0.377, loss_mean=0.347][A
+Train step of epoch 0:  21%|██        | 1351/6434 [3:09:54<12:03:15,  8.54s/it, gpt_loss=0.375, loss_mean=0.35] [A
+Train step of epoch 0:  21%|██        | 1352/6434 [3:09:54<11:38:13,  8.24s/it, gpt_loss=0.375, loss_mean=0.35][A
+Train step of epoch 0:  21%|██        | 1352/6434 [3:10:03<11:38:13,  8.24s/it, gpt_loss=0.316, loss_mean=0.346][A
+Train step of epoch 0:  21%|██        | 1353/6434 [3:10:03<12:00:55,  8.51s/it, gpt_loss=0.316, loss_mean=0.346][A
+Train step of epoch 0:  21%|██        | 1353/6434 [3:10:11<12:00:55,  8.51s/it, gpt_loss=0.553, loss_mean=0.367][A
+Train step of epoch 0:  21%|██        | 1354/6434 [3:10:11<11:36:50,  8.23s/it, gpt_loss=0.553, loss_mean=0.367][A
+Train step of epoch 0:  21%|██        | 1354/6434 [3:10:20<11:36:50,  8.23s/it, gpt_loss=0.367, loss_mean=0.367][A
+Train step of epoch 0:  21%|██        | 1355/6434 [3:10:20<12:12:40,  8.66s/it, gpt_loss=0.367, loss_mean=0.367][A
+Train step of epoch 0:  21%|██        | 1355/6434 [3:10:28<12:12:40,  8.66s/it, gpt_loss=0.315, loss_mean=0.362][A
+Train step of epoch 0:  21%|██        | 1356/6434 [3:10:28<11:40:33,  8.28s/it, gpt_loss=0.315, loss_mean=0.362][A
+Train step of epoch 0:  21%|██        | 1356/6434 [3:10:37<11:40:33,  8.28s/it, gpt_loss=0.412, loss_mean=0.367][A
+Train step of epoch 0:  21%|██        | 1357/6434 [3:10:37<12:04:17,  8.56s/it, gpt_loss=0.412, loss_mean=0.367][A
+Train step of epoch 0:  21%|██        | 1357/6434 [3:10:46<12:04:17,  8.56s/it, gpt_loss=0.301, loss_mean=0.36] [A
+Train step of epoch 0:  21%|██        | 1358/6434 [3:10:46<12:06:04,  8.58s/it, gpt_loss=0.301, loss_mean=0.36][A
+Train step of epoch 0:  21%|██        | 1358/6434 [3:10:54<12:06:04,  8.58s/it, gpt_loss=0.317, loss_mean=0.356][A
+Train step of epoch 0:  21%|██        | 1359/6434 [3:10:54<12:10:16,  8.63s/it, gpt_loss=0.317, loss_mean=0.356][A
+[LID Router Debug] Step: 1360
+Batch Size: 10
+Audio Batch Size: 120
+LID Assignments: [9, 4, 2, 4, 6, 3, 2, 0, 3, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:  21%|██        | 1359/6434 [3:11:03<12:10:16,  8.63s/it, gpt_loss=0.415, loss_mean=0.362][A
+Train step of epoch 0:  21%|██        | 1360/6434 [3:11:03<12:01:04,  8.53s/it, gpt_loss=0.415, loss_mean=0.362][A
+Train step of epoch 0:  21%|██        | 1360/6434 [3:11:12<12:01:04,  8.53s/it, gpt_loss=0.354, loss_mean=0.361][A
+Train step of epoch 0:  21%|██        | 1361/6434 [3:11:12<12:10:59,  8.65s/it, gpt_loss=0.354, loss_mean=0.361][A
+Train step of epoch 0:  21%|██        | 1361/6434 [3:11:20<12:10:59,  8.65s/it, gpt_loss=0.265, loss_mean=0.351][A
+Train step of epoch 0:  21%|██        | 1362/6434 [3:11:20<12:07:33,  8.61s/it, gpt_loss=0.265, loss_mean=0.351][A
+Train step of epoch 0:  21%|██        | 1362/6434 [3:11:28<12:07:33,  8.61s/it, gpt_loss=0.295, loss_mean=0.346][A
+Train step of epoch 0:  21%|██        | 1363/6434 [3:11:28<11:57:18,  8.49s/it, gpt_loss=0.295, loss_mean=0.346][A
+Train step of epoch 0:  21%|██        | 1363/6434 [3:11:36<11:57:18,  8.49s/it, gpt_loss=0.297, loss_mean=0.341][A
+Train step of epoch 0:  21%|██        | 1364/6434 [3:11:36<11:33:05,  8.20s/it, gpt_loss=0.297, loss_mean=0.341][A
+Train step of epoch 0:  21%|██        | 1364/6434 [3:11:43<11:33:05,  8.20s/it, gpt_loss=0.349, loss_mean=0.342][A
+Train step of epoch 0:  21%|██        | 1365/6434 [3:11:43<11:15:54,  8.00s/it, gpt_loss=0.349, loss_mean=0.342][A
+Train step of epoch 0:  21%|██        | 1365/6434 [3:11:52<11:15:54,  8.00s/it, gpt_loss=0.362, loss_mean=0.344][A
+Train step of epoch 0:  21%|██        | 1366/6434 [3:11:52<11:32:03,  8.19s/it, gpt_loss=0.362, loss_mean=0.344][A
+Train step of epoch 0:  21%|██        | 1366/6434 [3:12:01<11:32:03,  8.19s/it, gpt_loss=0.445, loss_mean=0.354][A
+Train step of epoch 0:  21%|██        | 1367/6434 [3:12:01<11:55:38,  8.47s/it, gpt_loss=0.445, loss_mean=0.354][A
+Train step of epoch 0:  21%|██        | 1367/6434 [3:12:10<11:55:38,  8.47s/it, gpt_loss=0.32, loss_mean=0.35]  [A
+Train step of epoch 0:  21%|██▏       | 1368/6434 [3:12:10<11:57:56,  8.50s/it, gpt_loss=0.32, loss_mean=0.35][A
+Train step of epoch 0:  21%|██▏       | 1368/6434 [3:12:19<11:57:56,  8.50s/it, gpt_loss=0.331, loss_mean=0.349][A
+Train step of epoch 0:  21%|██▏       | 1369/6434 [3:12:19<12:13:48,  8.69s/it, gpt_loss=0.331, loss_mean=0.349][A
+[LID Router Debug] Step: 1370
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [9, 4, 5, 1, 2, 0, 1, 3, 4, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  21%|██▏       | 1369/6434 [3:12:26<12:13:48,  8.69s/it, gpt_loss=0.388, loss_mean=0.352][A
+Train step of epoch 0:  21%|██▏       | 1370/6434 [3:12:26<11:48:26,  8.39s/it, gpt_loss=0.388, loss_mean=0.352][A
+Train step of epoch 0:  21%|██▏       | 1370/6434 [3:12:35<11:48:26,  8.39s/it, gpt_loss=0.426, loss_mean=0.36] [A
+Train step of epoch 0:  21%|██▏       | 1371/6434 [3:12:35<11:50:23,  8.42s/it, gpt_loss=0.426, loss_mean=0.36][A
+Train step of epoch 0:  21%|██▏       | 1371/6434 [3:12:43<11:50:23,  8.42s/it, gpt_loss=0.389, loss_mean=0.363][A
+Train step of epoch 0:  21%|██▏       | 1372/6434 [3:12:43<11:32:03,  8.20s/it, gpt_loss=0.389, loss_mean=0.363][A
+Train step of epoch 0:  21%|██▏       | 1372/6434 [3:12:50<11:32:03,  8.20s/it, gpt_loss=0.264, loss_mean=0.353][A
+Train step of epoch 0:  21%|██▏       | 1373/6434 [3:12:50<11:10:52,  7.95s/it, gpt_loss=0.264, loss_mean=0.353][A
+Train step of epoch 0:  21%|██▏       | 1373/6434 [3:12:59<11:10:52,  7.95s/it, gpt_loss=0.366, loss_mean=0.354][A
+Train step of epoch 0:  21%|██▏       | 1374/6434 [3:12:59<11:33:37,  8.22s/it, gpt_loss=0.366, loss_mean=0.354][A
+Train step of epoch 0:  21%|██▏       | 1374/6434 [3:13:06<11:33:37,  8.22s/it, gpt_loss=0.419, loss_mean=0.361][A
+Train step of epoch 0:  21%|██▏       | 1375/6434 [3:13:06<11:09:18,  7.94s/it, gpt_loss=0.419, loss_mean=0.361][A
+Train step of epoch 0:  21%|██▏       | 1375/6434 [3:13:14<11:09:18,  7.94s/it, gpt_loss=0.321, loss_mean=0.357][A
+Train step of epoch 0:  21%|██▏       | 1376/6434 [3:13:14<11:08:32,  7.93s/it, gpt_loss=0.321, loss_mean=0.357][A
+Train step of epoch 0:  21%|██▏       | 1376/6434 [3:13:23<11:08:32,  7.93s/it, gpt_loss=0.282, loss_mean=0.349][A
+Train step of epoch 0:  21%|██▏       | 1377/6434 [3:13:23<11:32:52,  8.22s/it, gpt_loss=0.282, loss_mean=0.349][A
+Train step of epoch 0:  21%|██▏       | 1377/6434 [3:13:30<11:32:52,  8.22s/it, gpt_loss=0.359, loss_mean=0.35] [A
+Train step of epoch 0:  21%|██▏       | 1378/6434 [3:13:30<11:10:00,  7.95s/it, gpt_loss=0.359, loss_mean=0.35][A
+Train step of epoch 0:  21%|██▏       | 1378/6434 [3:13:38<11:10:00,  7.95s/it, gpt_loss=0.298, loss_mean=0.345][A
+Train step of epoch 0:  21%|██▏       | 1379/6434 [3:13:38<10:59:45,  7.83s/it, gpt_loss=0.298, loss_mean=0.345][A
+[LID Router Debug] Step: 1380
+Batch Size: 10
+Audio Batch Size: 112
+LID Assignments: [2, 3, 6, 1, 2, 8, 9, 0, 6, 3]
+Active Experts in Batch: {0, 1, 2, 3, 6, 8, 9}
+
+Train step of epoch 0:  21%|██▏       | 1379/6434 [3:13:47<10:59:45,  7.83s/it, gpt_loss=0.315, loss_mean=0.342][A
+Train step of epoch 0:  21%|██▏       | 1380/6434 [3:13:47<11:27:33,  8.16s/it, gpt_loss=0.315, loss_mean=0.342][A
+Train step of epoch 0:  21%|██▏       | 1380/6434 [3:13:56<11:27:33,  8.16s/it, gpt_loss=0.397, loss_mean=0.348][A
+Train step of epoch 0:  21%|██▏       | 1381/6434 [3:13:56<11:41:31,  8.33s/it, gpt_loss=0.397, loss_mean=0.348][A
+Train step of epoch 0:  21%|██▏       | 1381/6434 [3:14:03<11:41:31,  8.33s/it, gpt_loss=0.333, loss_mean=0.346][A
+Train step of epoch 0:  21%|██▏       | 1382/6434 [3:14:03<11:32:04,  8.22s/it, gpt_loss=0.333, loss_mean=0.346][A
+Train step of epoch 0:  21%|██▏       | 1382/6434 [3:14:13<11:32:04,  8.22s/it, gpt_loss=0.319, loss_mean=0.343][A
+Train step of epoch 0:  21%|██▏       | 1383/6434 [3:14:13<11:54:17,  8.48s/it, gpt_loss=0.319, loss_mean=0.343][A
+Train step of epoch 0:  21%|██▏       | 1383/6434 [3:14:21<11:54:17,  8.48s/it, gpt_loss=0.292, loss_mean=0.338][A
+Train step of epoch 0:  22%|██▏       | 1384/6434 [3:14:21<11:54:45,  8.49s/it, gpt_loss=0.292, loss_mean=0.338][A
+Train step of epoch 0:  22%|██▏       | 1384/6434 [3:14:28<11:54:45,  8.49s/it, gpt_loss=0.321, loss_mean=0.337][A
+Train step of epoch 0:  22%|██▏       | 1385/6434 [3:14:28<11:21:18,  8.10s/it, gpt_loss=0.321, loss_mean=0.337][A
+Train step of epoch 0:  22%|██▏       | 1385/6434 [3:14:37<11:21:18,  8.10s/it, gpt_loss=0.304, loss_mean=0.333][A
+Train step of epoch 0:  22%|██▏       | 1386/6434 [3:14:37<11:48:54,  8.43s/it, gpt_loss=0.304, loss_mean=0.333][A
+Train step of epoch 0:  22%|██▏       | 1386/6434 [3:14:46<11:48:54,  8.43s/it, gpt_loss=0.322, loss_mean=0.332][A
+Train step of epoch 0:  22%|██▏       | 1387/6434 [3:14:46<11:40:10,  8.32s/it, gpt_loss=0.322, loss_mean=0.332][A
+Train step of epoch 0:  22%|██▏       | 1387/6434 [3:14:54<11:40:10,  8.32s/it, gpt_loss=0.336, loss_mean=0.333][A
+Train step of epoch 0:  22%|██▏       | 1388/6434 [3:14:54<11:45:17,  8.39s/it, gpt_loss=0.336, loss_mean=0.333][A
+Train step of epoch 0:  22%|██▏       | 1388/6434 [3:15:03<11:45:17,  8.39s/it, gpt_loss=0.327, loss_mean=0.332][A
+Train step of epoch 0:  22%|██▏       | 1389/6434 [3:15:03<11:46:47,  8.41s/it, gpt_loss=0.327, loss_mean=0.332][A
+[LID Router Debug] Step: 1390
+Batch Size: 10
+Audio Batch Size: 128
+LID Assignments: [9, 2, 9, 3, 9, 6, 2, 1, 5, 9]
+Active Experts in Batch: {1, 2, 3, 5, 6, 9}
+
+Train step of epoch 0:  22%|██▏       | 1389/6434 [3:15:11<11:46:47,  8.41s/it, gpt_loss=0.354, loss_mean=0.334][A
+Train step of epoch 0:  22%|██▏       | 1390/6434 [3:15:11<11:57:27,  8.53s/it, gpt_loss=0.354, loss_mean=0.334][A
+Train step of epoch 0:  22%|██▏       | 1390/6434 [3:15:21<11:57:27,  8.53s/it, gpt_loss=0.317, loss_mean=0.332][A
+Train step of epoch 0:  22%|██▏       | 1391/6434 [3:15:21<12:16:31,  8.76s/it, gpt_loss=0.317, loss_mean=0.332][A
+Train step of epoch 0:  22%|██▏       | 1391/6434 [3:15:28<12:16:31,  8.76s/it, gpt_loss=0.43, loss_mean=0.342] [A
+Train step of epoch 0:  22%|██▏       | 1392/6434 [3:15:28<11:44:49,  8.39s/it, gpt_loss=0.43, loss_mean=0.342][A
+Train step of epoch 0:  22%|██▏       | 1392/6434 [3:15:36<11:44:49,  8.39s/it, gpt_loss=0.378, loss_mean=0.346][A
+Train step of epoch 0:  22%|██▏       | 1393/6434 [3:15:36<11:38:14,  8.31s/it, gpt_loss=0.378, loss_mean=0.346][A
+Train step of epoch 0:  22%|██▏       | 1393/6434 [3:15:44<11:38:14,  8.31s/it, gpt_loss=0.334, loss_mean=0.345][A
+Train step of epoch 0:  22%|██▏       | 1394/6434 [3:15:44<11:30:18,  8.22s/it, gpt_loss=0.334, loss_mean=0.345][A
+Train step of epoch 0:  22%|██▏       | 1394/6434 [3:15:53<11:30:18,  8.22s/it, gpt_loss=0.548, loss_mean=0.365][A
+Train step of epoch 0:  22%|██▏       | 1395/6434 [3:15:53<11:44:29,  8.39s/it, gpt_loss=0.548, loss_mean=0.365][A
+Train step of epoch 0:  22%|██▏       | 1395/6434 [3:16:02<11:44:29,  8.39s/it, gpt_loss=0.395, loss_mean=0.368][A
+Train step of epoch 0:  22%|██▏       | 1396/6434 [3:16:02<11:59:17,  8.57s/it, gpt_loss=0.395, loss_mean=0.368][A
+Train step of epoch 0:  22%|██▏       | 1396/6434 [3:16:11<11:59:17,  8.57s/it, gpt_loss=0.349, loss_mean=0.366][A
+Train step of epoch 0:  22%|██▏       | 1397/6434 [3:16:11<12:08:05,  8.67s/it, gpt_loss=0.349, loss_mean=0.366][A
+Train step of epoch 0:  22%|██▏       | 1397/6434 [3:16:19<12:08:05,  8.67s/it, gpt_loss=0.289, loss_mean=0.358][A
+Train step of epoch 0:  22%|██▏       | 1398/6434 [3:16:19<12:03:51,  8.62s/it, gpt_loss=0.289, loss_mean=0.358][A
+Train step of epoch 0:  22%|██▏       | 1398/6434 [3:16:28<12:03:51,  8.62s/it, gpt_loss=0.301, loss_mean=0.353][A
+Train step of epoch 0:  22%|██▏       | 1399/6434 [3:16:28<11:54:51,  8.52s/it, gpt_loss=0.301, loss_mean=0.353][A
+[LID Router Debug] Step: 1400
+Batch Size: 10
+Audio Batch Size: 132
+LID Assignments: [2, 4, 5, 9, 4, 3, 0, 4, 9, 3]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+[2026-02-06 19:12:40,887] [INFO] [logging.py:96:log_dist] [Rank 0] step=700, skipped=0, lr=[1.980406566017246e-05, 1.980406566017246e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-06 19:12:40,888] [INFO] [timer.py:260:stop] epoch=0/micro_step=1400/global_step=700, RunningAvgSamplesPerSec=4.760853561168058, CurrSamplesPerSec=4.7359336050812075, MemAllocated=12.68GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  22%|██▏       | 1399/6434 [3:16:36<11:54:51,  8.52s/it, gpt_loss=0.366, loss_mean=0.354][A
+Train step of epoch 0:  22%|██▏       | 1400/6434 [3:16:36<11:58:00,  8.56s/it, gpt_loss=0.366, loss_mean=0.354][A
+Train step of epoch 0:  22%|██▏       | 1400/6434 [3:16:46<11:58:00,  8.56s/it, gpt_loss=0.309, loss_mean=0.349][A
+Train step of epoch 0:  22%|██▏       | 1401/6434 [3:16:46<12:13:31,  8.74s/it, gpt_loss=0.309, loss_mean=0.349][A
+Train step of epoch 0:  22%|██▏       | 1401/6434 [3:16:54<12:13:31,  8.74s/it, gpt_loss=0.29, loss_mean=0.343] [A
+Train step of epoch 0:  22%|██▏       | 1402/6434 [3:16:54<12:05:42,  8.65s/it, gpt_loss=0.29, loss_mean=0.343][A
+Train step of epoch 0:  22%|██▏       | 1402/6434 [3:17:02<12:05:42,  8.65s/it, gpt_loss=0.301, loss_mean=0.339][A
+Train step of epoch 0:  22%|██▏       | 1403/6434 [3:17:02<11:57:53,  8.56s/it, gpt_loss=0.301, loss_mean=0.339][A
+Train step of epoch 0:  22%|██▏       | 1403/6434 [3:17:11<11:57:53,  8.56s/it, gpt_loss=0.444, loss_mean=0.35] [A
+Train step of epoch 0:  22%|██▏       | 1404/6434 [3:17:11<11:58:37,  8.57s/it, gpt_loss=0.444, loss_mean=0.35][A
+Train step of epoch 0:  22%|██▏       | 1404/6434 [3:17:20<11:58:37,  8.57s/it, gpt_loss=0.291, loss_mean=0.344][A
+Train step of epoch 0:  22%|██▏       | 1405/6434 [3:17:20<11:57:29,  8.56s/it, gpt_loss=0.291, loss_mean=0.344][A
+Train step of epoch 0:  22%|██▏       | 1405/6434 [3:17:27<11:57:29,  8.56s/it, gpt_loss=0.373, loss_mean=0.347][A
+Train step of epoch 0:  22%|██▏       | 1406/6434 [3:17:27<11:41:31,  8.37s/it, gpt_loss=0.373, loss_mean=0.347][A
+Train step of epoch 0:  22%|██▏       | 1406/6434 [3:17:37<11:41:31,  8.37s/it, gpt_loss=0.324, loss_mean=0.344][A
+Train step of epoch 0:  22%|██▏       | 1407/6434 [3:17:37<12:11:30,  8.73s/it, gpt_loss=0.324, loss_mean=0.344][A
+Train step of epoch 0:  22%|██▏       | 1407/6434 [3:17:45<12:11:30,  8.73s/it, gpt_loss=0.329, loss_mean=0.343][A
+Train step of epoch 0:  22%|██▏       | 1408/6434 [3:17:45<11:44:39,  8.41s/it, gpt_loss=0.329, loss_mean=0.343][A
+Train step of epoch 0:  22%|██▏       | 1408/6434 [3:17:54<11:44:39,  8.41s/it, gpt_loss=0.356, loss_mean=0.344][A
+Train step of epoch 0:  22%|██▏       | 1409/6434 [3:17:54<11:55:57,  8.55s/it, gpt_loss=0.356, loss_mean=0.344][A
+[LID Router Debug] Step: 1410
+Batch Size: 10
+Audio Batch Size: 90
+LID Assignments: [9, 0, 0, 0, 0, 4, 1, 3, 4, 2]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+Train step of epoch 0:  22%|██▏       | 1409/6434 [3:18:03<11:55:57,  8.55s/it, gpt_loss=0.347, loss_mean=0.344][A
+Train step of epoch 0:  22%|██▏       | 1410/6434 [3:18:03<12:19:26,  8.83s/it, gpt_loss=0.347, loss_mean=0.344][A
+Train step of epoch 0:  22%|██▏       | 1410/6434 [3:18:11<12:19:26,  8.83s/it, gpt_loss=0.283, loss_mean=0.338][A
+Train step of epoch 0:  22%|██▏       | 1411/6434 [3:18:11<12:02:00,  8.62s/it, gpt_loss=0.283, loss_mean=0.338][A
+Train step of epoch 0:  22%|██▏       | 1411/6434 [3:18:20<12:02:00,  8.62s/it, gpt_loss=0.295, loss_mean=0.334][A
+Train step of epoch 0:  22%|██▏       | 1412/6434 [3:18:20<11:55:20,  8.55s/it, gpt_loss=0.295, loss_mean=0.334][A
+Train step of epoch 0:  22%|██▏       | 1412/6434 [3:18:27<11:55:20,  8.55s/it, gpt_loss=0.298, loss_mean=0.33] [A
+Train step of epoch 0:  22%|██▏       | 1413/6434 [3:18:27<11:31:42,  8.27s/it, gpt_loss=0.298, loss_mean=0.33][A
+Train step of epoch 0:  22%|██▏       | 1413/6434 [3:18:34<11:31:42,  8.27s/it, gpt_loss=0.289, loss_mean=0.326][A
+Train step of epoch 0:  22%|██▏       | 1414/6434 [3:18:34<10:58:02,  7.87s/it, gpt_loss=0.289, loss_mean=0.326][A
+Train step of epoch 0:  22%|██▏       | 1414/6434 [3:18:42<10:58:02,  7.87s/it, gpt_loss=0.38, loss_mean=0.332] [A
+Train step of epoch 0:  22%|██▏       | 1415/6434 [3:18:42<11:11:06,  8.02s/it, gpt_loss=0.38, loss_mean=0.332][A
+Train step of epoch 0:  22%|██▏       | 1415/6434 [3:18:51<11:11:06,  8.02s/it, gpt_loss=0.286, loss_mean=0.327][A
+Train step of epoch 0:  22%|██▏       | 1416/6434 [3:18:51<11:34:59,  8.31s/it, gpt_loss=0.286, loss_mean=0.327][A
+Train step of epoch 0:  22%|██▏       | 1416/6434 [3:18:59<11:34:59,  8.31s/it, gpt_loss=0.325, loss_mean=0.327][A
+Train step of epoch 0:  22%|██▏       | 1417/6434 [3:18:59<11:24:49,  8.19s/it, gpt_loss=0.325, loss_mean=0.327][A
+Train step of epoch 0:  22%|██▏       | 1417/6434 [3:19:08<11:24:49,  8.19s/it, gpt_loss=0.391, loss_mean=0.333][A
+Train step of epoch 0:  22%|██▏       | 1418/6434 [3:19:08<11:30:24,  8.26s/it, gpt_loss=0.391, loss_mean=0.333][A
+Train step of epoch 0:  22%|██▏       | 1418/6434 [3:19:16<11:30:24,  8.26s/it, gpt_loss=0.282, loss_mean=0.328][A
+Train step of epoch 0:  22%|██▏       | 1419/6434 [3:19:16<11:39:41,  8.37s/it, gpt_loss=0.282, loss_mean=0.328][A
+[LID Router Debug] Step: 1420
+Batch Size: 10
+Audio Batch Size: 129
+LID Assignments: [3, 9, 9, 4, 3, 6, 1, 5, 6, 5]
+Active Experts in Batch: {1, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  22%|██▏       | 1419/6434 [3:19:25<11:39:41,  8.37s/it, gpt_loss=0.455, loss_mean=0.341][A
+Train step of epoch 0:  22%|██▏       | 1420/6434 [3:19:25<11:51:21,  8.51s/it, gpt_loss=0.455, loss_mean=0.341][A
+Train step of epoch 0:  22%|██▏       | 1420/6434 [3:19:34<11:51:21,  8.51s/it, gpt_loss=0.355, loss_mean=0.342][A
+Train step of epoch 0:  22%|██▏       | 1421/6434 [3:19:34<12:04:57,  8.68s/it, gpt_loss=0.355, loss_mean=0.342][A
+Train step of epoch 0:  22%|██▏       | 1421/6434 [3:19:43<12:04:57,  8.68s/it, gpt_loss=0.305, loss_mean=0.338][A
+Train step of epoch 0:  22%|██▏       | 1422/6434 [3:19:43<12:09:25,  8.73s/it, gpt_loss=0.305, loss_mean=0.338][A
+Train step of epoch 0:  22%|██▏       | 1422/6434 [3:19:51<12:09:25,  8.73s/it, gpt_loss=0.311, loss_mean=0.336][A
+Train step of epoch 0:  22%|██▏       | 1423/6434 [3:19:51<11:53:56,  8.55s/it, gpt_loss=0.311, loss_mean=0.336][A
+Train step of epoch 0:  22%|██▏       | 1423/6434 [3:20:00<11:53:56,  8.55s/it, gpt_loss=0.322, loss_mean=0.334][A
+Train step of epoch 0:  22%|██▏       | 1424/6434 [3:20:00<11:58:15,  8.60s/it, gpt_loss=0.322, loss_mean=0.334][A
+Train step of epoch 0:  22%|██▏       | 1424/6434 [3:20:08<11:58:15,  8.60s/it, gpt_loss=0.265, loss_mean=0.327][A
+Train step of epoch 0:  22%|██▏       | 1425/6434 [3:20:08<11:47:44,  8.48s/it, gpt_loss=0.265, loss_mean=0.327][A
+Train step of epoch 0:  22%|██▏       | 1425/6434 [3:20:17<11:47:44,  8.48s/it, gpt_loss=0.312, loss_mean=0.326][A
+Train step of epoch 0:  22%|██▏       | 1426/6434 [3:20:17<12:03:06,  8.66s/it, gpt_loss=0.312, loss_mean=0.326][A
+Train step of epoch 0:  22%|██▏       | 1426/6434 [3:20:26<12:03:06,  8.66s/it, gpt_loss=0.33, loss_mean=0.326] [A
+Train step of epoch 0:  22%|██▏       | 1427/6434 [3:20:26<12:14:28,  8.80s/it, gpt_loss=0.33, loss_mean=0.326][A
+Train step of epoch 0:  22%|██▏       | 1427/6434 [3:20:34<12:14:28,  8.80s/it, gpt_loss=0.34, loss_mean=0.328][A
+Train step of epoch 0:  22%|██▏       | 1428/6434 [3:20:34<11:40:41,  8.40s/it, gpt_loss=0.34, loss_mean=0.328][A
+Train step of epoch 0:  22%|██▏       | 1428/6434 [3:20:44<11:40:41,  8.40s/it, gpt_loss=0.287, loss_mean=0.324][A
+Train step of epoch 0:  22%|██▏       | 1429/6434 [3:20:44<12:12:10,  8.78s/it, gpt_loss=0.287, loss_mean=0.324][A
+[LID Router Debug] Step: 1430
+Batch Size: 10
+Audio Batch Size: 77
+LID Assignments: [2, 1, 0, 7, 2, 2, 5, 6, 4, 4]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6, 7}
+
+Train step of epoch 0:  22%|██▏       | 1429/6434 [3:20:52<12:12:10,  8.78s/it, gpt_loss=0.393, loss_mean=0.33] [A
+Train step of epoch 0:  22%|██▏       | 1430/6434 [3:20:52<11:52:57,  8.55s/it, gpt_loss=0.393, loss_mean=0.33][A
+Train step of epoch 0:  22%|██▏       | 1430/6434 [3:21:00<11:52:57,  8.55s/it, gpt_loss=0.354, loss_mean=0.333][A
+Train step of epoch 0:  22%|██▏       | 1431/6434 [3:21:00<11:45:58,  8.47s/it, gpt_loss=0.354, loss_mean=0.333][A
+Train step of epoch 0:  22%|██▏       | 1431/6434 [3:21:08<11:45:58,  8.47s/it, gpt_loss=0.377, loss_mean=0.337][A
+Train step of epoch 0:  22%|██▏       | 1432/6434 [3:21:08<11:49:38,  8.51s/it, gpt_loss=0.377, loss_mean=0.337][A
+Train step of epoch 0:  22%|██▏       | 1432/6434 [3:21:17<11:49:38,  8.51s/it, gpt_loss=0.301, loss_mean=0.334][A
+Train step of epoch 0:  22%|██▏       | 1433/6434 [3:21:17<11:39:58,  8.40s/it, gpt_loss=0.301, loss_mean=0.334][A
+Train step of epoch 0:  22%|██▏       | 1433/6434 [3:21:25<11:39:58,  8.40s/it, gpt_loss=0.271, loss_mean=0.327][A
+Train step of epoch 0:  22%|██▏       | 1434/6434 [3:21:25<11:31:54,  8.30s/it, gpt_loss=0.271, loss_mean=0.327][A
+Train step of epoch 0:  22%|██▏       | 1434/6434 [3:21:33<11:31:54,  8.30s/it, gpt_loss=0.388, loss_mean=0.333][A
+Train step of epoch 0:  22%|██▏       | 1435/6434 [3:21:33<11:30:08,  8.28s/it, gpt_loss=0.388, loss_mean=0.333][A
+Train step of epoch 0:  22%|██▏       | 1435/6434 [3:21:42<11:30:08,  8.28s/it, gpt_loss=0.31, loss_mean=0.331] [A
+Train step of epoch 0:  22%|██▏       | 1436/6434 [3:21:42<11:49:21,  8.52s/it, gpt_loss=0.31, loss_mean=0.331][A
+Train step of epoch 0:  22%|██▏       | 1436/6434 [3:21:49<11:49:21,  8.52s/it, gpt_loss=0.333, loss_mean=0.331][A
+Train step of epoch 0:  22%|██▏       | 1437/6434 [3:21:49<11:21:55,  8.19s/it, gpt_loss=0.333, loss_mean=0.331][A
+Train step of epoch 0:  22%|██▏       | 1437/6434 [3:21:57<11:21:55,  8.19s/it, gpt_loss=0.306, loss_mean=0.329][A
+Train step of epoch 0:  22%|██▏       | 1438/6434 [3:21:57<11:13:59,  8.09s/it, gpt_loss=0.306, loss_mean=0.329][A
+Train step of epoch 0:  22%|██▏       | 1438/6434 [3:22:05<11:13:59,  8.09s/it, gpt_loss=0.378, loss_mean=0.334][A
+Train step of epoch 0:  22%|██▏       | 1439/6434 [3:22:05<11:05:24,  7.99s/it, gpt_loss=0.378, loss_mean=0.334][A
+[LID Router Debug] Step: 1440
+Batch Size: 10
+Audio Batch Size: 113
+LID Assignments: [2, 3, 5, 9, 3, 1, 3, 9, 4, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  22%|██▏       | 1439/6434 [3:22:13<11:05:24,  7.99s/it, gpt_loss=0.352, loss_mean=0.336][A
+Train step of epoch 0:  22%|██▏       | 1440/6434 [3:22:13<11:08:51,  8.04s/it, gpt_loss=0.352, loss_mean=0.336][A
+Train step of epoch 0:  22%|██▏       | 1440/6434 [3:22:21<11:08:51,  8.04s/it, gpt_loss=0.362, loss_mean=0.338][A
+Train step of epoch 0:  22%|██▏       | 1441/6434 [3:22:21<11:14:03,  8.10s/it, gpt_loss=0.362, loss_mean=0.338][A
+Train step of epoch 0:  22%|██▏       | 1441/6434 [3:22:30<11:14:03,  8.10s/it, gpt_loss=0.361, loss_mean=0.34] [A
+Train step of epoch 0:  22%|██▏       | 1442/6434 [3:22:30<11:24:12,  8.22s/it, gpt_loss=0.361, loss_mean=0.34][A
+Train step of epoch 0:  22%|██▏       | 1442/6434 [3:22:39<11:24:12,  8.22s/it, gpt_loss=0.345, loss_mean=0.341][A
+Train step of epoch 0:  22%|██▏       | 1443/6434 [3:22:39<11:34:40,  8.35s/it, gpt_loss=0.345, loss_mean=0.341][A
+Train step of epoch 0:  22%|██▏       | 1443/6434 [3:22:48<11:34:40,  8.35s/it, gpt_loss=0.34, loss_mean=0.341] [A
+Train step of epoch 0:  22%|██▏       | 1444/6434 [3:22:48<11:48:52,  8.52s/it, gpt_loss=0.34, loss_mean=0.341][A
+Train step of epoch 0:  22%|██▏       | 1444/6434 [3:22:55<11:48:52,  8.52s/it, gpt_loss=0.404, loss_mean=0.347][A
+Train step of epoch 0:  22%|██▏       | 1445/6434 [3:22:55<11:33:56,  8.35s/it, gpt_loss=0.404, loss_mean=0.347][A
+Train step of epoch 0:  22%|██▏       | 1445/6434 [3:23:05<11:33:56,  8.35s/it, gpt_loss=0.435, loss_mean=0.356][A
+Train step of epoch 0:  22%|██▏       | 1446/6434 [3:23:05<12:12:05,  8.81s/it, gpt_loss=0.435, loss_mean=0.356][A
+Train step of epoch 0:  22%|██▏       | 1446/6434 [3:23:13<12:12:05,  8.81s/it, gpt_loss=0.313, loss_mean=0.352][A
+Train step of epoch 0:  22%|██▏       | 1447/6434 [3:23:13<11:40:58,  8.43s/it, gpt_loss=0.313, loss_mean=0.352][A
+Train step of epoch 0:  22%|██▏       | 1447/6434 [3:23:22<11:40:58,  8.43s/it, gpt_loss=0.333, loss_mean=0.35] [A
+Train step of epoch 0:  23%|██▎       | 1448/6434 [3:23:22<11:50:07,  8.55s/it, gpt_loss=0.333, loss_mean=0.35][A
+Train step of epoch 0:  23%|██▎       | 1448/6434 [3:23:30<11:50:07,  8.55s/it, gpt_loss=0.312, loss_mean=0.346][A
+Train step of epoch 0:  23%|██▎       | 1449/6434 [3:23:30<11:38:33,  8.41s/it, gpt_loss=0.312, loss_mean=0.346][A
+[LID Router Debug] Step: 1450
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [3, 0, 4, 9, 5, 1, 6, 3, 1, 5]
+Active Experts in Batch: {0, 1, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  23%|██▎       | 1449/6434 [3:23:38<11:38:33,  8.41s/it, gpt_loss=0.395, loss_mean=0.351][A
+Train step of epoch 0:  23%|██▎       | 1450/6434 [3:23:38<11:29:03,  8.30s/it, gpt_loss=0.395, loss_mean=0.351][A
+Train step of epoch 0:  23%|██▎       | 1450/6434 [3:23:49<11:29:03,  8.30s/it, gpt_loss=0.367, loss_mean=0.352][A
+Train step of epoch 0:  23%|██▎       | 1451/6434 [3:23:49<12:37:11,  9.12s/it, gpt_loss=0.367, loss_mean=0.352][A
+Train step of epoch 0:  23%|██▎       | 1451/6434 [3:23:57<12:37:11,  9.12s/it, gpt_loss=0.432, loss_mean=0.36] [A
+Train step of epoch 0:  23%|██▎       | 1452/6434 [3:23:57<12:14:49,  8.85s/it, gpt_loss=0.432, loss_mean=0.36][A
+Train step of epoch 0:  23%|██▎       | 1452/6434 [3:24:06<12:14:49,  8.85s/it, gpt_loss=0.372, loss_mean=0.362][A
+Train step of epoch 0:  23%|██▎       | 1453/6434 [3:24:06<12:11:51,  8.82s/it, gpt_loss=0.372, loss_mean=0.362][A
+Train step of epoch 0:  23%|██▎       | 1453/6434 [3:24:16<12:11:51,  8.82s/it, gpt_loss=0.258, loss_mean=0.351][A
+Train step of epoch 0:  23%|██▎       | 1454/6434 [3:24:16<12:43:58,  9.20s/it, gpt_loss=0.258, loss_mean=0.351][A
+Train step of epoch 0:  23%|██▎       | 1454/6434 [3:24:24<12:43:58,  9.20s/it, gpt_loss=0.327, loss_mean=0.349][A
+Train step of epoch 0:  23%|██▎       | 1455/6434 [3:24:24<12:25:49,  8.99s/it, gpt_loss=0.327, loss_mean=0.349][A
+Train step of epoch 0:  23%|██▎       | 1455/6434 [3:24:32<12:25:49,  8.99s/it, gpt_loss=0.389, loss_mean=0.353][A
+Train step of epoch 0:  23%|██▎       | 1456/6434 [3:24:32<11:43:12,  8.48s/it, gpt_loss=0.389, loss_mean=0.353][A
+Train step of epoch 0:  23%|██▎       | 1456/6434 [3:24:40<11:43:12,  8.48s/it, gpt_loss=0.343, loss_mean=0.352][A
+Train step of epoch 0:  23%|██▎       | 1457/6434 [3:24:40<11:29:19,  8.31s/it, gpt_loss=0.343, loss_mean=0.352][A
+Train step of epoch 0:  23%|██▎       | 1457/6434 [3:24:48<11:29:19,  8.31s/it, gpt_loss=0.383, loss_mean=0.355][A
+Train step of epoch 0:  23%|██▎       | 1458/6434 [3:24:48<11:37:47,  8.41s/it, gpt_loss=0.383, loss_mean=0.355][A
+Train step of epoch 0:  23%|██▎       | 1458/6434 [3:24:57<11:37:47,  8.41s/it, gpt_loss=0.354, loss_mean=0.355][A
+Train step of epoch 0:  23%|██▎       | 1459/6434 [3:24:57<11:46:13,  8.52s/it, gpt_loss=0.354, loss_mean=0.355][A
+[LID Router Debug] Step: 1460
+Batch Size: 10
+Audio Batch Size: 95
+LID Assignments: [2, 3, 9, 5, 0, 2, 2, 6, 9, 4]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  23%|██▎       | 1459/6434 [3:25:05<11:46:13,  8.52s/it, gpt_loss=0.43, loss_mean=0.362] [A
+Train step of epoch 0:  23%|██▎       | 1460/6434 [3:25:05<11:33:41,  8.37s/it, gpt_loss=0.43, loss_mean=0.362][A
+Train step of epoch 0:  23%|██▎       | 1460/6434 [3:25:15<11:33:41,  8.37s/it, gpt_loss=0.336, loss_mean=0.36][A
+Train step of epoch 0:  23%|██▎       | 1461/6434 [3:25:15<12:02:36,  8.72s/it, gpt_loss=0.336, loss_mean=0.36][A
+Train step of epoch 0:  23%|██▎       | 1461/6434 [3:25:23<12:02:36,  8.72s/it, gpt_loss=0.341, loss_mean=0.358][A
+Train step of epoch 0:  23%|██▎       | 1462/6434 [3:25:23<12:01:21,  8.71s/it, gpt_loss=0.341, loss_mean=0.358][A
+Train step of epoch 0:  23%|██▎       | 1462/6434 [3:25:31<12:01:21,  8.71s/it, gpt_loss=0.305, loss_mean=0.353][A
+Train step of epoch 0:  23%|██▎       | 1463/6434 [3:25:31<11:31:57,  8.35s/it, gpt_loss=0.305, loss_mean=0.353][A
+Train step of epoch 0:  23%|██▎       | 1463/6434 [3:25:40<11:31:57,  8.35s/it, gpt_loss=0.273, loss_mean=0.345][A
+Train step of epoch 0:  23%|██▎       | 1464/6434 [3:25:40<11:43:29,  8.49s/it, gpt_loss=0.273, loss_mean=0.345][A
+Train step of epoch 0:  23%|██▎       | 1464/6434 [3:25:49<11:43:29,  8.49s/it, gpt_loss=0.332, loss_mean=0.343][A
+Train step of epoch 0:  23%|██▎       | 1465/6434 [3:25:49<12:05:36,  8.76s/it, gpt_loss=0.332, loss_mean=0.343][A
+Train step of epoch 0:  23%|██▎       | 1465/6434 [3:25:57<12:05:36,  8.76s/it, gpt_loss=0.286, loss_mean=0.338][A
+Train step of epoch 0:  23%|██▎       | 1466/6434 [3:25:57<11:54:43,  8.63s/it, gpt_loss=0.286, loss_mean=0.338][A
+Train step of epoch 0:  23%|██▎       | 1466/6434 [3:26:05<11:54:43,  8.63s/it, gpt_loss=0.354, loss_mean=0.339][A
+Train step of epoch 0:  23%|██▎       | 1467/6434 [3:26:05<11:41:50,  8.48s/it, gpt_loss=0.354, loss_mean=0.339][A
+Train step of epoch 0:  23%|██▎       | 1467/6434 [3:26:15<11:41:50,  8.48s/it, gpt_loss=0.364, loss_mean=0.342][A
+Train step of epoch 0:  23%|██▎       | 1468/6434 [3:26:15<11:58:24,  8.68s/it, gpt_loss=0.364, loss_mean=0.342][A
+Train step of epoch 0:  23%|██▎       | 1468/6434 [3:26:24<11:58:24,  8.68s/it, gpt_loss=0.364, loss_mean=0.344][A
+Train step of epoch 0:  23%|██▎       | 1469/6434 [3:26:24<12:21:52,  8.97s/it, gpt_loss=0.364, loss_mean=0.344][A
+[LID Router Debug] Step: 1470
+Batch Size: 10
+Audio Batch Size: 131
+LID Assignments: [9, 0, 3, 4, 9, 2, 0, 3, 3, 4]
+Active Experts in Batch: {0, 2, 3, 4, 9}
+
+Train step of epoch 0:  23%|██▎       | 1469/6434 [3:26:33<12:21:52,  8.97s/it, gpt_loss=0.385, loss_mean=0.348][A
+Train step of epoch 0:  23%|██▎       | 1470/6434 [3:26:33<12:21:54,  8.97s/it, gpt_loss=0.385, loss_mean=0.348][A
+Train step of epoch 0:  23%|██▎       | 1470/6434 [3:26:43<12:21:54,  8.97s/it, gpt_loss=0.252, loss_mean=0.338][A
+Train step of epoch 0:  23%|██▎       | 1471/6434 [3:26:43<12:33:07,  9.10s/it, gpt_loss=0.252, loss_mean=0.338][A
+Train step of epoch 0:  23%|██▎       | 1471/6434 [3:26:50<12:33:07,  9.10s/it, gpt_loss=0.345, loss_mean=0.339][A
+Train step of epoch 0:  23%|██▎       | 1472/6434 [3:26:50<12:02:07,  8.73s/it, gpt_loss=0.345, loss_mean=0.339][A
+Train step of epoch 0:  23%|██▎       | 1472/6434 [3:26:59<12:02:07,  8.73s/it, gpt_loss=0.445, loss_mean=0.35] [A
+Train step of epoch 0:  23%|██▎       | 1473/6434 [3:26:59<11:45:35,  8.53s/it, gpt_loss=0.445, loss_mean=0.35][A
+Train step of epoch 0:  23%|██▎       | 1473/6434 [3:27:07<11:45:35,  8.53s/it, gpt_loss=0.371, loss_mean=0.352][A
+Train step of epoch 0:  23%|██▎       | 1474/6434 [3:27:07<11:35:54,  8.42s/it, gpt_loss=0.371, loss_mean=0.352][A
+Train step of epoch 0:  23%|██▎       | 1474/6434 [3:27:16<11:35:54,  8.42s/it, gpt_loss=0.36, loss_mean=0.353] [A
+Train step of epoch 0:  23%|██▎       | 1475/6434 [3:27:16<11:46:31,  8.55s/it, gpt_loss=0.36, loss_mean=0.353][A
+Train step of epoch 0:  23%|██▎       | 1475/6434 [3:27:25<11:46:31,  8.55s/it, gpt_loss=0.311, loss_mean=0.348][A
+Train step of epoch 0:  23%|██▎       | 1476/6434 [3:27:25<12:06:11,  8.79s/it, gpt_loss=0.311, loss_mean=0.348][A
+Train step of epoch 0:  23%|██▎       | 1476/6434 [3:27:33<12:06:11,  8.79s/it, gpt_loss=0.307, loss_mean=0.344][A
+Train step of epoch 0:  23%|██▎       | 1477/6434 [3:27:33<12:01:03,  8.73s/it, gpt_loss=0.307, loss_mean=0.344][A
+Train step of epoch 0:  23%|██▎       | 1477/6434 [3:27:43<12:01:03,  8.73s/it, gpt_loss=0.289, loss_mean=0.339][A
+Train step of epoch 0:  23%|██▎       | 1478/6434 [3:27:43<12:22:03,  8.98s/it, gpt_loss=0.289, loss_mean=0.339][A
+Train step of epoch 0:  23%|██▎       | 1478/6434 [3:27:52<12:22:03,  8.98s/it, gpt_loss=0.351, loss_mean=0.34] [A
+Train step of epoch 0:  23%|██▎       | 1479/6434 [3:27:52<12:12:56,  8.88s/it, gpt_loss=0.351, loss_mean=0.34][A
+[LID Router Debug] Step: 1480
+Batch Size: 10
+Audio Batch Size: 81
+LID Assignments: [2, 1, 5, 9, 9, 0, 4, 4, 4, 5]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+Train step of epoch 0:  23%|██▎       | 1479/6434 [3:28:00<12:12:56,  8.88s/it, gpt_loss=0.284, loss_mean=0.334][A
+Train step of epoch 0:  23%|██▎       | 1480/6434 [3:28:00<11:58:59,  8.71s/it, gpt_loss=0.284, loss_mean=0.334][A
+Train step of epoch 0:  23%|██▎       | 1480/6434 [3:28:09<11:58:59,  8.71s/it, gpt_loss=0.355, loss_mean=0.336][A
+Train step of epoch 0:  23%|██▎       | 1481/6434 [3:28:09<11:58:24,  8.70s/it, gpt_loss=0.355, loss_mean=0.336][A
+Train step of epoch 0:  23%|██▎       | 1481/6434 [3:28:17<11:58:24,  8.70s/it, gpt_loss=0.373, loss_mean=0.34] [A
+Train step of epoch 0:  23%|██▎       | 1482/6434 [3:28:17<11:57:50,  8.70s/it, gpt_loss=0.373, loss_mean=0.34][A
+Train step of epoch 0:  23%|██▎       | 1482/6434 [3:28:26<11:57:50,  8.70s/it, gpt_loss=0.407, loss_mean=0.347][A
+Train step of epoch 0:  23%|██▎       | 1483/6434 [3:28:26<11:46:43,  8.56s/it, gpt_loss=0.407, loss_mean=0.347][A
+Train step of epoch 0:  23%|██▎       | 1483/6434 [3:28:34<11:46:43,  8.56s/it, gpt_loss=0.377, loss_mean=0.35] [A
+Train step of epoch 0:  23%|██▎       | 1484/6434 [3:28:34<11:40:46,  8.49s/it, gpt_loss=0.377, loss_mean=0.35][A
+Train step of epoch 0:  23%|██▎       | 1484/6434 [3:28:42<11:40:46,  8.49s/it, gpt_loss=0.343, loss_mean=0.349][A
+Train step of epoch 0:  23%|██▎       | 1485/6434 [3:28:42<11:34:37,  8.42s/it, gpt_loss=0.343, loss_mean=0.349][A
+Train step of epoch 0:  23%|██▎       | 1485/6434 [3:28:52<11:34:37,  8.42s/it, gpt_loss=0.35, loss_mean=0.349] [A
+Train step of epoch 0:  23%|██▎       | 1486/6434 [3:28:52<12:01:21,  8.75s/it, gpt_loss=0.35, loss_mean=0.349][A
+Train step of epoch 0:  23%|██▎       | 1486/6434 [3:29:01<12:01:21,  8.75s/it, gpt_loss=0.348, loss_mean=0.349][A
+Train step of epoch 0:  23%|██▎       | 1487/6434 [3:29:01<12:18:31,  8.96s/it, gpt_loss=0.348, loss_mean=0.349][A
+Train step of epoch 0:  23%|██▎       | 1487/6434 [3:29:09<12:18:31,  8.96s/it, gpt_loss=0.381, loss_mean=0.352][A
+Train step of epoch 0:  23%|██▎       | 1488/6434 [3:29:09<11:38:42,  8.48s/it, gpt_loss=0.381, loss_mean=0.352][A
+Train step of epoch 0:  23%|██▎       | 1488/6434 [3:29:17<11:38:42,  8.48s/it, gpt_loss=0.355, loss_mean=0.353][A
+Train step of epoch 0:  23%|██▎       | 1489/6434 [3:29:17<11:33:19,  8.41s/it, gpt_loss=0.355, loss_mean=0.353][A
+[LID Router Debug] Step: 1490
+Batch Size: 10
+Audio Batch Size: 86
+LID Assignments: [2, 0, 5, 4, 4, 2, 1, 6, 6, 0]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6}
+
+Train step of epoch 0:  23%|██▎       | 1489/6434 [3:29:25<11:33:19,  8.41s/it, gpt_loss=0.261, loss_mean=0.343][A
+Train step of epoch 0:  23%|██▎       | 1490/6434 [3:29:25<11:29:48,  8.37s/it, gpt_loss=0.261, loss_mean=0.343][A
+Train step of epoch 0:  23%|██▎       | 1490/6434 [3:29:34<11:29:48,  8.37s/it, gpt_loss=0.31, loss_mean=0.34]  [A
+Train step of epoch 0:  23%|██▎       | 1491/6434 [3:29:34<11:35:25,  8.44s/it, gpt_loss=0.31, loss_mean=0.34][A
+Train step of epoch 0:  23%|██▎       | 1491/6434 [3:29:42<11:35:25,  8.44s/it, gpt_loss=0.313, loss_mean=0.337][A
+Train step of epoch 0:  23%|██▎       | 1492/6434 [3:29:42<11:22:38,  8.29s/it, gpt_loss=0.313, loss_mean=0.337][A
+Train step of epoch 0:  23%|██▎       | 1492/6434 [3:29:49<11:22:38,  8.29s/it, gpt_loss=0.331, loss_mean=0.337][A
+Train step of epoch 0:  23%|██▎       | 1493/6434 [3:29:49<11:08:09,  8.11s/it, gpt_loss=0.331, loss_mean=0.337][A
+Train step of epoch 0:  23%|██▎       | 1493/6434 [3:29:58<11:08:09,  8.11s/it, gpt_loss=0.338, loss_mean=0.337][A
+Train step of epoch 0:  23%|██▎       | 1494/6434 [3:29:58<11:10:04,  8.14s/it, gpt_loss=0.338, loss_mean=0.337][A
+Train step of epoch 0:  23%|██▎       | 1494/6434 [3:30:05<11:10:04,  8.14s/it, gpt_loss=0.359, loss_mean=0.339][A
+Train step of epoch 0:  23%|██▎       | 1495/6434 [3:30:05<11:05:20,  8.08s/it, gpt_loss=0.359, loss_mean=0.339][A
+Train step of epoch 0:  23%|██▎       | 1495/6434 [3:30:13<11:05:20,  8.08s/it, gpt_loss=0.372, loss_mean=0.342][A
+Train step of epoch 0:  23%|██▎       | 1496/6434 [3:30:13<10:52:55,  7.93s/it, gpt_loss=0.372, loss_mean=0.342][A
+Train step of epoch 0:  23%|██▎       | 1496/6434 [3:30:22<10:52:55,  7.93s/it, gpt_loss=0.323, loss_mean=0.34] [A
+Train step of epoch 0:  23%|██▎       | 1497/6434 [3:30:22<11:23:57,  8.31s/it, gpt_loss=0.323, loss_mean=0.34][A
+Train step of epoch 0:  23%|██▎       | 1497/6434 [3:30:30<11:23:57,  8.31s/it, gpt_loss=0.298, loss_mean=0.336][A
+Train step of epoch 0:  23%|██▎       | 1498/6434 [3:30:30<11:13:58,  8.19s/it, gpt_loss=0.298, loss_mean=0.336][A
+Train step of epoch 0:  23%|██▎       | 1498/6434 [3:30:39<11:13:58,  8.19s/it, gpt_loss=0.377, loss_mean=0.34] [A
+Train step of epoch 0:  23%|██▎       | 1499/6434 [3:30:39<11:32:08,  8.42s/it, gpt_loss=0.377, loss_mean=0.34][A
+[LID Router Debug] Step: 1500
+Batch Size: 10
+Audio Batch Size: 111
+LID Assignments: [5, 5, 9, 9, 2, 2, 5, 3, 2, 0]
+Active Experts in Batch: {0, 2, 3, 5, 9}
+
+Train step of epoch 0:  23%|██▎       | 1499/6434 [3:30:48<11:32:08,  8.42s/it, gpt_loss=0.344, loss_mean=0.341][A
+Train step of epoch 0:  23%|██▎       | 1500/6434 [3:30:48<11:38:28,  8.49s/it, gpt_loss=0.344, loss_mean=0.341][A
+Train step of epoch 0:  23%|██▎       | 1500/6434 [3:30:56<11:38:28,  8.49s/it, gpt_loss=0.355, loss_mean=0.342][A
+Train step of epoch 0:  23%|██▎       | 1501/6434 [3:30:56<11:24:01,  8.32s/it, gpt_loss=0.355, loss_mean=0.342][A
+Train step of epoch 0:  23%|██▎       | 1501/6434 [3:31:06<11:24:01,  8.32s/it, gpt_loss=0.403, loss_mean=0.348][A
+Train step of epoch 0:  23%|██▎       | 1502/6434 [3:31:06<12:12:10,  8.91s/it, gpt_loss=0.403, loss_mean=0.348][A
+Train step of epoch 0:  23%|██▎       | 1502/6434 [3:31:16<12:12:10,  8.91s/it, gpt_loss=0.327, loss_mean=0.346][A
+Train step of epoch 0:  23%|██▎       | 1503/6434 [3:31:16<12:30:17,  9.13s/it, gpt_loss=0.327, loss_mean=0.346][A
+Train step of epoch 0:  23%|██▎       | 1503/6434 [3:31:24<12:30:17,  9.13s/it, gpt_loss=0.313, loss_mean=0.343][A
+Train step of epoch 0:  23%|██▎       | 1504/6434 [3:31:24<12:12:29,  8.91s/it, gpt_loss=0.313, loss_mean=0.343][A
+Train step of epoch 0:  23%|██▎       | 1504/6434 [3:31:33<12:12:29,  8.91s/it, gpt_loss=0.369, loss_mean=0.345][A
+Train step of epoch 0:  23%|██▎       | 1505/6434 [3:31:33<12:17:47,  8.98s/it, gpt_loss=0.369, loss_mean=0.345][A
+Train step of epoch 0:  23%|██▎       | 1505/6434 [3:31:42<12:17:47,  8.98s/it, gpt_loss=0.277, loss_mean=0.338][A
+Train step of epoch 0:  23%|██▎       | 1506/6434 [3:31:42<12:11:03,  8.90s/it, gpt_loss=0.277, loss_mean=0.338][A
+Train step of epoch 0:  23%|██▎       | 1506/6434 [3:31:50<12:11:03,  8.90s/it, gpt_loss=0.378, loss_mean=0.342][A
+Train step of epoch 0:  23%|██▎       | 1507/6434 [3:31:50<11:48:44,  8.63s/it, gpt_loss=0.378, loss_mean=0.342][A
+Train step of epoch 0:  23%|██▎       | 1507/6434 [3:31:58<11:48:44,  8.63s/it, gpt_loss=0.368, loss_mean=0.345][A
+Train step of epoch 0:  23%|██▎       | 1508/6434 [3:31:58<11:34:43,  8.46s/it, gpt_loss=0.368, loss_mean=0.345][A
+Train step of epoch 0:  23%|██▎       | 1508/6434 [3:32:05<11:34:43,  8.46s/it, gpt_loss=0.312, loss_mean=0.342][A
+Train step of epoch 0:  23%|██▎       | 1509/6434 [3:32:05<11:11:32,  8.18s/it, gpt_loss=0.312, loss_mean=0.342][A
+[LID Router Debug] Step: 1510
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [7, 9, 4, 3, 4, 5, 8, 2, 0, 4]
+Active Experts in Batch: {0, 2, 3, 4, 5, 7, 8, 9}
+
+Train step of epoch 0:  23%|██▎       | 1509/6434 [3:32:14<11:11:32,  8.18s/it, gpt_loss=0.347, loss_mean=0.342][A
+Train step of epoch 0:  23%|██▎       | 1510/6434 [3:32:14<11:11:32,  8.18s/it, gpt_loss=0.347, loss_mean=0.342][A
+Train step of epoch 0:  23%|██▎       | 1510/6434 [3:32:22<11:11:32,  8.18s/it, gpt_loss=0.263, loss_mean=0.334][A
+Train step of epoch 0:  23%|██▎       | 1511/6434 [3:32:22<11:23:49,  8.33s/it, gpt_loss=0.263, loss_mean=0.334][A
+Train step of epoch 0:  23%|██▎       | 1511/6434 [3:32:31<11:23:49,  8.33s/it, gpt_loss=0.283, loss_mean=0.329][A
+Train step of epoch 0:  24%|██▎       | 1512/6434 [3:32:31<11:22:52,  8.32s/it, gpt_loss=0.283, loss_mean=0.329][A
+Train step of epoch 0:  24%|██▎       | 1512/6434 [3:32:39<11:22:52,  8.32s/it, gpt_loss=0.288, loss_mean=0.325][A
+Train step of epoch 0:  24%|██▎       | 1513/6434 [3:32:39<11:33:00,  8.45s/it, gpt_loss=0.288, loss_mean=0.325][A
+Train step of epoch 0:  24%|██▎       | 1513/6434 [3:32:47<11:33:00,  8.45s/it, gpt_loss=0.347, loss_mean=0.327][A
+Train step of epoch 0:  24%|██▎       | 1514/6434 [3:32:47<11:15:37,  8.24s/it, gpt_loss=0.347, loss_mean=0.327][A
+Train step of epoch 0:  24%|██▎       | 1514/6434 [3:32:55<11:15:37,  8.24s/it, gpt_loss=0.413, loss_mean=0.336][A
+Train step of epoch 0:  24%|██▎       | 1515/6434 [3:32:55<11:06:58,  8.14s/it, gpt_loss=0.413, loss_mean=0.336][A
+Train step of epoch 0:  24%|██▎       | 1515/6434 [3:33:03<11:06:58,  8.14s/it, gpt_loss=0.326, loss_mean=0.335][A
+Train step of epoch 0:  24%|██▎       | 1516/6434 [3:33:03<11:02:31,  8.08s/it, gpt_loss=0.326, loss_mean=0.335][A
+Train step of epoch 0:  24%|██▎       | 1516/6434 [3:33:11<11:02:31,  8.08s/it, gpt_loss=0.313, loss_mean=0.333][A
+Train step of epoch 0:  24%|██▎       | 1517/6434 [3:33:11<11:03:32,  8.10s/it, gpt_loss=0.313, loss_mean=0.333][A
+Train step of epoch 0:  24%|██▎       | 1517/6434 [3:33:21<11:03:32,  8.10s/it, gpt_loss=0.259, loss_mean=0.325][A
+Train step of epoch 0:  24%|██▎       | 1518/6434 [3:33:21<11:38:08,  8.52s/it, gpt_loss=0.259, loss_mean=0.325][A
+Train step of epoch 0:  24%|██▎       | 1518/6434 [3:33:29<11:38:08,  8.52s/it, gpt_loss=0.413, loss_mean=0.334][A
+Train step of epoch 0:  24%|██▎       | 1519/6434 [3:33:29<11:45:44,  8.62s/it, gpt_loss=0.413, loss_mean=0.334][A
+[LID Router Debug] Step: 1520
+Batch Size: 10
+Audio Batch Size: 133
+LID Assignments: [8, 0, 6, 2, 3, 9, 3, 4, 6, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6, 8, 9}
+
+Train step of epoch 0:  24%|██▎       | 1519/6434 [3:33:38<11:45:44,  8.62s/it, gpt_loss=0.463, loss_mean=0.347][A
+Train step of epoch 0:  24%|██▎       | 1520/6434 [3:33:38<11:48:40,  8.65s/it, gpt_loss=0.463, loss_mean=0.347][A
+Train step of epoch 0:  24%|██▎       | 1520/6434 [3:33:47<11:48:40,  8.65s/it, gpt_loss=0.368, loss_mean=0.349][A
+Train step of epoch 0:  24%|██▎       | 1521/6434 [3:33:47<11:55:59,  8.74s/it, gpt_loss=0.368, loss_mean=0.349][A
+Train step of epoch 0:  24%|██▎       | 1521/6434 [3:33:56<11:55:59,  8.74s/it, gpt_loss=0.289, loss_mean=0.343][A
+Train step of epoch 0:  24%|██▎       | 1522/6434 [3:33:56<11:59:35,  8.79s/it, gpt_loss=0.289, loss_mean=0.343][A
+Train step of epoch 0:  24%|██▎       | 1522/6434 [3:34:04<11:59:35,  8.79s/it, gpt_loss=0.347, loss_mean=0.343][A
+Train step of epoch 0:  24%|██▎       | 1523/6434 [3:34:04<11:47:44,  8.65s/it, gpt_loss=0.347, loss_mean=0.343][A
+Train step of epoch 0:  24%|██▎       | 1523/6434 [3:34:13<11:47:44,  8.65s/it, gpt_loss=0.326, loss_mean=0.342][A
+Train step of epoch 0:  24%|██▎       | 1524/6434 [3:34:13<11:39:18,  8.55s/it, gpt_loss=0.326, loss_mean=0.342][A
+Train step of epoch 0:  24%|██▎       | 1524/6434 [3:34:22<11:39:18,  8.55s/it, gpt_loss=0.27, loss_mean=0.335] [A
+Train step of epoch 0:  24%|██▎       | 1525/6434 [3:34:22<11:46:54,  8.64s/it, gpt_loss=0.27, loss_mean=0.335][A
+Train step of epoch 0:  24%|██▎       | 1525/6434 [3:34:30<11:46:54,  8.64s/it, gpt_loss=0.271, loss_mean=0.328][A
+Train step of epoch 0:  24%|██▎       | 1526/6434 [3:34:30<11:41:14,  8.57s/it, gpt_loss=0.271, loss_mean=0.328][A
+Train step of epoch 0:  24%|██▎       | 1526/6434 [3:34:38<11:41:14,  8.57s/it, gpt_loss=0.369, loss_mean=0.332][A
+Train step of epoch 0:  24%|██▎       | 1527/6434 [3:34:38<11:19:54,  8.31s/it, gpt_loss=0.369, loss_mean=0.332][A
+Train step of epoch 0:  24%|██▎       | 1527/6434 [3:34:46<11:19:54,  8.31s/it, gpt_loss=0.31, loss_mean=0.33]  [A
+Train step of epoch 0:  24%|██▎       | 1528/6434 [3:34:46<11:28:18,  8.42s/it, gpt_loss=0.31, loss_mean=0.33][A
+Train step of epoch 0:  24%|██▎       | 1528/6434 [3:34:55<11:28:18,  8.42s/it, gpt_loss=0.312, loss_mean=0.328][A
+Train step of epoch 0:  24%|██▍       | 1529/6434 [3:34:55<11:27:25,  8.41s/it, gpt_loss=0.312, loss_mean=0.328][A
+[LID Router Debug] Step: 1530
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [6, 2, 6, 9, 3, 9, 4, 9, 6, 1]
+Active Experts in Batch: {1, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:  24%|██▍       | 1529/6434 [3:35:03<11:27:25,  8.41s/it, gpt_loss=0.35, loss_mean=0.33]  [A
+Train step of epoch 0:  24%|██▍       | 1530/6434 [3:35:03<11:17:18,  8.29s/it, gpt_loss=0.35, loss_mean=0.33][A
+Train step of epoch 0:  24%|██▍       | 1530/6434 [3:35:11<11:17:18,  8.29s/it, gpt_loss=0.314, loss_mean=0.329][A
+Train step of epoch 0:  24%|██▍       | 1531/6434 [3:35:11<11:28:19,  8.42s/it, gpt_loss=0.314, loss_mean=0.329][A
+Train step of epoch 0:  24%|██▍       | 1531/6434 [3:35:20<11:28:19,  8.42s/it, gpt_loss=0.317, loss_mean=0.328][A
+Train step of epoch 0:  24%|██▍       | 1532/6434 [3:35:20<11:27:47,  8.42s/it, gpt_loss=0.317, loss_mean=0.328][A
+Train step of epoch 0:  24%|██▍       | 1532/6434 [3:35:28<11:27:47,  8.42s/it, gpt_loss=0.33, loss_mean=0.328] [A
+Train step of epoch 0:  24%|██▍       | 1533/6434 [3:35:28<11:15:25,  8.27s/it, gpt_loss=0.33, loss_mean=0.328][A
+Train step of epoch 0:  24%|██▍       | 1533/6434 [3:35:36<11:15:25,  8.27s/it, gpt_loss=0.287, loss_mean=0.324][A
+Train step of epoch 0:  24%|██▍       | 1534/6434 [3:35:36<11:23:27,  8.37s/it, gpt_loss=0.287, loss_mean=0.324][A
+Train step of epoch 0:  24%|██▍       | 1534/6434 [3:35:45<11:23:27,  8.37s/it, gpt_loss=0.36, loss_mean=0.327] [A
+Train step of epoch 0:  24%|██▍       | 1535/6434 [3:35:45<11:40:21,  8.58s/it, gpt_loss=0.36, loss_mean=0.327][A
+Train step of epoch 0:  24%|██▍       | 1535/6434 [3:35:54<11:40:21,  8.58s/it, gpt_loss=0.31, loss_mean=0.326][A
+Train step of epoch 0:  24%|██▍       | 1536/6434 [3:35:54<11:51:18,  8.71s/it, gpt_loss=0.31, loss_mean=0.326][A
+Train step of epoch 0:  24%|██▍       | 1536/6434 [3:36:03<11:51:18,  8.71s/it, gpt_loss=0.362, loss_mean=0.329][A
+Train step of epoch 0:  24%|██▍       | 1537/6434 [3:36:03<11:45:33,  8.64s/it, gpt_loss=0.362, loss_mean=0.329][A
+Train step of epoch 0:  24%|██▍       | 1537/6434 [3:36:10<11:45:33,  8.64s/it, gpt_loss=0.379, loss_mean=0.334][A
+Train step of epoch 0:  24%|██▍       | 1538/6434 [3:36:10<11:01:43,  8.11s/it, gpt_loss=0.379, loss_mean=0.334][A
+Train step of epoch 0:  24%|██▍       | 1538/6434 [3:36:18<11:01:43,  8.11s/it, gpt_loss=0.341, loss_mean=0.335][A
+Train step of epoch 0:  24%|██▍       | 1539/6434 [3:36:18<10:57:52,  8.06s/it, gpt_loss=0.341, loss_mean=0.335][A
+[LID Router Debug] Step: 1540
+Batch Size: 10
+Audio Batch Size: 105
+LID Assignments: [3, 3, 2, 4, 5, 4, 0, 1, 6, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+Train step of epoch 0:  24%|██▍       | 1539/6434 [3:36:25<10:57:52,  8.06s/it, gpt_loss=0.311, loss_mean=0.332][A
+Train step of epoch 0:  24%|██▍       | 1540/6434 [3:36:25<10:46:10,  7.92s/it, gpt_loss=0.311, loss_mean=0.332][A
+Train step of epoch 0:  24%|██▍       | 1540/6434 [3:36:33<10:46:10,  7.92s/it, gpt_loss=0.302, loss_mean=0.329][A
+Train step of epoch 0:  24%|██▍       | 1541/6434 [3:36:33<10:45:59,  7.92s/it, gpt_loss=0.302, loss_mean=0.329][A
+Train step of epoch 0:  24%|██▍       | 1541/6434 [3:36:41<10:45:59,  7.92s/it, gpt_loss=0.44, loss_mean=0.34]  [A
+Train step of epoch 0:  24%|██▍       | 1542/6434 [3:36:41<10:46:47,  7.93s/it, gpt_loss=0.44, loss_mean=0.34][A
+Train step of epoch 0:  24%|██▍       | 1542/6434 [3:36:50<10:46:47,  7.93s/it, gpt_loss=0.356, loss_mean=0.342][A
+Train step of epoch 0:  24%|██▍       | 1543/6434 [3:36:50<11:06:03,  8.17s/it, gpt_loss=0.356, loss_mean=0.342][A
+Train step of epoch 0:  24%|██▍       | 1543/6434 [3:36:59<11:06:03,  8.17s/it, gpt_loss=0.379, loss_mean=0.346][A
+Train step of epoch 0:  24%|██▍       | 1544/6434 [3:36:59<11:14:43,  8.28s/it, gpt_loss=0.379, loss_mean=0.346][A
+Train step of epoch 0:  24%|██▍       | 1544/6434 [3:37:07<11:14:43,  8.28s/it, gpt_loss=0.379, loss_mean=0.349][A
+Train step of epoch 0:  24%|██▍       | 1545/6434 [3:37:07<11:14:10,  8.27s/it, gpt_loss=0.379, loss_mean=0.349][A
+Train step of epoch 0:  24%|██▍       | 1545/6434 [3:37:14<11:14:10,  8.27s/it, gpt_loss=0.346, loss_mean=0.349][A
+Train step of epoch 0:  24%|██▍       | 1546/6434 [3:37:14<10:50:24,  7.98s/it, gpt_loss=0.346, loss_mean=0.349][A
+Train step of epoch 0:  24%|██▍       | 1546/6434 [3:37:22<10:50:24,  7.98s/it, gpt_loss=0.347, loss_mean=0.349][A
+Train step of epoch 0:  24%|██▍       | 1547/6434 [3:37:22<10:41:10,  7.87s/it, gpt_loss=0.347, loss_mean=0.349][A
+Train step of epoch 0:  24%|██▍       | 1547/6434 [3:37:31<10:41:10,  7.87s/it, gpt_loss=0.292, loss_mean=0.343][A
+Train step of epoch 0:  24%|██▍       | 1548/6434 [3:37:31<11:07:34,  8.20s/it, gpt_loss=0.292, loss_mean=0.343][A
+Train step of epoch 0:  24%|██▍       | 1548/6434 [3:37:39<11:07:34,  8.20s/it, gpt_loss=0.29, loss_mean=0.338] [A
+Train step of epoch 0:  24%|██▍       | 1549/6434 [3:37:39<11:15:27,  8.30s/it, gpt_loss=0.29, loss_mean=0.338][A
+[LID Router Debug] Step: 1550
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [2, 1, 5, 3, 9, 2, 1, 0, 2, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  24%|██▍       | 1549/6434 [3:37:46<11:15:27,  8.30s/it, gpt_loss=0.338, loss_mean=0.338][A
+Train step of epoch 0:  24%|██▍       | 1550/6434 [3:37:46<10:51:34,  8.00s/it, gpt_loss=0.338, loss_mean=0.338][A
+Train step of epoch 0:  24%|██▍       | 1550/6434 [3:37:55<10:51:34,  8.00s/it, gpt_loss=0.327, loss_mean=0.337][A
+Train step of epoch 0:  24%|██▍       | 1551/6434 [3:37:55<10:53:11,  8.03s/it, gpt_loss=0.327, loss_mean=0.337][A
+Train step of epoch 0:  24%|██▍       | 1551/6434 [3:38:03<10:53:11,  8.03s/it, gpt_loss=0.332, loss_mean=0.336][A
+Train step of epoch 0:  24%|██▍       | 1552/6434 [3:38:03<10:55:35,  8.06s/it, gpt_loss=0.332, loss_mean=0.336][A
+Train step of epoch 0:  24%|██▍       | 1552/6434 [3:38:12<10:55:35,  8.06s/it, gpt_loss=0.436, loss_mean=0.346][A
+Train step of epoch 0:  24%|██▍       | 1553/6434 [3:38:12<11:32:59,  8.52s/it, gpt_loss=0.436, loss_mean=0.346][A
+Train step of epoch 0:  24%|██▍       | 1553/6434 [3:38:22<11:32:59,  8.52s/it, gpt_loss=0.432, loss_mean=0.355][A
+Train step of epoch 0:  24%|██▍       | 1554/6434 [3:38:22<11:56:12,  8.81s/it, gpt_loss=0.432, loss_mean=0.355][A
+Train step of epoch 0:  24%|██▍       | 1554/6434 [3:38:30<11:56:12,  8.81s/it, gpt_loss=0.309, loss_mean=0.35] [A
+Train step of epoch 0:  24%|██▍       | 1555/6434 [3:38:30<11:53:37,  8.78s/it, gpt_loss=0.309, loss_mean=0.35][A
+Train step of epoch 0:  24%|██▍       | 1555/6434 [3:38:39<11:53:37,  8.78s/it, gpt_loss=0.289, loss_mean=0.344][A
+Train step of epoch 0:  24%|██▍       | 1556/6434 [3:38:39<11:40:44,  8.62s/it, gpt_loss=0.289, loss_mean=0.344][A
+Train step of epoch 0:  24%|██▍       | 1556/6434 [3:38:48<11:40:44,  8.62s/it, gpt_loss=0.446, loss_mean=0.354][A
+Train step of epoch 0:  24%|██▍       | 1557/6434 [3:38:48<12:03:54,  8.91s/it, gpt_loss=0.446, loss_mean=0.354][A
+Train step of epoch 0:  24%|██▍       | 1557/6434 [3:38:56<12:03:54,  8.91s/it, gpt_loss=0.345, loss_mean=0.353][A
+Train step of epoch 0:  24%|██▍       | 1558/6434 [3:38:56<11:33:53,  8.54s/it, gpt_loss=0.345, loss_mean=0.353][A
+Train step of epoch 0:  24%|██▍       | 1558/6434 [3:39:04<11:33:53,  8.54s/it, gpt_loss=0.339, loss_mean=0.352][A
+Train step of epoch 0:  24%|██▍       | 1559/6434 [3:39:04<11:24:26,  8.42s/it, gpt_loss=0.339, loss_mean=0.352][A
+[LID Router Debug] Step: 1560
+Batch Size: 10
+Audio Batch Size: 89
+LID Assignments: [3, 5, 0, 6, 4, 4, 4, 1, 1, 0]
+Active Experts in Batch: {0, 1, 3, 4, 5, 6}
+
+Train step of epoch 0:  24%|██▍       | 1559/6434 [3:39:14<11:24:26,  8.42s/it, gpt_loss=0.347, loss_mean=0.351][A
+Train step of epoch 0:  24%|██▍       | 1560/6434 [3:39:14<11:57:46,  8.84s/it, gpt_loss=0.347, loss_mean=0.351][A
+Train step of epoch 0:  24%|██▍       | 1560/6434 [3:39:22<11:57:46,  8.84s/it, gpt_loss=0.368, loss_mean=0.353][A
+Train step of epoch 0:  24%|██▍       | 1561/6434 [3:39:22<11:35:08,  8.56s/it, gpt_loss=0.368, loss_mean=0.353][A
+Train step of epoch 0:  24%|██▍       | 1561/6434 [3:39:31<11:35:08,  8.56s/it, gpt_loss=0.352, loss_mean=0.353][A
+Train step of epoch 0:  24%|██▍       | 1562/6434 [3:39:31<11:48:44,  8.73s/it, gpt_loss=0.352, loss_mean=0.353][A
+Train step of epoch 0:  24%|██▍       | 1562/6434 [3:39:38<11:48:44,  8.73s/it, gpt_loss=0.372, loss_mean=0.355][A
+Train step of epoch 0:  24%|██▍       | 1563/6434 [3:39:38<11:08:48,  8.24s/it, gpt_loss=0.372, loss_mean=0.355][A
+Train step of epoch 0:  24%|██▍       | 1563/6434 [3:39:47<11:08:48,  8.24s/it, gpt_loss=0.341, loss_mean=0.353][A
+Train step of epoch 0:  24%|██▍       | 1564/6434 [3:39:47<11:17:48,  8.35s/it, gpt_loss=0.341, loss_mean=0.353][A
+Train step of epoch 0:  24%|██▍       | 1564/6434 [3:39:57<11:17:48,  8.35s/it, gpt_loss=0.281, loss_mean=0.346][A
+Train step of epoch 0:  24%|██▍       | 1565/6434 [3:39:57<11:59:48,  8.87s/it, gpt_loss=0.281, loss_mean=0.346][A
+Train step of epoch 0:  24%|██▍       | 1565/6434 [3:40:05<11:59:48,  8.87s/it, gpt_loss=0.293, loss_mean=0.341][A
+Train step of epoch 0:  24%|██▍       | 1566/6434 [3:40:05<11:39:36,  8.62s/it, gpt_loss=0.293, loss_mean=0.341][A
+Train step of epoch 0:  24%|██▍       | 1566/6434 [3:40:13<11:39:36,  8.62s/it, gpt_loss=0.324, loss_mean=0.339][A
+Train step of epoch 0:  24%|██▍       | 1567/6434 [3:40:13<11:18:57,  8.37s/it, gpt_loss=0.324, loss_mean=0.339][A
+Train step of epoch 0:  24%|██▍       | 1567/6434 [3:40:21<11:18:57,  8.37s/it, gpt_loss=0.298, loss_mean=0.335][A
+Train step of epoch 0:  24%|██▍       | 1568/6434 [3:40:21<11:13:24,  8.30s/it, gpt_loss=0.298, loss_mean=0.335][A
+Train step of epoch 0:  24%|██▍       | 1568/6434 [3:40:29<11:13:24,  8.30s/it, gpt_loss=0.328, loss_mean=0.334][A
+Train step of epoch 0:  24%|██▍       | 1569/6434 [3:40:29<11:13:09,  8.30s/it, gpt_loss=0.328, loss_mean=0.334][A
+[LID Router Debug] Step: 1570
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [4, 2, 5, 0, 5, 5, 9, 1, 0, 6]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  24%|██▍       | 1569/6434 [3:40:39<11:13:09,  8.30s/it, gpt_loss=0.301, loss_mean=0.331][A
+Train step of epoch 0:  24%|██▍       | 1570/6434 [3:40:39<11:43:05,  8.67s/it, gpt_loss=0.301, loss_mean=0.331][A
+Train step of epoch 0:  24%|██▍       | 1570/6434 [3:40:47<11:43:05,  8.67s/it, gpt_loss=0.305, loss_mean=0.328][A
+Train step of epoch 0:  24%|██▍       | 1571/6434 [3:40:47<11:47:11,  8.73s/it, gpt_loss=0.305, loss_mean=0.328][A
+Train step of epoch 0:  24%|██▍       | 1571/6434 [3:40:57<11:47:11,  8.73s/it, gpt_loss=0.3, loss_mean=0.326]  [A
+Train step of epoch 0:  24%|██▍       | 1572/6434 [3:40:57<12:00:29,  8.89s/it, gpt_loss=0.3, loss_mean=0.326][A
+Train step of epoch 0:  24%|██▍       | 1572/6434 [3:41:04<12:00:29,  8.89s/it, gpt_loss=0.334, loss_mean=0.326][A
+Train step of epoch 0:  24%|██▍       | 1573/6434 [3:41:04<11:31:59,  8.54s/it, gpt_loss=0.334, loss_mean=0.326][A
+Train step of epoch 0:  24%|██▍       | 1573/6434 [3:41:13<11:31:59,  8.54s/it, gpt_loss=0.317, loss_mean=0.325][A
+Train step of epoch 0:  24%|██▍       | 1574/6434 [3:41:13<11:34:07,  8.57s/it, gpt_loss=0.317, loss_mean=0.325][A
+Train step of epoch 0:  24%|██▍       | 1574/6434 [3:41:21<11:34:07,  8.57s/it, gpt_loss=0.312, loss_mean=0.324][A
+Train step of epoch 0:  24%|██▍       | 1575/6434 [3:41:21<11:13:10,  8.31s/it, gpt_loss=0.312, loss_mean=0.324][A
+Train step of epoch 0:  24%|██▍       | 1575/6434 [3:41:30<11:13:10,  8.31s/it, gpt_loss=0.33, loss_mean=0.325] [A
+Train step of epoch 0:  24%|██▍       | 1576/6434 [3:41:30<11:36:20,  8.60s/it, gpt_loss=0.33, loss_mean=0.325][A
+Train step of epoch 0:  24%|██▍       | 1576/6434 [3:41:37<11:36:20,  8.60s/it, gpt_loss=0.537, loss_mean=0.346][A
+Train step of epoch 0:  25%|██▍       | 1577/6434 [3:41:37<11:07:13,  8.24s/it, gpt_loss=0.537, loss_mean=0.346][A
+Train step of epoch 0:  25%|██▍       | 1577/6434 [3:41:48<11:07:13,  8.24s/it, gpt_loss=0.411, loss_mean=0.352][A
+Train step of epoch 0:  25%|██▍       | 1578/6434 [3:41:48<12:11:59,  9.04s/it, gpt_loss=0.411, loss_mean=0.352][A
+Train step of epoch 0:  25%|██▍       | 1578/6434 [3:41:56<12:11:59,  9.04s/it, gpt_loss=0.352, loss_mean=0.352][A
+Train step of epoch 0:  25%|██▍       | 1579/6434 [3:41:56<11:35:07,  8.59s/it, gpt_loss=0.352, loss_mean=0.352][A
+[LID Router Debug] Step: 1580
+Batch Size: 10
+Audio Batch Size: 123
+LID Assignments: [9, 2, 5, 3, 8, 4, 0, 0, 4, 9]
+Active Experts in Batch: {0, 2, 3, 4, 5, 8, 9}
+
+Train step of epoch 0:  25%|██▍       | 1579/6434 [3:42:05<11:35:07,  8.59s/it, gpt_loss=0.344, loss_mean=0.351][A
+Train step of epoch 0:  25%|██▍       | 1580/6434 [3:42:05<11:44:48,  8.71s/it, gpt_loss=0.344, loss_mean=0.351][A
+Train step of epoch 0:  25%|██▍       | 1580/6434 [3:42:13<11:44:48,  8.71s/it, gpt_loss=0.281, loss_mean=0.344][A
+Train step of epoch 0:  25%|██▍       | 1581/6434 [3:42:13<11:41:27,  8.67s/it, gpt_loss=0.281, loss_mean=0.344][A
+Train step of epoch 0:  25%|██▍       | 1581/6434 [3:42:22<11:41:27,  8.67s/it, gpt_loss=0.285, loss_mean=0.339][A
+Train step of epoch 0:  25%|██▍       | 1582/6434 [3:42:22<11:30:40,  8.54s/it, gpt_loss=0.285, loss_mean=0.339][A
+Train step of epoch 0:  25%|██▍       | 1582/6434 [3:42:30<11:30:40,  8.54s/it, gpt_loss=0.321, loss_mean=0.337][A
+Train step of epoch 0:  25%|██▍       | 1583/6434 [3:42:30<11:25:06,  8.47s/it, gpt_loss=0.321, loss_mean=0.337][A
+Train step of epoch 0:  25%|██▍       | 1583/6434 [3:42:39<11:25:06,  8.47s/it, gpt_loss=0.241, loss_mean=0.327][A
+Train step of epoch 0:  25%|██▍       | 1584/6434 [3:42:39<11:36:11,  8.61s/it, gpt_loss=0.241, loss_mean=0.327][A
+Train step of epoch 0:  25%|██▍       | 1584/6434 [3:42:47<11:36:11,  8.61s/it, gpt_loss=0.29, loss_mean=0.323] [A
+Train step of epoch 0:  25%|██▍       | 1585/6434 [3:42:47<11:34:01,  8.59s/it, gpt_loss=0.29, loss_mean=0.323][A
+Train step of epoch 0:  25%|██▍       | 1585/6434 [3:42:56<11:34:01,  8.59s/it, gpt_loss=0.304, loss_mean=0.322][A
+Train step of epoch 0:  25%|██▍       | 1586/6434 [3:42:56<11:43:10,  8.70s/it, gpt_loss=0.304, loss_mean=0.322][A
+Train step of epoch 0:  25%|██▍       | 1586/6434 [3:43:05<11:43:10,  8.70s/it, gpt_loss=0.394, loss_mean=0.329][A
+Train step of epoch 0:  25%|██▍       | 1587/6434 [3:43:05<11:38:16,  8.64s/it, gpt_loss=0.394, loss_mean=0.329][A
+Train step of epoch 0:  25%|██▍       | 1587/6434 [3:43:14<11:38:16,  8.64s/it, gpt_loss=0.285, loss_mean=0.324][A
+Train step of epoch 0:  25%|██▍       | 1588/6434 [3:43:14<11:40:48,  8.68s/it, gpt_loss=0.285, loss_mean=0.324][A
+Train step of epoch 0:  25%|██▍       | 1588/6434 [3:43:21<11:40:48,  8.68s/it, gpt_loss=0.488, loss_mean=0.341][A
+Train step of epoch 0:  25%|██▍       | 1589/6434 [3:43:21<11:14:02,  8.35s/it, gpt_loss=0.488, loss_mean=0.341][A
+[LID Router Debug] Step: 1590
+Batch Size: 10
+Audio Batch Size: 95
+LID Assignments: [5, 4, 2, 1, 5, 1, 3, 0, 9, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  25%|██▍       | 1589/6434 [3:43:30<11:14:02,  8.35s/it, gpt_loss=0.314, loss_mean=0.338][A
+Train step of epoch 0:  25%|██▍       | 1590/6434 [3:43:30<11:23:00,  8.46s/it, gpt_loss=0.314, loss_mean=0.338][A
+Train step of epoch 0:  25%|██▍       | 1590/6434 [3:43:38<11:23:00,  8.46s/it, gpt_loss=0.333, loss_mean=0.337][A
+Train step of epoch 0:  25%|██▍       | 1591/6434 [3:43:38<11:18:58,  8.41s/it, gpt_loss=0.333, loss_mean=0.337][A
+Train step of epoch 0:  25%|██▍       | 1591/6434 [3:43:47<11:18:58,  8.41s/it, gpt_loss=0.349, loss_mean=0.339][A
+Train step of epoch 0:  25%|██▍       | 1592/6434 [3:43:47<11:28:06,  8.53s/it, gpt_loss=0.349, loss_mean=0.339][A
+Train step of epoch 0:  25%|██▍       | 1592/6434 [3:43:56<11:28:06,  8.53s/it, gpt_loss=0.373, loss_mean=0.342][A
+Train step of epoch 0:  25%|██▍       | 1593/6434 [3:43:56<11:42:10,  8.70s/it, gpt_loss=0.373, loss_mean=0.342][A
+Train step of epoch 0:  25%|██▍       | 1593/6434 [3:44:05<11:42:10,  8.70s/it, gpt_loss=0.325, loss_mean=0.34] [A
+Train step of epoch 0:  25%|██▍       | 1594/6434 [3:44:05<11:38:59,  8.67s/it, gpt_loss=0.325, loss_mean=0.34][A
+Train step of epoch 0:  25%|██▍       | 1594/6434 [3:44:13<11:38:59,  8.67s/it, gpt_loss=0.304, loss_mean=0.337][A
+Train step of epoch 0:  25%|██▍       | 1595/6434 [3:44:13<11:37:02,  8.64s/it, gpt_loss=0.304, loss_mean=0.337][A
+Train step of epoch 0:  25%|██▍       | 1595/6434 [3:44:21<11:37:02,  8.64s/it, gpt_loss=0.276, loss_mean=0.331][A
+Train step of epoch 0:  25%|██▍       | 1596/6434 [3:44:21<11:23:29,  8.48s/it, gpt_loss=0.276, loss_mean=0.331][A
+Train step of epoch 0:  25%|██▍       | 1596/6434 [3:44:29<11:23:29,  8.48s/it, gpt_loss=0.34, loss_mean=0.331] [A
+Train step of epoch 0:  25%|██▍       | 1597/6434 [3:44:29<11:04:54,  8.25s/it, gpt_loss=0.34, loss_mean=0.331][A
+Train step of epoch 0:  25%|██▍       | 1597/6434 [3:44:37<11:04:54,  8.25s/it, gpt_loss=0.27, loss_mean=0.325][A
+Train step of epoch 0:  25%|██▍       | 1598/6434 [3:44:37<11:02:41,  8.22s/it, gpt_loss=0.27, loss_mean=0.325][A
+Train step of epoch 0:  25%|██▍       | 1598/6434 [3:44:46<11:02:41,  8.22s/it, gpt_loss=0.277, loss_mean=0.32][A
+Train step of epoch 0:  25%|██▍       | 1599/6434 [3:44:46<11:11:22,  8.33s/it, gpt_loss=0.277, loss_mean=0.32][A
+[LID Router Debug] Step: 1600
+Batch Size: 10
+Audio Batch Size: 109
+LID Assignments: [0, 9, 0, 0, 0, 5, 9, 6, 3, 3]
+Active Experts in Batch: {0, 3, 5, 6, 9}
+[2026-02-06 19:40:58,187] [INFO] [logging.py:96:log_dist] [Rank 0] step=800, skipped=0, lr=[1.9734003988885815e-05, 1.9734003988885815e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-06 19:40:58,188] [INFO] [timer.py:260:stop] epoch=0/micro_step=1600/global_step=800, RunningAvgSamplesPerSec=4.7559146323597865, CurrSamplesPerSec=4.900299882107494, MemAllocated=12.38GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  25%|██▍       | 1599/6434 [3:44:54<11:11:22,  8.33s/it, gpt_loss=0.314, loss_mean=0.32][A
+Train step of epoch 0:  25%|██▍       | 1600/6434 [3:44:54<10:57:36,  8.16s/it, gpt_loss=0.314, loss_mean=0.32][A
+Train step of epoch 0:  25%|██▍       | 1600/6434 [3:45:02<10:57:36,  8.16s/it, gpt_loss=0.307, loss_mean=0.319][A
+Train step of epoch 0:  25%|██▍       | 1601/6434 [3:45:02<10:48:30,  8.05s/it, gpt_loss=0.307, loss_mean=0.319][A
+Train step of epoch 0:  25%|██▍       | 1601/6434 [3:45:10<10:48:30,  8.05s/it, gpt_loss=0.304, loss_mean=0.317][A
+Train step of epoch 0:  25%|██▍       | 1602/6434 [3:45:10<10:56:00,  8.15s/it, gpt_loss=0.304, loss_mean=0.317][A
+Train step of epoch 0:  25%|██▍       | 1602/6434 [3:45:18<10:56:00,  8.15s/it, gpt_loss=0.275, loss_mean=0.313][A
+Train step of epoch 0:  25%|██▍       | 1603/6434 [3:45:18<10:58:20,  8.18s/it, gpt_loss=0.275, loss_mean=0.313][A
+Train step of epoch 0:  25%|██▍       | 1603/6434 [3:45:27<10:58:20,  8.18s/it, gpt_loss=0.38, loss_mean=0.32]  [A
+Train step of epoch 0:  25%|██▍       | 1604/6434 [3:45:27<11:20:10,  8.45s/it, gpt_loss=0.38, loss_mean=0.32][A
+Train step of epoch 0:  25%|██▍       | 1604/6434 [3:45:36<11:20:10,  8.45s/it, gpt_loss=0.376, loss_mean=0.325][A
+Train step of epoch 0:  25%|██▍       | 1605/6434 [3:45:36<11:26:40,  8.53s/it, gpt_loss=0.376, loss_mean=0.325][A
+Train step of epoch 0:  25%|██▍       | 1605/6434 [3:45:45<11:26:40,  8.53s/it, gpt_loss=0.388, loss_mean=0.332][A
+Train step of epoch 0:  25%|██▍       | 1606/6434 [3:45:45<11:32:20,  8.60s/it, gpt_loss=0.388, loss_mean=0.332][A
+Train step of epoch 0:  25%|██▍       | 1606/6434 [3:45:53<11:32:20,  8.60s/it, gpt_loss=0.318, loss_mean=0.33] [A
+Train step of epoch 0:  25%|██▍       | 1607/6434 [3:45:53<11:16:14,  8.41s/it, gpt_loss=0.318, loss_mean=0.33][A
+Train step of epoch 0:  25%|██▍       | 1607/6434 [3:46:01<11:16:14,  8.41s/it, gpt_loss=0.355, loss_mean=0.333][A
+Train step of epoch 0:  25%|██▍       | 1608/6434 [3:46:01<11:03:38,  8.25s/it, gpt_loss=0.355, loss_mean=0.333][A
+Train step of epoch 0:  25%|██▍       | 1608/6434 [3:46:09<11:03:38,  8.25s/it, gpt_loss=0.29, loss_mean=0.329] [A
+Train step of epoch 0:  25%|██▌       | 1609/6434 [3:46:09<11:08:54,  8.32s/it, gpt_loss=0.29, loss_mean=0.329][A
+[LID Router Debug] Step: 1610
+Batch Size: 10
+Audio Batch Size: 108
+LID Assignments: [3, 3, 4, 1, 5, 2, 1, 5, 4, 5]
+Active Experts in Batch: {1, 2, 3, 4, 5}
+
+Train step of epoch 0:  25%|██▌       | 1609/6434 [3:46:18<11:08:54,  8.32s/it, gpt_loss=0.346, loss_mean=0.33][A
+Train step of epoch 0:  25%|██▌       | 1610/6434 [3:46:18<11:19:28,  8.45s/it, gpt_loss=0.346, loss_mean=0.33][A
+Train step of epoch 0:  25%|██▌       | 1610/6434 [3:46:27<11:19:28,  8.45s/it, gpt_loss=0.267, loss_mean=0.324][A
+Train step of epoch 0:  25%|██▌       | 1611/6434 [3:46:27<11:27:58,  8.56s/it, gpt_loss=0.267, loss_mean=0.324][A
+Train step of epoch 0:  25%|██▌       | 1611/6434 [3:46:35<11:27:58,  8.56s/it, gpt_loss=0.464, loss_mean=0.338][A
+Train step of epoch 0:  25%|██▌       | 1612/6434 [3:46:35<11:18:36,  8.44s/it, gpt_loss=0.464, loss_mean=0.338][A
+Train step of epoch 0:  25%|██▌       | 1612/6434 [3:46:43<11:18:36,  8.44s/it, gpt_loss=0.329, loss_mean=0.337][A
+Train step of epoch 0:  25%|██▌       | 1613/6434 [3:46:43<11:16:15,  8.42s/it, gpt_loss=0.329, loss_mean=0.337][A
+Train step of epoch 0:  25%|██▌       | 1613/6434 [3:46:52<11:16:15,  8.42s/it, gpt_loss=0.221, loss_mean=0.325][A
+Train step of epoch 0:  25%|██▌       | 1614/6434 [3:46:52<11:19:49,  8.46s/it, gpt_loss=0.221, loss_mean=0.325][A
+Train step of epoch 0:  25%|██▌       | 1614/6434 [3:47:01<11:19:49,  8.46s/it, gpt_loss=0.395, loss_mean=0.332][A
+Train step of epoch 0:  25%|██▌       | 1615/6434 [3:47:01<11:50:02,  8.84s/it, gpt_loss=0.395, loss_mean=0.332][A
+Train step of epoch 0:  25%|██▌       | 1615/6434 [3:47:09<11:50:02,  8.84s/it, gpt_loss=0.308, loss_mean=0.33] [A
+Train step of epoch 0:  25%|██▌       | 1616/6434 [3:47:09<11:30:32,  8.60s/it, gpt_loss=0.308, loss_mean=0.33][A
+Train step of epoch 0:  25%|██▌       | 1616/6434 [3:47:17<11:30:32,  8.60s/it, gpt_loss=0.409, loss_mean=0.338][A
+Train step of epoch 0:  25%|██▌       | 1617/6434 [3:47:17<11:10:59,  8.36s/it, gpt_loss=0.409, loss_mean=0.338][A
+Train step of epoch 0:  25%|██▌       | 1617/6434 [3:47:26<11:10:59,  8.36s/it, gpt_loss=0.342, loss_mean=0.338][A
+Train step of epoch 0:  25%|██▌       | 1618/6434 [3:47:26<11:10:22,  8.35s/it, gpt_loss=0.342, loss_mean=0.338][A
+Train step of epoch 0:  25%|██▌       | 1618/6434 [3:47:34<11:10:22,  8.35s/it, gpt_loss=0.405, loss_mean=0.345][A
+Train step of epoch 0:  25%|██▌       | 1619/6434 [3:47:34<11:20:21,  8.48s/it, gpt_loss=0.405, loss_mean=0.345][A
+[LID Router Debug] Step: 1620
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [9, 9, 5, 6, 2, 1, 3, 2, 4, 4]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  25%|██▌       | 1619/6434 [3:47:43<11:20:21,  8.48s/it, gpt_loss=0.382, loss_mean=0.349][A
+Train step of epoch 0:  25%|██▌       | 1620/6434 [3:47:43<11:17:53,  8.45s/it, gpt_loss=0.382, loss_mean=0.349][A
+Train step of epoch 0:  25%|██▌       | 1620/6434 [3:47:52<11:17:53,  8.45s/it, gpt_loss=0.43, loss_mean=0.357] [A
+Train step of epoch 0:  25%|██▌       | 1621/6434 [3:47:52<11:31:02,  8.61s/it, gpt_loss=0.43, loss_mean=0.357][A
+Train step of epoch 0:  25%|██▌       | 1621/6434 [3:47:59<11:31:02,  8.61s/it, gpt_loss=0.395, loss_mean=0.361][A
+Train step of epoch 0:  25%|██▌       | 1622/6434 [3:47:59<11:01:43,  8.25s/it, gpt_loss=0.395, loss_mean=0.361][A
+Train step of epoch 0:  25%|██▌       | 1622/6434 [3:48:08<11:01:43,  8.25s/it, gpt_loss=0.375, loss_mean=0.362][A
+Train step of epoch 0:  25%|██▌       | 1623/6434 [3:48:08<11:13:37,  8.40s/it, gpt_loss=0.375, loss_mean=0.362][A
+Train step of epoch 0:  25%|██▌       | 1623/6434 [3:48:18<11:13:37,  8.40s/it, gpt_loss=0.388, loss_mean=0.365][A
+Train step of epoch 0:  25%|██▌       | 1624/6434 [3:48:18<11:48:58,  8.84s/it, gpt_loss=0.388, loss_mean=0.365][A
+Train step of epoch 0:  25%|██▌       | 1624/6434 [3:48:27<11:48:58,  8.84s/it, gpt_loss=0.306, loss_mean=0.359][A
+Train step of epoch 0:  25%|██▌       | 1625/6434 [3:48:27<11:51:18,  8.87s/it, gpt_loss=0.306, loss_mean=0.359][A
+Train step of epoch 0:  25%|██▌       | 1625/6434 [3:48:35<11:51:18,  8.87s/it, gpt_loss=0.329, loss_mean=0.356][A
+Train step of epoch 0:  25%|██▌       | 1626/6434 [3:48:35<11:35:48,  8.68s/it, gpt_loss=0.329, loss_mean=0.356][A
+Train step of epoch 0:  25%|██▌       | 1626/6434 [3:48:44<11:35:48,  8.68s/it, gpt_loss=0.295, loss_mean=0.35] [A
+Train step of epoch 0:  25%|██▌       | 1627/6434 [3:48:44<11:34:06,  8.66s/it, gpt_loss=0.295, loss_mean=0.35][A
+Train step of epoch 0:  25%|██▌       | 1627/6434 [3:48:53<11:34:06,  8.66s/it, gpt_loss=0.296, loss_mean=0.344][A
+Train step of epoch 0:  25%|██▌       | 1628/6434 [3:48:53<11:45:18,  8.81s/it, gpt_loss=0.296, loss_mean=0.344][A
+Train step of epoch 0:  25%|██▌       | 1628/6434 [3:49:01<11:45:18,  8.81s/it, gpt_loss=0.33, loss_mean=0.343] [A
+Train step of epoch 0:  25%|██▌       | 1629/6434 [3:49:01<11:32:33,  8.65s/it, gpt_loss=0.33, loss_mean=0.343][A
+[LID Router Debug] Step: 1630
+Batch Size: 10
+Audio Batch Size: 89
+LID Assignments: [9, 4, 5, 1, 9, 1, 5, 5, 9, 9]
+Active Experts in Batch: {9, 4, 5, 1}
+
+Train step of epoch 0:  25%|██▌       | 1629/6434 [3:49:09<11:32:33,  8.65s/it, gpt_loss=0.374, loss_mean=0.346][A
+Train step of epoch 0:  25%|██▌       | 1630/6434 [3:49:09<11:18:02,  8.47s/it, gpt_loss=0.374, loss_mean=0.346][A
+Train step of epoch 0:  25%|██▌       | 1630/6434 [3:49:18<11:18:02,  8.47s/it, gpt_loss=0.387, loss_mean=0.35] [A
+Train step of epoch 0:  25%|██▌       | 1631/6434 [3:49:18<11:18:12,  8.47s/it, gpt_loss=0.387, loss_mean=0.35][A
+Train step of epoch 0:  25%|██▌       | 1631/6434 [3:49:25<11:18:12,  8.47s/it, gpt_loss=0.32, loss_mean=0.347][A
+Train step of epoch 0:  25%|██▌       | 1632/6434 [3:49:25<10:54:40,  8.18s/it, gpt_loss=0.32, loss_mean=0.347][A
+Train step of epoch 0:  25%|██▌       | 1632/6434 [3:49:34<10:54:40,  8.18s/it, gpt_loss=0.291, loss_mean=0.341][A
+Train step of epoch 0:  25%|██▌       | 1633/6434 [3:49:34<11:10:05,  8.37s/it, gpt_loss=0.291, loss_mean=0.341][A
+Train step of epoch 0:  25%|██▌       | 1633/6434 [3:49:42<11:10:05,  8.37s/it, gpt_loss=0.391, loss_mean=0.346][A
+Train step of epoch 0:  25%|██▌       | 1634/6434 [3:49:42<11:06:35,  8.33s/it, gpt_loss=0.391, loss_mean=0.346][A
+Train step of epoch 0:  25%|██▌       | 1634/6434 [3:49:50<11:06:35,  8.33s/it, gpt_loss=0.259, loss_mean=0.338][A
+Train step of epoch 0:  25%|██▌       | 1635/6434 [3:49:50<10:48:48,  8.11s/it, gpt_loss=0.259, loss_mean=0.338][A
+Train step of epoch 0:  25%|██▌       | 1635/6434 [3:49:58<10:48:48,  8.11s/it, gpt_loss=0.273, loss_mean=0.331][A
+Train step of epoch 0:  25%|██▌       | 1636/6434 [3:49:58<10:49:45,  8.13s/it, gpt_loss=0.273, loss_mean=0.331][A
+Train step of epoch 0:  25%|██▌       | 1636/6434 [3:50:07<10:49:45,  8.13s/it, gpt_loss=0.266, loss_mean=0.325][A
+Train step of epoch 0:  25%|██▌       | 1637/6434 [3:50:07<11:08:03,  8.36s/it, gpt_loss=0.266, loss_mean=0.325][A
+Train step of epoch 0:  25%|██▌       | 1637/6434 [3:50:16<11:08:03,  8.36s/it, gpt_loss=0.312, loss_mean=0.324][A
+Train step of epoch 0:  25%|██▌       | 1638/6434 [3:50:16<11:27:46,  8.60s/it, gpt_loss=0.312, loss_mean=0.324][A
+Train step of epoch 0:  25%|██▌       | 1638/6434 [3:50:24<11:27:46,  8.60s/it, gpt_loss=0.427, loss_mean=0.334][A
+Train step of epoch 0:  25%|██▌       | 1639/6434 [3:50:24<11:17:11,  8.47s/it, gpt_loss=0.427, loss_mean=0.334][A
+[LID Router Debug] Step: 1640
+Batch Size: 10
+Audio Batch Size: 150
+LID Assignments: [5, 3, 6, 2, 3, 4, 2, 2, 3, 5]
+Active Experts in Batch: {2, 3, 4, 5, 6}
+
+Train step of epoch 0:  25%|██▌       | 1639/6434 [3:50:33<11:17:11,  8.47s/it, gpt_loss=0.304, loss_mean=0.331][A
+Train step of epoch 0:  25%|██▌       | 1640/6434 [3:50:33<11:36:01,  8.71s/it, gpt_loss=0.304, loss_mean=0.331][A
+Train step of epoch 0:  25%|██▌       | 1640/6434 [3:50:42<11:36:01,  8.71s/it, gpt_loss=0.335, loss_mean=0.331][A
+Train step of epoch 0:  26%|██▌       | 1641/6434 [3:50:42<11:25:40,  8.58s/it, gpt_loss=0.335, loss_mean=0.331][A
+Train step of epoch 0:  26%|██▌       | 1641/6434 [3:50:50<11:25:40,  8.58s/it, gpt_loss=0.379, loss_mean=0.336][A
+Train step of epoch 0:  26%|██▌       | 1642/6434 [3:50:50<11:31:04,  8.65s/it, gpt_loss=0.379, loss_mean=0.336][A
+Train step of epoch 0:  26%|██▌       | 1642/6434 [3:50:59<11:31:04,  8.65s/it, gpt_loss=0.33, loss_mean=0.335] [A
+Train step of epoch 0:  26%|██▌       | 1643/6434 [3:50:59<11:30:07,  8.64s/it, gpt_loss=0.33, loss_mean=0.335][A
+Train step of epoch 0:  26%|██▌       | 1643/6434 [3:51:06<11:30:07,  8.64s/it, gpt_loss=0.326, loss_mean=0.334][A
+Train step of epoch 0:  26%|██▌       | 1644/6434 [3:51:06<10:51:58,  8.17s/it, gpt_loss=0.326, loss_mean=0.334][A
+Train step of epoch 0:  26%|██▌       | 1644/6434 [3:51:14<10:51:58,  8.17s/it, gpt_loss=0.334, loss_mean=0.334][A
+Train step of epoch 0:  26%|██▌       | 1645/6434 [3:51:14<10:55:34,  8.21s/it, gpt_loss=0.334, loss_mean=0.334][A
+Train step of epoch 0:  26%|██▌       | 1645/6434 [3:51:21<10:55:34,  8.21s/it, gpt_loss=0.32, loss_mean=0.333] [A
+Train step of epoch 0:  26%|██▌       | 1646/6434 [3:51:21<10:25:31,  7.84s/it, gpt_loss=0.32, loss_mean=0.333][A
+Train step of epoch 0:  26%|██▌       | 1646/6434 [3:51:30<10:25:31,  7.84s/it, gpt_loss=0.417, loss_mean=0.341][A
+Train step of epoch 0:  26%|██▌       | 1647/6434 [3:51:30<10:49:18,  8.14s/it, gpt_loss=0.417, loss_mean=0.341][A
+Train step of epoch 0:  26%|██▌       | 1647/6434 [3:51:38<10:49:18,  8.14s/it, gpt_loss=0.31, loss_mean=0.338] [A
+Train step of epoch 0:  26%|██▌       | 1648/6434 [3:51:38<10:35:38,  7.97s/it, gpt_loss=0.31, loss_mean=0.338][A
+Train step of epoch 0:  26%|██▌       | 1648/6434 [3:51:46<10:35:38,  7.97s/it, gpt_loss=0.31, loss_mean=0.335][A
+Train step of epoch 0:  26%|██▌       | 1649/6434 [3:51:46<10:34:06,  7.95s/it, gpt_loss=0.31, loss_mean=0.335][A
+[LID Router Debug] Step: 1650
+Batch Size: 10
+Audio Batch Size: 121
+LID Assignments: [9, 6, 3, 0, 2, 9, 3, 5, 4, 3]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  26%|██▌       | 1649/6434 [3:51:54<10:34:06,  7.95s/it, gpt_loss=0.387, loss_mean=0.341][A
+Train step of epoch 0:  26%|██▌       | 1650/6434 [3:51:54<10:40:33,  8.03s/it, gpt_loss=0.387, loss_mean=0.341][A
+Train step of epoch 0:  26%|██▌       | 1650/6434 [3:52:03<10:40:33,  8.03s/it, gpt_loss=0.332, loss_mean=0.34] [A
+Train step of epoch 0:  26%|██▌       | 1651/6434 [3:52:03<11:09:48,  8.40s/it, gpt_loss=0.332, loss_mean=0.34][A
+Train step of epoch 0:  26%|██▌       | 1651/6434 [3:52:11<11:09:48,  8.40s/it, gpt_loss=0.323, loss_mean=0.338][A
+Train step of epoch 0:  26%|██▌       | 1652/6434 [3:52:11<11:03:09,  8.32s/it, gpt_loss=0.323, loss_mean=0.338][A
+Train step of epoch 0:  26%|██▌       | 1652/6434 [3:52:19<11:03:09,  8.32s/it, gpt_loss=0.314, loss_mean=0.336][A
+Train step of epoch 0:  26%|██▌       | 1653/6434 [3:52:19<10:48:15,  8.14s/it, gpt_loss=0.314, loss_mean=0.336][A
+Train step of epoch 0:  26%|██▌       | 1653/6434 [3:52:27<10:48:15,  8.14s/it, gpt_loss=0.369, loss_mean=0.339][A
+Train step of epoch 0:  26%|██▌       | 1654/6434 [3:52:27<10:47:28,  8.13s/it, gpt_loss=0.369, loss_mean=0.339][A
+Train step of epoch 0:  26%|██▌       | 1654/6434 [3:52:35<10:47:28,  8.13s/it, gpt_loss=0.48, loss_mean=0.353] [A
+Train step of epoch 0:  26%|██▌       | 1655/6434 [3:52:35<10:46:11,  8.11s/it, gpt_loss=0.48, loss_mean=0.353][A
+Train step of epoch 0:  26%|██▌       | 1655/6434 [3:52:43<10:46:11,  8.11s/it, gpt_loss=0.323, loss_mean=0.35][A
+Train step of epoch 0:  26%|██▌       | 1656/6434 [3:52:43<10:43:14,  8.08s/it, gpt_loss=0.323, loss_mean=0.35][A
+Train step of epoch 0:  26%|██▌       | 1656/6434 [3:52:52<10:43:14,  8.08s/it, gpt_loss=0.306, loss_mean=0.346][A
+Train step of epoch 0:  26%|██▌       | 1657/6434 [3:52:52<10:55:08,  8.23s/it, gpt_loss=0.306, loss_mean=0.346][A
+Train step of epoch 0:  26%|██▌       | 1657/6434 [3:53:01<10:55:08,  8.23s/it, gpt_loss=0.299, loss_mean=0.341][A
+Train step of epoch 0:  26%|██▌       | 1658/6434 [3:53:01<11:06:11,  8.37s/it, gpt_loss=0.299, loss_mean=0.341][A
+Train step of epoch 0:  26%|██▌       | 1658/6434 [3:53:09<11:06:11,  8.37s/it, gpt_loss=0.36, loss_mean=0.343] [A
+Train step of epoch 0:  26%|██▌       | 1659/6434 [3:53:09<11:08:18,  8.40s/it, gpt_loss=0.36, loss_mean=0.343][A
+[LID Router Debug] Step: 1660
+Batch Size: 10
+Audio Batch Size: 90
+LID Assignments: [4, 1, 0, 5, 6, 9, 1, 3, 6, 0]
+Active Experts in Batch: {0, 1, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  26%|██▌       | 1659/6434 [3:53:17<11:08:18,  8.40s/it, gpt_loss=0.28, loss_mean=0.337][A
+Train step of epoch 0:  26%|██▌       | 1660/6434 [3:53:17<10:58:56,  8.28s/it, gpt_loss=0.28, loss_mean=0.337][A
+Train step of epoch 0:  26%|██▌       | 1660/6434 [3:53:25<10:58:56,  8.28s/it, gpt_loss=0.4, loss_mean=0.343] [A
+Train step of epoch 0:  26%|██▌       | 1661/6434 [3:53:25<10:57:01,  8.26s/it, gpt_loss=0.4, loss_mean=0.343][A
+Train step of epoch 0:  26%|██▌       | 1661/6434 [3:53:33<10:57:01,  8.26s/it, gpt_loss=0.259, loss_mean=0.335][A
+Train step of epoch 0:  26%|██▌       | 1662/6434 [3:53:33<10:52:39,  8.21s/it, gpt_loss=0.259, loss_mean=0.335][A
+Train step of epoch 0:  26%|██▌       | 1662/6434 [3:53:42<10:52:39,  8.21s/it, gpt_loss=0.35, loss_mean=0.336] [A
+Train step of epoch 0:  26%|██▌       | 1663/6434 [3:53:42<10:56:39,  8.26s/it, gpt_loss=0.35, loss_mean=0.336][A
+Train step of epoch 0:  26%|██▌       | 1663/6434 [3:53:50<10:56:39,  8.26s/it, gpt_loss=0.268, loss_mean=0.329][A
+Train step of epoch 0:  26%|██▌       | 1664/6434 [3:53:50<10:58:01,  8.28s/it, gpt_loss=0.268, loss_mean=0.329][A
+Train step of epoch 0:  26%|██▌       | 1664/6434 [3:53:57<10:58:01,  8.28s/it, gpt_loss=0.317, loss_mean=0.328][A
+Train step of epoch 0:  26%|██▌       | 1665/6434 [3:53:57<10:32:41,  7.96s/it, gpt_loss=0.317, loss_mean=0.328][A
+Train step of epoch 0:  26%|██▌       | 1665/6434 [3:54:05<10:32:41,  7.96s/it, gpt_loss=0.315, loss_mean=0.327][A
+Train step of epoch 0:  26%|██▌       | 1666/6434 [3:54:05<10:30:09,  7.93s/it, gpt_loss=0.315, loss_mean=0.327][A
+Train step of epoch 0:  26%|██▌       | 1666/6434 [3:54:14<10:30:09,  7.93s/it, gpt_loss=0.459, loss_mean=0.34] [A
+Train step of epoch 0:  26%|██▌       | 1667/6434 [3:54:14<10:47:43,  8.15s/it, gpt_loss=0.459, loss_mean=0.34][A
+Train step of epoch 0:  26%|██▌       | 1667/6434 [3:54:23<10:47:43,  8.15s/it, gpt_loss=0.305, loss_mean=0.336][A
+Train step of epoch 0:  26%|██▌       | 1668/6434 [3:54:23<11:04:27,  8.36s/it, gpt_loss=0.305, loss_mean=0.336][A
+Train step of epoch 0:  26%|██▌       | 1668/6434 [3:54:30<11:04:27,  8.36s/it, gpt_loss=0.316, loss_mean=0.334][A
+Train step of epoch 0:  26%|██▌       | 1669/6434 [3:54:30<10:49:39,  8.18s/it, gpt_loss=0.316, loss_mean=0.334][A
+[LID Router Debug] Step: 1670
+Batch Size: 10
+Audio Batch Size: 171
+LID Assignments: [2, 9, 4, 4, 8, 8, 9, 9, 3, 0]
+Active Experts in Batch: {0, 2, 3, 4, 8, 9}
+
+Train step of epoch 0:  26%|██▌       | 1669/6434 [3:54:40<10:49:39,  8.18s/it, gpt_loss=0.35, loss_mean=0.336] [A
+Train step of epoch 0:  26%|██▌       | 1670/6434 [3:54:40<11:35:18,  8.76s/it, gpt_loss=0.35, loss_mean=0.336][A
+Train step of epoch 0:  26%|██▌       | 1670/6434 [3:54:49<11:35:18,  8.76s/it, gpt_loss=0.363, loss_mean=0.339][A
+Train step of epoch 0:  26%|██▌       | 1671/6434 [3:54:49<11:36:43,  8.78s/it, gpt_loss=0.363, loss_mean=0.339][A
+Train step of epoch 0:  26%|██▌       | 1671/6434 [3:54:58<11:36:43,  8.78s/it, gpt_loss=0.283, loss_mean=0.333][A
+Train step of epoch 0:  26%|██▌       | 1672/6434 [3:54:58<11:27:58,  8.67s/it, gpt_loss=0.283, loss_mean=0.333][A
+Train step of epoch 0:  26%|██▌       | 1672/6434 [3:55:05<11:27:58,  8.67s/it, gpt_loss=0.296, loss_mean=0.329][A
+Train step of epoch 0:  26%|██▌       | 1673/6434 [3:55:05<11:01:19,  8.33s/it, gpt_loss=0.296, loss_mean=0.329][A
+Train step of epoch 0:  26%|██▌       | 1673/6434 [3:55:13<11:01:19,  8.33s/it, gpt_loss=0.419, loss_mean=0.338][A
+Train step of epoch 0:  26%|██▌       | 1674/6434 [3:55:13<10:49:01,  8.18s/it, gpt_loss=0.419, loss_mean=0.338][A
+Train step of epoch 0:  26%|██▌       | 1674/6434 [3:55:20<10:49:01,  8.18s/it, gpt_loss=0.31, loss_mean=0.336] [A
+Train step of epoch 0:  26%|██▌       | 1675/6434 [3:55:20<10:22:23,  7.85s/it, gpt_loss=0.31, loss_mean=0.336][A
+Train step of epoch 0:  26%|██▌       | 1675/6434 [3:55:28<10:22:23,  7.85s/it, gpt_loss=0.332, loss_mean=0.335][A
+Train step of epoch 0:  26%|██▌       | 1676/6434 [3:55:28<10:28:50,  7.93s/it, gpt_loss=0.332, loss_mean=0.335][A
+Train step of epoch 0:  26%|██▌       | 1676/6434 [3:55:36<10:28:50,  7.93s/it, gpt_loss=0.3, loss_mean=0.332]  [A
+Train step of epoch 0:  26%|██▌       | 1677/6434 [3:55:36<10:13:56,  7.74s/it, gpt_loss=0.3, loss_mean=0.332][A
+Train step of epoch 0:  26%|██▌       | 1677/6434 [3:55:44<10:13:56,  7.74s/it, gpt_loss=0.344, loss_mean=0.333][A
+Train step of epoch 0:  26%|██▌       | 1678/6434 [3:55:44<10:23:51,  7.87s/it, gpt_loss=0.344, loss_mean=0.333][A
+Train step of epoch 0:  26%|██▌       | 1678/6434 [3:55:52<10:23:51,  7.87s/it, gpt_loss=0.401, loss_mean=0.34] [A
+Train step of epoch 0:  26%|██▌       | 1679/6434 [3:55:52<10:36:48,  8.04s/it, gpt_loss=0.401, loss_mean=0.34][A
+[LID Router Debug] Step: 1680
+Batch Size: 10
+Audio Batch Size: 85
+LID Assignments: [1, 6, 0, 2, 0, 9, 4, 2, 4, 1]
+Active Experts in Batch: {0, 1, 2, 4, 6, 9}
+
+Train step of epoch 0:  26%|██▌       | 1679/6434 [3:56:00<10:36:48,  8.04s/it, gpt_loss=0.354, loss_mean=0.341][A
+Train step of epoch 0:  26%|██▌       | 1680/6434 [3:56:00<10:29:23,  7.94s/it, gpt_loss=0.354, loss_mean=0.341][A
+Train step of epoch 0:  26%|██▌       | 1680/6434 [3:56:07<10:29:23,  7.94s/it, gpt_loss=0.287, loss_mean=0.336][A
+Train step of epoch 0:  26%|██▌       | 1681/6434 [3:56:07<10:15:39,  7.77s/it, gpt_loss=0.287, loss_mean=0.336][A
+Train step of epoch 0:  26%|██▌       | 1681/6434 [3:56:16<10:15:39,  7.77s/it, gpt_loss=0.343, loss_mean=0.336][A
+Train step of epoch 0:  26%|██▌       | 1682/6434 [3:56:16<10:33:28,  8.00s/it, gpt_loss=0.343, loss_mean=0.336][A
+Train step of epoch 0:  26%|██▌       | 1682/6434 [3:56:24<10:33:28,  8.00s/it, gpt_loss=0.316, loss_mean=0.334][A
+Train step of epoch 0:  26%|██▌       | 1683/6434 [3:56:24<10:44:15,  8.14s/it, gpt_loss=0.316, loss_mean=0.334][A
+Train step of epoch 0:  26%|██▌       | 1683/6434 [3:56:33<10:44:15,  8.14s/it, gpt_loss=0.251, loss_mean=0.326][A
+Train step of epoch 0:  26%|██▌       | 1684/6434 [3:56:33<10:48:26,  8.19s/it, gpt_loss=0.251, loss_mean=0.326][A
+Train step of epoch 0:  26%|██▌       | 1684/6434 [3:56:42<10:48:26,  8.19s/it, gpt_loss=0.43, loss_mean=0.336] [A
+Train step of epoch 0:  26%|██▌       | 1685/6434 [3:56:42<11:14:19,  8.52s/it, gpt_loss=0.43, loss_mean=0.336][A
+Train step of epoch 0:  26%|██▌       | 1685/6434 [3:56:50<11:14:19,  8.52s/it, gpt_loss=0.383, loss_mean=0.341][A
+Train step of epoch 0:  26%|██▌       | 1686/6434 [3:56:50<11:14:29,  8.52s/it, gpt_loss=0.383, loss_mean=0.341][A
+Train step of epoch 0:  26%|██▌       | 1686/6434 [3:56:59<11:14:29,  8.52s/it, gpt_loss=0.315, loss_mean=0.338][A
+Train step of epoch 0:  26%|██▌       | 1687/6434 [3:56:59<11:10:57,  8.48s/it, gpt_loss=0.315, loss_mean=0.338][A
+Train step of epoch 0:  26%|██▌       | 1687/6434 [3:57:07<11:10:57,  8.48s/it, gpt_loss=0.355, loss_mean=0.34] [A
+Train step of epoch 0:  26%|██▌       | 1688/6434 [3:57:07<10:53:56,  8.27s/it, gpt_loss=0.355, loss_mean=0.34][A
+Train step of epoch 0:  26%|██▌       | 1688/6434 [3:57:15<10:53:56,  8.27s/it, gpt_loss=0.293, loss_mean=0.335][A
+Train step of epoch 0:  26%|██▋       | 1689/6434 [3:57:15<11:09:27,  8.47s/it, gpt_loss=0.293, loss_mean=0.335][A
+[LID Router Debug] Step: 1690
+Batch Size: 10
+Audio Batch Size: 105
+LID Assignments: [9, 4, 9, 4, 4, 0, 9, 6, 0, 9]
+Active Experts in Batch: {0, 9, 4, 6}
+
+Train step of epoch 0:  26%|██▋       | 1689/6434 [3:57:23<11:09:27,  8.47s/it, gpt_loss=0.332, loss_mean=0.335][A
+Train step of epoch 0:  26%|██▋       | 1690/6434 [3:57:23<10:48:24,  8.20s/it, gpt_loss=0.332, loss_mean=0.335][A
+Train step of epoch 0:  26%|██▋       | 1690/6434 [3:57:31<10:48:24,  8.20s/it, gpt_loss=0.283, loss_mean=0.33] [A
+Train step of epoch 0:  26%|██▋       | 1691/6434 [3:57:31<10:43:31,  8.14s/it, gpt_loss=0.283, loss_mean=0.33][A
+Train step of epoch 0:  26%|██▋       | 1691/6434 [3:57:39<10:43:31,  8.14s/it, gpt_loss=0.344, loss_mean=0.331][A
+Train step of epoch 0:  26%|██▋       | 1692/6434 [3:57:39<10:46:37,  8.18s/it, gpt_loss=0.344, loss_mean=0.331][A
+Train step of epoch 0:  26%|██▋       | 1692/6434 [3:57:47<10:46:37,  8.18s/it, gpt_loss=0.327, loss_mean=0.331][A
+Train step of epoch 0:  26%|██▋       | 1693/6434 [3:57:47<10:28:51,  7.96s/it, gpt_loss=0.327, loss_mean=0.331][A
+Train step of epoch 0:  26%|██▋       | 1693/6434 [3:57:55<10:28:51,  7.96s/it, gpt_loss=0.359, loss_mean=0.334][A
+Train step of epoch 0:  26%|██▋       | 1694/6434 [3:57:55<10:28:43,  7.96s/it, gpt_loss=0.359, loss_mean=0.334][A
+Train step of epoch 0:  26%|██▋       | 1694/6434 [3:58:03<10:28:43,  7.96s/it, gpt_loss=0.323, loss_mean=0.333][A
+Train step of epoch 0:  26%|██▋       | 1695/6434 [3:58:03<10:38:16,  8.08s/it, gpt_loss=0.323, loss_mean=0.333][A
+Train step of epoch 0:  26%|██▋       | 1695/6434 [3:58:11<10:38:16,  8.08s/it, gpt_loss=0.307, loss_mean=0.33] [A
+Train step of epoch 0:  26%|██▋       | 1696/6434 [3:58:11<10:39:17,  8.10s/it, gpt_loss=0.307, loss_mean=0.33][A
+Train step of epoch 0:  26%|██▋       | 1696/6434 [3:58:20<10:39:17,  8.10s/it, gpt_loss=0.466, loss_mean=0.344][A
+Train step of epoch 0:  26%|██▋       | 1697/6434 [3:58:20<10:49:31,  8.23s/it, gpt_loss=0.466, loss_mean=0.344][A
+Train step of epoch 0:  26%|██▋       | 1697/6434 [3:58:28<10:49:31,  8.23s/it, gpt_loss=0.378, loss_mean=0.347][A
+Train step of epoch 0:  26%|██▋       | 1698/6434 [3:58:28<10:53:30,  8.28s/it, gpt_loss=0.378, loss_mean=0.347][A
+Train step of epoch 0:  26%|██▋       | 1698/6434 [3:58:36<10:53:30,  8.28s/it, gpt_loss=0.381, loss_mean=0.351][A
+Train step of epoch 0:  26%|██▋       | 1699/6434 [3:58:36<10:39:19,  8.10s/it, gpt_loss=0.381, loss_mean=0.351][A
+[LID Router Debug] Step: 1700
+Batch Size: 10
+Audio Batch Size: 136
+LID Assignments: [0, 3, 3, 2, 3, 4, 1, 1, 3, 6]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6}
+
+Train step of epoch 0:  26%|██▋       | 1699/6434 [3:58:45<10:39:19,  8.10s/it, gpt_loss=0.269, loss_mean=0.342][A
+Train step of epoch 0:  26%|██▋       | 1700/6434 [3:58:45<11:01:21,  8.38s/it, gpt_loss=0.269, loss_mean=0.342][A
+Train step of epoch 0:  26%|██▋       | 1700/6434 [3:58:55<11:01:21,  8.38s/it, gpt_loss=0.353, loss_mean=0.343][A
+Train step of epoch 0:  26%|██▋       | 1701/6434 [3:58:55<11:31:46,  8.77s/it, gpt_loss=0.353, loss_mean=0.343][A
+Train step of epoch 0:  26%|██▋       | 1701/6434 [3:59:02<11:31:46,  8.77s/it, gpt_loss=0.376, loss_mean=0.347][A
+Train step of epoch 0:  26%|██▋       | 1702/6434 [3:59:02<11:08:16,  8.47s/it, gpt_loss=0.376, loss_mean=0.347][A
+Train step of epoch 0:  26%|██▋       | 1702/6434 [3:59:10<11:08:16,  8.47s/it, gpt_loss=0.355, loss_mean=0.347][A
+Train step of epoch 0:  26%|██▋       | 1703/6434 [3:59:10<10:49:37,  8.24s/it, gpt_loss=0.355, loss_mean=0.347][A
+Train step of epoch 0:  26%|██▋       | 1703/6434 [3:59:18<10:49:37,  8.24s/it, gpt_loss=0.298, loss_mean=0.342][A
+Train step of epoch 0:  26%|██▋       | 1704/6434 [3:59:18<10:34:10,  8.04s/it, gpt_loss=0.298, loss_mean=0.342][A
+Train step of epoch 0:  26%|██▋       | 1704/6434 [3:59:25<10:34:10,  8.04s/it, gpt_loss=0.371, loss_mean=0.345][A
+Train step of epoch 0:  26%|██▋       | 1705/6434 [3:59:25<10:26:07,  7.94s/it, gpt_loss=0.371, loss_mean=0.345][A
+Train step of epoch 0:  26%|██▋       | 1705/6434 [3:59:33<10:26:07,  7.94s/it, gpt_loss=0.281, loss_mean=0.339][A
+Train step of epoch 0:  27%|██▋       | 1706/6434 [3:59:33<10:27:25,  7.96s/it, gpt_loss=0.281, loss_mean=0.339][A
+Train step of epoch 0:  27%|██▋       | 1706/6434 [3:59:41<10:27:25,  7.96s/it, gpt_loss=0.296, loss_mean=0.335][A
+Train step of epoch 0:  27%|██▋       | 1707/6434 [3:59:41<10:25:28,  7.94s/it, gpt_loss=0.296, loss_mean=0.335][A
+Train step of epoch 0:  27%|██▋       | 1707/6434 [3:59:49<10:25:28,  7.94s/it, gpt_loss=0.34, loss_mean=0.335] [A
+Train step of epoch 0:  27%|██▋       | 1708/6434 [3:59:49<10:28:10,  7.98s/it, gpt_loss=0.34, loss_mean=0.335][A
+Train step of epoch 0:  27%|██▋       | 1708/6434 [3:59:57<10:28:10,  7.98s/it, gpt_loss=0.302, loss_mean=0.332][A
+Train step of epoch 0:  27%|██▋       | 1709/6434 [3:59:57<10:11:42,  7.77s/it, gpt_loss=0.302, loss_mean=0.332][A
+[LID Router Debug] Step: 1710
+Batch Size: 10
+Audio Batch Size: 84
+LID Assignments: [9, 2, 9, 2, 1, 4, 2, 5, 2, 0]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+Train step of epoch 0:  27%|██▋       | 1709/6434 [4:00:04<10:11:42,  7.77s/it, gpt_loss=0.355, loss_mean=0.334][A
+Train step of epoch 0:  27%|██▋       | 1710/6434 [4:00:04<10:01:46,  7.64s/it, gpt_loss=0.355, loss_mean=0.334][A
+Train step of epoch 0:  27%|██▋       | 1710/6434 [4:00:13<10:01:46,  7.64s/it, gpt_loss=0.348, loss_mean=0.336][A
+Train step of epoch 0:  27%|██▋       | 1711/6434 [4:00:13<10:45:01,  8.19s/it, gpt_loss=0.348, loss_mean=0.336][A
+Train step of epoch 0:  27%|██▋       | 1711/6434 [4:00:23<10:45:01,  8.19s/it, gpt_loss=0.331, loss_mean=0.335][A
+Train step of epoch 0:  27%|██▋       | 1712/6434 [4:00:23<11:16:36,  8.60s/it, gpt_loss=0.331, loss_mean=0.335][A
+Train step of epoch 0:  27%|██▋       | 1712/6434 [4:00:32<11:16:36,  8.60s/it, gpt_loss=0.388, loss_mean=0.34] [A
+Train step of epoch 0:  27%|██▋       | 1713/6434 [4:00:32<11:30:57,  8.78s/it, gpt_loss=0.388, loss_mean=0.34][A
+Train step of epoch 0:  27%|██▋       | 1713/6434 [4:00:41<11:30:57,  8.78s/it, gpt_loss=0.384, loss_mean=0.345][A
+Train step of epoch 0:  27%|██▋       | 1714/6434 [4:00:41<11:25:29,  8.71s/it, gpt_loss=0.384, loss_mean=0.345][A
+Train step of epoch 0:  27%|██▋       | 1714/6434 [4:00:49<11:25:29,  8.71s/it, gpt_loss=0.29, loss_mean=0.339] [A
+Train step of epoch 0:  27%|██▋       | 1715/6434 [4:00:49<11:04:41,  8.45s/it, gpt_loss=0.29, loss_mean=0.339][A
+Train step of epoch 0:  27%|██▋       | 1715/6434 [4:00:56<11:04:41,  8.45s/it, gpt_loss=0.31, loss_mean=0.336][A
+Train step of epoch 0:  27%|██▋       | 1716/6434 [4:00:56<10:34:08,  8.06s/it, gpt_loss=0.31, loss_mean=0.336][A
+Train step of epoch 0:  27%|██▋       | 1716/6434 [4:01:04<10:34:08,  8.06s/it, gpt_loss=0.295, loss_mean=0.332][A
+Train step of epoch 0:  27%|██▋       | 1717/6434 [4:01:04<10:48:30,  8.25s/it, gpt_loss=0.295, loss_mean=0.332][A
+Train step of epoch 0:  27%|██▋       | 1717/6434 [4:01:14<10:48:30,  8.25s/it, gpt_loss=0.289, loss_mean=0.328][A
+Train step of epoch 0:  27%|██▋       | 1718/6434 [4:01:14<11:16:56,  8.61s/it, gpt_loss=0.289, loss_mean=0.328][A
+Train step of epoch 0:  27%|██▋       | 1718/6434 [4:01:22<11:16:56,  8.61s/it, gpt_loss=0.391, loss_mean=0.334][A
+Train step of epoch 0:  27%|██▋       | 1719/6434 [4:01:22<10:58:11,  8.38s/it, gpt_loss=0.391, loss_mean=0.334][A
+[LID Router Debug] Step: 1720
+Batch Size: 10
+Audio Batch Size: 82
+LID Assignments: [4, 2, 6, 9, 0, 9, 0, 4, 0, 5]
+Active Experts in Batch: {0, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  27%|██▋       | 1719/6434 [4:01:30<10:58:11,  8.38s/it, gpt_loss=0.326, loss_mean=0.333][A
+Train step of epoch 0:  27%|██▋       | 1720/6434 [4:01:30<10:52:23,  8.30s/it, gpt_loss=0.326, loss_mean=0.333][A
+Train step of epoch 0:  27%|██▋       | 1720/6434 [4:01:38<10:52:23,  8.30s/it, gpt_loss=0.317, loss_mean=0.332][A
+Train step of epoch 0:  27%|██▋       | 1721/6434 [4:01:38<10:44:46,  8.21s/it, gpt_loss=0.317, loss_mean=0.332][A
+Train step of epoch 0:  27%|██▋       | 1721/6434 [4:01:46<10:44:46,  8.21s/it, gpt_loss=0.371, loss_mean=0.336][A
+Train step of epoch 0:  27%|██▋       | 1722/6434 [4:01:46<10:57:02,  8.37s/it, gpt_loss=0.371, loss_mean=0.336][A
+Train step of epoch 0:  27%|██▋       | 1722/6434 [4:01:55<10:57:02,  8.37s/it, gpt_loss=0.294, loss_mean=0.332][A
+Train step of epoch 0:  27%|██▋       | 1723/6434 [4:01:55<10:58:38,  8.39s/it, gpt_loss=0.294, loss_mean=0.332][A
+Train step of epoch 0:  27%|██▋       | 1723/6434 [4:02:05<10:58:38,  8.39s/it, gpt_loss=0.357, loss_mean=0.334][A
+Train step of epoch 0:  27%|██▋       | 1724/6434 [4:02:05<11:42:57,  8.95s/it, gpt_loss=0.357, loss_mean=0.334][A
+Train step of epoch 0:  27%|██▋       | 1724/6434 [4:02:15<11:42:57,  8.95s/it, gpt_loss=0.264, loss_mean=0.327][A
+Train step of epoch 0:  27%|██▋       | 1725/6434 [4:02:15<11:55:47,  9.12s/it, gpt_loss=0.264, loss_mean=0.327][A
+Train step of epoch 0:  27%|██▋       | 1725/6434 [4:02:23<11:55:47,  9.12s/it, gpt_loss=0.315, loss_mean=0.326][A
+Train step of epoch 0:  27%|██▋       | 1726/6434 [4:02:23<11:34:32,  8.85s/it, gpt_loss=0.315, loss_mean=0.326][A
+Train step of epoch 0:  27%|██▋       | 1726/6434 [4:02:30<11:34:32,  8.85s/it, gpt_loss=0.328, loss_mean=0.326][A
+Train step of epoch 0:  27%|██▋       | 1727/6434 [4:02:30<10:52:57,  8.32s/it, gpt_loss=0.328, loss_mean=0.326][A
+Train step of epoch 0:  27%|██▋       | 1727/6434 [4:02:40<10:52:57,  8.32s/it, gpt_loss=0.333, loss_mean=0.327][A
+Train step of epoch 0:  27%|██▋       | 1728/6434 [4:02:40<11:29:22,  8.79s/it, gpt_loss=0.333, loss_mean=0.327][A
+Train step of epoch 0:  27%|██▋       | 1728/6434 [4:02:48<11:29:22,  8.79s/it, gpt_loss=0.348, loss_mean=0.329][A
+Train step of epoch 0:  27%|██▋       | 1729/6434 [4:02:48<11:03:17,  8.46s/it, gpt_loss=0.348, loss_mean=0.329][A
+[LID Router Debug] Step: 1730
+Batch Size: 10
+Audio Batch Size: 109
+LID Assignments: [6, 0, 4, 2, 1, 3, 1, 2, 9, 6]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:  27%|██▋       | 1729/6434 [4:02:57<11:03:17,  8.46s/it, gpt_loss=0.233, loss_mean=0.319][A
+Train step of epoch 0:  27%|██▋       | 1730/6434 [4:02:57<11:32:36,  8.83s/it, gpt_loss=0.233, loss_mean=0.319][A
+Train step of epoch 0:  27%|██▋       | 1730/6434 [4:03:05<11:32:36,  8.83s/it, gpt_loss=0.325, loss_mean=0.32] [A
+Train step of epoch 0:  27%|██▋       | 1731/6434 [4:03:05<11:05:41,  8.49s/it, gpt_loss=0.325, loss_mean=0.32][A
+Train step of epoch 0:  27%|██▋       | 1731/6434 [4:03:13<11:05:41,  8.49s/it, gpt_loss=0.271, loss_mean=0.315][A
+Train step of epoch 0:  27%|██▋       | 1732/6434 [4:03:13<10:55:30,  8.36s/it, gpt_loss=0.271, loss_mean=0.315][A
+Train step of epoch 0:  27%|██▋       | 1732/6434 [4:03:22<10:55:30,  8.36s/it, gpt_loss=0.321, loss_mean=0.315][A
+Train step of epoch 0:  27%|██▋       | 1733/6434 [4:03:22<11:02:33,  8.46s/it, gpt_loss=0.321, loss_mean=0.315][A
+Train step of epoch 0:  27%|██▋       | 1733/6434 [4:03:29<11:02:33,  8.46s/it, gpt_loss=0.375, loss_mean=0.321][A
+Train step of epoch 0:  27%|██▋       | 1734/6434 [4:03:29<10:38:58,  8.16s/it, gpt_loss=0.375, loss_mean=0.321][A
+Train step of epoch 0:  27%|██▋       | 1734/6434 [4:03:38<10:38:58,  8.16s/it, gpt_loss=0.294, loss_mean=0.319][A
+Train step of epoch 0:  27%|██▋       | 1735/6434 [4:03:38<10:45:47,  8.25s/it, gpt_loss=0.294, loss_mean=0.319][A
+Train step of epoch 0:  27%|██▋       | 1735/6434 [4:03:46<10:45:47,  8.25s/it, gpt_loss=0.311, loss_mean=0.318][A
+Train step of epoch 0:  27%|██▋       | 1736/6434 [4:03:46<10:51:13,  8.32s/it, gpt_loss=0.311, loss_mean=0.318][A
+Train step of epoch 0:  27%|██▋       | 1736/6434 [4:03:54<10:51:13,  8.32s/it, gpt_loss=0.257, loss_mean=0.312][A
+Train step of epoch 0:  27%|██▋       | 1737/6434 [4:03:54<10:45:55,  8.25s/it, gpt_loss=0.257, loss_mean=0.312][A
+Train step of epoch 0:  27%|██▋       | 1737/6434 [4:04:02<10:45:55,  8.25s/it, gpt_loss=0.287, loss_mean=0.309][A
+Train step of epoch 0:  27%|██▋       | 1738/6434 [4:04:02<10:45:40,  8.25s/it, gpt_loss=0.287, loss_mean=0.309][A
+Train step of epoch 0:  27%|██▋       | 1738/6434 [4:04:12<10:45:40,  8.25s/it, gpt_loss=0.277, loss_mean=0.306][A
+Train step of epoch 0:  27%|██▋       | 1739/6434 [4:04:12<11:16:40,  8.65s/it, gpt_loss=0.277, loss_mean=0.306][A
+[LID Router Debug] Step: 1740
+Batch Size: 10
+Audio Batch Size: 126
+LID Assignments: [3, 2, 3, 4, 3, 0, 9, 2, 5, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  27%|██▋       | 1739/6434 [4:04:20<11:16:40,  8.65s/it, gpt_loss=0.275, loss_mean=0.303][A
+Train step of epoch 0:  27%|██▋       | 1740/6434 [4:04:20<11:06:30,  8.52s/it, gpt_loss=0.275, loss_mean=0.303][A
+Train step of epoch 0:  27%|██▋       | 1740/6434 [4:04:29<11:06:30,  8.52s/it, gpt_loss=0.398, loss_mean=0.312][A
+Train step of epoch 0:  27%|██▋       | 1741/6434 [4:04:29<10:59:53,  8.44s/it, gpt_loss=0.398, loss_mean=0.312][A
+Train step of epoch 0:  27%|██▋       | 1741/6434 [4:04:37<10:59:53,  8.44s/it, gpt_loss=0.307, loss_mean=0.312][A
+Train step of epoch 0:  27%|██▋       | 1742/6434 [4:04:37<11:00:18,  8.44s/it, gpt_loss=0.307, loss_mean=0.312][A
+Train step of epoch 0:  27%|██▋       | 1742/6434 [4:04:45<11:00:18,  8.44s/it, gpt_loss=0.32, loss_mean=0.313] [A
+Train step of epoch 0:  27%|██▋       | 1743/6434 [4:04:45<10:42:31,  8.22s/it, gpt_loss=0.32, loss_mean=0.313][A
+Train step of epoch 0:  27%|██▋       | 1743/6434 [4:04:54<10:42:31,  8.22s/it, gpt_loss=0.344, loss_mean=0.316][A
+Train step of epoch 0:  27%|██▋       | 1744/6434 [4:04:54<11:17:38,  8.67s/it, gpt_loss=0.344, loss_mean=0.316][A
+Train step of epoch 0:  27%|██▋       | 1744/6434 [4:05:02<11:17:38,  8.67s/it, gpt_loss=0.327, loss_mean=0.317][A
+Train step of epoch 0:  27%|██▋       | 1745/6434 [4:05:02<10:56:12,  8.40s/it, gpt_loss=0.327, loss_mean=0.317][A
+Train step of epoch 0:  27%|██▋       | 1745/6434 [4:05:10<10:56:12,  8.40s/it, gpt_loss=0.329, loss_mean=0.318][A
+Train step of epoch 0:  27%|██▋       | 1746/6434 [4:05:10<10:45:05,  8.26s/it, gpt_loss=0.329, loss_mean=0.318][A
+Train step of epoch 0:  27%|██▋       | 1746/6434 [4:05:19<10:45:05,  8.26s/it, gpt_loss=0.295, loss_mean=0.316][A
+Train step of epoch 0:  27%|██▋       | 1747/6434 [4:05:19<10:54:05,  8.37s/it, gpt_loss=0.295, loss_mean=0.316][A
+Train step of epoch 0:  27%|██▋       | 1747/6434 [4:05:27<10:54:05,  8.37s/it, gpt_loss=0.35, loss_mean=0.319] [A
+Train step of epoch 0:  27%|██▋       | 1748/6434 [4:05:27<10:45:26,  8.26s/it, gpt_loss=0.35, loss_mean=0.319][A
+Train step of epoch 0:  27%|██▋       | 1748/6434 [4:05:35<10:45:26,  8.26s/it, gpt_loss=0.334, loss_mean=0.321][A
+Train step of epoch 0:  27%|██▋       | 1749/6434 [4:05:35<10:39:02,  8.18s/it, gpt_loss=0.334, loss_mean=0.321][A
+[LID Router Debug] Step: 1750
+Batch Size: 10
+Audio Batch Size: 124
+LID Assignments: [5, 2, 6, 3, 8, 2, 5, 1, 4, 5]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6, 8}
+
+Train step of epoch 0:  27%|██▋       | 1749/6434 [4:05:43<10:39:02,  8.18s/it, gpt_loss=0.301, loss_mean=0.319][A
+Train step of epoch 0:  27%|██▋       | 1750/6434 [4:05:43<10:40:44,  8.21s/it, gpt_loss=0.301, loss_mean=0.319][A
+Train step of epoch 0:  27%|██▋       | 1750/6434 [4:05:52<10:40:44,  8.21s/it, gpt_loss=0.31, loss_mean=0.318] [A
+Train step of epoch 0:  27%|██▋       | 1751/6434 [4:05:52<11:04:05,  8.51s/it, gpt_loss=0.31, loss_mean=0.318][A
+Train step of epoch 0:  27%|██▋       | 1751/6434 [4:06:00<11:04:05,  8.51s/it, gpt_loss=0.318, loss_mean=0.318][A
+Train step of epoch 0:  27%|██▋       | 1752/6434 [4:06:00<10:50:19,  8.33s/it, gpt_loss=0.318, loss_mean=0.318][A
+Train step of epoch 0:  27%|██▋       | 1752/6434 [4:06:09<10:50:19,  8.33s/it, gpt_loss=0.29, loss_mean=0.315] [A
+Train step of epoch 0:  27%|██▋       | 1753/6434 [4:06:09<10:57:48,  8.43s/it, gpt_loss=0.29, loss_mean=0.315][A
+Train step of epoch 0:  27%|██▋       | 1753/6434 [4:06:17<10:57:48,  8.43s/it, gpt_loss=0.302, loss_mean=0.314][A
+Train step of epoch 0:  27%|██▋       | 1754/6434 [4:06:17<10:44:16,  8.26s/it, gpt_loss=0.302, loss_mean=0.314][A
+Train step of epoch 0:  27%|██▋       | 1754/6434 [4:06:24<10:44:16,  8.26s/it, gpt_loss=0.4, loss_mean=0.322]  [A
+Train step of epoch 0:  27%|██▋       | 1755/6434 [4:06:24<10:30:28,  8.08s/it, gpt_loss=0.4, loss_mean=0.322][A
+Train step of epoch 0:  27%|██▋       | 1755/6434 [4:06:34<10:30:28,  8.08s/it, gpt_loss=0.376, loss_mean=0.328][A
+Train step of epoch 0:  27%|██▋       | 1756/6434 [4:06:34<11:00:30,  8.47s/it, gpt_loss=0.376, loss_mean=0.328][A
+Train step of epoch 0:  27%|██▋       | 1756/6434 [4:06:42<11:00:30,  8.47s/it, gpt_loss=0.355, loss_mean=0.331][A
+Train step of epoch 0:  27%|██▋       | 1757/6434 [4:06:42<11:00:56,  8.48s/it, gpt_loss=0.355, loss_mean=0.331][A
+Train step of epoch 0:  27%|██▋       | 1757/6434 [4:06:50<11:00:56,  8.48s/it, gpt_loss=0.416, loss_mean=0.339][A
+Train step of epoch 0:  27%|██▋       | 1758/6434 [4:06:50<10:37:50,  8.18s/it, gpt_loss=0.416, loss_mean=0.339][A
+Train step of epoch 0:  27%|██▋       | 1758/6434 [4:07:01<10:37:50,  8.18s/it, gpt_loss=0.365, loss_mean=0.342][A
+Train step of epoch 0:  27%|██▋       | 1759/6434 [4:07:01<11:41:31,  9.00s/it, gpt_loss=0.365, loss_mean=0.342][A
+[LID Router Debug] Step: 1760
+Batch Size: 10
+Audio Batch Size: 137
+LID Assignments: [6, 9, 5, 3, 9, 4, 2, 5, 4, 2]
+Active Experts in Batch: {2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  27%|██▋       | 1759/6434 [4:07:10<11:41:31,  9.00s/it, gpt_loss=0.343, loss_mean=0.342][A
+Train step of epoch 0:  27%|██▋       | 1760/6434 [4:07:10<11:42:49,  9.02s/it, gpt_loss=0.343, loss_mean=0.342][A
+Train step of epoch 0:  27%|██▋       | 1760/6434 [4:07:18<11:42:49,  9.02s/it, gpt_loss=0.399, loss_mean=0.347][A
+Train step of epoch 0:  27%|██▋       | 1761/6434 [4:07:18<11:37:41,  8.96s/it, gpt_loss=0.399, loss_mean=0.347][A
+Train step of epoch 0:  27%|██▋       | 1761/6434 [4:07:26<11:37:41,  8.96s/it, gpt_loss=0.252, loss_mean=0.338][A
+Train step of epoch 0:  27%|██▋       | 1762/6434 [4:07:26<11:02:04,  8.50s/it, gpt_loss=0.252, loss_mean=0.338][A
+Train step of epoch 0:  27%|██▋       | 1762/6434 [4:07:35<11:02:04,  8.50s/it, gpt_loss=0.368, loss_mean=0.341][A
+Train step of epoch 0:  27%|██▋       | 1763/6434 [4:07:35<11:18:06,  8.71s/it, gpt_loss=0.368, loss_mean=0.341][A
+Train step of epoch 0:  27%|██▋       | 1763/6434 [4:07:43<11:18:06,  8.71s/it, gpt_loss=0.31, loss_mean=0.338] [A
+Train step of epoch 0:  27%|██▋       | 1764/6434 [4:07:43<10:55:18,  8.42s/it, gpt_loss=0.31, loss_mean=0.338][A
+Train step of epoch 0:  27%|██▋       | 1764/6434 [4:07:51<10:55:18,  8.42s/it, gpt_loss=0.367, loss_mean=0.341][A
+Train step of epoch 0:  27%|██▋       | 1765/6434 [4:07:51<10:58:49,  8.47s/it, gpt_loss=0.367, loss_mean=0.341][A
+Train step of epoch 0:  27%|██▋       | 1765/6434 [4:07:59<10:58:49,  8.47s/it, gpt_loss=0.327, loss_mean=0.339][A
+Train step of epoch 0:  27%|██▋       | 1766/6434 [4:07:59<10:42:22,  8.26s/it, gpt_loss=0.327, loss_mean=0.339][A
+Train step of epoch 0:  27%|██▋       | 1766/6434 [4:08:07<10:42:22,  8.26s/it, gpt_loss=0.302, loss_mean=0.336][A
+Train step of epoch 0:  27%|██▋       | 1767/6434 [4:08:07<10:34:13,  8.15s/it, gpt_loss=0.302, loss_mean=0.336][A
+Train step of epoch 0:  27%|██▋       | 1767/6434 [4:08:17<10:34:13,  8.15s/it, gpt_loss=0.332, loss_mean=0.335][A
+Train step of epoch 0:  27%|██▋       | 1768/6434 [4:08:17<11:13:12,  8.66s/it, gpt_loss=0.332, loss_mean=0.335][A
+Train step of epoch 0:  27%|██▋       | 1768/6434 [4:08:26<11:13:12,  8.66s/it, gpt_loss=0.296, loss_mean=0.331][A
+Train step of epoch 0:  27%|██▋       | 1769/6434 [4:08:26<11:12:12,  8.65s/it, gpt_loss=0.296, loss_mean=0.331][A
+[LID Router Debug] Step: 1770
+Batch Size: 10
+Audio Batch Size: 112
+LID Assignments: [3, 6, 3, 1, 4, 2, 1, 9, 2, 2]
+Active Experts in Batch: {1, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:  27%|██▋       | 1769/6434 [4:08:34<11:12:12,  8.65s/it, gpt_loss=0.3, loss_mean=0.328]  [A
+Train step of epoch 0:  28%|██▊       | 1770/6434 [4:08:34<10:56:34,  8.45s/it, gpt_loss=0.3, loss_mean=0.328][A
+Train step of epoch 0:  28%|██▊       | 1770/6434 [4:08:41<10:56:34,  8.45s/it, gpt_loss=0.326, loss_mean=0.328][A
+Train step of epoch 0:  28%|██▊       | 1771/6434 [4:08:41<10:43:33,  8.28s/it, gpt_loss=0.326, loss_mean=0.328][A
+Train step of epoch 0:  28%|██▊       | 1771/6434 [4:08:49<10:43:33,  8.28s/it, gpt_loss=0.3, loss_mean=0.325]  [A
+Train step of epoch 0:  28%|██▊       | 1772/6434 [4:08:49<10:36:39,  8.19s/it, gpt_loss=0.3, loss_mean=0.325][A
+Train step of epoch 0:  28%|██▊       | 1772/6434 [4:08:59<10:36:39,  8.19s/it, gpt_loss=0.426, loss_mean=0.335][A
+Train step of epoch 0:  28%|██▊       | 1773/6434 [4:08:59<11:10:51,  8.64s/it, gpt_loss=0.426, loss_mean=0.335][A
+Train step of epoch 0:  28%|██▊       | 1773/6434 [4:09:07<11:10:51,  8.64s/it, gpt_loss=0.313, loss_mean=0.333][A
+Train step of epoch 0:  28%|██▊       | 1774/6434 [4:09:07<10:57:52,  8.47s/it, gpt_loss=0.313, loss_mean=0.333][A
+Train step of epoch 0:  28%|██▊       | 1774/6434 [4:09:16<10:57:52,  8.47s/it, gpt_loss=0.297, loss_mean=0.33] [A
+Train step of epoch 0:  28%|██▊       | 1775/6434 [4:09:16<10:58:57,  8.49s/it, gpt_loss=0.297, loss_mean=0.33][A
+Train step of epoch 0:  28%|██▊       | 1775/6434 [4:09:24<10:58:57,  8.49s/it, gpt_loss=0.343, loss_mean=0.331][A
+Train step of epoch 0:  28%|██▊       | 1776/6434 [4:09:24<10:49:06,  8.36s/it, gpt_loss=0.343, loss_mean=0.331][A
+Train step of epoch 0:  28%|██▊       | 1776/6434 [4:09:32<10:49:06,  8.36s/it, gpt_loss=0.316, loss_mean=0.329][A
+Train step of epoch 0:  28%|██▊       | 1777/6434 [4:09:32<10:53:07,  8.41s/it, gpt_loss=0.316, loss_mean=0.329][A
+Train step of epoch 0:  28%|██▊       | 1777/6434 [4:09:41<10:53:07,  8.41s/it, gpt_loss=0.325, loss_mean=0.329][A
+Train step of epoch 0:  28%|██▊       | 1778/6434 [4:09:41<11:00:59,  8.52s/it, gpt_loss=0.325, loss_mean=0.329][A
+Train step of epoch 0:  28%|██▊       | 1778/6434 [4:09:49<11:00:59,  8.52s/it, gpt_loss=0.34, loss_mean=0.33]  [A
+Train step of epoch 0:  28%|██▊       | 1779/6434 [4:09:49<10:47:52,  8.35s/it, gpt_loss=0.34, loss_mean=0.33][A
+[LID Router Debug] Step: 1780
+Batch Size: 10
+Audio Batch Size: 125
+LID Assignments: [9, 6, 1, 4, 3, 3, 2, 1, 6, 3]
+Active Experts in Batch: {1, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:  28%|██▊       | 1779/6434 [4:09:58<10:47:52,  8.35s/it, gpt_loss=0.342, loss_mean=0.331][A
+Train step of epoch 0:  28%|██▊       | 1780/6434 [4:09:58<10:53:27,  8.42s/it, gpt_loss=0.342, loss_mean=0.331][A
+Train step of epoch 0:  28%|██▊       | 1780/6434 [4:10:07<10:53:27,  8.42s/it, gpt_loss=0.273, loss_mean=0.325][A
+Train step of epoch 0:  28%|██▊       | 1781/6434 [4:10:07<11:12:37,  8.67s/it, gpt_loss=0.273, loss_mean=0.325][A
+Train step of epoch 0:  28%|██▊       | 1781/6434 [4:10:16<11:12:37,  8.67s/it, gpt_loss=0.274, loss_mean=0.32] [A
+Train step of epoch 0:  28%|██▊       | 1782/6434 [4:10:16<11:19:53,  8.77s/it, gpt_loss=0.274, loss_mean=0.32][A
+Train step of epoch 0:  28%|██▊       | 1782/6434 [4:10:25<11:19:53,  8.77s/it, gpt_loss=0.435, loss_mean=0.332][A
+Train step of epoch 0:  28%|██▊       | 1783/6434 [4:10:25<11:26:42,  8.86s/it, gpt_loss=0.435, loss_mean=0.332][A
+Train step of epoch 0:  28%|██▊       | 1783/6434 [4:10:34<11:26:42,  8.86s/it, gpt_loss=0.371, loss_mean=0.336][A
+Train step of epoch 0:  28%|██▊       | 1784/6434 [4:10:34<11:20:07,  8.78s/it, gpt_loss=0.371, loss_mean=0.336][A
+Train step of epoch 0:  28%|██▊       | 1784/6434 [4:10:41<11:20:07,  8.78s/it, gpt_loss=0.385, loss_mean=0.341][A
+Train step of epoch 0:  28%|██▊       | 1785/6434 [4:10:41<10:55:33,  8.46s/it, gpt_loss=0.385, loss_mean=0.341][A
+Train step of epoch 0:  28%|██▊       | 1785/6434 [4:10:51<10:55:33,  8.46s/it, gpt_loss=0.315, loss_mean=0.338][A
+Train step of epoch 0:  28%|██▊       | 1786/6434 [4:10:51<11:18:27,  8.76s/it, gpt_loss=0.315, loss_mean=0.338][A
+Train step of epoch 0:  28%|██▊       | 1786/6434 [4:10:59<11:18:27,  8.76s/it, gpt_loss=0.31, loss_mean=0.335] [A
+Train step of epoch 0:  28%|██▊       | 1787/6434 [4:10:59<11:11:24,  8.67s/it, gpt_loss=0.31, loss_mean=0.335][A
+Train step of epoch 0:  28%|██▊       | 1787/6434 [4:11:07<11:11:24,  8.67s/it, gpt_loss=0.375, loss_mean=0.339][A
+Train step of epoch 0:  28%|██▊       | 1788/6434 [4:11:07<10:52:00,  8.42s/it, gpt_loss=0.375, loss_mean=0.339][A
+Train step of epoch 0:  28%|██▊       | 1788/6434 [4:11:16<10:52:00,  8.42s/it, gpt_loss=0.286, loss_mean=0.334][A
+Train step of epoch 0:  28%|██▊       | 1789/6434 [4:11:16<11:00:53,  8.54s/it, gpt_loss=0.286, loss_mean=0.334][A
+[LID Router Debug] Step: 1790
+Batch Size: 10
+Audio Batch Size: 91
+LID Assignments: [9, 5, 3, 9, 1, 6, 5, 2, 2, 2]
+Active Experts in Batch: {1, 2, 3, 5, 6, 9}
+
+Train step of epoch 0:  28%|██▊       | 1789/6434 [4:11:24<11:00:53,  8.54s/it, gpt_loss=0.394, loss_mean=0.34] [A
+Train step of epoch 0:  28%|██▊       | 1790/6434 [4:11:24<10:51:34,  8.42s/it, gpt_loss=0.394, loss_mean=0.34][A
+Train step of epoch 0:  28%|██▊       | 1790/6434 [4:11:35<10:51:34,  8.42s/it, gpt_loss=0.259, loss_mean=0.332][A
+Train step of epoch 0:  28%|██▊       | 1791/6434 [4:11:35<12:03:39,  9.35s/it, gpt_loss=0.259, loss_mean=0.332][A
+Train step of epoch 0:  28%|██▊       | 1791/6434 [4:11:44<12:03:39,  9.35s/it, gpt_loss=0.318, loss_mean=0.33] [A
+Train step of epoch 0:  28%|██▊       | 1792/6434 [4:11:44<11:47:32,  9.15s/it, gpt_loss=0.318, loss_mean=0.33][A
+Train step of epoch 0:  28%|██▊       | 1792/6434 [4:11:53<11:47:32,  9.15s/it, gpt_loss=0.32, loss_mean=0.329][A
+Train step of epoch 0:  28%|██▊       | 1793/6434 [4:11:53<11:33:38,  8.97s/it, gpt_loss=0.32, loss_mean=0.329][A
+Train step of epoch 0:  28%|██▊       | 1793/6434 [4:12:01<11:33:38,  8.97s/it, gpt_loss=0.282, loss_mean=0.325][A
+Train step of epoch 0:  28%|██▊       | 1794/6434 [4:12:01<11:13:29,  8.71s/it, gpt_loss=0.282, loss_mean=0.325][A
+Train step of epoch 0:  28%|██▊       | 1794/6434 [4:12:09<11:13:29,  8.71s/it, gpt_loss=0.392, loss_mean=0.331][A
+Train step of epoch 0:  28%|██▊       | 1795/6434 [4:12:09<11:11:33,  8.69s/it, gpt_loss=0.392, loss_mean=0.331][A
+Train step of epoch 0:  28%|██▊       | 1795/6434 [4:12:17<11:11:33,  8.69s/it, gpt_loss=0.314, loss_mean=0.33] [A
+Train step of epoch 0:  28%|██▊       | 1796/6434 [4:12:17<10:37:40,  8.25s/it, gpt_loss=0.314, loss_mean=0.33][A
+Train step of epoch 0:  28%|██▊       | 1796/6434 [4:12:25<10:37:40,  8.25s/it, gpt_loss=0.303, loss_mean=0.327][A
+Train step of epoch 0:  28%|██▊       | 1797/6434 [4:12:25<10:44:46,  8.34s/it, gpt_loss=0.303, loss_mean=0.327][A
+Train step of epoch 0:  28%|██▊       | 1797/6434 [4:12:34<10:44:46,  8.34s/it, gpt_loss=0.285, loss_mean=0.323][A
+Train step of epoch 0:  28%|██▊       | 1798/6434 [4:12:34<10:56:04,  8.49s/it, gpt_loss=0.285, loss_mean=0.323][A
+Train step of epoch 0:  28%|██▊       | 1798/6434 [4:12:43<10:56:04,  8.49s/it, gpt_loss=0.386, loss_mean=0.329][A
+Train step of epoch 0:  28%|██▊       | 1799/6434 [4:12:43<11:04:03,  8.60s/it, gpt_loss=0.386, loss_mean=0.329][A
+[LID Router Debug] Step: 1800
+Batch Size: 10
+Audio Batch Size: 119
+LID Assignments: [3, 1, 9, 4, 6, 6, 9, 9, 0, 1]
+Active Experts in Batch: {0, 1, 3, 4, 6, 9}
+[2026-02-06 20:08:55,761] [INFO] [logging.py:96:log_dist] [Rank 0] step=900, skipped=0, lr=[1.9653418297738813e-05, 1.9653418297738813e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-06 20:08:55,762] [INFO] [timer.py:260:stop] epoch=0/micro_step=1800/global_step=900, RunningAvgSamplesPerSec=4.758293243203502, CurrSamplesPerSec=4.651844543384789, MemAllocated=12.57GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  28%|██▊       | 1799/6434 [4:12:51<11:04:03,  8.60s/it, gpt_loss=0.323, loss_mean=0.328][A
+Train step of epoch 0:  28%|██▊       | 1800/6434 [4:12:51<10:59:00,  8.53s/it, gpt_loss=0.323, loss_mean=0.328][A
+Train step of epoch 0:  28%|██▊       | 1800/6434 [4:12:59<10:59:00,  8.53s/it, gpt_loss=0.339, loss_mean=0.33] [A
+Train step of epoch 0:  28%|██▊       | 1801/6434 [4:12:59<10:44:18,  8.34s/it, gpt_loss=0.339, loss_mean=0.33][A
+Train step of epoch 0:  28%|██▊       | 1801/6434 [4:13:07<10:44:18,  8.34s/it, gpt_loss=0.336, loss_mean=0.33][A
+Train step of epoch 0:  28%|██▊       | 1802/6434 [4:13:07<10:30:21,  8.17s/it, gpt_loss=0.336, loss_mean=0.33][A
+Train step of epoch 0:  28%|██▊       | 1802/6434 [4:13:15<10:30:21,  8.17s/it, gpt_loss=0.431, loss_mean=0.34][A
+Train step of epoch 0:  28%|██▊       | 1803/6434 [4:13:15<10:25:54,  8.11s/it, gpt_loss=0.431, loss_mean=0.34][A
+Train step of epoch 0:  28%|██▊       | 1803/6434 [4:13:24<10:25:54,  8.11s/it, gpt_loss=0.335, loss_mean=0.34][A
+Train step of epoch 0:  28%|██▊       | 1804/6434 [4:13:24<10:38:28,  8.27s/it, gpt_loss=0.335, loss_mean=0.34][A
+Train step of epoch 0:  28%|██▊       | 1804/6434 [4:13:32<10:38:28,  8.27s/it, gpt_loss=0.288, loss_mean=0.335][A
+Train step of epoch 0:  28%|██▊       | 1805/6434 [4:13:32<10:36:46,  8.25s/it, gpt_loss=0.288, loss_mean=0.335][A
+Train step of epoch 0:  28%|██▊       | 1805/6434 [4:13:40<10:36:46,  8.25s/it, gpt_loss=0.389, loss_mean=0.34] [A
+Train step of epoch 0:  28%|██▊       | 1806/6434 [4:13:40<10:36:10,  8.25s/it, gpt_loss=0.389, loss_mean=0.34][A
+Train step of epoch 0:  28%|██▊       | 1806/6434 [4:13:50<10:36:10,  8.25s/it, gpt_loss=0.354, loss_mean=0.341][A
+Train step of epoch 0:  28%|██▊       | 1807/6434 [4:13:50<11:06:15,  8.64s/it, gpt_loss=0.354, loss_mean=0.341][A
+Train step of epoch 0:  28%|██▊       | 1807/6434 [4:13:58<11:06:15,  8.64s/it, gpt_loss=0.36, loss_mean=0.343] [A
+Train step of epoch 0:  28%|██▊       | 1808/6434 [4:13:58<10:50:10,  8.43s/it, gpt_loss=0.36, loss_mean=0.343][A
+Train step of epoch 0:  28%|██▊       | 1808/6434 [4:14:06<10:50:10,  8.43s/it, gpt_loss=0.327, loss_mean=0.342][A
+Train step of epoch 0:  28%|██▊       | 1809/6434 [4:14:06<10:46:12,  8.38s/it, gpt_loss=0.327, loss_mean=0.342][A
+[LID Router Debug] Step: 1810
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [2, 2, 3, 1, 2, 0, 4, 2, 4, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4}
+
+Train step of epoch 0:  28%|██▊       | 1809/6434 [4:14:14<10:46:12,  8.38s/it, gpt_loss=0.429, loss_mean=0.35] [A
+Train step of epoch 0:  28%|██▊       | 1810/6434 [4:14:14<10:35:16,  8.24s/it, gpt_loss=0.429, loss_mean=0.35][A
+Train step of epoch 0:  28%|██▊       | 1810/6434 [4:14:22<10:35:16,  8.24s/it, gpt_loss=0.272, loss_mean=0.343][A
+Train step of epoch 0:  28%|██▊       | 1811/6434 [4:14:22<10:46:36,  8.39s/it, gpt_loss=0.272, loss_mean=0.343][A
+Train step of epoch 0:  28%|██▊       | 1811/6434 [4:14:33<10:46:36,  8.39s/it, gpt_loss=0.337, loss_mean=0.342][A
+Train step of epoch 0:  28%|██▊       | 1812/6434 [4:14:33<11:36:49,  9.05s/it, gpt_loss=0.337, loss_mean=0.342][A
+Train step of epoch 0:  28%|██▊       | 1812/6434 [4:14:43<11:36:49,  9.05s/it, gpt_loss=0.321, loss_mean=0.34] [A
+Train step of epoch 0:  28%|██▊       | 1813/6434 [4:14:43<11:53:31,  9.26s/it, gpt_loss=0.321, loss_mean=0.34][A
+Train step of epoch 0:  28%|██▊       | 1813/6434 [4:14:51<11:53:31,  9.26s/it, gpt_loss=0.299, loss_mean=0.336][A
+Train step of epoch 0:  28%|██▊       | 1814/6434 [4:14:51<11:27:13,  8.93s/it, gpt_loss=0.299, loss_mean=0.336][A
+Train step of epoch 0:  28%|██▊       | 1814/6434 [4:14:59<11:27:13,  8.93s/it, gpt_loss=0.403, loss_mean=0.342][A
+Train step of epoch 0:  28%|██▊       | 1815/6434 [4:14:59<11:10:13,  8.71s/it, gpt_loss=0.403, loss_mean=0.342][A
+Train step of epoch 0:  28%|██▊       | 1815/6434 [4:15:08<11:10:13,  8.71s/it, gpt_loss=0.312, loss_mean=0.339][A
+Train step of epoch 0:  28%|██▊       | 1816/6434 [4:15:08<11:08:22,  8.68s/it, gpt_loss=0.312, loss_mean=0.339][A
+Train step of epoch 0:  28%|██▊       | 1816/6434 [4:15:16<11:08:22,  8.68s/it, gpt_loss=0.333, loss_mean=0.339][A
+Train step of epoch 0:  28%|██▊       | 1817/6434 [4:15:16<11:03:50,  8.63s/it, gpt_loss=0.333, loss_mean=0.339][A
+Train step of epoch 0:  28%|██▊       | 1817/6434 [4:15:25<11:03:50,  8.63s/it, gpt_loss=0.312, loss_mean=0.336][A
+Train step of epoch 0:  28%|██▊       | 1818/6434 [4:15:25<11:02:13,  8.61s/it, gpt_loss=0.312, loss_mean=0.336][A
+Train step of epoch 0:  28%|██▊       | 1818/6434 [4:15:33<11:02:13,  8.61s/it, gpt_loss=0.365, loss_mean=0.339][A
+Train step of epoch 0:  28%|██▊       | 1819/6434 [4:15:33<10:54:43,  8.51s/it, gpt_loss=0.365, loss_mean=0.339][A
+[LID Router Debug] Step: 1820
+Batch Size: 10
+Audio Batch Size: 87
+LID Assignments: [5, 5, 3, 2, 2, 5, 6, 2, 6, 5]
+Active Experts in Batch: {2, 3, 5, 6}
+
+Train step of epoch 0:  28%|██▊       | 1819/6434 [4:15:41<10:54:43,  8.51s/it, gpt_loss=0.428, loss_mean=0.348][A
+Train step of epoch 0:  28%|██▊       | 1820/6434 [4:15:41<10:40:06,  8.32s/it, gpt_loss=0.428, loss_mean=0.348][A
+Train step of epoch 0:  28%|██▊       | 1820/6434 [4:15:49<10:40:06,  8.32s/it, gpt_loss=0.441, loss_mean=0.357][A
+Train step of epoch 0:  28%|██▊       | 1821/6434 [4:15:49<10:23:47,  8.11s/it, gpt_loss=0.441, loss_mean=0.357][A
+Train step of epoch 0:  28%|██▊       | 1821/6434 [4:15:56<10:23:47,  8.11s/it, gpt_loss=0.334, loss_mean=0.355][A
+Train step of epoch 0:  28%|██▊       | 1822/6434 [4:15:56<10:05:42,  7.88s/it, gpt_loss=0.334, loss_mean=0.355][A
+Train step of epoch 0:  28%|██▊       | 1822/6434 [4:16:04<10:05:42,  7.88s/it, gpt_loss=0.366, loss_mean=0.356][A
+Train step of epoch 0:  28%|██▊       | 1823/6434 [4:16:04<10:18:17,  8.05s/it, gpt_loss=0.366, loss_mean=0.356][A
+Train step of epoch 0:  28%|██▊       | 1823/6434 [4:16:13<10:18:17,  8.05s/it, gpt_loss=0.291, loss_mean=0.35] [A
+Train step of epoch 0:  28%|██▊       | 1824/6434 [4:16:13<10:36:29,  8.28s/it, gpt_loss=0.291, loss_mean=0.35][A
+Train step of epoch 0:  28%|██▊       | 1824/6434 [4:16:21<10:36:29,  8.28s/it, gpt_loss=0.289, loss_mean=0.343][A
+Train step of epoch 0:  28%|██▊       | 1825/6434 [4:16:21<10:34:55,  8.27s/it, gpt_loss=0.289, loss_mean=0.343][A
+Train step of epoch 0:  28%|██▊       | 1825/6434 [4:16:30<10:34:55,  8.27s/it, gpt_loss=0.36, loss_mean=0.345] [A
+Train step of epoch 0:  28%|██▊       | 1826/6434 [4:16:30<10:36:49,  8.29s/it, gpt_loss=0.36, loss_mean=0.345][A
+Train step of epoch 0:  28%|██▊       | 1826/6434 [4:16:37<10:36:49,  8.29s/it, gpt_loss=0.375, loss_mean=0.348][A
+Train step of epoch 0:  28%|██▊       | 1827/6434 [4:16:37<10:09:22,  7.94s/it, gpt_loss=0.375, loss_mean=0.348][A
+Train step of epoch 0:  28%|██▊       | 1827/6434 [4:16:46<10:09:22,  7.94s/it, gpt_loss=0.352, loss_mean=0.349][A
+Train step of epoch 0:  28%|██▊       | 1828/6434 [4:16:46<10:42:56,  8.38s/it, gpt_loss=0.352, loss_mean=0.349][A
+Train step of epoch 0:  28%|██▊       | 1828/6434 [4:16:54<10:42:56,  8.38s/it, gpt_loss=0.326, loss_mean=0.346][A
+Train step of epoch 0:  28%|██▊       | 1829/6434 [4:16:54<10:17:03,  8.04s/it, gpt_loss=0.326, loss_mean=0.346][A
+[LID Router Debug] Step: 1830
+Batch Size: 10
+Audio Batch Size: 106
+LID Assignments: [4, 2, 2, 1, 0, 3, 2, 2, 3, 2]
+Active Experts in Batch: {0, 1, 2, 3, 4}
+
+Train step of epoch 0:  28%|██▊       | 1829/6434 [4:17:02<10:17:03,  8.04s/it, gpt_loss=0.303, loss_mean=0.342][A
+Train step of epoch 0:  28%|██▊       | 1830/6434 [4:17:02<10:31:40,  8.23s/it, gpt_loss=0.303, loss_mean=0.342][A
+Train step of epoch 0:  28%|██▊       | 1830/6434 [4:17:11<10:31:40,  8.23s/it, gpt_loss=0.29, loss_mean=0.337] [A
+Train step of epoch 0:  28%|██▊       | 1831/6434 [4:17:11<10:39:45,  8.34s/it, gpt_loss=0.29, loss_mean=0.337][A
+Train step of epoch 0:  28%|██▊       | 1831/6434 [4:17:20<10:39:45,  8.34s/it, gpt_loss=0.363, loss_mean=0.339][A
+Train step of epoch 0:  28%|██▊       | 1832/6434 [4:17:20<10:51:03,  8.49s/it, gpt_loss=0.363, loss_mean=0.339][A
+Train step of epoch 0:  28%|██▊       | 1832/6434 [4:17:28<10:51:03,  8.49s/it, gpt_loss=0.373, loss_mean=0.343][A
+Train step of epoch 0:  28%|██▊       | 1833/6434 [4:17:28<10:55:22,  8.55s/it, gpt_loss=0.373, loss_mean=0.343][A
+Train step of epoch 0:  28%|██▊       | 1833/6434 [4:17:37<10:55:22,  8.55s/it, gpt_loss=0.264, loss_mean=0.335][A
+Train step of epoch 0:  29%|██▊       | 1834/6434 [4:17:37<10:50:12,  8.48s/it, gpt_loss=0.264, loss_mean=0.335][A
+Train step of epoch 0:  29%|██▊       | 1834/6434 [4:17:45<10:50:12,  8.48s/it, gpt_loss=0.269, loss_mean=0.328][A
+Train step of epoch 0:  29%|██▊       | 1835/6434 [4:17:45<10:43:47,  8.40s/it, gpt_loss=0.269, loss_mean=0.328][A
+Train step of epoch 0:  29%|██▊       | 1835/6434 [4:17:54<10:43:47,  8.40s/it, gpt_loss=0.344, loss_mean=0.33] [A
+Train step of epoch 0:  29%|██▊       | 1836/6434 [4:17:54<10:58:54,  8.60s/it, gpt_loss=0.344, loss_mean=0.33][A
+Train step of epoch 0:  29%|██▊       | 1836/6434 [4:18:03<10:58:54,  8.60s/it, gpt_loss=0.27, loss_mean=0.324][A
+Train step of epoch 0:  29%|██▊       | 1837/6434 [4:18:03<11:00:23,  8.62s/it, gpt_loss=0.27, loss_mean=0.324][A
+Train step of epoch 0:  29%|██▊       | 1837/6434 [4:18:11<11:00:23,  8.62s/it, gpt_loss=0.273, loss_mean=0.319][A
+Train step of epoch 0:  29%|██▊       | 1838/6434 [4:18:11<10:56:44,  8.57s/it, gpt_loss=0.273, loss_mean=0.319][A
+Train step of epoch 0:  29%|██▊       | 1838/6434 [4:18:21<10:56:44,  8.57s/it, gpt_loss=0.326, loss_mean=0.319][A
+Train step of epoch 0:  29%|██▊       | 1839/6434 [4:18:21<11:25:45,  8.95s/it, gpt_loss=0.326, loss_mean=0.319][A
+[LID Router Debug] Step: 1840
+Batch Size: 10
+Audio Batch Size: 72
+LID Assignments: [2, 5, 1, 1, 5, 1, 9, 4, 0, 2]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+Train step of epoch 0:  29%|██▊       | 1839/6434 [4:18:29<11:25:45,  8.95s/it, gpt_loss=0.374, loss_mean=0.325][A
+Train step of epoch 0:  29%|██▊       | 1840/6434 [4:18:29<11:10:29,  8.76s/it, gpt_loss=0.374, loss_mean=0.325][A
+Train step of epoch 0:  29%|██▊       | 1840/6434 [4:18:37<11:10:29,  8.76s/it, gpt_loss=0.427, loss_mean=0.335][A
+Train step of epoch 0:  29%|██▊       | 1841/6434 [4:18:37<10:56:16,  8.57s/it, gpt_loss=0.427, loss_mean=0.335][A
+Train step of epoch 0:  29%|██▊       | 1841/6434 [4:18:46<10:56:16,  8.57s/it, gpt_loss=0.357, loss_mean=0.337][A
+Train step of epoch 0:  29%|██▊       | 1842/6434 [4:18:46<10:53:15,  8.54s/it, gpt_loss=0.357, loss_mean=0.337][A
+Train step of epoch 0:  29%|██▊       | 1842/6434 [4:18:55<10:53:15,  8.54s/it, gpt_loss=0.471, loss_mean=0.351][A
+Train step of epoch 0:  29%|██▊       | 1843/6434 [4:18:55<11:02:16,  8.66s/it, gpt_loss=0.471, loss_mean=0.351][A
+Train step of epoch 0:  29%|██▊       | 1843/6434 [4:19:02<11:02:16,  8.66s/it, gpt_loss=0.322, loss_mean=0.348][A
+Train step of epoch 0:  29%|██▊       | 1844/6434 [4:19:02<10:40:38,  8.37s/it, gpt_loss=0.322, loss_mean=0.348][A
+Train step of epoch 0:  29%|██▊       | 1844/6434 [4:19:12<10:40:38,  8.37s/it, gpt_loss=0.328, loss_mean=0.346][A
+Train step of epoch 0:  29%|██▊       | 1845/6434 [4:19:12<10:58:18,  8.61s/it, gpt_loss=0.328, loss_mean=0.346][A
+Train step of epoch 0:  29%|██▊       | 1845/6434 [4:19:19<10:58:18,  8.61s/it, gpt_loss=0.322, loss_mean=0.343][A
+Train step of epoch 0:  29%|██▊       | 1846/6434 [4:19:19<10:32:52,  8.28s/it, gpt_loss=0.322, loss_mean=0.343][A
+Train step of epoch 0:  29%|██▊       | 1846/6434 [4:19:27<10:32:52,  8.28s/it, gpt_loss=0.388, loss_mean=0.348][A
+Train step of epoch 0:  29%|██▊       | 1847/6434 [4:19:27<10:19:37,  8.10s/it, gpt_loss=0.388, loss_mean=0.348][A
+Train step of epoch 0:  29%|██▊       | 1847/6434 [4:19:35<10:19:37,  8.10s/it, gpt_loss=0.373, loss_mean=0.35] [A
+Train step of epoch 0:  29%|██▊       | 1848/6434 [4:19:35<10:28:20,  8.22s/it, gpt_loss=0.373, loss_mean=0.35][A
+Train step of epoch 0:  29%|██▊       | 1848/6434 [4:19:44<10:28:20,  8.22s/it, gpt_loss=0.276, loss_mean=0.343][A
+Train step of epoch 0:  29%|██▊       | 1849/6434 [4:19:44<10:39:29,  8.37s/it, gpt_loss=0.276, loss_mean=0.343][A
+[LID Router Debug] Step: 1850
+Batch Size: 10
+Audio Batch Size: 98
+LID Assignments: [0, 3, 1, 0, 4, 3, 9, 9, 2, 5]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  29%|██▊       | 1849/6434 [4:19:54<10:39:29,  8.37s/it, gpt_loss=0.352, loss_mean=0.344][A
+Train step of epoch 0:  29%|██▉       | 1850/6434 [4:19:54<11:10:36,  8.78s/it, gpt_loss=0.352, loss_mean=0.344][A
+Train step of epoch 0:  29%|██▉       | 1850/6434 [4:20:02<11:10:36,  8.78s/it, gpt_loss=0.351, loss_mean=0.344][A
+Train step of epoch 0:  29%|██▉       | 1851/6434 [4:20:02<10:55:51,  8.59s/it, gpt_loss=0.351, loss_mean=0.344][A
+Train step of epoch 0:  29%|██▉       | 1851/6434 [4:20:10<10:55:51,  8.59s/it, gpt_loss=0.305, loss_mean=0.341][A
+Train step of epoch 0:  29%|██▉       | 1852/6434 [4:20:10<10:53:50,  8.56s/it, gpt_loss=0.305, loss_mean=0.341][A
+Train step of epoch 0:  29%|██▉       | 1852/6434 [4:20:18<10:53:50,  8.56s/it, gpt_loss=0.396, loss_mean=0.346][A
+Train step of epoch 0:  29%|██▉       | 1853/6434 [4:20:18<10:37:05,  8.34s/it, gpt_loss=0.396, loss_mean=0.346][A
+Train step of epoch 0:  29%|██▉       | 1853/6434 [4:20:26<10:37:05,  8.34s/it, gpt_loss=0.319, loss_mean=0.343][A
+Train step of epoch 0:  29%|██▉       | 1854/6434 [4:20:26<10:34:36,  8.31s/it, gpt_loss=0.319, loss_mean=0.343][A
+Train step of epoch 0:  29%|██▉       | 1854/6434 [4:20:36<10:34:36,  8.31s/it, gpt_loss=0.363, loss_mean=0.345][A
+Train step of epoch 0:  29%|██▉       | 1855/6434 [4:20:36<11:06:20,  8.73s/it, gpt_loss=0.363, loss_mean=0.345][A
+Train step of epoch 0:  29%|██▉       | 1855/6434 [4:20:46<11:06:20,  8.73s/it, gpt_loss=0.276, loss_mean=0.338][A
+Train step of epoch 0:  29%|██▉       | 1856/6434 [4:20:46<11:23:02,  8.95s/it, gpt_loss=0.276, loss_mean=0.338][A
+Train step of epoch 0:  29%|██▉       | 1856/6434 [4:20:55<11:23:02,  8.95s/it, gpt_loss=0.323, loss_mean=0.337][A
+Train step of epoch 0:  29%|██▉       | 1857/6434 [4:20:55<11:21:35,  8.93s/it, gpt_loss=0.323, loss_mean=0.337][A
+Train step of epoch 0:  29%|██▉       | 1857/6434 [4:21:03<11:21:35,  8.93s/it, gpt_loss=0.249, loss_mean=0.328][A
+Train step of epoch 0:  29%|██▉       | 1858/6434 [4:21:03<11:14:00,  8.84s/it, gpt_loss=0.249, loss_mean=0.328][A
+Train step of epoch 0:  29%|██▉       | 1858/6434 [4:21:11<11:14:00,  8.84s/it, gpt_loss=0.323, loss_mean=0.328][A
+Train step of epoch 0:  29%|██▉       | 1859/6434 [4:21:11<10:59:46,  8.65s/it, gpt_loss=0.323, loss_mean=0.328][A
+[LID Router Debug] Step: 1860
+Batch Size: 10
+Audio Batch Size: 94
+LID Assignments: [5, 2, 2, 5, 4, 4, 1, 1, 0, 0]
+Active Experts in Batch: {0, 1, 2, 4, 5}
+
+Train step of epoch 0:  29%|██▉       | 1859/6434 [4:21:19<10:59:46,  8.65s/it, gpt_loss=0.31, loss_mean=0.326] [A
+Train step of epoch 0:  29%|██▉       | 1860/6434 [4:21:19<10:45:07,  8.46s/it, gpt_loss=0.31, loss_mean=0.326][A
+Train step of epoch 0:  29%|██▉       | 1860/6434 [4:21:28<10:45:07,  8.46s/it, gpt_loss=0.409, loss_mean=0.334][A
+Train step of epoch 0:  29%|██▉       | 1861/6434 [4:21:28<10:40:48,  8.41s/it, gpt_loss=0.409, loss_mean=0.334][A
+Train step of epoch 0:  29%|██▉       | 1861/6434 [4:21:36<10:40:48,  8.41s/it, gpt_loss=0.333, loss_mean=0.334][A
+Train step of epoch 0:  29%|██▉       | 1862/6434 [4:21:36<10:36:25,  8.35s/it, gpt_loss=0.333, loss_mean=0.334][A
+Train step of epoch 0:  29%|██▉       | 1862/6434 [4:21:45<10:36:25,  8.35s/it, gpt_loss=0.402, loss_mean=0.341][A
+Train step of epoch 0:  29%|██▉       | 1863/6434 [4:21:45<10:57:29,  8.63s/it, gpt_loss=0.402, loss_mean=0.341][A
+Train step of epoch 0:  29%|██▉       | 1863/6434 [4:21:54<10:57:29,  8.63s/it, gpt_loss=0.274, loss_mean=0.334][A
+Train step of epoch 0:  29%|██▉       | 1864/6434 [4:21:54<10:57:39,  8.63s/it, gpt_loss=0.274, loss_mean=0.334][A
+Train step of epoch 0:  29%|██▉       | 1864/6434 [4:22:02<10:57:39,  8.63s/it, gpt_loss=0.47, loss_mean=0.348] [A
+Train step of epoch 0:  29%|██▉       | 1865/6434 [4:22:02<10:53:50,  8.59s/it, gpt_loss=0.47, loss_mean=0.348][A
+Train step of epoch 0:  29%|██▉       | 1865/6434 [4:22:11<10:53:50,  8.59s/it, gpt_loss=0.344, loss_mean=0.347][A
+Train step of epoch 0:  29%|██▉       | 1866/6434 [4:22:11<10:56:32,  8.62s/it, gpt_loss=0.344, loss_mean=0.347][A
+Train step of epoch 0:  29%|██▉       | 1866/6434 [4:22:20<10:56:32,  8.62s/it, gpt_loss=0.388, loss_mean=0.351][A
+Train step of epoch 0:  29%|██▉       | 1867/6434 [4:22:20<11:12:31,  8.84s/it, gpt_loss=0.388, loss_mean=0.351][A
+Train step of epoch 0:  29%|██▉       | 1867/6434 [4:22:28<11:12:31,  8.84s/it, gpt_loss=0.346, loss_mean=0.351][A
+Train step of epoch 0:  29%|██▉       | 1868/6434 [4:22:28<10:51:48,  8.57s/it, gpt_loss=0.346, loss_mean=0.351][A
+Train step of epoch 0:  29%|██▉       | 1868/6434 [4:22:37<10:51:48,  8.57s/it, gpt_loss=0.349, loss_mean=0.351][A
+Train step of epoch 0:  29%|██▉       | 1869/6434 [4:22:37<10:45:34,  8.49s/it, gpt_loss=0.349, loss_mean=0.351][A
+[LID Router Debug] Step: 1870
+Batch Size: 10
+Audio Batch Size: 108
+LID Assignments: [5, 9, 1, 5, 3, 9, 0, 9, 1, 5]
+Active Experts in Batch: {0, 1, 3, 5, 9}
+
+Train step of epoch 0:  29%|██▉       | 1869/6434 [4:22:45<10:45:34,  8.49s/it, gpt_loss=0.328, loss_mean=0.348][A
+Train step of epoch 0:  29%|██▉       | 1870/6434 [4:22:45<10:35:57,  8.36s/it, gpt_loss=0.328, loss_mean=0.348][A
+Train step of epoch 0:  29%|██▉       | 1870/6434 [4:22:54<10:35:57,  8.36s/it, gpt_loss=0.29, loss_mean=0.343] [A
+Train step of epoch 0:  29%|██▉       | 1871/6434 [4:22:54<11:01:54,  8.70s/it, gpt_loss=0.29, loss_mean=0.343][A
+Train step of epoch 0:  29%|██▉       | 1871/6434 [4:23:03<11:01:54,  8.70s/it, gpt_loss=0.322, loss_mean=0.341][A
+Train step of epoch 0:  29%|██▉       | 1872/6434 [4:23:03<11:09:46,  8.81s/it, gpt_loss=0.322, loss_mean=0.341][A
+Train step of epoch 0:  29%|██▉       | 1872/6434 [4:23:12<11:09:46,  8.81s/it, gpt_loss=0.317, loss_mean=0.338][A
+Train step of epoch 0:  29%|██▉       | 1873/6434 [4:23:12<11:06:34,  8.77s/it, gpt_loss=0.317, loss_mean=0.338][A
+Train step of epoch 0:  29%|██▉       | 1873/6434 [4:23:21<11:06:34,  8.77s/it, gpt_loss=0.369, loss_mean=0.341][A
+Train step of epoch 0:  29%|██▉       | 1874/6434 [4:23:21<11:05:21,  8.75s/it, gpt_loss=0.369, loss_mean=0.341][A
+Train step of epoch 0:  29%|██▉       | 1874/6434 [4:23:28<11:05:21,  8.75s/it, gpt_loss=0.303, loss_mean=0.337][A
+Train step of epoch 0:  29%|██▉       | 1875/6434 [4:23:28<10:28:10,  8.27s/it, gpt_loss=0.303, loss_mean=0.337][A
+Train step of epoch 0:  29%|██▉       | 1875/6434 [4:23:36<10:28:10,  8.27s/it, gpt_loss=0.364, loss_mean=0.34] [A
+Train step of epoch 0:  29%|██▉       | 1876/6434 [4:23:36<10:32:18,  8.32s/it, gpt_loss=0.364, loss_mean=0.34][A
+Train step of epoch 0:  29%|██▉       | 1876/6434 [4:23:44<10:32:18,  8.32s/it, gpt_loss=0.353, loss_mean=0.341][A
+Train step of epoch 0:  29%|██▉       | 1877/6434 [4:23:44<10:11:05,  8.05s/it, gpt_loss=0.353, loss_mean=0.341][A
+Train step of epoch 0:  29%|██▉       | 1877/6434 [4:23:52<10:11:05,  8.05s/it, gpt_loss=0.456, loss_mean=0.353][A
+Train step of epoch 0:  29%|██▉       | 1878/6434 [4:23:52<10:27:01,  8.26s/it, gpt_loss=0.456, loss_mean=0.353][A
+Train step of epoch 0:  29%|██▉       | 1878/6434 [4:24:00<10:27:01,  8.26s/it, gpt_loss=0.29, loss_mean=0.346] [A
+Train step of epoch 0:  29%|██▉       | 1879/6434 [4:24:00<10:08:08,  8.01s/it, gpt_loss=0.29, loss_mean=0.346][A
+[LID Router Debug] Step: 1880
+Batch Size: 10
+Audio Batch Size: 134
+LID Assignments: [2, 2, 3, 2, 2, 1, 3, 4, 4, 1]
+Active Experts in Batch: {1, 2, 3, 4}
+
+Train step of epoch 0:  29%|██▉       | 1879/6434 [4:24:08<10:08:08,  8.01s/it, gpt_loss=0.228, loss_mean=0.335][A
+Train step of epoch 0:  29%|██▉       | 1880/6434 [4:24:08<10:22:53,  8.21s/it, gpt_loss=0.228, loss_mean=0.335][A
+Train step of epoch 0:  29%|██▉       | 1880/6434 [4:24:17<10:22:53,  8.21s/it, gpt_loss=0.313, loss_mean=0.332][A
+Train step of epoch 0:  29%|██▉       | 1881/6434 [4:24:17<10:24:50,  8.23s/it, gpt_loss=0.313, loss_mean=0.332][A
+Train step of epoch 0:  29%|██▉       | 1881/6434 [4:24:25<10:24:50,  8.23s/it, gpt_loss=0.294, loss_mean=0.329][A
+Train step of epoch 0:  29%|██▉       | 1882/6434 [4:24:25<10:21:05,  8.19s/it, gpt_loss=0.294, loss_mean=0.329][A
+Train step of epoch 0:  29%|██▉       | 1882/6434 [4:24:33<10:21:05,  8.19s/it, gpt_loss=0.283, loss_mean=0.324][A
+Train step of epoch 0:  29%|██▉       | 1883/6434 [4:24:33<10:28:29,  8.29s/it, gpt_loss=0.283, loss_mean=0.324][A
+Train step of epoch 0:  29%|██▉       | 1883/6434 [4:24:42<10:28:29,  8.29s/it, gpt_loss=0.418, loss_mean=0.333][A
+Train step of epoch 0:  29%|██▉       | 1884/6434 [4:24:42<10:27:02,  8.27s/it, gpt_loss=0.418, loss_mean=0.333][A
+Train step of epoch 0:  29%|██▉       | 1884/6434 [4:24:49<10:27:02,  8.27s/it, gpt_loss=0.37, loss_mean=0.337] [A
+Train step of epoch 0:  29%|██▉       | 1885/6434 [4:24:49<10:13:20,  8.09s/it, gpt_loss=0.37, loss_mean=0.337][A
+Train step of epoch 0:  29%|██▉       | 1885/6434 [4:24:57<10:13:20,  8.09s/it, gpt_loss=0.261, loss_mean=0.329][A
+Train step of epoch 0:  29%|██▉       | 1886/6434 [4:24:57<10:16:40,  8.14s/it, gpt_loss=0.261, loss_mean=0.329][A
+Train step of epoch 0:  29%|██▉       | 1886/6434 [4:25:07<10:16:40,  8.14s/it, gpt_loss=0.279, loss_mean=0.324][A
+Train step of epoch 0:  29%|██▉       | 1887/6434 [4:25:07<10:49:00,  8.56s/it, gpt_loss=0.279, loss_mean=0.324][A
+Train step of epoch 0:  29%|██▉       | 1887/6434 [4:25:16<10:49:00,  8.56s/it, gpt_loss=0.354, loss_mean=0.327][A
+Train step of epoch 0:  29%|██▉       | 1888/6434 [4:25:16<10:52:00,  8.61s/it, gpt_loss=0.354, loss_mean=0.327][A
+Train step of epoch 0:  29%|██▉       | 1888/6434 [4:25:24<10:52:00,  8.61s/it, gpt_loss=0.304, loss_mean=0.325][A
+Train step of epoch 0:  29%|██▉       | 1889/6434 [4:25:24<10:48:25,  8.56s/it, gpt_loss=0.304, loss_mean=0.325][A
+[LID Router Debug] Step: 1890
+Batch Size: 10
+Audio Batch Size: 121
+LID Assignments: [2, 9, 2, 5, 3, 9, 0, 2, 5, 5]
+Active Experts in Batch: {0, 2, 3, 5, 9}
+
+Train step of epoch 0:  29%|██▉       | 1889/6434 [4:25:32<10:48:25,  8.56s/it, gpt_loss=0.505, loss_mean=0.343][A
+Train step of epoch 0:  29%|██▉       | 1890/6434 [4:25:32<10:41:07,  8.47s/it, gpt_loss=0.505, loss_mean=0.343][A
+Train step of epoch 0:  29%|██▉       | 1890/6434 [4:25:40<10:41:07,  8.47s/it, gpt_loss=0.345, loss_mean=0.343][A
+Train step of epoch 0:  29%|██▉       | 1891/6434 [4:25:40<10:20:46,  8.20s/it, gpt_loss=0.345, loss_mean=0.343][A
+Train step of epoch 0:  29%|██▉       | 1891/6434 [4:25:49<10:20:46,  8.20s/it, gpt_loss=0.295, loss_mean=0.338][A
+Train step of epoch 0:  29%|██▉       | 1892/6434 [4:25:49<10:27:45,  8.29s/it, gpt_loss=0.295, loss_mean=0.338][A
+Train step of epoch 0:  29%|██▉       | 1892/6434 [4:25:58<10:27:45,  8.29s/it, gpt_loss=0.305, loss_mean=0.335][A
+Train step of epoch 0:  29%|██▉       | 1893/6434 [4:25:58<10:47:21,  8.55s/it, gpt_loss=0.305, loss_mean=0.335][A
+Train step of epoch 0:  29%|██▉       | 1893/6434 [4:26:07<10:47:21,  8.55s/it, gpt_loss=0.403, loss_mean=0.342][A
+Train step of epoch 0:  29%|██▉       | 1894/6434 [4:26:07<11:03:59,  8.78s/it, gpt_loss=0.403, loss_mean=0.342][A
+Train step of epoch 0:  29%|██▉       | 1894/6434 [4:26:15<11:03:59,  8.78s/it, gpt_loss=0.259, loss_mean=0.334][A
+Train step of epoch 0:  29%|██▉       | 1895/6434 [4:26:15<10:48:59,  8.58s/it, gpt_loss=0.259, loss_mean=0.334][A
+Train step of epoch 0:  29%|██▉       | 1895/6434 [4:26:23<10:48:59,  8.58s/it, gpt_loss=0.375, loss_mean=0.338][A
+Train step of epoch 0:  29%|██▉       | 1896/6434 [4:26:23<10:27:52,  8.30s/it, gpt_loss=0.375, loss_mean=0.338][A
+Train step of epoch 0:  29%|██▉       | 1896/6434 [4:26:31<10:27:52,  8.30s/it, gpt_loss=0.358, loss_mean=0.34] [A
+Train step of epoch 0:  29%|██▉       | 1897/6434 [4:26:31<10:24:56,  8.26s/it, gpt_loss=0.358, loss_mean=0.34][A
+Train step of epoch 0:  29%|██▉       | 1897/6434 [4:26:39<10:24:56,  8.26s/it, gpt_loss=0.341, loss_mean=0.34][A
+Train step of epoch 0:  29%|██▉       | 1898/6434 [4:26:39<10:21:06,  8.22s/it, gpt_loss=0.341, loss_mean=0.34][A
+Train step of epoch 0:  29%|██▉       | 1898/6434 [4:26:48<10:21:06,  8.22s/it, gpt_loss=0.407, loss_mean=0.347][A
+Train step of epoch 0:  30%|██▉       | 1899/6434 [4:26:48<10:36:51,  8.43s/it, gpt_loss=0.407, loss_mean=0.347][A
+[LID Router Debug] Step: 1900
+Batch Size: 10
+Audio Batch Size: 128
+LID Assignments: [2, 3, 9, 9, 5, 4, 0, 0, 8, 9]
+Active Experts in Batch: {0, 2, 3, 4, 5, 8, 9}
+
+Train step of epoch 0:  30%|██▉       | 1899/6434 [4:26:56<10:36:51,  8.43s/it, gpt_loss=0.383, loss_mean=0.35] [A
+Train step of epoch 0:  30%|██▉       | 1900/6434 [4:26:56<10:34:46,  8.40s/it, gpt_loss=0.383, loss_mean=0.35][A
+Train step of epoch 0:  30%|██▉       | 1900/6434 [4:27:05<10:34:46,  8.40s/it, gpt_loss=0.329, loss_mean=0.348][A
+Train step of epoch 0:  30%|██▉       | 1901/6434 [4:27:05<10:37:31,  8.44s/it, gpt_loss=0.329, loss_mean=0.348][A
+Train step of epoch 0:  30%|██▉       | 1901/6434 [4:27:13<10:37:31,  8.44s/it, gpt_loss=0.438, loss_mean=0.357][A
+Train step of epoch 0:  30%|██▉       | 1902/6434 [4:27:13<10:39:01,  8.46s/it, gpt_loss=0.438, loss_mean=0.357][A
+Train step of epoch 0:  30%|██▉       | 1902/6434 [4:27:22<10:39:01,  8.46s/it, gpt_loss=0.329, loss_mean=0.354][A
+Train step of epoch 0:  30%|██▉       | 1903/6434 [4:27:22<10:40:39,  8.48s/it, gpt_loss=0.329, loss_mean=0.354][A
+Train step of epoch 0:  30%|██▉       | 1903/6434 [4:27:32<10:40:39,  8.48s/it, gpt_loss=0.3, loss_mean=0.349]  [A
+Train step of epoch 0:  30%|██▉       | 1904/6434 [4:27:32<11:09:19,  8.87s/it, gpt_loss=0.3, loss_mean=0.349][A
+Train step of epoch 0:  30%|██▉       | 1904/6434 [4:27:40<11:09:19,  8.87s/it, gpt_loss=0.408, loss_mean=0.355][A
+Train step of epoch 0:  30%|██▉       | 1905/6434 [4:27:40<11:04:32,  8.80s/it, gpt_loss=0.408, loss_mean=0.355][A
+Train step of epoch 0:  30%|██▉       | 1905/6434 [4:27:49<11:04:32,  8.80s/it, gpt_loss=0.327, loss_mean=0.352][A
+Train step of epoch 0:  30%|██▉       | 1906/6434 [4:27:49<10:55:32,  8.69s/it, gpt_loss=0.327, loss_mean=0.352][A
+Train step of epoch 0:  30%|██▉       | 1906/6434 [4:27:56<10:55:32,  8.69s/it, gpt_loss=0.283, loss_mean=0.345][A
+Train step of epoch 0:  30%|██▉       | 1907/6434 [4:27:56<10:33:09,  8.39s/it, gpt_loss=0.283, loss_mean=0.345][A
+Train step of epoch 0:  30%|██▉       | 1907/6434 [4:28:05<10:33:09,  8.39s/it, gpt_loss=0.297, loss_mean=0.34] [A
+Train step of epoch 0:  30%|██▉       | 1908/6434 [4:28:05<10:34:27,  8.41s/it, gpt_loss=0.297, loss_mean=0.34][A
+Train step of epoch 0:  30%|██▉       | 1908/6434 [4:28:13<10:34:27,  8.41s/it, gpt_loss=0.365, loss_mean=0.343][A
+Train step of epoch 0:  30%|██▉       | 1909/6434 [4:28:13<10:29:38,  8.35s/it, gpt_loss=0.365, loss_mean=0.343][A
+[LID Router Debug] Step: 1910
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [9, 2, 5, 1, 0, 2, 6, 0, 4, 9]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  30%|██▉       | 1909/6434 [4:28:20<10:29:38,  8.35s/it, gpt_loss=0.28, loss_mean=0.337] [A
+Train step of epoch 0:  30%|██▉       | 1910/6434 [4:28:20<10:02:33,  7.99s/it, gpt_loss=0.28, loss_mean=0.337][A
+Train step of epoch 0:  30%|██▉       | 1910/6434 [4:28:27<10:02:33,  7.99s/it, gpt_loss=0.291, loss_mean=0.332][A
+Train step of epoch 0:  30%|██▉       | 1911/6434 [4:28:27<9:42:26,  7.73s/it, gpt_loss=0.291, loss_mean=0.332] [A
+Train step of epoch 0:  30%|██▉       | 1911/6434 [4:28:35<9:42:26,  7.73s/it, gpt_loss=0.407, loss_mean=0.339][A
+Train step of epoch 0:  30%|██▉       | 1912/6434 [4:28:35<9:46:10,  7.78s/it, gpt_loss=0.407, loss_mean=0.339][A
+Train step of epoch 0:  30%|██▉       | 1912/6434 [4:28:43<9:46:10,  7.78s/it, gpt_loss=0.276, loss_mean=0.333][A
+Train step of epoch 0:  30%|██▉       | 1913/6434 [4:28:43<9:46:40,  7.79s/it, gpt_loss=0.276, loss_mean=0.333][A
+Train step of epoch 0:  30%|██▉       | 1913/6434 [4:28:52<9:46:40,  7.79s/it, gpt_loss=0.362, loss_mean=0.336][A
+Train step of epoch 0:  30%|██▉       | 1914/6434 [4:28:52<10:16:01,  8.18s/it, gpt_loss=0.362, loss_mean=0.336][A
+Train step of epoch 0:  30%|██▉       | 1914/6434 [4:29:00<10:16:01,  8.18s/it, gpt_loss=0.475, loss_mean=0.35] [A
+Train step of epoch 0:  30%|██▉       | 1915/6434 [4:29:00<10:11:07,  8.11s/it, gpt_loss=0.475, loss_mean=0.35][A
+Train step of epoch 0:  30%|██▉       | 1915/6434 [4:29:09<10:11:07,  8.11s/it, gpt_loss=0.322, loss_mean=0.347][A
+Train step of epoch 0:  30%|██▉       | 1916/6434 [4:29:09<10:22:28,  8.27s/it, gpt_loss=0.322, loss_mean=0.347][A
+Train step of epoch 0:  30%|██▉       | 1916/6434 [4:29:17<10:22:28,  8.27s/it, gpt_loss=0.293, loss_mean=0.342][A
+Train step of epoch 0:  30%|██▉       | 1917/6434 [4:29:17<10:33:45,  8.42s/it, gpt_loss=0.293, loss_mean=0.342][A
+Train step of epoch 0:  30%|██▉       | 1917/6434 [4:29:29<10:33:45,  8.42s/it, gpt_loss=0.374, loss_mean=0.345][A
+Train step of epoch 0:  30%|██▉       | 1918/6434 [4:29:29<11:33:07,  9.21s/it, gpt_loss=0.374, loss_mean=0.345][A
+Train step of epoch 0:  30%|██▉       | 1918/6434 [4:29:37<11:33:07,  9.21s/it, gpt_loss=0.356, loss_mean=0.346][A
+Train step of epoch 0:  30%|██▉       | 1919/6434 [4:29:37<11:05:54,  8.85s/it, gpt_loss=0.356, loss_mean=0.346][A
+[LID Router Debug] Step: 1920
+Batch Size: 10
+Audio Batch Size: 110
+LID Assignments: [2, 6, 1, 6, 3, 1, 0, 3, 4, 9]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:  30%|██▉       | 1919/6434 [4:29:45<11:05:54,  8.85s/it, gpt_loss=0.293, loss_mean=0.341][A
+Train step of epoch 0:  30%|██▉       | 1920/6434 [4:29:45<10:56:33,  8.73s/it, gpt_loss=0.293, loss_mean=0.341][A
+Train step of epoch 0:  30%|██▉       | 1920/6434 [4:29:53<10:56:33,  8.73s/it, gpt_loss=0.367, loss_mean=0.343][A
+Train step of epoch 0:  30%|██▉       | 1921/6434 [4:29:53<10:44:01,  8.56s/it, gpt_loss=0.367, loss_mean=0.343][A
+Train step of epoch 0:  30%|██▉       | 1921/6434 [4:30:02<10:44:01,  8.56s/it, gpt_loss=0.299, loss_mean=0.339][A
+Train step of epoch 0:  30%|██▉       | 1922/6434 [4:30:02<10:41:00,  8.52s/it, gpt_loss=0.299, loss_mean=0.339][A
+Train step of epoch 0:  30%|██▉       | 1922/6434 [4:30:09<10:41:00,  8.52s/it, gpt_loss=0.443, loss_mean=0.349][A
+Train step of epoch 0:  30%|██▉       | 1923/6434 [4:30:09<10:24:22,  8.30s/it, gpt_loss=0.443, loss_mean=0.349][A
+Train step of epoch 0:  30%|██▉       | 1923/6434 [4:30:17<10:24:22,  8.30s/it, gpt_loss=0.403, loss_mean=0.355][A
+Train step of epoch 0:  30%|██▉       | 1924/6434 [4:30:17<10:06:09,  8.06s/it, gpt_loss=0.403, loss_mean=0.355][A
+Train step of epoch 0:  30%|██▉       | 1924/6434 [4:30:25<10:06:09,  8.06s/it, gpt_loss=0.376, loss_mean=0.357][A
+Train step of epoch 0:  30%|██▉       | 1925/6434 [4:30:25<10:17:46,  8.22s/it, gpt_loss=0.376, loss_mean=0.357][A
+Train step of epoch 0:  30%|██▉       | 1925/6434 [4:30:34<10:17:46,  8.22s/it, gpt_loss=0.277, loss_mean=0.349][A
+Train step of epoch 0:  30%|██▉       | 1926/6434 [4:30:34<10:29:10,  8.37s/it, gpt_loss=0.277, loss_mean=0.349][A
+Train step of epoch 0:  30%|██▉       | 1926/6434 [4:30:43<10:29:10,  8.37s/it, gpt_loss=0.321, loss_mean=0.346][A
+Train step of epoch 0:  30%|██▉       | 1927/6434 [4:30:43<10:28:51,  8.37s/it, gpt_loss=0.321, loss_mean=0.346][A
+Train step of epoch 0:  30%|██▉       | 1927/6434 [4:30:50<10:28:51,  8.37s/it, gpt_loss=0.334, loss_mean=0.345][A
+Train step of epoch 0:  30%|██▉       | 1928/6434 [4:30:50<10:14:52,  8.19s/it, gpt_loss=0.334, loss_mean=0.345][A
+Train step of epoch 0:  30%|██▉       | 1928/6434 [4:30:59<10:14:52,  8.19s/it, gpt_loss=0.344, loss_mean=0.345][A
+Train step of epoch 0:  30%|██▉       | 1929/6434 [4:30:59<10:25:31,  8.33s/it, gpt_loss=0.344, loss_mean=0.345][A
+[LID Router Debug] Step: 1930
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [4, 3, 5, 0, 1, 5, 2, 2, 4, 9]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  30%|██▉       | 1929/6434 [4:31:08<10:25:31,  8.33s/it, gpt_loss=0.288, loss_mean=0.339][A
+Train step of epoch 0:  30%|██▉       | 1930/6434 [4:31:08<10:50:49,  8.67s/it, gpt_loss=0.288, loss_mean=0.339][A
+Train step of epoch 0:  30%|██▉       | 1930/6434 [4:31:16<10:50:49,  8.67s/it, gpt_loss=0.31, loss_mean=0.336] [A
+Train step of epoch 0:  30%|███       | 1931/6434 [4:31:16<10:31:16,  8.41s/it, gpt_loss=0.31, loss_mean=0.336][A
+Train step of epoch 0:  30%|███       | 1931/6434 [4:31:23<10:31:16,  8.41s/it, gpt_loss=0.353, loss_mean=0.338][A
+Train step of epoch 0:  30%|███       | 1932/6434 [4:31:23<10:02:20,  8.03s/it, gpt_loss=0.353, loss_mean=0.338][A
+Train step of epoch 0:  30%|███       | 1932/6434 [4:31:32<10:02:20,  8.03s/it, gpt_loss=0.289, loss_mean=0.333][A
+Train step of epoch 0:  30%|███       | 1933/6434 [4:31:32<10:14:59,  8.20s/it, gpt_loss=0.289, loss_mean=0.333][A
+Train step of epoch 0:  30%|███       | 1933/6434 [4:31:40<10:14:59,  8.20s/it, gpt_loss=0.319, loss_mean=0.331][A
+Train step of epoch 0:  30%|███       | 1934/6434 [4:31:40<10:11:28,  8.15s/it, gpt_loss=0.319, loss_mean=0.331][A
+Train step of epoch 0:  30%|███       | 1934/6434 [4:31:49<10:11:28,  8.15s/it, gpt_loss=0.362, loss_mean=0.335][A
+Train step of epoch 0:  30%|███       | 1935/6434 [4:31:49<10:24:37,  8.33s/it, gpt_loss=0.362, loss_mean=0.335][A
+Train step of epoch 0:  30%|███       | 1935/6434 [4:31:56<10:24:37,  8.33s/it, gpt_loss=0.25, loss_mean=0.326] [A
+Train step of epoch 0:  30%|███       | 1936/6434 [4:31:56<10:10:11,  8.14s/it, gpt_loss=0.25, loss_mean=0.326][A
+Train step of epoch 0:  30%|███       | 1936/6434 [4:32:04<10:10:11,  8.14s/it, gpt_loss=0.255, loss_mean=0.319][A
+Train step of epoch 0:  30%|███       | 1937/6434 [4:32:04<10:07:06,  8.10s/it, gpt_loss=0.255, loss_mean=0.319][A
+Train step of epoch 0:  30%|███       | 1937/6434 [4:32:14<10:07:06,  8.10s/it, gpt_loss=0.325, loss_mean=0.32] [A
+Train step of epoch 0:  30%|███       | 1938/6434 [4:32:14<10:32:25,  8.44s/it, gpt_loss=0.325, loss_mean=0.32][A
+Train step of epoch 0:  30%|███       | 1938/6434 [4:32:22<10:32:25,  8.44s/it, gpt_loss=0.323, loss_mean=0.32][A
+Train step of epoch 0:  30%|███       | 1939/6434 [4:32:22<10:31:34,  8.43s/it, gpt_loss=0.323, loss_mean=0.32][A
+[LID Router Debug] Step: 1940
+Batch Size: 10
+Audio Batch Size: 83
+LID Assignments: [6, 2, 4, 4, 4, 9, 9, 5, 2, 2]
+Active Experts in Batch: {2, 4, 5, 6, 9}
+
+Train step of epoch 0:  30%|███       | 1939/6434 [4:32:31<10:31:34,  8.43s/it, gpt_loss=0.417, loss_mean=0.33][A
+Train step of epoch 0:  30%|███       | 1940/6434 [4:32:31<10:30:38,  8.42s/it, gpt_loss=0.417, loss_mean=0.33][A
+Train step of epoch 0:  30%|███       | 1940/6434 [4:32:38<10:30:38,  8.42s/it, gpt_loss=0.455, loss_mean=0.342][A
+Train step of epoch 0:  30%|███       | 1941/6434 [4:32:38<10:17:30,  8.25s/it, gpt_loss=0.455, loss_mean=0.342][A
+Train step of epoch 0:  30%|███       | 1941/6434 [4:32:46<10:17:30,  8.25s/it, gpt_loss=0.364, loss_mean=0.344][A
+Train step of epoch 0:  30%|███       | 1942/6434 [4:32:46<9:53:54,  7.93s/it, gpt_loss=0.364, loss_mean=0.344] [A
+Train step of epoch 0:  30%|███       | 1942/6434 [4:32:54<9:53:54,  7.93s/it, gpt_loss=0.351, loss_mean=0.345][A
+Train step of epoch 0:  30%|███       | 1943/6434 [4:32:54<10:06:27,  8.10s/it, gpt_loss=0.351, loss_mean=0.345][A
+Train step of epoch 0:  30%|███       | 1943/6434 [4:33:02<10:06:27,  8.10s/it, gpt_loss=0.312, loss_mean=0.342][A
+Train step of epoch 0:  30%|███       | 1944/6434 [4:33:02<10:07:38,  8.12s/it, gpt_loss=0.312, loss_mean=0.342][A
+Train step of epoch 0:  30%|███       | 1944/6434 [4:33:10<10:07:38,  8.12s/it, gpt_loss=0.384, loss_mean=0.346][A
+Train step of epoch 0:  30%|███       | 1945/6434 [4:33:10<10:09:11,  8.14s/it, gpt_loss=0.384, loss_mean=0.346][A
+Train step of epoch 0:  30%|███       | 1945/6434 [4:33:19<10:09:11,  8.14s/it, gpt_loss=0.357, loss_mean=0.347][A
+Train step of epoch 0:  30%|███       | 1946/6434 [4:33:19<10:19:21,  8.28s/it, gpt_loss=0.357, loss_mean=0.347][A
+Train step of epoch 0:  30%|███       | 1946/6434 [4:33:28<10:19:21,  8.28s/it, gpt_loss=0.436, loss_mean=0.356][A
+Train step of epoch 0:  30%|███       | 1947/6434 [4:33:28<10:24:38,  8.35s/it, gpt_loss=0.436, loss_mean=0.356][A
+Train step of epoch 0:  30%|███       | 1947/6434 [4:33:36<10:24:38,  8.35s/it, gpt_loss=0.275, loss_mean=0.348][A
+Train step of epoch 0:  30%|███       | 1948/6434 [4:33:36<10:20:31,  8.30s/it, gpt_loss=0.275, loss_mean=0.348][A
+Train step of epoch 0:  30%|███       | 1948/6434 [4:33:45<10:20:31,  8.30s/it, gpt_loss=0.329, loss_mean=0.346][A
+Train step of epoch 0:  30%|███       | 1949/6434 [4:33:45<10:35:53,  8.51s/it, gpt_loss=0.329, loss_mean=0.346][A
+[LID Router Debug] Step: 1950
+Batch Size: 10
+Audio Batch Size: 96
+LID Assignments: [6, 6, 9, 9, 2, 0, 9, 1, 4, 5]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  30%|███       | 1949/6434 [4:33:54<10:35:53,  8.51s/it, gpt_loss=0.314, loss_mean=0.343][A
+Train step of epoch 0:  30%|███       | 1950/6434 [4:33:54<10:43:02,  8.60s/it, gpt_loss=0.314, loss_mean=0.343][A
+Train step of epoch 0:  30%|███       | 1950/6434 [4:34:03<10:43:02,  8.60s/it, gpt_loss=0.407, loss_mean=0.349][A
+Train step of epoch 0:  30%|███       | 1951/6434 [4:34:03<11:06:28,  8.92s/it, gpt_loss=0.407, loss_mean=0.349][A
+Train step of epoch 0:  30%|███       | 1951/6434 [4:34:12<11:06:28,  8.92s/it, gpt_loss=0.33, loss_mean=0.347] [A
+Train step of epoch 0:  30%|███       | 1952/6434 [4:34:12<11:02:39,  8.87s/it, gpt_loss=0.33, loss_mean=0.347][A
+Train step of epoch 0:  30%|███       | 1952/6434 [4:34:21<11:02:39,  8.87s/it, gpt_loss=0.388, loss_mean=0.351][A
+Train step of epoch 0:  30%|███       | 1953/6434 [4:34:21<11:06:39,  8.93s/it, gpt_loss=0.388, loss_mean=0.351][A
+Train step of epoch 0:  30%|███       | 1953/6434 [4:34:29<11:06:39,  8.93s/it, gpt_loss=0.309, loss_mean=0.347][A
+Train step of epoch 0:  30%|███       | 1954/6434 [4:34:29<10:42:52,  8.61s/it, gpt_loss=0.309, loss_mean=0.347][A
+Train step of epoch 0:  30%|███       | 1954/6434 [4:34:37<10:42:52,  8.61s/it, gpt_loss=0.353, loss_mean=0.348][A
+Train step of epoch 0:  30%|███       | 1955/6434 [4:34:37<10:29:07,  8.43s/it, gpt_loss=0.353, loss_mean=0.348][A
+Train step of epoch 0:  30%|███       | 1955/6434 [4:34:46<10:29:07,  8.43s/it, gpt_loss=0.368, loss_mean=0.35] [A
+Train step of epoch 0:  30%|███       | 1956/6434 [4:34:46<10:50:58,  8.72s/it, gpt_loss=0.368, loss_mean=0.35][A
+Train step of epoch 0:  30%|███       | 1956/6434 [4:34:54<10:50:58,  8.72s/it, gpt_loss=0.396, loss_mean=0.354][A
+Train step of epoch 0:  30%|███       | 1957/6434 [4:34:54<10:30:12,  8.45s/it, gpt_loss=0.396, loss_mean=0.354][A
+Train step of epoch 0:  30%|███       | 1957/6434 [4:35:02<10:30:12,  8.45s/it, gpt_loss=0.387, loss_mean=0.358][A
+Train step of epoch 0:  30%|███       | 1958/6434 [4:35:02<10:28:07,  8.42s/it, gpt_loss=0.387, loss_mean=0.358][A
+Train step of epoch 0:  30%|███       | 1958/6434 [4:35:11<10:28:07,  8.42s/it, gpt_loss=0.308, loss_mean=0.353][A
+Train step of epoch 0:  30%|███       | 1959/6434 [4:35:11<10:30:52,  8.46s/it, gpt_loss=0.308, loss_mean=0.353][A
+[LID Router Debug] Step: 1960
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [2, 4, 2, 9, 1, 2, 5, 0, 3, 2]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  30%|███       | 1959/6434 [4:35:19<10:30:52,  8.46s/it, gpt_loss=0.386, loss_mean=0.356][A
+Train step of epoch 0:  30%|███       | 1960/6434 [4:35:19<10:22:57,  8.35s/it, gpt_loss=0.386, loss_mean=0.356][A
+Train step of epoch 0:  30%|███       | 1960/6434 [4:35:27<10:22:57,  8.35s/it, gpt_loss=0.455, loss_mean=0.366][A
+Train step of epoch 0:  30%|███       | 1961/6434 [4:35:27<10:12:33,  8.22s/it, gpt_loss=0.455, loss_mean=0.366][A
+Train step of epoch 0:  30%|███       | 1961/6434 [4:35:36<10:12:33,  8.22s/it, gpt_loss=0.352, loss_mean=0.364][A
+Train step of epoch 0:  30%|███       | 1962/6434 [4:35:36<10:25:32,  8.39s/it, gpt_loss=0.352, loss_mean=0.364][A
+Train step of epoch 0:  30%|███       | 1962/6434 [4:35:45<10:25:32,  8.39s/it, gpt_loss=0.291, loss_mean=0.357][A
+Train step of epoch 0:  31%|███       | 1963/6434 [4:35:45<10:36:50,  8.55s/it, gpt_loss=0.291, loss_mean=0.357][A
+Train step of epoch 0:  31%|███       | 1963/6434 [4:35:54<10:36:50,  8.55s/it, gpt_loss=0.297, loss_mean=0.351][A
+Train step of epoch 0:  31%|███       | 1964/6434 [4:35:54<10:45:39,  8.67s/it, gpt_loss=0.297, loss_mean=0.351][A
+Train step of epoch 0:  31%|███       | 1964/6434 [4:36:03<10:45:39,  8.67s/it, gpt_loss=0.37, loss_mean=0.353] [A
+Train step of epoch 0:  31%|███       | 1965/6434 [4:36:03<10:50:14,  8.73s/it, gpt_loss=0.37, loss_mean=0.353][A
+Train step of epoch 0:  31%|███       | 1965/6434 [4:36:12<10:50:14,  8.73s/it, gpt_loss=0.497, loss_mean=0.367][A
+Train step of epoch 0:  31%|███       | 1966/6434 [4:36:12<11:10:00,  9.00s/it, gpt_loss=0.497, loss_mean=0.367][A
+Train step of epoch 0:  31%|███       | 1966/6434 [4:36:20<11:10:00,  9.00s/it, gpt_loss=0.357, loss_mean=0.366][A
+Train step of epoch 0:  31%|███       | 1967/6434 [4:36:20<10:46:36,  8.69s/it, gpt_loss=0.357, loss_mean=0.366][A
+Train step of epoch 0:  31%|███       | 1967/6434 [4:36:28<10:46:36,  8.69s/it, gpt_loss=0.281, loss_mean=0.358][A
+Train step of epoch 0:  31%|███       | 1968/6434 [4:36:28<10:34:56,  8.53s/it, gpt_loss=0.281, loss_mean=0.358][A
+Train step of epoch 0:  31%|███       | 1968/6434 [4:36:36<10:34:56,  8.53s/it, gpt_loss=0.336, loss_mean=0.356][A
+Train step of epoch 0:  31%|███       | 1969/6434 [4:36:36<10:24:36,  8.39s/it, gpt_loss=0.336, loss_mean=0.356][A
+[LID Router Debug] Step: 1970
+Batch Size: 10
+Audio Batch Size: 94
+LID Assignments: [2, 2, 6, 1, 1, 2, 9, 9, 2, 1]
+Active Experts in Batch: {1, 2, 6, 9}
+
+Train step of epoch 0:  31%|███       | 1969/6434 [4:36:45<10:24:36,  8.39s/it, gpt_loss=0.362, loss_mean=0.356][A
+Train step of epoch 0:  31%|███       | 1970/6434 [4:36:45<10:37:15,  8.57s/it, gpt_loss=0.362, loss_mean=0.356][A
+Train step of epoch 0:  31%|███       | 1970/6434 [4:36:53<10:37:15,  8.57s/it, gpt_loss=0.291, loss_mean=0.35] [A
+Train step of epoch 0:  31%|███       | 1971/6434 [4:36:53<10:28:05,  8.44s/it, gpt_loss=0.291, loss_mean=0.35][A
+Train step of epoch 0:  31%|███       | 1971/6434 [4:37:02<10:28:05,  8.44s/it, gpt_loss=0.328, loss_mean=0.348][A
+Train step of epoch 0:  31%|███       | 1972/6434 [4:37:02<10:39:23,  8.60s/it, gpt_loss=0.328, loss_mean=0.348][A
+Train step of epoch 0:  31%|███       | 1972/6434 [4:37:11<10:39:23,  8.60s/it, gpt_loss=0.249, loss_mean=0.338][A
+Train step of epoch 0:  31%|███       | 1973/6434 [4:37:11<10:39:22,  8.60s/it, gpt_loss=0.249, loss_mean=0.338][A
+Train step of epoch 0:  31%|███       | 1973/6434 [4:37:20<10:39:22,  8.60s/it, gpt_loss=0.391, loss_mean=0.343][A
+Train step of epoch 0:  31%|███       | 1974/6434 [4:37:20<10:39:43,  8.61s/it, gpt_loss=0.391, loss_mean=0.343][A
+Train step of epoch 0:  31%|███       | 1974/6434 [4:37:28<10:39:43,  8.61s/it, gpt_loss=0.299, loss_mean=0.339][A
+Train step of epoch 0:  31%|███       | 1975/6434 [4:37:28<10:32:11,  8.51s/it, gpt_loss=0.299, loss_mean=0.339][A
+Train step of epoch 0:  31%|███       | 1975/6434 [4:37:36<10:32:11,  8.51s/it, gpt_loss=0.382, loss_mean=0.343][A
+Train step of epoch 0:  31%|███       | 1976/6434 [4:37:36<10:31:16,  8.50s/it, gpt_loss=0.382, loss_mean=0.343][A
+Train step of epoch 0:  31%|███       | 1976/6434 [4:37:44<10:31:16,  8.50s/it, gpt_loss=0.249, loss_mean=0.333][A
+Train step of epoch 0:  31%|███       | 1977/6434 [4:37:44<10:18:03,  8.32s/it, gpt_loss=0.249, loss_mean=0.333][A
+Train step of epoch 0:  31%|███       | 1977/6434 [4:37:52<10:18:03,  8.32s/it, gpt_loss=0.34, loss_mean=0.334] [A
+Train step of epoch 0:  31%|███       | 1978/6434 [4:37:52<10:12:00,  8.24s/it, gpt_loss=0.34, loss_mean=0.334][A
+Train step of epoch 0:  31%|███       | 1978/6434 [4:38:02<10:12:00,  8.24s/it, gpt_loss=0.33, loss_mean=0.334][A
+Train step of epoch 0:  31%|███       | 1979/6434 [4:38:02<10:36:37,  8.57s/it, gpt_loss=0.33, loss_mean=0.334][A
+[LID Router Debug] Step: 1980
+Batch Size: 10
+Audio Batch Size: 83
+LID Assignments: [5, 4, 4, 1, 1, 6, 9, 4, 2, 9]
+Active Experts in Batch: {1, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  31%|███       | 1979/6434 [4:38:11<10:36:37,  8.57s/it, gpt_loss=0.339, loss_mean=0.334][A
+Train step of epoch 0:  31%|███       | 1980/6434 [4:38:11<10:51:18,  8.77s/it, gpt_loss=0.339, loss_mean=0.334][A
+Train step of epoch 0:  31%|███       | 1980/6434 [4:38:19<10:51:18,  8.77s/it, gpt_loss=0.331, loss_mean=0.334][A
+Train step of epoch 0:  31%|███       | 1981/6434 [4:38:19<10:36:18,  8.57s/it, gpt_loss=0.331, loss_mean=0.334][A
+Train step of epoch 0:  31%|███       | 1981/6434 [4:38:28<10:36:18,  8.57s/it, gpt_loss=0.312, loss_mean=0.332][A
+Train step of epoch 0:  31%|███       | 1982/6434 [4:38:28<10:34:21,  8.55s/it, gpt_loss=0.312, loss_mean=0.332][A
+Train step of epoch 0:  31%|███       | 1982/6434 [4:38:35<10:34:21,  8.55s/it, gpt_loss=0.279, loss_mean=0.327][A
+Train step of epoch 0:  31%|███       | 1983/6434 [4:38:35<10:16:10,  8.31s/it, gpt_loss=0.279, loss_mean=0.327][A
+Train step of epoch 0:  31%|███       | 1983/6434 [4:38:44<10:16:10,  8.31s/it, gpt_loss=0.349, loss_mean=0.329][A
+Train step of epoch 0:  31%|███       | 1984/6434 [4:38:44<10:23:20,  8.40s/it, gpt_loss=0.349, loss_mean=0.329][A
+Train step of epoch 0:  31%|███       | 1984/6434 [4:38:52<10:23:20,  8.40s/it, gpt_loss=0.349, loss_mean=0.331][A
+Train step of epoch 0:  31%|███       | 1985/6434 [4:38:52<10:18:12,  8.34s/it, gpt_loss=0.349, loss_mean=0.331][A
+Train step of epoch 0:  31%|███       | 1985/6434 [4:39:01<10:18:12,  8.34s/it, gpt_loss=0.397, loss_mean=0.337][A
+Train step of epoch 0:  31%|███       | 1986/6434 [4:39:01<10:21:42,  8.39s/it, gpt_loss=0.397, loss_mean=0.337][A
+Train step of epoch 0:  31%|███       | 1986/6434 [4:39:10<10:21:42,  8.39s/it, gpt_loss=0.258, loss_mean=0.329][A
+Train step of epoch 0:  31%|███       | 1987/6434 [4:39:10<10:36:12,  8.58s/it, gpt_loss=0.258, loss_mean=0.329][A
+Train step of epoch 0:  31%|███       | 1987/6434 [4:39:18<10:36:12,  8.58s/it, gpt_loss=0.306, loss_mean=0.327][A
+Train step of epoch 0:  31%|███       | 1988/6434 [4:39:18<10:37:14,  8.60s/it, gpt_loss=0.306, loss_mean=0.327][A
+Train step of epoch 0:  31%|███       | 1988/6434 [4:39:27<10:37:14,  8.60s/it, gpt_loss=0.384, loss_mean=0.333][A
+Train step of epoch 0:  31%|███       | 1989/6434 [4:39:27<10:37:21,  8.60s/it, gpt_loss=0.384, loss_mean=0.333][A
+[LID Router Debug] Step: 1990
+Batch Size: 10
+Audio Batch Size: 130
+LID Assignments: [3, 1, 5, 5, 3, 9, 1, 4, 9, 11]
+Active Experts in Batch: {1, 3, 4, 5, 9, 11}
+
+Train step of epoch 0:  31%|███       | 1989/6434 [4:39:36<10:37:21,  8.60s/it, gpt_loss=0.361, loss_mean=0.336][A
+Train step of epoch 0:  31%|███       | 1990/6434 [4:39:36<10:47:02,  8.74s/it, gpt_loss=0.361, loss_mean=0.336][A
+Train step of epoch 0:  31%|███       | 1990/6434 [4:39:46<10:47:02,  8.74s/it, gpt_loss=0.348, loss_mean=0.337][A
+Train step of epoch 0:  31%|███       | 1991/6434 [4:39:46<11:04:47,  8.98s/it, gpt_loss=0.348, loss_mean=0.337][A
+Train step of epoch 0:  31%|███       | 1991/6434 [4:39:55<11:04:47,  8.98s/it, gpt_loss=0.314, loss_mean=0.335][A
+Train step of epoch 0:  31%|███       | 1992/6434 [4:39:55<11:07:51,  9.02s/it, gpt_loss=0.314, loss_mean=0.335][A
+Train step of epoch 0:  31%|███       | 1992/6434 [4:40:03<11:07:51,  9.02s/it, gpt_loss=0.242, loss_mean=0.325][A
+Train step of epoch 0:  31%|███       | 1993/6434 [4:40:03<10:54:27,  8.84s/it, gpt_loss=0.242, loss_mean=0.325][A
+Train step of epoch 0:  31%|███       | 1993/6434 [4:40:12<10:54:27,  8.84s/it, gpt_loss=0.307, loss_mean=0.323][A
+Train step of epoch 0:  31%|███       | 1994/6434 [4:40:12<10:59:43,  8.92s/it, gpt_loss=0.307, loss_mean=0.323][A
+Train step of epoch 0:  31%|███       | 1994/6434 [4:40:21<10:59:43,  8.92s/it, gpt_loss=0.33, loss_mean=0.324] [A
+Train step of epoch 0:  31%|███       | 1995/6434 [4:40:21<10:55:04,  8.85s/it, gpt_loss=0.33, loss_mean=0.324][A
+Train step of epoch 0:  31%|███       | 1995/6434 [4:40:30<10:55:04,  8.85s/it, gpt_loss=0.326, loss_mean=0.324][A
+Train step of epoch 0:  31%|███       | 1996/6434 [4:40:30<11:10:01,  9.06s/it, gpt_loss=0.326, loss_mean=0.324][A
+Train step of epoch 0:  31%|███       | 1996/6434 [4:40:39<11:10:01,  9.06s/it, gpt_loss=0.411, loss_mean=0.333][A
+Train step of epoch 0:  31%|███       | 1997/6434 [4:40:39<11:02:28,  8.96s/it, gpt_loss=0.411, loss_mean=0.333][A
+Train step of epoch 0:  31%|███       | 1997/6434 [4:40:47<11:02:28,  8.96s/it, gpt_loss=0.344, loss_mean=0.334][A
+Train step of epoch 0:  31%|███       | 1998/6434 [4:40:47<10:41:37,  8.68s/it, gpt_loss=0.344, loss_mean=0.334][A
+Train step of epoch 0:  31%|███       | 1998/6434 [4:40:55<10:41:37,  8.68s/it, gpt_loss=0.381, loss_mean=0.339][A
+Train step of epoch 0:  31%|███       | 1999/6434 [4:40:55<10:22:50,  8.43s/it, gpt_loss=0.381, loss_mean=0.339][A
+[LID Router Debug] Step: 2000
+Batch Size: 10
+Audio Batch Size: 89
+LID Assignments: [1, 6, 9, 0, 1, 9, 4, 6, 5, 0]
+Active Experts in Batch: {0, 1, 4, 5, 6, 9}
+[2026-02-06 20:37:08,215] [INFO] [logging.py:96:log_dist] [Rank 0] step=1000, skipped=0, lr=[1.9562395712791304e-05, 1.9562395712791304e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-06 20:37:08,215] [INFO] [timer.py:260:stop] epoch=0/micro_step=2000/global_step=1000, RunningAvgSamplesPerSec=4.7559824985822585, CurrSamplesPerSec=4.827019869043489, MemAllocated=12.57GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  31%|███       | 1999/6434 [4:41:04<10:22:50,  8.43s/it, gpt_loss=0.331, loss_mean=0.338][A
+Train step of epoch 0:  31%|███       | 2000/6434 [4:41:04<10:30:12,  8.53s/it, gpt_loss=0.331, loss_mean=0.338][A
+Train step of epoch 0:  31%|███       | 2000/6434 [4:41:13<10:30:12,  8.53s/it, gpt_loss=0.361, loss_mean=0.34] [A
+Train step of epoch 0:  31%|███       | 2001/6434 [4:41:13<10:38:50,  8.65s/it, gpt_loss=0.361, loss_mean=0.34][A
+Train step of epoch 0:  31%|███       | 2001/6434 [4:41:21<10:38:50,  8.65s/it, gpt_loss=0.293, loss_mean=0.336][A
+Train step of epoch 0:  31%|███       | 2002/6434 [4:41:21<10:25:08,  8.46s/it, gpt_loss=0.293, loss_mean=0.336][A
+Train step of epoch 0:  31%|███       | 2002/6434 [4:41:28<10:25:08,  8.46s/it, gpt_loss=0.377, loss_mean=0.34] [A
+Train step of epoch 0:  31%|███       | 2003/6434 [4:41:28<10:09:22,  8.25s/it, gpt_loss=0.377, loss_mean=0.34][A
+Train step of epoch 0:  31%|███       | 2003/6434 [4:41:36<10:09:22,  8.25s/it, gpt_loss=0.29, loss_mean=0.335][A
+Train step of epoch 0:  31%|███       | 2004/6434 [4:41:36<9:58:35,  8.11s/it, gpt_loss=0.29, loss_mean=0.335] [A
+Train step of epoch 0:  31%|███       | 2004/6434 [4:41:44<9:58:35,  8.11s/it, gpt_loss=0.387, loss_mean=0.34][A
+Train step of epoch 0:  31%|███       | 2005/6434 [4:41:44<9:55:30,  8.07s/it, gpt_loss=0.387, loss_mean=0.34][A
+Train step of epoch 0:  31%|███       | 2005/6434 [4:41:52<9:55:30,  8.07s/it, gpt_loss=0.257, loss_mean=0.332][A
+Train step of epoch 0:  31%|███       | 2006/6434 [4:41:52<9:55:18,  8.07s/it, gpt_loss=0.257, loss_mean=0.332][A
+Train step of epoch 0:  31%|███       | 2006/6434 [4:42:01<9:55:18,  8.07s/it, gpt_loss=0.303, loss_mean=0.329][A
+Train step of epoch 0:  31%|███       | 2007/6434 [4:42:01<10:14:49,  8.33s/it, gpt_loss=0.303, loss_mean=0.329][A
+Train step of epoch 0:  31%|███       | 2007/6434 [4:42:09<10:14:49,  8.33s/it, gpt_loss=0.398, loss_mean=0.336][A
+Train step of epoch 0:  31%|███       | 2008/6434 [4:42:09<10:10:45,  8.28s/it, gpt_loss=0.398, loss_mean=0.336][A
+Train step of epoch 0:  31%|███       | 2008/6434 [4:42:17<10:10:45,  8.28s/it, gpt_loss=0.347, loss_mean=0.337][A
+Train step of epoch 0:  31%|███       | 2009/6434 [4:42:17<9:48:34,  7.98s/it, gpt_loss=0.347, loss_mean=0.337] [A
+[LID Router Debug] Step: 2010
+Batch Size: 10
+Audio Batch Size: 137
+LID Assignments: [1, 0, 2, 0, 0, 1, 9, 2, 2, 2]
+Active Experts in Batch: {0, 1, 2, 9}
+
+Train step of epoch 0:  31%|███       | 2009/6434 [4:42:26<9:48:34,  7.98s/it, gpt_loss=0.298, loss_mean=0.333][A
+Train step of epoch 0:  31%|███       | 2010/6434 [4:42:26<10:12:24,  8.31s/it, gpt_loss=0.298, loss_mean=0.333][A
+Train step of epoch 0:  31%|███       | 2010/6434 [4:42:35<10:12:24,  8.31s/it, gpt_loss=0.346, loss_mean=0.334][A
+Train step of epoch 0:  31%|███▏      | 2011/6434 [4:42:35<10:35:45,  8.62s/it, gpt_loss=0.346, loss_mean=0.334][A
+Train step of epoch 0:  31%|███▏      | 2011/6434 [4:42:42<10:35:45,  8.62s/it, gpt_loss=0.344, loss_mean=0.335][A
+Train step of epoch 0:  31%|███▏      | 2012/6434 [4:42:42<10:08:06,  8.25s/it, gpt_loss=0.344, loss_mean=0.335][A
+Train step of epoch 0:  31%|███▏      | 2012/6434 [4:42:51<10:08:06,  8.25s/it, gpt_loss=0.318, loss_mean=0.334][A
+Train step of epoch 0:  31%|███▏      | 2013/6434 [4:42:51<10:06:56,  8.24s/it, gpt_loss=0.318, loss_mean=0.334][A
+Train step of epoch 0:  31%|███▏      | 2013/6434 [4:42:59<10:06:56,  8.24s/it, gpt_loss=0.428, loss_mean=0.343][A
+Train step of epoch 0:  31%|███▏      | 2014/6434 [4:42:59<10:09:42,  8.28s/it, gpt_loss=0.428, loss_mean=0.343][A
+Train step of epoch 0:  31%|███▏      | 2014/6434 [4:43:07<10:09:42,  8.28s/it, gpt_loss=0.289, loss_mean=0.338][A
+Train step of epoch 0:  31%|███▏      | 2015/6434 [4:43:07<10:01:48,  8.17s/it, gpt_loss=0.289, loss_mean=0.338][A
+Train step of epoch 0:  31%|███▏      | 2015/6434 [4:43:17<10:01:48,  8.17s/it, gpt_loss=0.306, loss_mean=0.334][A
+Train step of epoch 0:  31%|███▏      | 2016/6434 [4:43:17<10:39:12,  8.68s/it, gpt_loss=0.306, loss_mean=0.334][A
+Train step of epoch 0:  31%|███▏      | 2016/6434 [4:43:26<10:39:12,  8.68s/it, gpt_loss=0.336, loss_mean=0.335][A
+Train step of epoch 0:  31%|███▏      | 2017/6434 [4:43:26<10:43:28,  8.74s/it, gpt_loss=0.336, loss_mean=0.335][A
+Train step of epoch 0:  31%|███▏      | 2017/6434 [4:43:34<10:43:28,  8.74s/it, gpt_loss=0.339, loss_mean=0.335][A
+Train step of epoch 0:  31%|███▏      | 2018/6434 [4:43:34<10:34:42,  8.62s/it, gpt_loss=0.339, loss_mean=0.335][A
+Train step of epoch 0:  31%|███▏      | 2018/6434 [4:43:43<10:34:42,  8.62s/it, gpt_loss=0.404, loss_mean=0.342][A
+Train step of epoch 0:  31%|███▏      | 2019/6434 [4:43:43<10:30:49,  8.57s/it, gpt_loss=0.404, loss_mean=0.342][A
+[LID Router Debug] Step: 2020
+Batch Size: 10
+Audio Batch Size: 100
+LID Assignments: [5, 5, 1, 3, 1, 0, 8, 5, 5, 5]
+Active Experts in Batch: {0, 1, 3, 5, 8}
+
+Train step of epoch 0:  31%|███▏      | 2019/6434 [4:43:51<10:30:49,  8.57s/it, gpt_loss=0.337, loss_mean=0.341][A
+Train step of epoch 0:  31%|███▏      | 2020/6434 [4:43:51<10:24:22,  8.49s/it, gpt_loss=0.337, loss_mean=0.341][A
+Train step of epoch 0:  31%|███▏      | 2020/6434 [4:44:00<10:24:22,  8.49s/it, gpt_loss=0.306, loss_mean=0.338][A
+Train step of epoch 0:  31%|███▏      | 2021/6434 [4:44:00<10:37:09,  8.66s/it, gpt_loss=0.306, loss_mean=0.338][A
+Train step of epoch 0:  31%|███▏      | 2021/6434 [4:44:07<10:37:09,  8.66s/it, gpt_loss=0.355, loss_mean=0.34] [A
+Train step of epoch 0:  31%|███▏      | 2022/6434 [4:44:07<10:01:47,  8.18s/it, gpt_loss=0.355, loss_mean=0.34][A
+Train step of epoch 0:  31%|███▏      | 2022/6434 [4:44:16<10:01:47,  8.18s/it, gpt_loss=0.41, loss_mean=0.347][A
+Train step of epoch 0:  31%|███▏      | 2023/6434 [4:44:16<10:27:29,  8.54s/it, gpt_loss=0.41, loss_mean=0.347][A
+Train step of epoch 0:  31%|███▏      | 2023/6434 [4:44:26<10:27:29,  8.54s/it, gpt_loss=0.308, loss_mean=0.343][A
+Train step of epoch 0:  31%|███▏      | 2024/6434 [4:44:26<10:46:54,  8.80s/it, gpt_loss=0.308, loss_mean=0.343][A
+Train step of epoch 0:  31%|███▏      | 2024/6434 [4:44:34<10:46:54,  8.80s/it, gpt_loss=0.286, loss_mean=0.337][A
+Train step of epoch 0:  31%|███▏      | 2025/6434 [4:44:34<10:34:01,  8.63s/it, gpt_loss=0.286, loss_mean=0.337][A
+Train step of epoch 0:  31%|███▏      | 2025/6434 [4:44:42<10:34:01,  8.63s/it, gpt_loss=0.381, loss_mean=0.341][A
+Train step of epoch 0:  31%|███▏      | 2026/6434 [4:44:42<10:18:08,  8.41s/it, gpt_loss=0.381, loss_mean=0.341][A
+Train step of epoch 0:  31%|███▏      | 2026/6434 [4:44:50<10:18:08,  8.41s/it, gpt_loss=0.344, loss_mean=0.342][A
+Train step of epoch 0:  32%|███▏      | 2027/6434 [4:44:50<10:05:51,  8.25s/it, gpt_loss=0.344, loss_mean=0.342][A
+Train step of epoch 0:  32%|███▏      | 2027/6434 [4:44:58<10:05:51,  8.25s/it, gpt_loss=0.238, loss_mean=0.331][A
+Train step of epoch 0:  32%|███▏      | 2028/6434 [4:44:58<10:10:51,  8.32s/it, gpt_loss=0.238, loss_mean=0.331][A
+Train step of epoch 0:  32%|███▏      | 2028/6434 [4:45:05<10:10:51,  8.32s/it, gpt_loss=0.303, loss_mean=0.328][A
+Train step of epoch 0:  32%|███▏      | 2029/6434 [4:45:05<9:41:52,  7.93s/it, gpt_loss=0.303, loss_mean=0.328] [A
+[LID Router Debug] Step: 2030
+Batch Size: 10
+Audio Batch Size: 88
+LID Assignments: [4, 4, 4, 1, 2, 6, 1, 2, 3, 9]
+Active Experts in Batch: {1, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:  32%|███▏      | 2029/6434 [4:45:13<9:41:52,  7.93s/it, gpt_loss=0.28, loss_mean=0.324] [A
+Train step of epoch 0:  32%|███▏      | 2030/6434 [4:45:13<9:46:14,  7.99s/it, gpt_loss=0.28, loss_mean=0.324][A
+Train step of epoch 0:  32%|███▏      | 2030/6434 [4:45:22<9:46:14,  7.99s/it, gpt_loss=0.304, loss_mean=0.322][A
+Train step of epoch 0:  32%|███▏      | 2031/6434 [4:45:22<9:53:11,  8.08s/it, gpt_loss=0.304, loss_mean=0.322][A
+Train step of epoch 0:  32%|███▏      | 2031/6434 [4:45:31<9:53:11,  8.08s/it, gpt_loss=0.26, loss_mean=0.315] [A
+Train step of epoch 0:  32%|███▏      | 2032/6434 [4:45:31<10:10:58,  8.33s/it, gpt_loss=0.26, loss_mean=0.315][A
+Train step of epoch 0:  32%|███▏      | 2032/6434 [4:45:39<10:10:58,  8.33s/it, gpt_loss=0.295, loss_mean=0.313][A
+Train step of epoch 0:  32%|███▏      | 2033/6434 [4:45:39<10:12:04,  8.34s/it, gpt_loss=0.295, loss_mean=0.313][A
+Train step of epoch 0:  32%|███▏      | 2033/6434 [4:45:48<10:12:04,  8.34s/it, gpt_loss=0.295, loss_mean=0.312][A
+Train step of epoch 0:  32%|███▏      | 2034/6434 [4:45:48<10:27:39,  8.56s/it, gpt_loss=0.295, loss_mean=0.312][A
+Train step of epoch 0:  32%|███▏      | 2034/6434 [4:45:57<10:27:39,  8.56s/it, gpt_loss=0.361, loss_mean=0.317][A
+Train step of epoch 0:  32%|███▏      | 2035/6434 [4:45:57<10:44:56,  8.80s/it, gpt_loss=0.361, loss_mean=0.317][A
+Train step of epoch 0:  32%|███▏      | 2035/6434 [4:46:06<10:44:56,  8.80s/it, gpt_loss=0.315, loss_mean=0.316][A
+Train step of epoch 0:  32%|███▏      | 2036/6434 [4:46:06<10:33:01,  8.64s/it, gpt_loss=0.315, loss_mean=0.316][A
+Train step of epoch 0:  32%|███▏      | 2036/6434 [4:46:14<10:33:01,  8.64s/it, gpt_loss=0.245, loss_mean=0.309][A
+Train step of epoch 0:  32%|███▏      | 2037/6434 [4:46:14<10:27:19,  8.56s/it, gpt_loss=0.245, loss_mean=0.309][A
+Train step of epoch 0:  32%|███▏      | 2037/6434 [4:46:22<10:27:19,  8.56s/it, gpt_loss=0.419, loss_mean=0.32] [A
+Train step of epoch 0:  32%|███▏      | 2038/6434 [4:46:22<10:17:34,  8.43s/it, gpt_loss=0.419, loss_mean=0.32][A
+Train step of epoch 0:  32%|███▏      | 2038/6434 [4:46:30<10:17:34,  8.43s/it, gpt_loss=0.333, loss_mean=0.322][A
+Train step of epoch 0:  32%|███▏      | 2039/6434 [4:46:30<10:14:40,  8.39s/it, gpt_loss=0.333, loss_mean=0.322][A
+[LID Router Debug] Step: 2040
+Batch Size: 10
+Audio Batch Size: 95
+LID Assignments: [6, 4, 4, 5, 3, 2, 1, 4, 0, 6]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+Train step of epoch 0:  32%|███▏      | 2039/6434 [4:46:38<10:14:40,  8.39s/it, gpt_loss=0.313, loss_mean=0.321][A
+Train step of epoch 0:  32%|███▏      | 2040/6434 [4:46:38<10:03:16,  8.24s/it, gpt_loss=0.313, loss_mean=0.321][A
+Train step of epoch 0:  32%|███▏      | 2040/6434 [4:46:46<10:03:16,  8.24s/it, gpt_loss=0.34, loss_mean=0.323] [A
+Train step of epoch 0:  32%|███▏      | 2041/6434 [4:46:46<9:42:35,  7.96s/it, gpt_loss=0.34, loss_mean=0.323] [A
+Train step of epoch 0:  32%|███▏      | 2041/6434 [4:46:55<9:42:35,  7.96s/it, gpt_loss=0.286, loss_mean=0.319][A
+Train step of epoch 0:  32%|███▏      | 2042/6434 [4:46:55<10:04:44,  8.26s/it, gpt_loss=0.286, loss_mean=0.319][A
+Train step of epoch 0:  32%|███▏      | 2042/6434 [4:47:03<10:04:44,  8.26s/it, gpt_loss=0.31, loss_mean=0.318] [A
+Train step of epoch 0:  32%|███▏      | 2043/6434 [4:47:03<10:09:42,  8.33s/it, gpt_loss=0.31, loss_mean=0.318][A
+Train step of epoch 0:  32%|███▏      | 2043/6434 [4:47:12<10:09:42,  8.33s/it, gpt_loss=0.323, loss_mean=0.319][A
+Train step of epoch 0:  32%|███▏      | 2044/6434 [4:47:12<10:16:54,  8.43s/it, gpt_loss=0.323, loss_mean=0.319][A
+Train step of epoch 0:  32%|███▏      | 2044/6434 [4:47:19<10:16:54,  8.43s/it, gpt_loss=0.485, loss_mean=0.335][A
+Train step of epoch 0:  32%|███▏      | 2045/6434 [4:47:19<9:50:22,  8.07s/it, gpt_loss=0.485, loss_mean=0.335] [A
+Train step of epoch 0:  32%|███▏      | 2045/6434 [4:47:26<9:50:22,  8.07s/it, gpt_loss=0.301, loss_mean=0.332][A
+Train step of epoch 0:  32%|███▏      | 2046/6434 [4:47:26<9:33:25,  7.84s/it, gpt_loss=0.301, loss_mean=0.332][A
+Train step of epoch 0:  32%|███▏      | 2046/6434 [4:47:36<9:33:25,  7.84s/it, gpt_loss=0.381, loss_mean=0.337][A
+Train step of epoch 0:  32%|███▏      | 2047/6434 [4:47:36<10:09:41,  8.34s/it, gpt_loss=0.381, loss_mean=0.337][A
+Train step of epoch 0:  32%|███▏      | 2047/6434 [4:47:44<10:09:41,  8.34s/it, gpt_loss=0.275, loss_mean=0.331][A
+Train step of epoch 0:  32%|███▏      | 2048/6434 [4:47:44<10:13:51,  8.40s/it, gpt_loss=0.275, loss_mean=0.331][A
+Train step of epoch 0:  32%|███▏      | 2048/6434 [4:47:54<10:13:51,  8.40s/it, gpt_loss=0.36, loss_mean=0.333] [A
+Train step of epoch 0:  32%|███▏      | 2049/6434 [4:47:54<10:33:56,  8.67s/it, gpt_loss=0.36, loss_mean=0.333][A
+[LID Router Debug] Step: 2050
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [1, 4, 0, 11, 9, 6, 1, 2, 9, 0]
+Active Experts in Batch: {0, 1, 2, 4, 6, 9, 11}
+
+Train step of epoch 0:  32%|███▏      | 2049/6434 [4:48:01<10:33:56,  8.67s/it, gpt_loss=0.326, loss_mean=0.333][A
+Train step of epoch 0:  32%|███▏      | 2050/6434 [4:48:01<10:14:14,  8.41s/it, gpt_loss=0.326, loss_mean=0.333][A
+Train step of epoch 0:  32%|███▏      | 2050/6434 [4:48:10<10:14:14,  8.41s/it, gpt_loss=0.368, loss_mean=0.336][A
+Train step of epoch 0:  32%|███▏      | 2051/6434 [4:48:10<10:13:54,  8.40s/it, gpt_loss=0.368, loss_mean=0.336][A
+Train step of epoch 0:  32%|███▏      | 2051/6434 [4:48:18<10:13:54,  8.40s/it, gpt_loss=0.248, loss_mean=0.327][A
+Train step of epoch 0:  32%|███▏      | 2052/6434 [4:48:18<10:03:42,  8.27s/it, gpt_loss=0.248, loss_mean=0.327][A
+Train step of epoch 0:  32%|███▏      | 2052/6434 [4:48:27<10:03:42,  8.27s/it, gpt_loss=0.297, loss_mean=0.324][A
+Train step of epoch 0:  32%|███▏      | 2053/6434 [4:48:27<10:18:46,  8.47s/it, gpt_loss=0.297, loss_mean=0.324][A
+Train step of epoch 0:  32%|███▏      | 2053/6434 [4:48:36<10:18:46,  8.47s/it, gpt_loss=0.296, loss_mean=0.322][A
+Train step of epoch 0:  32%|███▏      | 2054/6434 [4:48:36<10:43:27,  8.81s/it, gpt_loss=0.296, loss_mean=0.322][A
+Train step of epoch 0:  32%|███▏      | 2054/6434 [4:48:44<10:43:27,  8.81s/it, gpt_loss=0.281, loss_mean=0.318][A
+Train step of epoch 0:  32%|███▏      | 2055/6434 [4:48:44<10:17:11,  8.46s/it, gpt_loss=0.281, loss_mean=0.318][A
+Train step of epoch 0:  32%|███▏      | 2055/6434 [4:48:53<10:17:11,  8.46s/it, gpt_loss=0.282, loss_mean=0.314][A
+Train step of epoch 0:  32%|███▏      | 2056/6434 [4:48:53<10:35:47,  8.71s/it, gpt_loss=0.282, loss_mean=0.314][A
+Train step of epoch 0:  32%|███▏      | 2056/6434 [4:49:02<10:35:47,  8.71s/it, gpt_loss=0.333, loss_mean=0.316][A
+Train step of epoch 0:  32%|███▏      | 2057/6434 [4:49:02<10:43:38,  8.82s/it, gpt_loss=0.333, loss_mean=0.316][A
+Train step of epoch 0:  32%|███▏      | 2057/6434 [4:49:12<10:43:38,  8.82s/it, gpt_loss=0.312, loss_mean=0.315][A
+Train step of epoch 0:  32%|███▏      | 2058/6434 [4:49:12<10:52:27,  8.95s/it, gpt_loss=0.312, loss_mean=0.315][A
+Train step of epoch 0:  32%|███▏      | 2058/6434 [4:49:19<10:52:27,  8.95s/it, gpt_loss=0.293, loss_mean=0.313][A
+Train step of epoch 0:  32%|███▏      | 2059/6434 [4:49:19<10:28:22,  8.62s/it, gpt_loss=0.293, loss_mean=0.313][A
+[LID Router Debug] Step: 2060
+Batch Size: 10
+Audio Batch Size: 109
+LID Assignments: [5, 5, 5, 0, 3, 2, 4, 2, 2, 3]
+Active Experts in Batch: {0, 2, 3, 4, 5}
+
+Train step of epoch 0:  32%|███▏      | 2059/6434 [4:49:27<10:28:22,  8.62s/it, gpt_loss=0.297, loss_mean=0.312][A
+Train step of epoch 0:  32%|███▏      | 2060/6434 [4:49:27<10:06:37,  8.32s/it, gpt_loss=0.297, loss_mean=0.312][A
+Train step of epoch 0:  32%|███▏      | 2060/6434 [4:49:36<10:06:37,  8.32s/it, gpt_loss=0.366, loss_mean=0.317][A
+Train step of epoch 0:  32%|███▏      | 2061/6434 [4:49:36<10:19:56,  8.51s/it, gpt_loss=0.366, loss_mean=0.317][A
+Train step of epoch 0:  32%|███▏      | 2061/6434 [4:49:45<10:19:56,  8.51s/it, gpt_loss=0.311, loss_mean=0.316][A
+Train step of epoch 0:  32%|███▏      | 2062/6434 [4:49:45<10:28:30,  8.63s/it, gpt_loss=0.311, loss_mean=0.316][A
+Train step of epoch 0:  32%|███▏      | 2062/6434 [4:49:53<10:28:30,  8.63s/it, gpt_loss=0.361, loss_mean=0.321][A
+Train step of epoch 0:  32%|███▏      | 2063/6434 [4:49:53<10:10:11,  8.38s/it, gpt_loss=0.361, loss_mean=0.321][A
+Train step of epoch 0:  32%|███▏      | 2063/6434 [4:50:00<10:10:11,  8.38s/it, gpt_loss=0.332, loss_mean=0.322][A
+Train step of epoch 0:  32%|███▏      | 2064/6434 [4:50:00<9:57:14,  8.20s/it, gpt_loss=0.332, loss_mean=0.322] [A
+Train step of epoch 0:  32%|███▏      | 2064/6434 [4:50:10<9:57:14,  8.20s/it, gpt_loss=0.329, loss_mean=0.323][A
+Train step of epoch 0:  32%|███▏      | 2065/6434 [4:50:10<10:22:00,  8.54s/it, gpt_loss=0.329, loss_mean=0.323][A
+Train step of epoch 0:  32%|███▏      | 2065/6434 [4:50:18<10:22:00,  8.54s/it, gpt_loss=0.29, loss_mean=0.319] [A
+Train step of epoch 0:  32%|███▏      | 2066/6434 [4:50:18<10:11:16,  8.40s/it, gpt_loss=0.29, loss_mean=0.319][A
+Train step of epoch 0:  32%|███▏      | 2066/6434 [4:50:26<10:11:16,  8.40s/it, gpt_loss=0.34, loss_mean=0.321][A
+Train step of epoch 0:  32%|███▏      | 2067/6434 [4:50:26<10:06:35,  8.33s/it, gpt_loss=0.34, loss_mean=0.321][A
+Train step of epoch 0:  32%|███▏      | 2067/6434 [4:50:35<10:06:35,  8.33s/it, gpt_loss=0.347, loss_mean=0.324][A
+Train step of epoch 0:  32%|███▏      | 2068/6434 [4:50:35<10:09:58,  8.38s/it, gpt_loss=0.347, loss_mean=0.324][A
+Train step of epoch 0:  32%|███▏      | 2068/6434 [4:50:42<10:09:58,  8.38s/it, gpt_loss=0.349, loss_mean=0.326][A
+Train step of epoch 0:  32%|███▏      | 2069/6434 [4:50:42<9:45:16,  8.04s/it, gpt_loss=0.349, loss_mean=0.326] [A
+[LID Router Debug] Step: 2070
+Batch Size: 10
+Audio Batch Size: 77
+LID Assignments: [6, 9, 1, 1, 5, 2, 6, 2, 5, 2]
+Active Experts in Batch: {1, 2, 5, 6, 9}
+
+Train step of epoch 0:  32%|███▏      | 2069/6434 [4:50:50<9:45:16,  8.04s/it, gpt_loss=0.291, loss_mean=0.323][A
+Train step of epoch 0:  32%|███▏      | 2070/6434 [4:50:50<9:48:17,  8.09s/it, gpt_loss=0.291, loss_mean=0.323][A
+Train step of epoch 0:  32%|███▏      | 2070/6434 [4:50:58<9:48:17,  8.09s/it, gpt_loss=0.361, loss_mean=0.327][A
+Train step of epoch 0:  32%|███▏      | 2071/6434 [4:50:58<9:56:19,  8.20s/it, gpt_loss=0.361, loss_mean=0.327][A
+Train step of epoch 0:  32%|███▏      | 2071/6434 [4:51:06<9:56:19,  8.20s/it, gpt_loss=0.285, loss_mean=0.323][A
+Train step of epoch 0:  32%|███▏      | 2072/6434 [4:51:06<9:48:55,  8.10s/it, gpt_loss=0.285, loss_mean=0.323][A
+Train step of epoch 0:  32%|███▏      | 2072/6434 [4:51:14<9:48:55,  8.10s/it, gpt_loss=0.378, loss_mean=0.328][A
+Train step of epoch 0:  32%|███▏      | 2073/6434 [4:51:14<9:40:30,  7.99s/it, gpt_loss=0.378, loss_mean=0.328][A
+Train step of epoch 0:  32%|███▏      | 2073/6434 [4:51:23<9:40:30,  7.99s/it, gpt_loss=0.371, loss_mean=0.332][A
+Train step of epoch 0:  32%|███▏      | 2074/6434 [4:51:23<9:52:55,  8.16s/it, gpt_loss=0.371, loss_mean=0.332][A
+Train step of epoch 0:  32%|███▏      | 2074/6434 [4:51:33<9:52:55,  8.16s/it, gpt_loss=0.267, loss_mean=0.326][A
+Train step of epoch 0:  32%|███▏      | 2075/6434 [4:51:33<10:38:56,  8.79s/it, gpt_loss=0.267, loss_mean=0.326][A
+Train step of epoch 0:  32%|███▏      | 2075/6434 [4:51:42<10:38:56,  8.79s/it, gpt_loss=0.294, loss_mean=0.323][A
+Train step of epoch 0:  32%|███▏      | 2076/6434 [4:51:42<10:36:24,  8.76s/it, gpt_loss=0.294, loss_mean=0.323][A
+Train step of epoch 0:  32%|███▏      | 2076/6434 [4:51:50<10:36:24,  8.76s/it, gpt_loss=0.333, loss_mean=0.324][A
+Train step of epoch 0:  32%|███▏      | 2077/6434 [4:51:50<10:24:58,  8.61s/it, gpt_loss=0.333, loss_mean=0.324][A
+Train step of epoch 0:  32%|███▏      | 2077/6434 [4:52:00<10:24:58,  8.61s/it, gpt_loss=0.228, loss_mean=0.314][A
+Train step of epoch 0:  32%|███▏      | 2078/6434 [4:52:00<11:06:14,  9.18s/it, gpt_loss=0.228, loss_mean=0.314][A
+Train step of epoch 0:  32%|███▏      | 2078/6434 [4:52:09<11:06:14,  9.18s/it, gpt_loss=0.292, loss_mean=0.312][A
+Train step of epoch 0:  32%|███▏      | 2079/6434 [4:52:09<11:03:46,  9.14s/it, gpt_loss=0.292, loss_mean=0.312][A
+[LID Router Debug] Step: 2080
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [0, 2, 4, 5, 2, 3, 3, 9, 0, 4]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  32%|███▏      | 2079/6434 [4:52:18<11:03:46,  9.14s/it, gpt_loss=0.325, loss_mean=0.313][A
+Train step of epoch 0:  32%|███▏      | 2080/6434 [4:52:18<10:55:16,  9.03s/it, gpt_loss=0.325, loss_mean=0.313][A
+Train step of epoch 0:  32%|███▏      | 2080/6434 [4:52:27<10:55:16,  9.03s/it, gpt_loss=0.347, loss_mean=0.317][A
+Train step of epoch 0:  32%|███▏      | 2081/6434 [4:52:27<10:41:17,  8.84s/it, gpt_loss=0.347, loss_mean=0.317][A
+Train step of epoch 0:  32%|███▏      | 2081/6434 [4:52:35<10:41:17,  8.84s/it, gpt_loss=0.351, loss_mean=0.32] [A
+Train step of epoch 0:  32%|███▏      | 2082/6434 [4:52:35<10:41:59,  8.85s/it, gpt_loss=0.351, loss_mean=0.32][A
+Train step of epoch 0:  32%|███▏      | 2082/6434 [4:52:43<10:41:59,  8.85s/it, gpt_loss=0.244, loss_mean=0.312][A
+Train step of epoch 0:  32%|███▏      | 2083/6434 [4:52:43<10:24:03,  8.61s/it, gpt_loss=0.244, loss_mean=0.312][A
+Train step of epoch 0:  32%|███▏      | 2083/6434 [4:52:52<10:24:03,  8.61s/it, gpt_loss=0.297, loss_mean=0.311][A
+Train step of epoch 0:  32%|███▏      | 2084/6434 [4:52:52<10:17:56,  8.52s/it, gpt_loss=0.297, loss_mean=0.311][A
+Train step of epoch 0:  32%|███▏      | 2084/6434 [4:52:59<10:17:56,  8.52s/it, gpt_loss=0.351, loss_mean=0.315][A
+Train step of epoch 0:  32%|███▏      | 2085/6434 [4:52:59<9:58:16,  8.25s/it, gpt_loss=0.351, loss_mean=0.315] [A
+Train step of epoch 0:  32%|███▏      | 2085/6434 [4:53:08<9:58:16,  8.25s/it, gpt_loss=0.302, loss_mean=0.314][A
+Train step of epoch 0:  32%|███▏      | 2086/6434 [4:53:08<10:13:52,  8.47s/it, gpt_loss=0.302, loss_mean=0.314][A
+Train step of epoch 0:  32%|███▏      | 2086/6434 [4:53:16<10:13:52,  8.47s/it, gpt_loss=0.402, loss_mean=0.322][A
+Train step of epoch 0:  32%|███▏      | 2087/6434 [4:53:16<9:51:30,  8.16s/it, gpt_loss=0.402, loss_mean=0.322] [A
+Train step of epoch 0:  32%|███▏      | 2087/6434 [4:53:24<9:51:30,  8.16s/it, gpt_loss=0.321, loss_mean=0.322][A
+Train step of epoch 0:  32%|███▏      | 2088/6434 [4:53:24<9:50:10,  8.15s/it, gpt_loss=0.321, loss_mean=0.322][A
+Train step of epoch 0:  32%|███▏      | 2088/6434 [4:53:33<9:50:10,  8.15s/it, gpt_loss=0.354, loss_mean=0.325][A
+Train step of epoch 0:  32%|███▏      | 2089/6434 [4:53:33<10:00:23,  8.29s/it, gpt_loss=0.354, loss_mean=0.325][A
+[LID Router Debug] Step: 2090
+Batch Size: 10
+Audio Batch Size: 121
+LID Assignments: [2, 2, 3, 1, 5, 4, 2, 2, 10, 1]
+Active Experts in Batch: {1, 2, 3, 4, 5, 10}
+
+Train step of epoch 0:  32%|███▏      | 2089/6434 [4:53:41<10:00:23,  8.29s/it, gpt_loss=0.45, loss_mean=0.338] [A
+Train step of epoch 0:  32%|███▏      | 2090/6434 [4:53:41<9:53:52,  8.20s/it, gpt_loss=0.45, loss_mean=0.338] [A
+Train step of epoch 0:  32%|███▏      | 2090/6434 [4:53:50<9:53:52,  8.20s/it, gpt_loss=0.297, loss_mean=0.334][A
+Train step of epoch 0:  32%|███▏      | 2091/6434 [4:53:50<10:15:41,  8.51s/it, gpt_loss=0.297, loss_mean=0.334][A
+Train step of epoch 0:  32%|███▏      | 2091/6434 [4:53:59<10:15:41,  8.51s/it, gpt_loss=0.303, loss_mean=0.331][A
+Train step of epoch 0:  33%|███▎      | 2092/6434 [4:53:59<10:20:14,  8.57s/it, gpt_loss=0.303, loss_mean=0.331][A
+Train step of epoch 0:  33%|███▎      | 2092/6434 [4:54:07<10:20:14,  8.57s/it, gpt_loss=0.299, loss_mean=0.328][A
+Train step of epoch 0:  33%|███▎      | 2093/6434 [4:54:07<10:18:46,  8.55s/it, gpt_loss=0.299, loss_mean=0.328][A
+Train step of epoch 0:  33%|███▎      | 2093/6434 [4:54:17<10:18:46,  8.55s/it, gpt_loss=0.302, loss_mean=0.325][A
+Train step of epoch 0:  33%|███▎      | 2094/6434 [4:54:17<10:39:38,  8.84s/it, gpt_loss=0.302, loss_mean=0.325][A
+Train step of epoch 0:  33%|███▎      | 2094/6434 [4:54:25<10:39:38,  8.84s/it, gpt_loss=0.298, loss_mean=0.322][A
+Train step of epoch 0:  33%|███▎      | 2095/6434 [4:54:25<10:20:39,  8.58s/it, gpt_loss=0.298, loss_mean=0.322][A
+Train step of epoch 0:  33%|███▎      | 2095/6434 [4:54:32<10:20:39,  8.58s/it, gpt_loss=0.355, loss_mean=0.326][A
+Train step of epoch 0:  33%|███▎      | 2096/6434 [4:54:32<9:58:56,  8.28s/it, gpt_loss=0.355, loss_mean=0.326] [A
+Train step of epoch 0:  33%|███▎      | 2096/6434 [4:54:40<9:58:56,  8.28s/it, gpt_loss=0.286, loss_mean=0.322][A
+Train step of epoch 0:  33%|███▎      | 2097/6434 [4:54:40<9:58:34,  8.28s/it, gpt_loss=0.286, loss_mean=0.322][A
+Train step of epoch 0:  33%|███▎      | 2097/6434 [4:54:49<9:58:34,  8.28s/it, gpt_loss=0.33, loss_mean=0.323] [A
+Train step of epoch 0:  33%|███▎      | 2098/6434 [4:54:49<10:09:55,  8.44s/it, gpt_loss=0.33, loss_mean=0.323][A
+Train step of epoch 0:  33%|███▎      | 2098/6434 [4:54:58<10:09:55,  8.44s/it, gpt_loss=0.28, loss_mean=0.318][A
+Train step of epoch 0:  33%|███▎      | 2099/6434 [4:54:58<10:12:39,  8.48s/it, gpt_loss=0.28, loss_mean=0.318][A
+[LID Router Debug] Step: 2100
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [2, 5, 4, 9, 3, 9, 0, 1, 9, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  33%|███▎      | 2099/6434 [4:55:07<10:12:39,  8.48s/it, gpt_loss=0.304, loss_mean=0.317][A
+Train step of epoch 0:  33%|███▎      | 2100/6434 [4:55:07<10:24:50,  8.65s/it, gpt_loss=0.304, loss_mean=0.317][A
+Train step of epoch 0:  33%|███▎      | 2100/6434 [4:55:16<10:24:50,  8.65s/it, gpt_loss=0.259, loss_mean=0.311][A
+Train step of epoch 0:  33%|███▎      | 2101/6434 [4:55:16<10:41:49,  8.89s/it, gpt_loss=0.259, loss_mean=0.311][A
+Train step of epoch 0:  33%|███▎      | 2101/6434 [4:55:25<10:41:49,  8.89s/it, gpt_loss=0.302, loss_mean=0.31] [A
+Train step of epoch 0:  33%|███▎      | 2102/6434 [4:55:25<10:35:17,  8.80s/it, gpt_loss=0.302, loss_mean=0.31][A
+Train step of epoch 0:  33%|███▎      | 2102/6434 [4:55:33<10:35:17,  8.80s/it, gpt_loss=0.333, loss_mean=0.313][A
+Train step of epoch 0:  33%|███▎      | 2103/6434 [4:55:33<10:26:37,  8.68s/it, gpt_loss=0.333, loss_mean=0.313][A
+Train step of epoch 0:  33%|███▎      | 2103/6434 [4:55:43<10:26:37,  8.68s/it, gpt_loss=0.393, loss_mean=0.321][A
+Train step of epoch 0:  33%|███▎      | 2104/6434 [4:55:43<10:40:44,  8.88s/it, gpt_loss=0.393, loss_mean=0.321][A
+Train step of epoch 0:  33%|███▎      | 2104/6434 [4:55:51<10:40:44,  8.88s/it, gpt_loss=0.354, loss_mean=0.324][A
+Train step of epoch 0:  33%|███▎      | 2105/6434 [4:55:51<10:36:33,  8.82s/it, gpt_loss=0.354, loss_mean=0.324][A
+Train step of epoch 0:  33%|███▎      | 2105/6434 [4:55:59<10:36:33,  8.82s/it, gpt_loss=0.305, loss_mean=0.322][A
+Train step of epoch 0:  33%|███▎      | 2106/6434 [4:55:59<10:17:29,  8.56s/it, gpt_loss=0.305, loss_mean=0.322][A
+Train step of epoch 0:  33%|███▎      | 2106/6434 [4:56:06<10:17:29,  8.56s/it, gpt_loss=0.309, loss_mean=0.321][A
+Train step of epoch 0:  33%|███▎      | 2107/6434 [4:56:06<9:47:16,  8.14s/it, gpt_loss=0.309, loss_mean=0.321] [A
+Train step of epoch 0:  33%|███▎      | 2107/6434 [4:56:15<9:47:16,  8.14s/it, gpt_loss=0.3, loss_mean=0.319]  [A
+Train step of epoch 0:  33%|███▎      | 2108/6434 [4:56:15<9:53:18,  8.23s/it, gpt_loss=0.3, loss_mean=0.319][A
+Train step of epoch 0:  33%|███▎      | 2108/6434 [4:56:24<9:53:18,  8.23s/it, gpt_loss=0.368, loss_mean=0.324][A
+Train step of epoch 0:  33%|███▎      | 2109/6434 [4:56:24<10:18:14,  8.58s/it, gpt_loss=0.368, loss_mean=0.324][A
+[LID Router Debug] Step: 2110
+Batch Size: 10
+Audio Batch Size: 89
+LID Assignments: [2, 9, 6, 9, 6, 1, 5, 9, 0, 4]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  33%|███▎      | 2109/6434 [4:56:32<10:18:14,  8.58s/it, gpt_loss=0.32, loss_mean=0.323] [A
+Train step of epoch 0:  33%|███▎      | 2110/6434 [4:56:32<10:11:19,  8.48s/it, gpt_loss=0.32, loss_mean=0.323][A
+Train step of epoch 0:  33%|███▎      | 2110/6434 [4:56:40<10:11:19,  8.48s/it, gpt_loss=0.318, loss_mean=0.323][A
+Train step of epoch 0:  33%|███▎      | 2111/6434 [4:56:40<9:59:11,  8.32s/it, gpt_loss=0.318, loss_mean=0.323] [A
+Train step of epoch 0:  33%|███▎      | 2111/6434 [4:56:49<9:59:11,  8.32s/it, gpt_loss=0.265, loss_mean=0.317][A
+Train step of epoch 0:  33%|███▎      | 2112/6434 [4:56:49<9:55:18,  8.26s/it, gpt_loss=0.265, loss_mean=0.317][A
+Train step of epoch 0:  33%|███▎      | 2112/6434 [4:56:57<9:55:18,  8.26s/it, gpt_loss=0.32, loss_mean=0.317] [A
+Train step of epoch 0:  33%|███▎      | 2113/6434 [4:56:57<9:51:22,  8.21s/it, gpt_loss=0.32, loss_mean=0.317][A
+Train step of epoch 0:  33%|███▎      | 2113/6434 [4:57:04<9:51:22,  8.21s/it, gpt_loss=0.319, loss_mean=0.317][A
+Train step of epoch 0:  33%|███▎      | 2114/6434 [4:57:04<9:41:52,  8.08s/it, gpt_loss=0.319, loss_mean=0.317][A
+Train step of epoch 0:  33%|███▎      | 2114/6434 [4:57:13<9:41:52,  8.08s/it, gpt_loss=0.312, loss_mean=0.317][A
+Train step of epoch 0:  33%|███▎      | 2115/6434 [4:57:13<9:54:52,  8.26s/it, gpt_loss=0.312, loss_mean=0.317][A
+Train step of epoch 0:  33%|███▎      | 2115/6434 [4:57:22<9:54:52,  8.26s/it, gpt_loss=0.317, loss_mean=0.317][A
+Train step of epoch 0:  33%|███▎      | 2116/6434 [4:57:22<10:06:12,  8.42s/it, gpt_loss=0.317, loss_mean=0.317][A
+Train step of epoch 0:  33%|███▎      | 2116/6434 [4:57:30<10:06:12,  8.42s/it, gpt_loss=0.375, loss_mean=0.323][A
+Train step of epoch 0:  33%|███▎      | 2117/6434 [4:57:30<9:51:50,  8.23s/it, gpt_loss=0.375, loss_mean=0.323] [A
+Train step of epoch 0:  33%|███▎      | 2117/6434 [4:57:38<9:51:50,  8.23s/it, gpt_loss=0.203, loss_mean=0.311][A
+Train step of epoch 0:  33%|███▎      | 2118/6434 [4:57:38<10:03:22,  8.39s/it, gpt_loss=0.203, loss_mean=0.311][A
+Train step of epoch 0:  33%|███▎      | 2118/6434 [4:57:46<10:03:22,  8.39s/it, gpt_loss=0.29, loss_mean=0.309] [A
+Train step of epoch 0:  33%|███▎      | 2119/6434 [4:57:46<9:55:20,  8.28s/it, gpt_loss=0.29, loss_mean=0.309] [A
+[LID Router Debug] Step: 2120
+Batch Size: 10
+Audio Batch Size: 122
+LID Assignments: [3, 0, 3, 9, 0, 9, 4, 3, 4, 2]
+Active Experts in Batch: {0, 2, 3, 4, 9}
+
+Train step of epoch 0:  33%|███▎      | 2119/6434 [4:57:55<9:55:20,  8.28s/it, gpt_loss=0.255, loss_mean=0.303][A
+Train step of epoch 0:  33%|███▎      | 2120/6434 [4:57:55<9:53:23,  8.25s/it, gpt_loss=0.255, loss_mean=0.303][A
+Train step of epoch 0:  33%|███▎      | 2120/6434 [4:58:03<9:53:23,  8.25s/it, gpt_loss=0.217, loss_mean=0.295][A
+Train step of epoch 0:  33%|███▎      | 2121/6434 [4:58:03<9:57:03,  8.31s/it, gpt_loss=0.217, loss_mean=0.295][A
+Train step of epoch 0:  33%|███▎      | 2121/6434 [4:58:12<9:57:03,  8.31s/it, gpt_loss=0.274, loss_mean=0.293][A
+Train step of epoch 0:  33%|███▎      | 2122/6434 [4:58:12<10:03:10,  8.39s/it, gpt_loss=0.274, loss_mean=0.293][A
+Train step of epoch 0:  33%|███▎      | 2122/6434 [4:58:19<10:03:10,  8.39s/it, gpt_loss=0.312, loss_mean=0.295][A
+Train step of epoch 0:  33%|███▎      | 2123/6434 [4:58:19<9:46:29,  8.16s/it, gpt_loss=0.312, loss_mean=0.295] [A
+Train step of epoch 0:  33%|███▎      | 2123/6434 [4:58:29<9:46:29,  8.16s/it, gpt_loss=0.254, loss_mean=0.291][A
+Train step of epoch 0:  33%|███▎      | 2124/6434 [4:58:29<10:10:43,  8.50s/it, gpt_loss=0.254, loss_mean=0.291][A
+Train step of epoch 0:  33%|███▎      | 2124/6434 [4:58:37<10:10:43,  8.50s/it, gpt_loss=0.274, loss_mean=0.289][A
+Train step of epoch 0:  33%|███▎      | 2125/6434 [4:58:37<10:10:53,  8.51s/it, gpt_loss=0.274, loss_mean=0.289][A
+Train step of epoch 0:  33%|███▎      | 2125/6434 [4:58:46<10:10:53,  8.51s/it, gpt_loss=0.331, loss_mean=0.293][A
+Train step of epoch 0:  33%|███▎      | 2126/6434 [4:58:46<10:25:14,  8.71s/it, gpt_loss=0.331, loss_mean=0.293][A
+Train step of epoch 0:  33%|███▎      | 2126/6434 [4:58:55<10:25:14,  8.71s/it, gpt_loss=0.324, loss_mean=0.296][A
+Train step of epoch 0:  33%|███▎      | 2127/6434 [4:58:55<10:33:41,  8.83s/it, gpt_loss=0.324, loss_mean=0.296][A
+Train step of epoch 0:  33%|███▎      | 2127/6434 [4:59:04<10:33:41,  8.83s/it, gpt_loss=0.331, loss_mean=0.3]  [A
+Train step of epoch 0:  33%|███▎      | 2128/6434 [4:59:04<10:21:02,  8.65s/it, gpt_loss=0.331, loss_mean=0.3][A
+Train step of epoch 0:  33%|███▎      | 2128/6434 [4:59:11<10:21:02,  8.65s/it, gpt_loss=0.312, loss_mean=0.301][A
+Train step of epoch 0:  33%|███▎      | 2129/6434 [4:59:11<10:03:15,  8.41s/it, gpt_loss=0.312, loss_mean=0.301][A
+[LID Router Debug] Step: 2130
+Batch Size: 10
+Audio Batch Size: 96
+LID Assignments: [2, 0, 0, 1, 0, 4, 6, 2, 9, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:  33%|███▎      | 2129/6434 [4:59:20<10:03:15,  8.41s/it, gpt_loss=0.396, loss_mean=0.31] [A
+Train step of epoch 0:  33%|███▎      | 2130/6434 [4:59:20<9:57:37,  8.33s/it, gpt_loss=0.396, loss_mean=0.31] [A
+Train step of epoch 0:  33%|███▎      | 2130/6434 [4:59:28<9:57:37,  8.33s/it, gpt_loss=0.279, loss_mean=0.307][A
+Train step of epoch 0:  33%|███▎      | 2131/6434 [4:59:28<10:05:39,  8.45s/it, gpt_loss=0.279, loss_mean=0.307][A
+Train step of epoch 0:  33%|███▎      | 2131/6434 [4:59:37<10:05:39,  8.45s/it, gpt_loss=0.341, loss_mean=0.311][A
+Train step of epoch 0:  33%|███▎      | 2132/6434 [4:59:37<10:13:39,  8.56s/it, gpt_loss=0.341, loss_mean=0.311][A
+Train step of epoch 0:  33%|███▎      | 2132/6434 [4:59:45<10:13:39,  8.56s/it, gpt_loss=0.315, loss_mean=0.311][A
+Train step of epoch 0:  33%|███▎      | 2133/6434 [4:59:45<10:05:53,  8.45s/it, gpt_loss=0.315, loss_mean=0.311][A
+Train step of epoch 0:  33%|███▎      | 2133/6434 [4:59:53<10:05:53,  8.45s/it, gpt_loss=0.334, loss_mean=0.313][A
+Train step of epoch 0:  33%|███▎      | 2134/6434 [4:59:53<9:52:44,  8.27s/it, gpt_loss=0.334, loss_mean=0.313] [A
+Train step of epoch 0:  33%|███▎      | 2134/6434 [5:00:02<9:52:44,  8.27s/it, gpt_loss=0.323, loss_mean=0.314][A
+Train step of epoch 0:  33%|███▎      | 2135/6434 [5:00:02<10:04:04,  8.43s/it, gpt_loss=0.323, loss_mean=0.314][A
+Train step of epoch 0:  33%|███▎      | 2135/6434 [5:00:10<10:04:04,  8.43s/it, gpt_loss=0.282, loss_mean=0.311][A
+Train step of epoch 0:  33%|███▎      | 2136/6434 [5:00:10<9:51:01,  8.25s/it, gpt_loss=0.282, loss_mean=0.311] [A
+Train step of epoch 0:  33%|███▎      | 2136/6434 [5:00:19<9:51:01,  8.25s/it, gpt_loss=0.315, loss_mean=0.312][A
+Train step of epoch 0:  33%|███▎      | 2137/6434 [5:00:19<10:05:13,  8.45s/it, gpt_loss=0.315, loss_mean=0.312][A
+Train step of epoch 0:  33%|███▎      | 2137/6434 [5:00:27<10:05:13,  8.45s/it, gpt_loss=0.391, loss_mean=0.319][A
+Train step of epoch 0:  33%|███▎      | 2138/6434 [5:00:27<9:51:44,  8.26s/it, gpt_loss=0.391, loss_mean=0.319] [A
+Train step of epoch 0:  33%|███▎      | 2138/6434 [5:00:35<9:51:44,  8.26s/it, gpt_loss=0.254, loss_mean=0.313][A
+Train step of epoch 0:  33%|███▎      | 2139/6434 [5:00:35<10:03:53,  8.44s/it, gpt_loss=0.254, loss_mean=0.313][A
+[LID Router Debug] Step: 2140
+Batch Size: 10
+Audio Batch Size: 106
+LID Assignments: [6, 3, 9, 6, 2, 2, 0, 4, 0, 4]
+Active Experts in Batch: {0, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:  33%|███▎      | 2139/6434 [5:00:43<10:03:53,  8.44s/it, gpt_loss=0.312, loss_mean=0.313][A
+Train step of epoch 0:  33%|███▎      | 2140/6434 [5:00:43<9:53:00,  8.29s/it, gpt_loss=0.312, loss_mean=0.313] [A
+Train step of epoch 0:  33%|███▎      | 2140/6434 [5:00:52<9:53:00,  8.29s/it, gpt_loss=0.306, loss_mean=0.312][A
+Train step of epoch 0:  33%|███▎      | 2141/6434 [5:00:52<9:51:14,  8.26s/it, gpt_loss=0.306, loss_mean=0.312][A
+Train step of epoch 0:  33%|███▎      | 2141/6434 [5:01:00<9:51:14,  8.26s/it, gpt_loss=0.287, loss_mean=0.31] [A
+Train step of epoch 0:  33%|███▎      | 2142/6434 [5:01:00<9:54:36,  8.31s/it, gpt_loss=0.287, loss_mean=0.31][A
+Train step of epoch 0:  33%|███▎      | 2142/6434 [5:01:08<9:54:36,  8.31s/it, gpt_loss=0.344, loss_mean=0.313][A
+Train step of epoch 0:  33%|███▎      | 2143/6434 [5:01:08<9:55:36,  8.33s/it, gpt_loss=0.344, loss_mean=0.313][A
+Train step of epoch 0:  33%|███▎      | 2143/6434 [5:01:16<9:55:36,  8.33s/it, gpt_loss=0.293, loss_mean=0.311][A
+Train step of epoch 0:  33%|███▎      | 2144/6434 [5:01:16<9:49:53,  8.25s/it, gpt_loss=0.293, loss_mean=0.311][A
+Train step of epoch 0:  33%|███▎      | 2144/6434 [5:01:25<9:49:53,  8.25s/it, gpt_loss=0.355, loss_mean=0.315][A
+Train step of epoch 0:  33%|███▎      | 2145/6434 [5:01:25<9:53:44,  8.31s/it, gpt_loss=0.355, loss_mean=0.315][A
+Train step of epoch 0:  33%|███▎      | 2145/6434 [5:01:34<9:53:44,  8.31s/it, gpt_loss=0.3, loss_mean=0.314]  [A
+Train step of epoch 0:  33%|███▎      | 2146/6434 [5:01:34<10:13:53,  8.59s/it, gpt_loss=0.3, loss_mean=0.314][A
+Train step of epoch 0:  33%|███▎      | 2146/6434 [5:01:43<10:13:53,  8.59s/it, gpt_loss=0.309, loss_mean=0.313][A
+Train step of epoch 0:  33%|███▎      | 2147/6434 [5:01:43<10:11:36,  8.56s/it, gpt_loss=0.309, loss_mean=0.313][A
+Train step of epoch 0:  33%|███▎      | 2147/6434 [5:01:51<10:11:36,  8.56s/it, gpt_loss=0.345, loss_mean=0.317][A
+Train step of epoch 0:  33%|███▎      | 2148/6434 [5:01:51<10:02:02,  8.43s/it, gpt_loss=0.345, loss_mean=0.317][A
+Train step of epoch 0:  33%|███▎      | 2148/6434 [5:01:58<10:02:02,  8.43s/it, gpt_loss=0.298, loss_mean=0.315][A
+Train step of epoch 0:  33%|███▎      | 2149/6434 [5:01:58<9:43:56,  8.18s/it, gpt_loss=0.298, loss_mean=0.315] [A
+[LID Router Debug] Step: 2150
+Batch Size: 10
+Audio Batch Size: 142
+LID Assignments: [3, 1, 9, 0, 9, 4, 0, 3, 10, 4]
+Active Experts in Batch: {0, 1, 3, 4, 9, 10}
+
+Train step of epoch 0:  33%|███▎      | 2149/6434 [5:02:07<9:43:56,  8.18s/it, gpt_loss=0.343, loss_mean=0.318][A
+Train step of epoch 0:  33%|███▎      | 2150/6434 [5:02:07<10:01:18,  8.42s/it, gpt_loss=0.343, loss_mean=0.318][A
+Train step of epoch 0:  33%|███▎      | 2150/6434 [5:02:16<10:01:18,  8.42s/it, gpt_loss=0.271, loss_mean=0.313][A
+Train step of epoch 0:  33%|███▎      | 2151/6434 [5:02:16<10:07:37,  8.51s/it, gpt_loss=0.271, loss_mean=0.313][A
+Train step of epoch 0:  33%|███▎      | 2151/6434 [5:02:25<10:07:37,  8.51s/it, gpt_loss=0.307, loss_mean=0.312][A
+Train step of epoch 0:  33%|███▎      | 2152/6434 [5:02:25<10:10:24,  8.55s/it, gpt_loss=0.307, loss_mean=0.312][A
+Train step of epoch 0:  33%|███▎      | 2152/6434 [5:02:32<10:10:24,  8.55s/it, gpt_loss=0.297, loss_mean=0.311][A
+Train step of epoch 0:  33%|███▎      | 2153/6434 [5:02:32<9:46:31,  8.22s/it, gpt_loss=0.297, loss_mean=0.311] [A
+Train step of epoch 0:  33%|███▎      | 2153/6434 [5:02:40<9:46:31,  8.22s/it, gpt_loss=0.338, loss_mean=0.313][A
+Train step of epoch 0:  33%|███▎      | 2154/6434 [5:02:40<9:48:44,  8.25s/it, gpt_loss=0.338, loss_mean=0.313][A
+Train step of epoch 0:  33%|███▎      | 2154/6434 [5:02:49<9:48:44,  8.25s/it, gpt_loss=0.293, loss_mean=0.311][A
+Train step of epoch 0:  33%|███▎      | 2155/6434 [5:02:49<9:50:00,  8.27s/it, gpt_loss=0.293, loss_mean=0.311][A
+Train step of epoch 0:  33%|███▎      | 2155/6434 [5:02:58<9:50:00,  8.27s/it, gpt_loss=0.25, loss_mean=0.305] [A
+Train step of epoch 0:  34%|███▎      | 2156/6434 [5:02:58<10:19:41,  8.69s/it, gpt_loss=0.25, loss_mean=0.305][A
+Train step of epoch 0:  34%|███▎      | 2156/6434 [5:03:06<10:19:41,  8.69s/it, gpt_loss=0.376, loss_mean=0.312][A
+Train step of epoch 0:  34%|███▎      | 2157/6434 [5:03:06<9:58:19,  8.39s/it, gpt_loss=0.376, loss_mean=0.312] [A
+Train step of epoch 0:  34%|███▎      | 2157/6434 [5:03:15<9:58:19,  8.39s/it, gpt_loss=0.314, loss_mean=0.313][A
+Train step of epoch 0:  34%|███▎      | 2158/6434 [5:03:15<10:15:18,  8.63s/it, gpt_loss=0.314, loss_mean=0.313][A
+Train step of epoch 0:  34%|███▎      | 2158/6434 [5:03:24<10:15:18,  8.63s/it, gpt_loss=0.351, loss_mean=0.316][A
+Train step of epoch 0:  34%|███▎      | 2159/6434 [5:03:24<10:14:57,  8.63s/it, gpt_loss=0.351, loss_mean=0.316][A
+[LID Router Debug] Step: 2160
+Batch Size: 10
+Audio Batch Size: 98
+LID Assignments: [2, 4, 3, 9, 6, 9, 9, 6, 9, 2]
+Active Experts in Batch: {2, 3, 4, 6, 9}
+
+Train step of epoch 0:  34%|███▎      | 2159/6434 [5:03:32<10:14:57,  8.63s/it, gpt_loss=0.37, loss_mean=0.322] [A
+Train step of epoch 0:  34%|███▎      | 2160/6434 [5:03:32<9:57:16,  8.38s/it, gpt_loss=0.37, loss_mean=0.322] [A
+Train step of epoch 0:  34%|███▎      | 2160/6434 [5:03:43<9:57:16,  8.38s/it, gpt_loss=0.255, loss_mean=0.315][A
+Train step of epoch 0:  34%|███▎      | 2161/6434 [5:03:43<11:07:04,  9.37s/it, gpt_loss=0.255, loss_mean=0.315][A
+Train step of epoch 0:  34%|███▎      | 2161/6434 [5:03:52<11:07:04,  9.37s/it, gpt_loss=0.311, loss_mean=0.315][A
+Train step of epoch 0:  34%|███▎      | 2162/6434 [5:03:52<10:45:57,  9.07s/it, gpt_loss=0.311, loss_mean=0.315][A
+Train step of epoch 0:  34%|███▎      | 2162/6434 [5:04:00<10:45:57,  9.07s/it, gpt_loss=0.239, loss_mean=0.307][A
+Train step of epoch 0:  34%|███▎      | 2163/6434 [5:04:00<10:34:57,  8.92s/it, gpt_loss=0.239, loss_mean=0.307][A
+Train step of epoch 0:  34%|███▎      | 2163/6434 [5:04:09<10:34:57,  8.92s/it, gpt_loss=0.395, loss_mean=0.316][A
+Train step of epoch 0:  34%|███▎      | 2164/6434 [5:04:09<10:27:10,  8.81s/it, gpt_loss=0.395, loss_mean=0.316][A
+Train step of epoch 0:  34%|███▎      | 2164/6434 [5:04:17<10:27:10,  8.81s/it, gpt_loss=0.264, loss_mean=0.311][A
+Train step of epoch 0:  34%|███▎      | 2165/6434 [5:04:17<10:14:02,  8.63s/it, gpt_loss=0.264, loss_mean=0.311][A
+Train step of epoch 0:  34%|███▎      | 2165/6434 [5:04:26<10:14:02,  8.63s/it, gpt_loss=0.386, loss_mean=0.318][A
+Train step of epoch 0:  34%|███▎      | 2166/6434 [5:04:26<10:17:00,  8.67s/it, gpt_loss=0.386, loss_mean=0.318][A
+Train step of epoch 0:  34%|███▎      | 2166/6434 [5:04:34<10:17:00,  8.67s/it, gpt_loss=0.294, loss_mean=0.316][A
+Train step of epoch 0:  34%|███▎      | 2167/6434 [5:04:34<9:54:54,  8.37s/it, gpt_loss=0.294, loss_mean=0.316] [A
+Train step of epoch 0:  34%|███▎      | 2167/6434 [5:04:42<9:54:54,  8.37s/it, gpt_loss=0.417, loss_mean=0.326][A
+Train step of epoch 0:  34%|███▎      | 2168/6434 [5:04:42<9:52:49,  8.34s/it, gpt_loss=0.417, loss_mean=0.326][A
+Train step of epoch 0:  34%|███▎      | 2168/6434 [5:04:51<9:52:49,  8.34s/it, gpt_loss=0.322, loss_mean=0.326][A
+Train step of epoch 0:  34%|███▎      | 2169/6434 [5:04:51<10:08:54,  8.57s/it, gpt_loss=0.322, loss_mean=0.326][A
+[LID Router Debug] Step: 2170
+Batch Size: 10
+Audio Batch Size: 81
+LID Assignments: [0, 9, 4, 2, 0, 4, 0, 4, 0, 2]
+Active Experts in Batch: {0, 9, 2, 4}
+
+Train step of epoch 0:  34%|███▎      | 2169/6434 [5:05:00<10:08:54,  8.57s/it, gpt_loss=0.35, loss_mean=0.328] [A
+Train step of epoch 0:  34%|███▎      | 2170/6434 [5:05:00<10:11:11,  8.60s/it, gpt_loss=0.35, loss_mean=0.328][A
+Train step of epoch 0:  34%|███▎      | 2170/6434 [5:05:08<10:11:11,  8.60s/it, gpt_loss=0.353, loss_mean=0.33][A
+Train step of epoch 0:  34%|███▎      | 2171/6434 [5:05:08<10:08:45,  8.57s/it, gpt_loss=0.353, loss_mean=0.33][A
+Train step of epoch 0:  34%|███▎      | 2171/6434 [5:05:16<10:08:45,  8.57s/it, gpt_loss=0.274, loss_mean=0.325][A
+Train step of epoch 0:  34%|███▍      | 2172/6434 [5:05:16<9:45:59,  8.25s/it, gpt_loss=0.274, loss_mean=0.325] [A
+Train step of epoch 0:  34%|███▍      | 2172/6434 [5:05:23<9:45:59,  8.25s/it, gpt_loss=0.391, loss_mean=0.331][A
+Train step of epoch 0:  34%|███▍      | 2173/6434 [5:05:23<9:34:29,  8.09s/it, gpt_loss=0.391, loss_mean=0.331][A
+Train step of epoch 0:  34%|███▍      | 2173/6434 [5:05:33<9:34:29,  8.09s/it, gpt_loss=0.301, loss_mean=0.328][A
+Train step of epoch 0:  34%|███▍      | 2174/6434 [5:05:33<10:16:14,  8.68s/it, gpt_loss=0.301, loss_mean=0.328][A
+Train step of epoch 0:  34%|███▍      | 2174/6434 [5:05:41<10:16:14,  8.68s/it, gpt_loss=0.363, loss_mean=0.332][A
+Train step of epoch 0:  34%|███▍      | 2175/6434 [5:05:41<10:03:04,  8.50s/it, gpt_loss=0.363, loss_mean=0.332][A
+Train step of epoch 0:  34%|███▍      | 2175/6434 [5:05:50<10:03:04,  8.50s/it, gpt_loss=0.263, loss_mean=0.325][A
+Train step of epoch 0:  34%|███▍      | 2176/6434 [5:05:50<9:54:58,  8.38s/it, gpt_loss=0.263, loss_mean=0.325] [A
+Train step of epoch 0:  34%|███▍      | 2176/6434 [5:05:58<9:54:58,  8.38s/it, gpt_loss=0.278, loss_mean=0.32] [A
+Train step of epoch 0:  34%|███▍      | 2177/6434 [5:05:58<10:01:59,  8.48s/it, gpt_loss=0.278, loss_mean=0.32][A
+Train step of epoch 0:  34%|███▍      | 2177/6434 [5:06:07<10:01:59,  8.48s/it, gpt_loss=0.324, loss_mean=0.321][A
+Train step of epoch 0:  34%|███▍      | 2178/6434 [5:06:07<9:55:19,  8.39s/it, gpt_loss=0.324, loss_mean=0.321] [A
+Train step of epoch 0:  34%|███▍      | 2178/6434 [5:06:14<9:55:19,  8.39s/it, gpt_loss=0.312, loss_mean=0.32] [A
+Train step of epoch 0:  34%|███▍      | 2179/6434 [5:06:14<9:43:34,  8.23s/it, gpt_loss=0.312, loss_mean=0.32][A
+[LID Router Debug] Step: 2180
+Batch Size: 10
+Audio Batch Size: 134
+LID Assignments: [5, 9, 3, 2, 0, 2, 3, 1, 5, 2]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9}
+
+Train step of epoch 0:  34%|███▍      | 2179/6434 [5:06:23<9:43:34,  8.23s/it, gpt_loss=0.285, loss_mean=0.316][A
+Train step of epoch 0:  34%|███▍      | 2180/6434 [5:06:23<10:01:11,  8.48s/it, gpt_loss=0.285, loss_mean=0.316][A
+Train step of epoch 0:  34%|███▍      | 2180/6434 [5:06:31<10:01:11,  8.48s/it, gpt_loss=0.282, loss_mean=0.313][A
+Train step of epoch 0:  34%|███▍      | 2181/6434 [5:06:31<9:40:01,  8.18s/it, gpt_loss=0.282, loss_mean=0.313] [A
+Train step of epoch 0:  34%|███▍      | 2181/6434 [5:06:39<9:40:01,  8.18s/it, gpt_loss=0.279, loss_mean=0.309][A
+Train step of epoch 0:  34%|███▍      | 2182/6434 [5:06:39<9:41:52,  8.21s/it, gpt_loss=0.279, loss_mean=0.309][A
+Train step of epoch 0:  34%|███▍      | 2182/6434 [5:06:47<9:41:52,  8.21s/it, gpt_loss=0.317, loss_mean=0.31] [A
+Train step of epoch 0:  34%|███▍      | 2183/6434 [5:06:47<9:33:51,  8.10s/it, gpt_loss=0.317, loss_mean=0.31][A
+Train step of epoch 0:  34%|███▍      | 2183/6434 [5:06:55<9:33:51,  8.10s/it, gpt_loss=0.348, loss_mean=0.314][A
+Train step of epoch 0:  34%|███▍      | 2184/6434 [5:06:55<9:40:52,  8.20s/it, gpt_loss=0.348, loss_mean=0.314][A
+Train step of epoch 0:  34%|███▍      | 2184/6434 [5:07:05<9:40:52,  8.20s/it, gpt_loss=0.25, loss_mean=0.308] [A
+Train step of epoch 0:  34%|███▍      | 2185/6434 [5:07:05<10:03:26,  8.52s/it, gpt_loss=0.25, loss_mean=0.308][A
+Train step of epoch 0:  34%|███▍      | 2185/6434 [5:07:12<10:03:26,  8.52s/it, gpt_loss=0.431, loss_mean=0.32][A
+Train step of epoch 0:  34%|███▍      | 2186/6434 [5:07:12<9:45:34,  8.27s/it, gpt_loss=0.431, loss_mean=0.32] [A
+Train step of epoch 0:  34%|███▍      | 2186/6434 [5:07:20<9:45:34,  8.27s/it, gpt_loss=0.27, loss_mean=0.315][A
+Train step of epoch 0:  34%|███▍      | 2187/6434 [5:07:20<9:29:19,  8.04s/it, gpt_loss=0.27, loss_mean=0.315][A
+Train step of epoch 0:  34%|███▍      | 2187/6434 [5:07:28<9:29:19,  8.04s/it, gpt_loss=0.385, loss_mean=0.322][A
+Train step of epoch 0:  34%|███▍      | 2188/6434 [5:07:28<9:28:13,  8.03s/it, gpt_loss=0.385, loss_mean=0.322][A
+Train step of epoch 0:  34%|███▍      | 2188/6434 [5:07:38<9:28:13,  8.03s/it, gpt_loss=0.339, loss_mean=0.324][A
+Train step of epoch 0:  34%|███▍      | 2189/6434 [5:07:38<10:03:32,  8.53s/it, gpt_loss=0.339, loss_mean=0.324][A
+[LID Router Debug] Step: 2190
+Batch Size: 10
+Audio Batch Size: 115
+LID Assignments: [3, 9, 3, 1, 2, 4, 9, 0, 1, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+Train step of epoch 0:  34%|███▍      | 2189/6434 [5:07:45<10:03:32,  8.53s/it, gpt_loss=0.323, loss_mean=0.324][A
+Train step of epoch 0:  34%|███▍      | 2190/6434 [5:07:45<9:48:28,  8.32s/it, gpt_loss=0.323, loss_mean=0.324] [A
+Train step of epoch 0:  34%|███▍      | 2190/6434 [5:07:54<9:48:28,  8.32s/it, gpt_loss=0.37, loss_mean=0.328] [A
+Train step of epoch 0:  34%|███▍      | 2191/6434 [5:07:54<9:51:34,  8.37s/it, gpt_loss=0.37, loss_mean=0.328][A
+Train step of epoch 0:  34%|███▍      | 2191/6434 [5:08:02<9:51:34,  8.37s/it, gpt_loss=0.317, loss_mean=0.327][A
+Train step of epoch 0:  34%|███▍      | 2192/6434 [5:08:02<9:52:55,  8.39s/it, gpt_loss=0.317, loss_mean=0.327][A
+Train step of epoch 0:  34%|███▍      | 2192/6434 [5:08:10<9:52:55,  8.39s/it, gpt_loss=0.418, loss_mean=0.336][A
+Train step of epoch 0:  34%|███▍      | 2193/6434 [5:08:10<9:36:38,  8.16s/it, gpt_loss=0.418, loss_mean=0.336][A
+Train step of epoch 0:  34%|███▍      | 2193/6434 [5:08:19<9:36:38,  8.16s/it, gpt_loss=0.319, loss_mean=0.335][A
+Train step of epoch 0:  34%|███▍      | 2194/6434 [5:08:19<9:51:11,  8.37s/it, gpt_loss=0.319, loss_mean=0.335][A
+Train step of epoch 0:  34%|███▍      | 2194/6434 [5:08:26<9:51:11,  8.37s/it, gpt_loss=0.33, loss_mean=0.334] [A
+Train step of epoch 0:  34%|███▍      | 2195/6434 [5:08:26<9:24:14,  7.99s/it, gpt_loss=0.33, loss_mean=0.334][A
+Train step of epoch 0:  34%|███▍      | 2195/6434 [5:08:35<9:24:14,  7.99s/it, gpt_loss=0.347, loss_mean=0.335][A
+Train step of epoch 0:  34%|███▍      | 2196/6434 [5:08:35<9:50:19,  8.36s/it, gpt_loss=0.347, loss_mean=0.335][A
+Train step of epoch 0:  34%|███▍      | 2196/6434 [5:08:42<9:50:19,  8.36s/it, gpt_loss=0.312, loss_mean=0.333][A
+Train step of epoch 0:  34%|███▍      | 2197/6434 [5:08:42<9:22:49,  7.97s/it, gpt_loss=0.312, loss_mean=0.333][A
+Train step of epoch 0:  34%|███▍      | 2197/6434 [5:08:51<9:22:49,  7.97s/it, gpt_loss=0.356, loss_mean=0.335][A
+Train step of epoch 0:  34%|███▍      | 2198/6434 [5:08:51<9:37:02,  8.17s/it, gpt_loss=0.356, loss_mean=0.335][A
+Train step of epoch 0:  34%|███▍      | 2198/6434 [5:08:59<9:37:02,  8.17s/it, gpt_loss=0.312, loss_mean=0.333][A
+Train step of epoch 0:  34%|███▍      | 2199/6434 [5:08:59<9:44:44,  8.28s/it, gpt_loss=0.312, loss_mean=0.333][A
+[LID Router Debug] Step: 2200
+Batch Size: 10
+Audio Batch Size: 121
+LID Assignments: [1, 1, 9, 1, 4, 2, 3, 2, 3, 4]
+Active Experts in Batch: {1, 2, 3, 4, 9}
+[2026-02-06 21:05:12,547] [INFO] [logging.py:96:log_dist] [Rank 0] step=1100, skipped=0, lr=[1.946103464405964e-05, 1.946103464405964e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-06 21:05:12,547] [INFO] [timer.py:260:stop] epoch=0/micro_step=2200/global_step=1100, RunningAvgSamplesPerSec=4.756237268265744, CurrSamplesPerSec=4.657519137758478, MemAllocated=12.79GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  34%|███▍      | 2199/6434 [5:09:08<9:44:44,  8.28s/it, gpt_loss=0.322, loss_mean=0.332][A
+Train step of epoch 0:  34%|███▍      | 2200/6434 [5:09:08<9:52:39,  8.40s/it, gpt_loss=0.322, loss_mean=0.332][A
+Train step of epoch 0:  34%|███▍      | 2200/6434 [5:09:17<9:52:39,  8.40s/it, gpt_loss=0.278, loss_mean=0.326][A
+Train step of epoch 0:  34%|███▍      | 2201/6434 [5:09:17<9:59:18,  8.49s/it, gpt_loss=0.278, loss_mean=0.326][A
+Train step of epoch 0:  34%|███▍      | 2201/6434 [5:09:26<9:59:18,  8.49s/it, gpt_loss=0.351, loss_mean=0.329][A
+Train step of epoch 0:  34%|███▍      | 2202/6434 [5:09:26<10:16:18,  8.74s/it, gpt_loss=0.351, loss_mean=0.329][A
+Train step of epoch 0:  34%|███▍      | 2202/6434 [5:09:33<10:16:18,  8.74s/it, gpt_loss=0.271, loss_mean=0.323][A
+Train step of epoch 0:  34%|███▍      | 2203/6434 [5:09:33<9:46:19,  8.31s/it, gpt_loss=0.271, loss_mean=0.323] [A
+Train step of epoch 0:  34%|███▍      | 2203/6434 [5:09:43<9:46:19,  8.31s/it, gpt_loss=0.288, loss_mean=0.32] [A
+Train step of epoch 0:  34%|███▍      | 2204/6434 [5:09:43<10:03:19,  8.56s/it, gpt_loss=0.288, loss_mean=0.32][A
+Train step of epoch 0:  34%|███▍      | 2204/6434 [5:09:51<10:03:19,  8.56s/it, gpt_loss=0.334, loss_mean=0.321][A
+Train step of epoch 0:  34%|███▍      | 2205/6434 [5:09:51<9:58:21,  8.49s/it, gpt_loss=0.334, loss_mean=0.321] [A
+Train step of epoch 0:  34%|███▍      | 2205/6434 [5:09:59<9:58:21,  8.49s/it, gpt_loss=0.358, loss_mean=0.325][A
+Train step of epoch 0:  34%|███▍      | 2206/6434 [5:09:59<9:53:51,  8.43s/it, gpt_loss=0.358, loss_mean=0.325][A
+Train step of epoch 0:  34%|███▍      | 2206/6434 [5:10:09<9:53:51,  8.43s/it, gpt_loss=0.303, loss_mean=0.323][A
+Train step of epoch 0:  34%|███▍      | 2207/6434 [5:10:09<10:19:14,  8.79s/it, gpt_loss=0.303, loss_mean=0.323][A
+Train step of epoch 0:  34%|███▍      | 2207/6434 [5:10:17<10:19:14,  8.79s/it, gpt_loss=0.332, loss_mean=0.324][A
+Train step of epoch 0:  34%|███▍      | 2208/6434 [5:10:17<10:08:40,  8.64s/it, gpt_loss=0.332, loss_mean=0.324][A
+Train step of epoch 0:  34%|███▍      | 2208/6434 [5:10:26<10:08:40,  8.64s/it, gpt_loss=0.503, loss_mean=0.341][A
+Train step of epoch 0:  34%|███▍      | 2209/6434 [5:10:26<10:04:15,  8.58s/it, gpt_loss=0.503, loss_mean=0.341][A
+[LID Router Debug] Step: 2210
+Batch Size: 10
+Audio Batch Size: 100
+LID Assignments: [5, 5, 1, 9, 2, 5, 0, 2, 4, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  34%|███▍      | 2209/6434 [5:10:33<10:04:15,  8.58s/it, gpt_loss=0.34, loss_mean=0.341] [A
+Train step of epoch 0:  34%|███▍      | 2210/6434 [5:10:33<9:37:26,  8.20s/it, gpt_loss=0.34, loss_mean=0.341] [A
+Train step of epoch 0:  34%|███▍      | 2210/6434 [5:10:40<9:37:26,  8.20s/it, gpt_loss=0.333, loss_mean=0.341][A
+Train step of epoch 0:  34%|███▍      | 2211/6434 [5:10:40<9:14:09,  7.87s/it, gpt_loss=0.333, loss_mean=0.341][A
+Train step of epoch 0:  34%|███▍      | 2211/6434 [5:10:48<9:14:09,  7.87s/it, gpt_loss=0.319, loss_mean=0.338][A
+Train step of epoch 0:  34%|███▍      | 2212/6434 [5:10:48<9:07:16,  7.78s/it, gpt_loss=0.319, loss_mean=0.338][A
+Train step of epoch 0:  34%|███▍      | 2212/6434 [5:10:57<9:07:16,  7.78s/it, gpt_loss=0.347, loss_mean=0.339][A
+Train step of epoch 0:  34%|███▍      | 2213/6434 [5:10:57<9:33:35,  8.15s/it, gpt_loss=0.347, loss_mean=0.339][A
+Train step of epoch 0:  34%|███▍      | 2213/6434 [5:11:05<9:33:35,  8.15s/it, gpt_loss=0.347, loss_mean=0.34] [A
+Train step of epoch 0:  34%|███▍      | 2214/6434 [5:11:05<9:30:13,  8.11s/it, gpt_loss=0.347, loss_mean=0.34][A
+Train step of epoch 0:  34%|███▍      | 2214/6434 [5:11:13<9:30:13,  8.11s/it, gpt_loss=0.333, loss_mean=0.339][A
+Train step of epoch 0:  34%|███▍      | 2215/6434 [5:11:13<9:28:24,  8.08s/it, gpt_loss=0.333, loss_mean=0.339][A
+Train step of epoch 0:  34%|███▍      | 2215/6434 [5:11:21<9:28:24,  8.08s/it, gpt_loss=0.357, loss_mean=0.341][A
+Train step of epoch 0:  34%|███▍      | 2216/6434 [5:11:21<9:44:37,  8.32s/it, gpt_loss=0.357, loss_mean=0.341][A
+Train step of epoch 0:  34%|███▍      | 2216/6434 [5:11:30<9:44:37,  8.32s/it, gpt_loss=0.274, loss_mean=0.334][A
+Train step of epoch 0:  34%|███▍      | 2217/6434 [5:11:30<9:48:46,  8.38s/it, gpt_loss=0.274, loss_mean=0.334][A
+Train step of epoch 0:  34%|███▍      | 2217/6434 [5:11:38<9:48:46,  8.38s/it, gpt_loss=0.48, loss_mean=0.349] [A
+Train step of epoch 0:  34%|███▍      | 2218/6434 [5:11:38<9:40:30,  8.26s/it, gpt_loss=0.48, loss_mean=0.349][A
+Train step of epoch 0:  34%|███▍      | 2218/6434 [5:11:45<9:40:30,  8.26s/it, gpt_loss=0.41, loss_mean=0.355][A
+Train step of epoch 0:  34%|███▍      | 2219/6434 [5:11:45<9:24:14,  8.03s/it, gpt_loss=0.41, loss_mean=0.355][A
+[LID Router Debug] Step: 2220
+Batch Size: 10
+Audio Batch Size: 125
+LID Assignments: [9, 9, 6, 9, 9, 9, 5, 3, 3, 5]
+Active Experts in Batch: {9, 3, 5, 6}
+
+Train step of epoch 0:  34%|███▍      | 2219/6434 [5:11:54<9:24:14,  8.03s/it, gpt_loss=0.367, loss_mean=0.356][A
+Train step of epoch 0:  35%|███▍      | 2220/6434 [5:11:54<9:32:32,  8.15s/it, gpt_loss=0.367, loss_mean=0.356][A
+Train step of epoch 0:  35%|███▍      | 2220/6434 [5:12:03<9:32:32,  8.15s/it, gpt_loss=0.284, loss_mean=0.349][A
+Train step of epoch 0:  35%|███▍      | 2221/6434 [5:12:03<9:44:14,  8.32s/it, gpt_loss=0.284, loss_mean=0.349][A
+Train step of epoch 0:  35%|███▍      | 2221/6434 [5:12:11<9:44:14,  8.32s/it, gpt_loss=0.298, loss_mean=0.344][A
+Train step of epoch 0:  35%|███▍      | 2222/6434 [5:12:11<9:45:33,  8.34s/it, gpt_loss=0.298, loss_mean=0.344][A
+Train step of epoch 0:  35%|███▍      | 2222/6434 [5:12:20<9:45:33,  8.34s/it, gpt_loss=0.26, loss_mean=0.336] [A
+Train step of epoch 0:  35%|███▍      | 2223/6434 [5:12:20<9:49:33,  8.40s/it, gpt_loss=0.26, loss_mean=0.336][A
+Train step of epoch 0:  35%|███▍      | 2223/6434 [5:12:28<9:49:33,  8.40s/it, gpt_loss=0.346, loss_mean=0.337][A
+Train step of epoch 0:  35%|███▍      | 2224/6434 [5:12:28<9:58:38,  8.53s/it, gpt_loss=0.346, loss_mean=0.337][A
+Train step of epoch 0:  35%|███▍      | 2224/6434 [5:12:38<9:58:38,  8.53s/it, gpt_loss=0.271, loss_mean=0.33] [A
+Train step of epoch 0:  35%|███▍      | 2225/6434 [5:12:38<10:30:18,  8.99s/it, gpt_loss=0.271, loss_mean=0.33][A
+Train step of epoch 0:  35%|███▍      | 2225/6434 [5:12:47<10:30:18,  8.99s/it, gpt_loss=0.461, loss_mean=0.343][A
+Train step of epoch 0:  35%|███▍      | 2226/6434 [5:12:47<10:30:35,  8.99s/it, gpt_loss=0.461, loss_mean=0.343][A
+Train step of epoch 0:  35%|███▍      | 2226/6434 [5:12:55<10:30:35,  8.99s/it, gpt_loss=0.351, loss_mean=0.344][A
+Train step of epoch 0:  35%|███▍      | 2227/6434 [5:12:55<10:10:04,  8.70s/it, gpt_loss=0.351, loss_mean=0.344][A
+Train step of epoch 0:  35%|███▍      | 2227/6434 [5:13:04<10:10:04,  8.70s/it, gpt_loss=0.292, loss_mean=0.339][A
+Train step of epoch 0:  35%|███▍      | 2228/6434 [5:13:04<10:05:07,  8.63s/it, gpt_loss=0.292, loss_mean=0.339][A
+Train step of epoch 0:  35%|███▍      | 2228/6434 [5:13:11<10:05:07,  8.63s/it, gpt_loss=0.359, loss_mean=0.341][A
+Train step of epoch 0:  35%|███▍      | 2229/6434 [5:13:11<9:40:18,  8.28s/it, gpt_loss=0.359, loss_mean=0.341] [A
+[LID Router Debug] Step: 2230
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [6, 0, 5, 0, 9, 0, 3, 4, 5, 3]
+Active Experts in Batch: {0, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  35%|███▍      | 2229/6434 [5:13:20<9:40:18,  8.28s/it, gpt_loss=0.293, loss_mean=0.336][A
+Train step of epoch 0:  35%|███▍      | 2230/6434 [5:13:20<9:46:49,  8.38s/it, gpt_loss=0.293, loss_mean=0.336][A
+Train step of epoch 0:  35%|███▍      | 2230/6434 [5:13:28<9:46:49,  8.38s/it, gpt_loss=0.31, loss_mean=0.333] [A
+Train step of epoch 0:  35%|███▍      | 2231/6434 [5:13:28<9:31:08,  8.15s/it, gpt_loss=0.31, loss_mean=0.333][A
+Train step of epoch 0:  35%|███▍      | 2231/6434 [5:13:36<9:31:08,  8.15s/it, gpt_loss=0.308, loss_mean=0.331][A
+Train step of epoch 0:  35%|███▍      | 2232/6434 [5:13:36<9:36:59,  8.24s/it, gpt_loss=0.308, loss_mean=0.331][A
+Train step of epoch 0:  35%|███▍      | 2232/6434 [5:13:45<9:36:59,  8.24s/it, gpt_loss=0.273, loss_mean=0.325][A
+Train step of epoch 0:  35%|███▍      | 2233/6434 [5:13:45<9:56:19,  8.52s/it, gpt_loss=0.273, loss_mean=0.325][A
+Train step of epoch 0:  35%|███▍      | 2233/6434 [5:13:54<9:56:19,  8.52s/it, gpt_loss=0.207, loss_mean=0.313][A
+Train step of epoch 0:  35%|███▍      | 2234/6434 [5:13:54<10:11:15,  8.73s/it, gpt_loss=0.207, loss_mean=0.313][A
+Train step of epoch 0:  35%|███▍      | 2234/6434 [5:14:04<10:11:15,  8.73s/it, gpt_loss=0.246, loss_mean=0.306][A
+Train step of epoch 0:  35%|███▍      | 2235/6434 [5:14:04<10:28:08,  8.98s/it, gpt_loss=0.246, loss_mean=0.306][A
+Train step of epoch 0:  35%|███▍      | 2235/6434 [5:14:13<10:28:08,  8.98s/it, gpt_loss=0.341, loss_mean=0.31] [A
+Train step of epoch 0:  35%|███▍      | 2236/6434 [5:14:13<10:36:07,  9.09s/it, gpt_loss=0.341, loss_mean=0.31][A
+Train step of epoch 0:  35%|███▍      | 2236/6434 [5:14:22<10:36:07,  9.09s/it, gpt_loss=0.227, loss_mean=0.302][A
+Train step of epoch 0:  35%|███▍      | 2237/6434 [5:14:22<10:22:18,  8.90s/it, gpt_loss=0.227, loss_mean=0.302][A
+Train step of epoch 0:  35%|███▍      | 2237/6434 [5:14:29<10:22:18,  8.90s/it, gpt_loss=0.345, loss_mean=0.306][A
+Train step of epoch 0:  35%|███▍      | 2238/6434 [5:14:29<9:50:44,  8.45s/it, gpt_loss=0.345, loss_mean=0.306] [A
+Train step of epoch 0:  35%|███▍      | 2238/6434 [5:14:37<9:50:44,  8.45s/it, gpt_loss=0.273, loss_mean=0.303][A
+Train step of epoch 0:  35%|███▍      | 2239/6434 [5:14:37<9:42:11,  8.33s/it, gpt_loss=0.273, loss_mean=0.303][A
+[LID Router Debug] Step: 2240
+Batch Size: 10
+Audio Batch Size: 72
+LID Assignments: [5, 1, 6, 1, 0, 9, 2, 4, 2, 5]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  35%|███▍      | 2239/6434 [5:14:46<9:42:11,  8.33s/it, gpt_loss=0.392, loss_mean=0.312][A
+Train step of epoch 0:  35%|███▍      | 2240/6434 [5:14:46<9:44:22,  8.36s/it, gpt_loss=0.392, loss_mean=0.312][A
+Train step of epoch 0:  35%|███▍      | 2240/6434 [5:14:55<9:44:22,  8.36s/it, gpt_loss=0.415, loss_mean=0.322][A
+Train step of epoch 0:  35%|███▍      | 2241/6434 [5:14:55<10:06:41,  8.68s/it, gpt_loss=0.415, loss_mean=0.322][A
+Train step of epoch 0:  35%|███▍      | 2241/6434 [5:15:03<10:06:41,  8.68s/it, gpt_loss=0.307, loss_mean=0.32] [A
+Train step of epoch 0:  35%|███▍      | 2242/6434 [5:15:03<9:52:14,  8.48s/it, gpt_loss=0.307, loss_mean=0.32] [A
+Train step of epoch 0:  35%|███▍      | 2242/6434 [5:15:11<9:52:14,  8.48s/it, gpt_loss=0.434, loss_mean=0.332][A
+Train step of epoch 0:  35%|███▍      | 2243/6434 [5:15:11<9:30:37,  8.17s/it, gpt_loss=0.434, loss_mean=0.332][A
+Train step of epoch 0:  35%|███▍      | 2243/6434 [5:15:19<9:30:37,  8.17s/it, gpt_loss=0.284, loss_mean=0.327][A
+Train step of epoch 0:  35%|███▍      | 2244/6434 [5:15:19<9:29:19,  8.15s/it, gpt_loss=0.284, loss_mean=0.327][A
+Train step of epoch 0:  35%|███▍      | 2244/6434 [5:15:26<9:29:19,  8.15s/it, gpt_loss=0.341, loss_mean=0.328][A
+Train step of epoch 0:  35%|███▍      | 2245/6434 [5:15:26<9:20:16,  8.02s/it, gpt_loss=0.341, loss_mean=0.328][A
+Train step of epoch 0:  35%|███▍      | 2245/6434 [5:15:35<9:20:16,  8.02s/it, gpt_loss=0.296, loss_mean=0.325][A
+Train step of epoch 0:  35%|███▍      | 2246/6434 [5:15:35<9:32:23,  8.20s/it, gpt_loss=0.296, loss_mean=0.325][A
+Train step of epoch 0:  35%|███▍      | 2246/6434 [5:15:43<9:32:23,  8.20s/it, gpt_loss=0.265, loss_mean=0.319][A
+Train step of epoch 0:  35%|███▍      | 2247/6434 [5:15:43<9:32:32,  8.20s/it, gpt_loss=0.265, loss_mean=0.319][A
+Train step of epoch 0:  35%|███▍      | 2247/6434 [5:15:52<9:32:32,  8.20s/it, gpt_loss=0.353, loss_mean=0.323][A
+Train step of epoch 0:  35%|███▍      | 2248/6434 [5:15:52<9:54:48,  8.53s/it, gpt_loss=0.353, loss_mean=0.323][A
+Train step of epoch 0:  35%|███▍      | 2248/6434 [5:16:01<9:54:48,  8.53s/it, gpt_loss=0.395, loss_mean=0.33] [A
+Train step of epoch 0:  35%|███▍      | 2249/6434 [5:16:01<9:46:14,  8.40s/it, gpt_loss=0.395, loss_mean=0.33][A
+[LID Router Debug] Step: 2250
+Batch Size: 10
+Audio Batch Size: 105
+LID Assignments: [4, 6, 2, 1, 2, 2, 9, 9, 2, 2]
+Active Experts in Batch: {1, 2, 4, 6, 9}
+
+Train step of epoch 0:  35%|███▍      | 2249/6434 [5:16:10<9:46:14,  8.40s/it, gpt_loss=0.322, loss_mean=0.329][A
+Train step of epoch 0:  35%|███▍      | 2250/6434 [5:16:10<10:11:54,  8.78s/it, gpt_loss=0.322, loss_mean=0.329][A
+Train step of epoch 0:  35%|███▍      | 2250/6434 [5:16:19<10:11:54,  8.78s/it, gpt_loss=0.354, loss_mean=0.331][A
+Train step of epoch 0:  35%|███▍      | 2251/6434 [5:16:19<10:12:47,  8.79s/it, gpt_loss=0.354, loss_mean=0.331][A
+Train step of epoch 0:  35%|███▍      | 2251/6434 [5:16:28<10:12:47,  8.79s/it, gpt_loss=0.269, loss_mean=0.325][A
+Train step of epoch 0:  35%|███▌      | 2252/6434 [5:16:28<10:09:13,  8.74s/it, gpt_loss=0.269, loss_mean=0.325][A
+Train step of epoch 0:  35%|███▌      | 2252/6434 [5:16:35<10:09:13,  8.74s/it, gpt_loss=0.278, loss_mean=0.321][A
+Train step of epoch 0:  35%|███▌      | 2253/6434 [5:16:35<9:39:05,  8.31s/it, gpt_loss=0.278, loss_mean=0.321] [A
+Train step of epoch 0:  35%|███▌      | 2253/6434 [5:16:44<9:39:05,  8.31s/it, gpt_loss=0.296, loss_mean=0.318][A
+Train step of epoch 0:  35%|███▌      | 2254/6434 [5:16:44<9:44:08,  8.38s/it, gpt_loss=0.296, loss_mean=0.318][A
+Train step of epoch 0:  35%|███▌      | 2254/6434 [5:16:53<9:44:08,  8.38s/it, gpt_loss=0.292, loss_mean=0.315][A
+Train step of epoch 0:  35%|███▌      | 2255/6434 [5:16:53<10:04:28,  8.68s/it, gpt_loss=0.292, loss_mean=0.315][A
+Train step of epoch 0:  35%|███▌      | 2255/6434 [5:17:03<10:04:28,  8.68s/it, gpt_loss=0.354, loss_mean=0.319][A
+Train step of epoch 0:  35%|███▌      | 2256/6434 [5:17:03<10:24:29,  8.97s/it, gpt_loss=0.354, loss_mean=0.319][A
+Train step of epoch 0:  35%|███▌      | 2256/6434 [5:17:11<10:24:29,  8.97s/it, gpt_loss=0.407, loss_mean=0.328][A
+Train step of epoch 0:  35%|███▌      | 2257/6434 [5:17:11<10:09:19,  8.75s/it, gpt_loss=0.407, loss_mean=0.328][A
+Train step of epoch 0:  35%|███▌      | 2257/6434 [5:17:19<10:09:19,  8.75s/it, gpt_loss=0.345, loss_mean=0.33] [A
+Train step of epoch 0:  35%|███▌      | 2258/6434 [5:17:19<9:59:41,  8.62s/it, gpt_loss=0.345, loss_mean=0.33] [A
+Train step of epoch 0:  35%|███▌      | 2258/6434 [5:17:27<9:59:41,  8.62s/it, gpt_loss=0.375, loss_mean=0.334][A
+Train step of epoch 0:  35%|███▌      | 2259/6434 [5:17:27<9:43:45,  8.39s/it, gpt_loss=0.375, loss_mean=0.334][A
+[LID Router Debug] Step: 2260
+Batch Size: 10
+Audio Batch Size: 126
+LID Assignments: [2, 0, 4, 0, 9, 1, 3, 3, 9, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+Train step of epoch 0:  35%|███▌      | 2259/6434 [5:17:35<9:43:45,  8.39s/it, gpt_loss=0.307, loss_mean=0.331][A
+Train step of epoch 0:  35%|███▌      | 2260/6434 [5:17:35<9:45:41,  8.42s/it, gpt_loss=0.307, loss_mean=0.331][A
+Train step of epoch 0:  35%|███▌      | 2260/6434 [5:17:45<9:45:41,  8.42s/it, gpt_loss=0.291, loss_mean=0.327][A
+Train step of epoch 0:  35%|███▌      | 2261/6434 [5:17:45<10:19:06,  8.90s/it, gpt_loss=0.291, loss_mean=0.327][A
+Train step of epoch 0:  35%|███▌      | 2261/6434 [5:17:54<10:19:06,  8.90s/it, gpt_loss=0.289, loss_mean=0.324][A
+Train step of epoch 0:  35%|███▌      | 2262/6434 [5:17:54<10:03:26,  8.68s/it, gpt_loss=0.289, loss_mean=0.324][A
+Train step of epoch 0:  35%|███▌      | 2262/6434 [5:18:03<10:03:26,  8.68s/it, gpt_loss=0.286, loss_mean=0.32] [A
+Train step of epoch 0:  35%|███▌      | 2263/6434 [5:18:03<10:13:46,  8.83s/it, gpt_loss=0.286, loss_mean=0.32][A
+Train step of epoch 0:  35%|███▌      | 2263/6434 [5:18:11<10:13:46,  8.83s/it, gpt_loss=0.302, loss_mean=0.318][A
+Train step of epoch 0:  35%|███▌      | 2264/6434 [5:18:11<10:09:36,  8.77s/it, gpt_loss=0.302, loss_mean=0.318][A
+Train step of epoch 0:  35%|███▌      | 2264/6434 [5:18:20<10:09:36,  8.77s/it, gpt_loss=0.296, loss_mean=0.316][A
+Train step of epoch 0:  35%|███▌      | 2265/6434 [5:18:20<10:11:59,  8.81s/it, gpt_loss=0.296, loss_mean=0.316][A
+Train step of epoch 0:  35%|███▌      | 2265/6434 [5:18:29<10:11:59,  8.81s/it, gpt_loss=0.374, loss_mean=0.322][A
+Train step of epoch 0:  35%|███▌      | 2266/6434 [5:18:29<10:14:09,  8.84s/it, gpt_loss=0.374, loss_mean=0.322][A
+Train step of epoch 0:  35%|███▌      | 2266/6434 [5:18:38<10:14:09,  8.84s/it, gpt_loss=0.337, loss_mean=0.323][A
+Train step of epoch 0:  35%|███▌      | 2267/6434 [5:18:38<10:07:01,  8.74s/it, gpt_loss=0.337, loss_mean=0.323][A
+Train step of epoch 0:  35%|███▌      | 2267/6434 [5:18:47<10:07:01,  8.74s/it, gpt_loss=0.325, loss_mean=0.323][A
+Train step of epoch 0:  35%|███▌      | 2268/6434 [5:18:47<10:15:08,  8.86s/it, gpt_loss=0.325, loss_mean=0.323][A
+Train step of epoch 0:  35%|███▌      | 2268/6434 [5:18:56<10:15:08,  8.86s/it, gpt_loss=0.356, loss_mean=0.327][A
+Train step of epoch 0:  35%|███▌      | 2269/6434 [5:18:56<10:25:09,  9.01s/it, gpt_loss=0.356, loss_mean=0.327][A
+[LID Router Debug] Step: 2270
+Batch Size: 10
+Audio Batch Size: 72
+LID Assignments: [4, 5, 1, 4, 4, 2, 5, 6, 5, 5]
+Active Experts in Batch: {1, 2, 4, 5, 6}
+
+Train step of epoch 0:  35%|███▌      | 2269/6434 [5:19:04<10:25:09,  9.01s/it, gpt_loss=0.349, loss_mean=0.329][A
+Train step of epoch 0:  35%|███▌      | 2270/6434 [5:19:04<10:03:22,  8.69s/it, gpt_loss=0.349, loss_mean=0.329][A
+Train step of epoch 0:  35%|███▌      | 2270/6434 [5:19:13<10:03:22,  8.69s/it, gpt_loss=0.292, loss_mean=0.325][A
+Train step of epoch 0:  35%|███▌      | 2271/6434 [5:19:13<10:12:17,  8.82s/it, gpt_loss=0.292, loss_mean=0.325][A
+Train step of epoch 0:  35%|███▌      | 2271/6434 [5:19:23<10:12:17,  8.82s/it, gpt_loss=0.399, loss_mean=0.333][A
+Train step of epoch 0:  35%|███▌      | 2272/6434 [5:19:23<10:24:35,  9.00s/it, gpt_loss=0.399, loss_mean=0.333][A
+Train step of epoch 0:  35%|███▌      | 2272/6434 [5:19:31<10:24:35,  9.00s/it, gpt_loss=0.294, loss_mean=0.329][A
+Train step of epoch 0:  35%|███▌      | 2273/6434 [5:19:31<10:09:11,  8.78s/it, gpt_loss=0.294, loss_mean=0.329][A
+Train step of epoch 0:  35%|███▌      | 2273/6434 [5:19:41<10:09:11,  8.78s/it, gpt_loss=0.27, loss_mean=0.323] [A
+Train step of epoch 0:  35%|███▌      | 2274/6434 [5:19:41<10:34:27,  9.15s/it, gpt_loss=0.27, loss_mean=0.323][A
+Train step of epoch 0:  35%|███▌      | 2274/6434 [5:19:49<10:34:27,  9.15s/it, gpt_loss=0.277, loss_mean=0.318][A
+Train step of epoch 0:  35%|███▌      | 2275/6434 [5:19:49<10:12:36,  8.84s/it, gpt_loss=0.277, loss_mean=0.318][A
+Train step of epoch 0:  35%|███▌      | 2275/6434 [5:19:57<10:12:36,  8.84s/it, gpt_loss=0.293, loss_mean=0.316][A
+Train step of epoch 0:  35%|███▌      | 2276/6434 [5:19:57<9:56:16,  8.60s/it, gpt_loss=0.293, loss_mean=0.316] [A
+Train step of epoch 0:  35%|███▌      | 2276/6434 [5:20:06<9:56:16,  8.60s/it, gpt_loss=0.312, loss_mean=0.315][A
+Train step of epoch 0:  35%|███▌      | 2277/6434 [5:20:06<9:50:34,  8.52s/it, gpt_loss=0.312, loss_mean=0.315][A
+Train step of epoch 0:  35%|███▌      | 2277/6434 [5:20:15<9:50:34,  8.52s/it, gpt_loss=0.314, loss_mean=0.315][A
+Train step of epoch 0:  35%|███▌      | 2278/6434 [5:20:15<10:16:46,  8.90s/it, gpt_loss=0.314, loss_mean=0.315][A
+Train step of epoch 0:  35%|███▌      | 2278/6434 [5:20:24<10:16:46,  8.90s/it, gpt_loss=0.313, loss_mean=0.315][A
+Train step of epoch 0:  35%|███▌      | 2279/6434 [5:20:24<10:04:17,  8.73s/it, gpt_loss=0.313, loss_mean=0.315][A
+[LID Router Debug] Step: 2280
+Batch Size: 10
+Audio Batch Size: 136
+LID Assignments: [2, 9, 4, 2, 3, 1, 10, 9, 9, 2]
+Active Experts in Batch: {1, 2, 3, 4, 9, 10}
+
+Train step of epoch 0:  35%|███▌      | 2279/6434 [5:20:32<10:04:17,  8.73s/it, gpt_loss=0.301, loss_mean=0.314][A
+Train step of epoch 0:  35%|███▌      | 2280/6434 [5:20:32<10:04:41,  8.73s/it, gpt_loss=0.301, loss_mean=0.314][A
+Train step of epoch 0:  35%|███▌      | 2280/6434 [5:20:41<10:04:41,  8.73s/it, gpt_loss=0.289, loss_mean=0.311][A
+Train step of epoch 0:  35%|███▌      | 2281/6434 [5:20:41<10:01:37,  8.69s/it, gpt_loss=0.289, loss_mean=0.311][A
+Train step of epoch 0:  35%|███▌      | 2281/6434 [5:20:50<10:01:37,  8.69s/it, gpt_loss=0.324, loss_mean=0.312][A
+Train step of epoch 0:  35%|███▌      | 2282/6434 [5:20:50<9:57:20,  8.63s/it, gpt_loss=0.324, loss_mean=0.312] [A
+Train step of epoch 0:  35%|███▌      | 2282/6434 [5:20:59<9:57:20,  8.63s/it, gpt_loss=0.304, loss_mean=0.312][A
+Train step of epoch 0:  35%|███▌      | 2283/6434 [5:20:59<10:09:21,  8.81s/it, gpt_loss=0.304, loss_mean=0.312][A
+Train step of epoch 0:  35%|███▌      | 2283/6434 [5:21:07<10:09:21,  8.81s/it, gpt_loss=0.405, loss_mean=0.321][A
+Train step of epoch 0:  35%|███▌      | 2284/6434 [5:21:07<9:58:18,  8.65s/it, gpt_loss=0.405, loss_mean=0.321] [A
+Train step of epoch 0:  35%|███▌      | 2284/6434 [5:21:14<9:58:18,  8.65s/it, gpt_loss=0.3, loss_mean=0.319]  [A
+Train step of epoch 0:  36%|███▌      | 2285/6434 [5:21:14<9:20:02,  8.10s/it, gpt_loss=0.3, loss_mean=0.319][A
+Train step of epoch 0:  36%|███▌      | 2285/6434 [5:21:22<9:20:02,  8.10s/it, gpt_loss=0.338, loss_mean=0.321][A
+Train step of epoch 0:  36%|███▌      | 2286/6434 [5:21:22<9:25:54,  8.19s/it, gpt_loss=0.338, loss_mean=0.321][A
+Train step of epoch 0:  36%|███▌      | 2286/6434 [5:21:30<9:25:54,  8.19s/it, gpt_loss=0.355, loss_mean=0.324][A
+Train step of epoch 0:  36%|███▌      | 2287/6434 [5:21:30<9:19:17,  8.09s/it, gpt_loss=0.355, loss_mean=0.324][A
+Train step of epoch 0:  36%|███▌      | 2287/6434 [5:21:39<9:19:17,  8.09s/it, gpt_loss=0.249, loss_mean=0.317][A
+Train step of epoch 0:  36%|███▌      | 2288/6434 [5:21:39<9:32:03,  8.28s/it, gpt_loss=0.249, loss_mean=0.317][A
+Train step of epoch 0:  36%|███▌      | 2288/6434 [5:21:47<9:32:03,  8.28s/it, gpt_loss=0.355, loss_mean=0.32] [A
+Train step of epoch 0:  36%|███▌      | 2289/6434 [5:21:47<9:23:44,  8.16s/it, gpt_loss=0.355, loss_mean=0.32][A
+[LID Router Debug] Step: 2290
+Batch Size: 10
+Audio Batch Size: 94
+LID Assignments: [6, 5, 6, 0, 0, 4, 4, 2, 9, 2]
+Active Experts in Batch: {0, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  36%|███▌      | 2289/6434 [5:21:55<9:23:44,  8.16s/it, gpt_loss=0.254, loss_mean=0.314][A
+Train step of epoch 0:  36%|███▌      | 2290/6434 [5:21:55<9:30:34,  8.26s/it, gpt_loss=0.254, loss_mean=0.314][A
+Train step of epoch 0:  36%|███▌      | 2290/6434 [5:22:04<9:30:34,  8.26s/it, gpt_loss=0.312, loss_mean=0.314][A
+Train step of epoch 0:  36%|███▌      | 2291/6434 [5:22:04<9:34:20,  8.32s/it, gpt_loss=0.312, loss_mean=0.314][A
+Train step of epoch 0:  36%|███▌      | 2291/6434 [5:22:12<9:34:20,  8.32s/it, gpt_loss=0.329, loss_mean=0.315][A
+Train step of epoch 0:  36%|███▌      | 2292/6434 [5:22:12<9:44:51,  8.47s/it, gpt_loss=0.329, loss_mean=0.315][A
+Train step of epoch 0:  36%|███▌      | 2292/6434 [5:22:22<9:44:51,  8.47s/it, gpt_loss=0.312, loss_mean=0.315][A
+Train step of epoch 0:  36%|███▌      | 2293/6434 [5:22:22<10:00:32,  8.70s/it, gpt_loss=0.312, loss_mean=0.315][A
+Train step of epoch 0:  36%|███▌      | 2293/6434 [5:22:30<10:00:32,  8.70s/it, gpt_loss=0.312, loss_mean=0.314][A
+Train step of epoch 0:  36%|███▌      | 2294/6434 [5:22:30<9:57:18,  8.66s/it, gpt_loss=0.312, loss_mean=0.314] [A
+Train step of epoch 0:  36%|███▌      | 2294/6434 [5:22:39<9:57:18,  8.66s/it, gpt_loss=0.271, loss_mean=0.31] [A
+Train step of epoch 0:  36%|███▌      | 2295/6434 [5:22:39<9:58:46,  8.68s/it, gpt_loss=0.271, loss_mean=0.31][A
+Train step of epoch 0:  36%|███▌      | 2295/6434 [5:22:48<9:58:46,  8.68s/it, gpt_loss=0.283, loss_mean=0.307][A
+Train step of epoch 0:  36%|███▌      | 2296/6434 [5:22:48<10:02:33,  8.74s/it, gpt_loss=0.283, loss_mean=0.307][A
+Train step of epoch 0:  36%|███▌      | 2296/6434 [5:22:56<10:02:33,  8.74s/it, gpt_loss=0.38, loss_mean=0.315] [A
+Train step of epoch 0:  36%|███▌      | 2297/6434 [5:22:56<9:59:24,  8.69s/it, gpt_loss=0.38, loss_mean=0.315] [A
+Train step of epoch 0:  36%|███▌      | 2297/6434 [5:23:04<9:59:24,  8.69s/it, gpt_loss=0.394, loss_mean=0.323][A
+Train step of epoch 0:  36%|███▌      | 2298/6434 [5:23:04<9:29:14,  8.26s/it, gpt_loss=0.394, loss_mean=0.323][A
+Train step of epoch 0:  36%|███▌      | 2298/6434 [5:23:13<9:29:14,  8.26s/it, gpt_loss=0.316, loss_mean=0.322][A
+Train step of epoch 0:  36%|███▌      | 2299/6434 [5:23:13<9:51:37,  8.58s/it, gpt_loss=0.316, loss_mean=0.322][A
+[LID Router Debug] Step: 2300
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [1, 2, 3, 9, 5, 0, 9, 0, 5, 2]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9}
+
+Train step of epoch 0:  36%|███▌      | 2299/6434 [5:23:21<9:51:37,  8.58s/it, gpt_loss=0.29, loss_mean=0.319] [A
+Train step of epoch 0:  36%|███▌      | 2300/6434 [5:23:21<9:40:15,  8.42s/it, gpt_loss=0.29, loss_mean=0.319][A
+Train step of epoch 0:  36%|███▌      | 2300/6434 [5:23:29<9:40:15,  8.42s/it, gpt_loss=0.312, loss_mean=0.318][A
+Train step of epoch 0:  36%|███▌      | 2301/6434 [5:23:29<9:20:16,  8.13s/it, gpt_loss=0.312, loss_mean=0.318][A
+Train step of epoch 0:  36%|███▌      | 2301/6434 [5:23:39<9:20:16,  8.13s/it, gpt_loss=0.318, loss_mean=0.318][A
+Train step of epoch 0:  36%|███▌      | 2302/6434 [5:23:39<10:08:33,  8.84s/it, gpt_loss=0.318, loss_mean=0.318][A
+Train step of epoch 0:  36%|███▌      | 2302/6434 [5:23:47<10:08:33,  8.84s/it, gpt_loss=0.363, loss_mean=0.322][A
+Train step of epoch 0:  36%|███▌      | 2303/6434 [5:23:47<9:51:34,  8.59s/it, gpt_loss=0.363, loss_mean=0.322] [A
+Train step of epoch 0:  36%|███▌      | 2303/6434 [5:23:56<9:51:34,  8.59s/it, gpt_loss=0.273, loss_mean=0.318][A
+Train step of epoch 0:  36%|███▌      | 2304/6434 [5:23:56<9:53:54,  8.63s/it, gpt_loss=0.273, loss_mean=0.318][A
+Train step of epoch 0:  36%|███▌      | 2304/6434 [5:24:05<9:53:54,  8.63s/it, gpt_loss=0.318, loss_mean=0.318][A
+Train step of epoch 0:  36%|███▌      | 2305/6434 [5:24:05<10:02:28,  8.75s/it, gpt_loss=0.318, loss_mean=0.318][A
+Train step of epoch 0:  36%|███▌      | 2305/6434 [5:24:13<10:02:28,  8.75s/it, gpt_loss=0.329, loss_mean=0.319][A
+Train step of epoch 0:  36%|███▌      | 2306/6434 [5:24:13<9:56:54,  8.68s/it, gpt_loss=0.329, loss_mean=0.319] [A
+Train step of epoch 0:  36%|███▌      | 2306/6434 [5:24:22<9:56:54,  8.68s/it, gpt_loss=0.346, loss_mean=0.321][A
+Train step of epoch 0:  36%|███▌      | 2307/6434 [5:24:22<9:52:36,  8.62s/it, gpt_loss=0.346, loss_mean=0.321][A
+Train step of epoch 0:  36%|███▌      | 2307/6434 [5:24:30<9:52:36,  8.62s/it, gpt_loss=0.282, loss_mean=0.317][A
+Train step of epoch 0:  36%|███▌      | 2308/6434 [5:24:30<9:53:35,  8.63s/it, gpt_loss=0.282, loss_mean=0.317][A
+Train step of epoch 0:  36%|███▌      | 2308/6434 [5:24:38<9:53:35,  8.63s/it, gpt_loss=0.361, loss_mean=0.322][A
+Train step of epoch 0:  36%|███▌      | 2309/6434 [5:24:38<9:41:02,  8.45s/it, gpt_loss=0.361, loss_mean=0.322][A
+[LID Router Debug] Step: 2310
+Batch Size: 10
+Audio Batch Size: 92
+LID Assignments: [5, 9, 5, 0, 1, 2, 4, 4, 1, 0]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+Train step of epoch 0:  36%|███▌      | 2309/6434 [5:24:46<9:41:02,  8.45s/it, gpt_loss=0.311, loss_mean=0.321][A
+Train step of epoch 0:  36%|███▌      | 2310/6434 [5:24:46<9:16:27,  8.10s/it, gpt_loss=0.311, loss_mean=0.321][A
+Train step of epoch 0:  36%|███▌      | 2310/6434 [5:24:54<9:16:27,  8.10s/it, gpt_loss=0.454, loss_mean=0.334][A
+Train step of epoch 0:  36%|███▌      | 2311/6434 [5:24:54<9:12:37,  8.04s/it, gpt_loss=0.454, loss_mean=0.334][A
+Train step of epoch 0:  36%|███▌      | 2311/6434 [5:25:02<9:12:37,  8.04s/it, gpt_loss=0.293, loss_mean=0.33] [A
+Train step of epoch 0:  36%|███▌      | 2312/6434 [5:25:02<9:11:29,  8.03s/it, gpt_loss=0.293, loss_mean=0.33][A
+Train step of epoch 0:  36%|███▌      | 2312/6434 [5:25:10<9:11:29,  8.03s/it, gpt_loss=0.273, loss_mean=0.324][A
+Train step of epoch 0:  36%|███▌      | 2313/6434 [5:25:10<9:18:40,  8.13s/it, gpt_loss=0.273, loss_mean=0.324][A
+Train step of epoch 0:  36%|███▌      | 2313/6434 [5:25:18<9:18:40,  8.13s/it, gpt_loss=0.367, loss_mean=0.329][A
+Train step of epoch 0:  36%|███▌      | 2314/6434 [5:25:18<9:25:35,  8.24s/it, gpt_loss=0.367, loss_mean=0.329][A
+Train step of epoch 0:  36%|███▌      | 2314/6434 [5:25:26<9:25:35,  8.24s/it, gpt_loss=0.339, loss_mean=0.33] [A
+Train step of epoch 0:  36%|███▌      | 2315/6434 [5:25:26<9:06:56,  7.97s/it, gpt_loss=0.339, loss_mean=0.33][A
+Train step of epoch 0:  36%|███▌      | 2315/6434 [5:25:35<9:06:56,  7.97s/it, gpt_loss=0.368, loss_mean=0.333][A
+Train step of epoch 0:  36%|███▌      | 2316/6434 [5:25:35<9:25:19,  8.24s/it, gpt_loss=0.368, loss_mean=0.333][A
+Train step of epoch 0:  36%|███▌      | 2316/6434 [5:25:44<9:25:19,  8.24s/it, gpt_loss=0.247, loss_mean=0.325][A
+Train step of epoch 0:  36%|███▌      | 2317/6434 [5:25:44<9:37:51,  8.42s/it, gpt_loss=0.247, loss_mean=0.325][A
+Train step of epoch 0:  36%|███▌      | 2317/6434 [5:25:52<9:37:51,  8.42s/it, gpt_loss=0.432, loss_mean=0.335][A
+Train step of epoch 0:  36%|███▌      | 2318/6434 [5:25:52<9:40:46,  8.47s/it, gpt_loss=0.432, loss_mean=0.335][A
+Train step of epoch 0:  36%|███▌      | 2318/6434 [5:26:01<9:40:46,  8.47s/it, gpt_loss=0.279, loss_mean=0.33] [A
+Train step of epoch 0:  36%|███▌      | 2319/6434 [5:26:01<9:38:53,  8.44s/it, gpt_loss=0.279, loss_mean=0.33][A
+[LID Router Debug] Step: 2320
+Batch Size: 10
+Audio Batch Size: 138
+LID Assignments: [9, 9, 2, 9, 4, 3, 2, 0, 4, 9]
+Active Experts in Batch: {0, 2, 3, 4, 9}
+
+Train step of epoch 0:  36%|███▌      | 2319/6434 [5:26:09<9:38:53,  8.44s/it, gpt_loss=0.388, loss_mean=0.336][A
+Train step of epoch 0:  36%|███▌      | 2320/6434 [5:26:09<9:47:33,  8.57s/it, gpt_loss=0.388, loss_mean=0.336][A
+Train step of epoch 0:  36%|███▌      | 2320/6434 [5:26:18<9:47:33,  8.57s/it, gpt_loss=0.305, loss_mean=0.333][A
+Train step of epoch 0:  36%|███▌      | 2321/6434 [5:26:18<9:48:09,  8.58s/it, gpt_loss=0.305, loss_mean=0.333][A
+Train step of epoch 0:  36%|███▌      | 2321/6434 [5:26:26<9:48:09,  8.58s/it, gpt_loss=0.293, loss_mean=0.329][A
+Train step of epoch 0:  36%|███▌      | 2322/6434 [5:26:26<9:29:19,  8.31s/it, gpt_loss=0.293, loss_mean=0.329][A
+Train step of epoch 0:  36%|███▌      | 2322/6434 [5:26:36<9:29:19,  8.31s/it, gpt_loss=0.409, loss_mean=0.337][A
+Train step of epoch 0:  36%|███▌      | 2323/6434 [5:26:36<10:10:02,  8.90s/it, gpt_loss=0.409, loss_mean=0.337][A
+Train step of epoch 0:  36%|███▌      | 2323/6434 [5:26:45<10:10:02,  8.90s/it, gpt_loss=0.326, loss_mean=0.336][A
+Train step of epoch 0:  36%|███▌      | 2324/6434 [5:26:45<10:07:56,  8.88s/it, gpt_loss=0.326, loss_mean=0.336][A
+Train step of epoch 0:  36%|███▌      | 2324/6434 [5:26:54<10:07:56,  8.88s/it, gpt_loss=0.268, loss_mean=0.329][A
+Train step of epoch 0:  36%|███▌      | 2325/6434 [5:26:54<10:10:25,  8.91s/it, gpt_loss=0.268, loss_mean=0.329][A
+Train step of epoch 0:  36%|███▌      | 2325/6434 [5:27:02<10:10:25,  8.91s/it, gpt_loss=0.391, loss_mean=0.335][A
+Train step of epoch 0:  36%|███▌      | 2326/6434 [5:27:02<9:51:03,  8.63s/it, gpt_loss=0.391, loss_mean=0.335] [A
+Train step of epoch 0:  36%|███▌      | 2326/6434 [5:27:10<9:51:03,  8.63s/it, gpt_loss=0.495, loss_mean=0.351][A
+Train step of epoch 0:  36%|███▌      | 2327/6434 [5:27:10<9:49:56,  8.62s/it, gpt_loss=0.495, loss_mean=0.351][A
+Train step of epoch 0:  36%|███▌      | 2327/6434 [5:27:19<9:49:56,  8.62s/it, gpt_loss=0.338, loss_mean=0.35] [A
+Train step of epoch 0:  36%|███▌      | 2328/6434 [5:27:19<9:47:13,  8.58s/it, gpt_loss=0.338, loss_mean=0.35][A
+Train step of epoch 0:  36%|███▌      | 2328/6434 [5:27:27<9:47:13,  8.58s/it, gpt_loss=0.263, loss_mean=0.341][A
+Train step of epoch 0:  36%|███▌      | 2329/6434 [5:27:27<9:46:49,  8.58s/it, gpt_loss=0.263, loss_mean=0.341][A
+[LID Router Debug] Step: 2330
+Batch Size: 10
+Audio Batch Size: 114
+LID Assignments: [2, 9, 1, 9, 9, 1, 4, 1, 4, 3]
+Active Experts in Batch: {1, 2, 3, 4, 9}
+
+Train step of epoch 0:  36%|███▌      | 2329/6434 [5:27:35<9:46:49,  8.58s/it, gpt_loss=0.336, loss_mean=0.341][A
+Train step of epoch 0:  36%|███▌      | 2330/6434 [5:27:35<9:35:32,  8.41s/it, gpt_loss=0.336, loss_mean=0.341][A
+Train step of epoch 0:  36%|███▌      | 2330/6434 [5:27:44<9:35:32,  8.41s/it, gpt_loss=0.306, loss_mean=0.337][A
+Train step of epoch 0:  36%|███▌      | 2331/6434 [5:27:44<9:43:09,  8.53s/it, gpt_loss=0.306, loss_mean=0.337][A
+Train step of epoch 0:  36%|███▌      | 2331/6434 [5:27:52<9:43:09,  8.53s/it, gpt_loss=0.281, loss_mean=0.331][A
+Train step of epoch 0:  36%|███▌      | 2332/6434 [5:27:52<9:31:51,  8.36s/it, gpt_loss=0.281, loss_mean=0.331][A
+Train step of epoch 0:  36%|███▌      | 2332/6434 [5:28:01<9:31:51,  8.36s/it, gpt_loss=0.262, loss_mean=0.325][A
+Train step of epoch 0:  36%|███▋      | 2333/6434 [5:28:01<9:50:52,  8.64s/it, gpt_loss=0.262, loss_mean=0.325][A
+Train step of epoch 0:  36%|███▋      | 2333/6434 [5:28:10<9:50:52,  8.64s/it, gpt_loss=0.324, loss_mean=0.324][A
+Train step of epoch 0:  36%|███▋      | 2334/6434 [5:28:10<9:42:32,  8.52s/it, gpt_loss=0.324, loss_mean=0.324][A
+Train step of epoch 0:  36%|███▋      | 2334/6434 [5:28:18<9:42:32,  8.52s/it, gpt_loss=0.275, loss_mean=0.319][A
+Train step of epoch 0:  36%|███▋      | 2335/6434 [5:28:18<9:39:04,  8.48s/it, gpt_loss=0.275, loss_mean=0.319][A
+Train step of epoch 0:  36%|███▋      | 2335/6434 [5:28:26<9:39:04,  8.48s/it, gpt_loss=0.215, loss_mean=0.309][A
+Train step of epoch 0:  36%|███▋      | 2336/6434 [5:28:26<9:33:59,  8.40s/it, gpt_loss=0.215, loss_mean=0.309][A
+Train step of epoch 0:  36%|███▋      | 2336/6434 [5:28:36<9:33:59,  8.40s/it, gpt_loss=0.343, loss_mean=0.312][A
+Train step of epoch 0:  36%|███▋      | 2337/6434 [5:28:36<9:54:13,  8.70s/it, gpt_loss=0.343, loss_mean=0.312][A
+Train step of epoch 0:  36%|███▋      | 2337/6434 [5:28:44<9:54:13,  8.70s/it, gpt_loss=0.325, loss_mean=0.314][A
+Train step of epoch 0:  36%|███▋      | 2338/6434 [5:28:44<9:49:36,  8.64s/it, gpt_loss=0.325, loss_mean=0.314][A
+Train step of epoch 0:  36%|███▋      | 2338/6434 [5:28:52<9:49:36,  8.64s/it, gpt_loss=0.331, loss_mean=0.315][A
+Train step of epoch 0:  36%|███▋      | 2339/6434 [5:28:52<9:39:18,  8.49s/it, gpt_loss=0.331, loss_mean=0.315][A
+[LID Router Debug] Step: 2340
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [0, 1, 0, 9, 0, 0, 3, 2, 9, 6]
+Active Experts in Batch: {0, 1, 2, 3, 6, 9}
+
+Train step of epoch 0:  36%|███▋      | 2339/6434 [5:29:02<9:39:18,  8.49s/it, gpt_loss=0.255, loss_mean=0.309][A
+Train step of epoch 0:  36%|███▋      | 2340/6434 [5:29:02<10:02:43,  8.83s/it, gpt_loss=0.255, loss_mean=0.309][A
+Train step of epoch 0:  36%|███▋      | 2340/6434 [5:29:10<10:02:43,  8.83s/it, gpt_loss=0.334, loss_mean=0.312][A
+Train step of epoch 0:  36%|███▋      | 2341/6434 [5:29:10<9:53:47,  8.70s/it, gpt_loss=0.334, loss_mean=0.312] [A
+Train step of epoch 0:  36%|███▋      | 2341/6434 [5:29:19<9:53:47,  8.70s/it, gpt_loss=0.297, loss_mean=0.31] [A
+Train step of epoch 0:  36%|███▋      | 2342/6434 [5:29:19<10:00:32,  8.81s/it, gpt_loss=0.297, loss_mean=0.31][A
+Train step of epoch 0:  36%|███▋      | 2342/6434 [5:29:29<10:00:32,  8.81s/it, gpt_loss=0.329, loss_mean=0.312][A
+Train step of epoch 0:  36%|███▋      | 2343/6434 [5:29:29<10:20:26,  9.10s/it, gpt_loss=0.329, loss_mean=0.312][A
+Train step of epoch 0:  36%|███▋      | 2343/6434 [5:29:37<10:20:26,  9.10s/it, gpt_loss=0.379, loss_mean=0.319][A
+Train step of epoch 0:  36%|███▋      | 2344/6434 [5:29:37<10:02:38,  8.84s/it, gpt_loss=0.379, loss_mean=0.319][A
+Train step of epoch 0:  36%|███▋      | 2344/6434 [5:29:47<10:02:38,  8.84s/it, gpt_loss=0.351, loss_mean=0.322][A
+Train step of epoch 0:  36%|███▋      | 2345/6434 [5:29:47<10:08:50,  8.93s/it, gpt_loss=0.351, loss_mean=0.322][A
+Train step of epoch 0:  36%|███▋      | 2345/6434 [5:29:57<10:08:50,  8.93s/it, gpt_loss=0.378, loss_mean=0.328][A
+Train step of epoch 0:  36%|███▋      | 2346/6434 [5:29:57<10:34:27,  9.31s/it, gpt_loss=0.378, loss_mean=0.328][A
+Train step of epoch 0:  36%|███▋      | 2346/6434 [5:30:06<10:34:27,  9.31s/it, gpt_loss=0.251, loss_mean=0.32] [A
+Train step of epoch 0:  36%|███▋      | 2347/6434 [5:30:06<10:41:19,  9.42s/it, gpt_loss=0.251, loss_mean=0.32][A
+Train step of epoch 0:  36%|███▋      | 2347/6434 [5:30:14<10:41:19,  9.42s/it, gpt_loss=0.411, loss_mean=0.329][A
+Train step of epoch 0:  36%|███▋      | 2348/6434 [5:30:14<10:02:44,  8.85s/it, gpt_loss=0.411, loss_mean=0.329][A
+Train step of epoch 0:  36%|███▋      | 2348/6434 [5:30:24<10:02:44,  8.85s/it, gpt_loss=0.329, loss_mean=0.329][A
+Train step of epoch 0:  37%|███▋      | 2349/6434 [5:30:24<10:19:53,  9.10s/it, gpt_loss=0.329, loss_mean=0.329][A
+[LID Router Debug] Step: 2350
+Batch Size: 10
+Audio Batch Size: 98
+LID Assignments: [1, 1, 9, 4, 4, 1, 4, 3, 4, 4]
+Active Experts in Batch: {1, 3, 4, 9}
+
+Train step of epoch 0:  37%|███▋      | 2349/6434 [5:30:31<10:19:53,  9.10s/it, gpt_loss=0.26, loss_mean=0.322] [A
+Train step of epoch 0:  37%|███▋      | 2350/6434 [5:30:31<9:49:53,  8.67s/it, gpt_loss=0.26, loss_mean=0.322] [A
+Train step of epoch 0:  37%|███▋      | 2350/6434 [5:30:40<9:49:53,  8.67s/it, gpt_loss=0.311, loss_mean=0.321][A
+Train step of epoch 0:  37%|███▋      | 2351/6434 [5:30:40<9:59:16,  8.81s/it, gpt_loss=0.311, loss_mean=0.321][A
+Train step of epoch 0:  37%|███▋      | 2351/6434 [5:30:48<9:59:16,  8.81s/it, gpt_loss=0.337, loss_mean=0.323][A
+Train step of epoch 0:  37%|███▋      | 2352/6434 [5:30:48<9:35:42,  8.46s/it, gpt_loss=0.337, loss_mean=0.323][A
+Train step of epoch 0:  37%|███▋      | 2352/6434 [5:30:56<9:35:42,  8.46s/it, gpt_loss=0.345, loss_mean=0.325][A
+Train step of epoch 0:  37%|███▋      | 2353/6434 [5:30:56<9:18:00,  8.20s/it, gpt_loss=0.345, loss_mean=0.325][A
+Train step of epoch 0:  37%|███▋      | 2353/6434 [5:31:04<9:18:00,  8.20s/it, gpt_loss=0.261, loss_mean=0.319][A
+Train step of epoch 0:  37%|███▋      | 2354/6434 [5:31:04<9:11:17,  8.11s/it, gpt_loss=0.261, loss_mean=0.319][A
+Train step of epoch 0:  37%|███▋      | 2354/6434 [5:31:11<9:11:17,  8.11s/it, gpt_loss=0.263, loss_mean=0.313][A
+Train step of epoch 0:  37%|███▋      | 2355/6434 [5:31:11<9:00:45,  7.95s/it, gpt_loss=0.263, loss_mean=0.313][A
+Train step of epoch 0:  37%|███▋      | 2355/6434 [5:31:20<9:00:45,  7.95s/it, gpt_loss=0.333, loss_mean=0.315][A
+Train step of epoch 0:  37%|███▋      | 2356/6434 [5:31:20<9:13:42,  8.15s/it, gpt_loss=0.333, loss_mean=0.315][A
+Train step of epoch 0:  37%|███▋      | 2356/6434 [5:31:28<9:13:42,  8.15s/it, gpt_loss=0.306, loss_mean=0.314][A
+Train step of epoch 0:  37%|███▋      | 2357/6434 [5:31:28<9:18:23,  8.22s/it, gpt_loss=0.306, loss_mean=0.314][A
+Train step of epoch 0:  37%|███▋      | 2357/6434 [5:31:36<9:18:23,  8.22s/it, gpt_loss=0.314, loss_mean=0.314][A
+Train step of epoch 0:  37%|███▋      | 2358/6434 [5:31:36<9:16:37,  8.19s/it, gpt_loss=0.314, loss_mean=0.314][A
+Train step of epoch 0:  37%|███▋      | 2358/6434 [5:31:44<9:16:37,  8.19s/it, gpt_loss=0.37, loss_mean=0.32]  [A
+Train step of epoch 0:  37%|███▋      | 2359/6434 [5:31:44<9:00:31,  7.96s/it, gpt_loss=0.37, loss_mean=0.32][A
+[LID Router Debug] Step: 2360
+Batch Size: 10
+Audio Batch Size: 123
+LID Assignments: [5, 3, 8, 2, 2, 2, 1, 4, 2, 9]
+Active Experts in Batch: {1, 2, 3, 4, 5, 8, 9}
+
+Train step of epoch 0:  37%|███▋      | 2359/6434 [5:31:52<9:00:31,  7.96s/it, gpt_loss=0.457, loss_mean=0.333][A
+Train step of epoch 0:  37%|███▋      | 2360/6434 [5:31:52<9:13:51,  8.16s/it, gpt_loss=0.457, loss_mean=0.333][A
+Train step of epoch 0:  37%|███▋      | 2360/6434 [5:32:00<9:13:51,  8.16s/it, gpt_loss=0.262, loss_mean=0.326][A
+Train step of epoch 0:  37%|███▋      | 2361/6434 [5:32:00<8:53:30,  7.86s/it, gpt_loss=0.262, loss_mean=0.326][A
+Train step of epoch 0:  37%|███▋      | 2361/6434 [5:32:07<8:53:30,  7.86s/it, gpt_loss=0.302, loss_mean=0.324][A
+Train step of epoch 0:  37%|███▋      | 2362/6434 [5:32:07<8:55:29,  7.89s/it, gpt_loss=0.302, loss_mean=0.324][A
+Train step of epoch 0:  37%|███▋      | 2362/6434 [5:32:17<8:55:29,  7.89s/it, gpt_loss=0.333, loss_mean=0.325][A
+Train step of epoch 0:  37%|███▋      | 2363/6434 [5:32:17<9:24:47,  8.32s/it, gpt_loss=0.333, loss_mean=0.325][A
+Train step of epoch 0:  37%|███▋      | 2363/6434 [5:32:26<9:24:47,  8.32s/it, gpt_loss=0.289, loss_mean=0.321][A
+Train step of epoch 0:  37%|███▋      | 2364/6434 [5:32:26<9:40:44,  8.56s/it, gpt_loss=0.289, loss_mean=0.321][A
+Train step of epoch 0:  37%|███▋      | 2364/6434 [5:32:34<9:40:44,  8.56s/it, gpt_loss=0.318, loss_mean=0.321][A
+Train step of epoch 0:  37%|███▋      | 2365/6434 [5:32:34<9:34:23,  8.47s/it, gpt_loss=0.318, loss_mean=0.321][A
+Train step of epoch 0:  37%|███▋      | 2365/6434 [5:32:42<9:34:23,  8.47s/it, gpt_loss=0.277, loss_mean=0.316][A
+Train step of epoch 0:  37%|███▋      | 2366/6434 [5:32:42<9:30:28,  8.41s/it, gpt_loss=0.277, loss_mean=0.316][A
+Train step of epoch 0:  37%|███▋      | 2366/6434 [5:32:51<9:30:28,  8.41s/it, gpt_loss=0.437, loss_mean=0.328][A
+Train step of epoch 0:  37%|███▋      | 2367/6434 [5:32:51<9:30:02,  8.41s/it, gpt_loss=0.437, loss_mean=0.328][A
+Train step of epoch 0:  37%|███▋      | 2367/6434 [5:33:00<9:30:02,  8.41s/it, gpt_loss=0.33, loss_mean=0.329] [A
+Train step of epoch 0:  37%|███▋      | 2368/6434 [5:33:00<9:40:31,  8.57s/it, gpt_loss=0.33, loss_mean=0.329][A
+Train step of epoch 0:  37%|███▋      | 2368/6434 [5:33:08<9:40:31,  8.57s/it, gpt_loss=0.298, loss_mean=0.326][A
+Train step of epoch 0:  37%|███▋      | 2369/6434 [5:33:08<9:37:30,  8.52s/it, gpt_loss=0.298, loss_mean=0.326][A
+[LID Router Debug] Step: 2370
+Batch Size: 10
+Audio Batch Size: 139
+LID Assignments: [5, 4, 9, 3, 9, 3, 6, 1, 9, 9]
+Active Experts in Batch: {1, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  37%|███▋      | 2369/6434 [5:33:17<9:37:30,  8.52s/it, gpt_loss=0.288, loss_mean=0.322][A
+Train step of epoch 0:  37%|███▋      | 2370/6434 [5:33:17<9:42:26,  8.60s/it, gpt_loss=0.288, loss_mean=0.322][A
+Train step of epoch 0:  37%|███▋      | 2370/6434 [5:33:26<9:42:26,  8.60s/it, gpt_loss=0.269, loss_mean=0.316][A
+Train step of epoch 0:  37%|███▋      | 2371/6434 [5:33:26<9:45:48,  8.65s/it, gpt_loss=0.269, loss_mean=0.316][A
+Train step of epoch 0:  37%|███▋      | 2371/6434 [5:33:34<9:45:48,  8.65s/it, gpt_loss=0.396, loss_mean=0.324][A
+Train step of epoch 0:  37%|███▋      | 2372/6434 [5:33:34<9:39:49,  8.56s/it, gpt_loss=0.396, loss_mean=0.324][A
+Train step of epoch 0:  37%|███▋      | 2372/6434 [5:33:43<9:39:49,  8.56s/it, gpt_loss=0.311, loss_mean=0.323][A
+Train step of epoch 0:  37%|███▋      | 2373/6434 [5:33:43<9:35:35,  8.50s/it, gpt_loss=0.311, loss_mean=0.323][A
+Train step of epoch 0:  37%|███▋      | 2373/6434 [5:33:52<9:35:35,  8.50s/it, gpt_loss=0.277, loss_mean=0.318][A
+Train step of epoch 0:  37%|███▋      | 2374/6434 [5:33:52<10:05:16,  8.94s/it, gpt_loss=0.277, loss_mean=0.318][A
+Train step of epoch 0:  37%|███▋      | 2374/6434 [5:34:00<10:05:16,  8.94s/it, gpt_loss=0.285, loss_mean=0.315][A
+Train step of epoch 0:  37%|███▋      | 2375/6434 [5:34:00<9:45:13,  8.65s/it, gpt_loss=0.285, loss_mean=0.315] [A
+Train step of epoch 0:  37%|███▋      | 2375/6434 [5:34:09<9:45:13,  8.65s/it, gpt_loss=0.297, loss_mean=0.313][A
+Train step of epoch 0:  37%|███▋      | 2376/6434 [5:34:09<9:34:56,  8.50s/it, gpt_loss=0.297, loss_mean=0.313][A
+Train step of epoch 0:  37%|███▋      | 2376/6434 [5:34:18<9:34:56,  8.50s/it, gpt_loss=0.256, loss_mean=0.308][A
+Train step of epoch 0:  37%|███▋      | 2377/6434 [5:34:18<9:44:01,  8.64s/it, gpt_loss=0.256, loss_mean=0.308][A
+Train step of epoch 0:  37%|███▋      | 2377/6434 [5:34:26<9:44:01,  8.64s/it, gpt_loss=0.268, loss_mean=0.304][A
+Train step of epoch 0:  37%|███▋      | 2378/6434 [5:34:26<9:40:35,  8.59s/it, gpt_loss=0.268, loss_mean=0.304][A
+Train step of epoch 0:  37%|███▋      | 2378/6434 [5:34:35<9:40:35,  8.59s/it, gpt_loss=0.26, loss_mean=0.299] [A
+Train step of epoch 0:  37%|███▋      | 2379/6434 [5:34:35<9:46:58,  8.69s/it, gpt_loss=0.26, loss_mean=0.299][A
+[LID Router Debug] Step: 2380
+Batch Size: 10
+Audio Batch Size: 90
+LID Assignments: [0, 2, 1, 5, 3, 0, 0, 4, 1, 5]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5}
+
+Train step of epoch 0:  37%|███▋      | 2379/6434 [5:34:43<9:46:58,  8.69s/it, gpt_loss=0.279, loss_mean=0.297][A
+Train step of epoch 0:  37%|███▋      | 2380/6434 [5:34:43<9:25:48,  8.37s/it, gpt_loss=0.279, loss_mean=0.297][A
+Train step of epoch 0:  37%|███▋      | 2380/6434 [5:34:52<9:25:48,  8.37s/it, gpt_loss=0.338, loss_mean=0.301][A
+Train step of epoch 0:  37%|███▋      | 2381/6434 [5:34:52<9:47:51,  8.70s/it, gpt_loss=0.338, loss_mean=0.301][A
+Train step of epoch 0:  37%|███▋      | 2381/6434 [5:35:01<9:47:51,  8.70s/it, gpt_loss=0.292, loss_mean=0.3]  [A
+Train step of epoch 0:  37%|███▋      | 2382/6434 [5:35:01<9:43:22,  8.64s/it, gpt_loss=0.292, loss_mean=0.3][A
+Train step of epoch 0:  37%|███▋      | 2382/6434 [5:35:09<9:43:22,  8.64s/it, gpt_loss=0.341, loss_mean=0.304][A
+Train step of epoch 0:  37%|███▋      | 2383/6434 [5:35:09<9:41:44,  8.62s/it, gpt_loss=0.341, loss_mean=0.304][A
+Train step of epoch 0:  37%|███▋      | 2383/6434 [5:35:18<9:41:44,  8.62s/it, gpt_loss=0.33, loss_mean=0.307] [A
+Train step of epoch 0:  37%|███▋      | 2384/6434 [5:35:18<9:39:20,  8.58s/it, gpt_loss=0.33, loss_mean=0.307][A
+Train step of epoch 0:  37%|███▋      | 2384/6434 [5:35:26<9:39:20,  8.58s/it, gpt_loss=0.353, loss_mean=0.312][A
+Train step of epoch 0:  37%|███▋      | 2385/6434 [5:35:26<9:42:20,  8.63s/it, gpt_loss=0.353, loss_mean=0.312][A
+Train step of epoch 0:  37%|███▋      | 2385/6434 [5:35:35<9:42:20,  8.63s/it, gpt_loss=0.359, loss_mean=0.316][A
+Train step of epoch 0:  37%|███▋      | 2386/6434 [5:35:35<9:50:04,  8.75s/it, gpt_loss=0.359, loss_mean=0.316][A
+Train step of epoch 0:  37%|███▋      | 2386/6434 [5:35:44<9:50:04,  8.75s/it, gpt_loss=0.326, loss_mean=0.317][A
+Train step of epoch 0:  37%|███▋      | 2387/6434 [5:35:44<9:51:05,  8.76s/it, gpt_loss=0.326, loss_mean=0.317][A
+Train step of epoch 0:  37%|███▋      | 2387/6434 [5:35:52<9:51:05,  8.76s/it, gpt_loss=0.333, loss_mean=0.319][A
+Train step of epoch 0:  37%|███▋      | 2388/6434 [5:35:52<9:32:45,  8.49s/it, gpt_loss=0.333, loss_mean=0.319][A
+Train step of epoch 0:  37%|███▋      | 2388/6434 [5:36:01<9:32:45,  8.49s/it, gpt_loss=0.338, loss_mean=0.321][A
+Train step of epoch 0:  37%|███▋      | 2389/6434 [5:36:01<9:42:09,  8.64s/it, gpt_loss=0.338, loss_mean=0.321][A
+[LID Router Debug] Step: 2390
+Batch Size: 10
+Audio Batch Size: 86
+LID Assignments: [5, 1, 6, 4, 0, 1, 10, 4, 0, 4]
+Active Experts in Batch: {0, 1, 4, 5, 6, 10}
+
+Train step of epoch 0:  37%|███▋      | 2389/6434 [5:36:09<9:42:09,  8.64s/it, gpt_loss=0.393, loss_mean=0.328][A
+Train step of epoch 0:  37%|███▋      | 2390/6434 [5:36:09<9:19:41,  8.30s/it, gpt_loss=0.393, loss_mean=0.328][A
+Train step of epoch 0:  37%|███▋      | 2390/6434 [5:36:18<9:19:41,  8.30s/it, gpt_loss=0.313, loss_mean=0.326][A
+Train step of epoch 0:  37%|███▋      | 2391/6434 [5:36:18<9:39:00,  8.59s/it, gpt_loss=0.313, loss_mean=0.326][A
+Train step of epoch 0:  37%|███▋      | 2391/6434 [5:36:25<9:39:00,  8.59s/it, gpt_loss=0.292, loss_mean=0.323][A
+Train step of epoch 0:  37%|███▋      | 2392/6434 [5:36:25<9:18:51,  8.30s/it, gpt_loss=0.292, loss_mean=0.323][A
+Train step of epoch 0:  37%|███▋      | 2392/6434 [5:36:33<9:18:51,  8.30s/it, gpt_loss=0.331, loss_mean=0.324][A
+Train step of epoch 0:  37%|███▋      | 2393/6434 [5:36:33<9:09:40,  8.16s/it, gpt_loss=0.331, loss_mean=0.324][A
+Train step of epoch 0:  37%|███▋      | 2393/6434 [5:36:42<9:09:40,  8.16s/it, gpt_loss=0.424, loss_mean=0.334][A
+Train step of epoch 0:  37%|███▋      | 2394/6434 [5:36:42<9:19:15,  8.31s/it, gpt_loss=0.424, loss_mean=0.334][A
+Train step of epoch 0:  37%|███▋      | 2394/6434 [5:36:50<9:19:15,  8.31s/it, gpt_loss=0.265, loss_mean=0.327][A
+Train step of epoch 0:  37%|███▋      | 2395/6434 [5:36:50<9:19:27,  8.31s/it, gpt_loss=0.265, loss_mean=0.327][A
+Train step of epoch 0:  37%|███▋      | 2395/6434 [5:36:58<9:19:27,  8.31s/it, gpt_loss=0.299, loss_mean=0.324][A
+Train step of epoch 0:  37%|███▋      | 2396/6434 [5:36:58<9:14:20,  8.24s/it, gpt_loss=0.299, loss_mean=0.324][A
+Train step of epoch 0:  37%|███▋      | 2396/6434 [5:37:07<9:14:20,  8.24s/it, gpt_loss=0.34, loss_mean=0.326] [A
+Train step of epoch 0:  37%|███▋      | 2397/6434 [5:37:07<9:23:41,  8.38s/it, gpt_loss=0.34, loss_mean=0.326][A
+Train step of epoch 0:  37%|███▋      | 2397/6434 [5:37:15<9:23:41,  8.38s/it, gpt_loss=0.335, loss_mean=0.327][A
+Train step of epoch 0:  37%|███▋      | 2398/6434 [5:37:15<9:14:29,  8.24s/it, gpt_loss=0.335, loss_mean=0.327][A
+Train step of epoch 0:  37%|███▋      | 2398/6434 [5:37:24<9:14:29,  8.24s/it, gpt_loss=0.418, loss_mean=0.336][A
+Train step of epoch 0:  37%|███▋      | 2399/6434 [5:37:24<9:28:54,  8.46s/it, gpt_loss=0.418, loss_mean=0.336][A
+[LID Router Debug] Step: 2400
+Batch Size: 10
+Audio Batch Size: 83
+LID Assignments: [4, 1, 5, 9, 2, 2, 2, 0, 1, 2]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+[2026-02-06 21:33:37,560] [INFO] [logging.py:96:log_dist] [Rank 0] step=1200, skipped=0, lr=[1.9349444679119665e-05, 1.9349444679119665e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-06 21:33:37,560] [INFO] [timer.py:260:stop] epoch=0/micro_step=2400/global_step=1200, RunningAvgSamplesPerSec=4.751518994412861, CurrSamplesPerSec=4.409371016197916, MemAllocated=12.44GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  37%|███▋      | 2399/6434 [5:37:33<9:28:54,  8.46s/it, gpt_loss=0.292, loss_mean=0.331][A
+Train step of epoch 0:  37%|███▋      | 2400/6434 [5:37:33<9:43:50,  8.68s/it, gpt_loss=0.292, loss_mean=0.331][A
+Train step of epoch 0:  37%|███▋      | 2400/6434 [5:37:41<9:43:50,  8.68s/it, gpt_loss=0.295, loss_mean=0.328][A
+Train step of epoch 0:  37%|███▋      | 2401/6434 [5:37:41<9:28:29,  8.46s/it, gpt_loss=0.295, loss_mean=0.328][A
+Train step of epoch 0:  37%|███▋      | 2401/6434 [5:37:50<9:28:29,  8.46s/it, gpt_loss=0.27, loss_mean=0.322] [A
+Train step of epoch 0:  37%|███▋      | 2402/6434 [5:37:50<9:35:36,  8.57s/it, gpt_loss=0.27, loss_mean=0.322][A
+Train step of epoch 0:  37%|███▋      | 2402/6434 [5:37:58<9:35:36,  8.57s/it, gpt_loss=0.329, loss_mean=0.323][A
+Train step of epoch 0:  37%|███▋      | 2403/6434 [5:37:58<9:27:06,  8.44s/it, gpt_loss=0.329, loss_mean=0.323][A
+Train step of epoch 0:  37%|███▋      | 2403/6434 [5:38:08<9:27:06,  8.44s/it, gpt_loss=0.393, loss_mean=0.33] [A
+Train step of epoch 0:  37%|███▋      | 2404/6434 [5:38:08<9:53:34,  8.84s/it, gpt_loss=0.393, loss_mean=0.33][A
+Train step of epoch 0:  37%|███▋      | 2404/6434 [5:38:16<9:53:34,  8.84s/it, gpt_loss=0.303, loss_mean=0.327][A
+Train step of epoch 0:  37%|███▋      | 2405/6434 [5:38:16<9:47:31,  8.75s/it, gpt_loss=0.303, loss_mean=0.327][A
+Train step of epoch 0:  37%|███▋      | 2405/6434 [5:38:25<9:47:31,  8.75s/it, gpt_loss=0.404, loss_mean=0.335][A
+Train step of epoch 0:  37%|███▋      | 2406/6434 [5:38:25<9:43:21,  8.69s/it, gpt_loss=0.404, loss_mean=0.335][A
+Train step of epoch 0:  37%|███▋      | 2406/6434 [5:38:34<9:43:21,  8.69s/it, gpt_loss=0.328, loss_mean=0.334][A
+Train step of epoch 0:  37%|███▋      | 2407/6434 [5:38:34<10:01:48,  8.97s/it, gpt_loss=0.328, loss_mean=0.334][A
+Train step of epoch 0:  37%|███▋      | 2407/6434 [5:38:43<10:01:48,  8.97s/it, gpt_loss=0.328, loss_mean=0.333][A
+Train step of epoch 0:  37%|███▋      | 2408/6434 [5:38:43<9:44:47,  8.72s/it, gpt_loss=0.328, loss_mean=0.333] [A
+Train step of epoch 0:  37%|███▋      | 2408/6434 [5:38:53<9:44:47,  8.72s/it, gpt_loss=0.335, loss_mean=0.334][A
+Train step of epoch 0:  37%|███▋      | 2409/6434 [5:38:53<10:10:47,  9.10s/it, gpt_loss=0.335, loss_mean=0.334][A
+[LID Router Debug] Step: 2410
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [3, 9, 3, 9, 5, 1, 4, 2, 5, 4]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  37%|███▋      | 2409/6434 [5:39:01<10:10:47,  9.10s/it, gpt_loss=0.363, loss_mean=0.337][A
+Train step of epoch 0:  37%|███▋      | 2410/6434 [5:39:01<9:56:55,  8.90s/it, gpt_loss=0.363, loss_mean=0.337] [A
+Train step of epoch 0:  37%|███▋      | 2410/6434 [5:39:10<9:56:55,  8.90s/it, gpt_loss=0.338, loss_mean=0.337][A
+Train step of epoch 0:  37%|███▋      | 2411/6434 [5:39:10<9:50:30,  8.81s/it, gpt_loss=0.338, loss_mean=0.337][A
+Train step of epoch 0:  37%|███▋      | 2411/6434 [5:39:18<9:50:30,  8.81s/it, gpt_loss=0.265, loss_mean=0.33] [A
+Train step of epoch 0:  37%|███▋      | 2412/6434 [5:39:18<9:44:53,  8.73s/it, gpt_loss=0.265, loss_mean=0.33][A
+Train step of epoch 0:  37%|███▋      | 2412/6434 [5:39:26<9:44:53,  8.73s/it, gpt_loss=0.324, loss_mean=0.329][A
+Train step of epoch 0:  38%|███▊      | 2413/6434 [5:39:26<9:35:39,  8.59s/it, gpt_loss=0.324, loss_mean=0.329][A
+Train step of epoch 0:  38%|███▊      | 2413/6434 [5:39:35<9:35:39,  8.59s/it, gpt_loss=0.297, loss_mean=0.326][A
+Train step of epoch 0:  38%|███▊      | 2414/6434 [5:39:35<9:39:35,  8.65s/it, gpt_loss=0.297, loss_mean=0.326][A
+Train step of epoch 0:  38%|███▊      | 2414/6434 [5:39:43<9:39:35,  8.65s/it, gpt_loss=0.321, loss_mean=0.325][A
+Train step of epoch 0:  38%|███▊      | 2415/6434 [5:39:43<9:19:35,  8.35s/it, gpt_loss=0.321, loss_mean=0.325][A
+Train step of epoch 0:  38%|███▊      | 2415/6434 [5:39:51<9:19:35,  8.35s/it, gpt_loss=0.331, loss_mean=0.326][A
+Train step of epoch 0:  38%|███▊      | 2416/6434 [5:39:51<9:22:34,  8.40s/it, gpt_loss=0.331, loss_mean=0.326][A
+Train step of epoch 0:  38%|███▊      | 2416/6434 [5:40:00<9:22:34,  8.40s/it, gpt_loss=0.333, loss_mean=0.327][A
+Train step of epoch 0:  38%|███▊      | 2417/6434 [5:40:00<9:32:28,  8.55s/it, gpt_loss=0.333, loss_mean=0.327][A
+Train step of epoch 0:  38%|███▊      | 2417/6434 [5:40:09<9:32:28,  8.55s/it, gpt_loss=0.375, loss_mean=0.331][A
+Train step of epoch 0:  38%|███▊      | 2418/6434 [5:40:09<9:32:52,  8.56s/it, gpt_loss=0.375, loss_mean=0.331][A
+Train step of epoch 0:  38%|███▊      | 2418/6434 [5:40:17<9:32:52,  8.56s/it, gpt_loss=0.337, loss_mean=0.332][A
+Train step of epoch 0:  38%|███▊      | 2419/6434 [5:40:17<9:21:57,  8.40s/it, gpt_loss=0.337, loss_mean=0.332][A
+[LID Router Debug] Step: 2420
+Batch Size: 10
+Audio Batch Size: 98
+LID Assignments: [4, 2, 5, 2, 2, 2, 9, 9, 1, 1]
+Active Experts in Batch: {1, 2, 4, 5, 9}
+
+Train step of epoch 0:  38%|███▊      | 2419/6434 [5:40:25<9:21:57,  8.40s/it, gpt_loss=0.424, loss_mean=0.341][A
+Train step of epoch 0:  38%|███▊      | 2420/6434 [5:40:25<9:13:25,  8.27s/it, gpt_loss=0.424, loss_mean=0.341][A
+Train step of epoch 0:  38%|███▊      | 2420/6434 [5:40:34<9:13:25,  8.27s/it, gpt_loss=0.32, loss_mean=0.339] [A
+Train step of epoch 0:  38%|███▊      | 2421/6434 [5:40:34<9:30:46,  8.53s/it, gpt_loss=0.32, loss_mean=0.339][A
+Train step of epoch 0:  38%|███▊      | 2421/6434 [5:40:44<9:30:46,  8.53s/it, gpt_loss=0.35, loss_mean=0.34] [A
+Train step of epoch 0:  38%|███▊      | 2422/6434 [5:40:44<9:59:47,  8.97s/it, gpt_loss=0.35, loss_mean=0.34][A
+Train step of epoch 0:  38%|███▊      | 2422/6434 [5:40:53<9:59:47,  8.97s/it, gpt_loss=0.328, loss_mean=0.339][A
+Train step of epoch 0:  38%|███▊      | 2423/6434 [5:40:53<10:06:31,  9.07s/it, gpt_loss=0.328, loss_mean=0.339][A
+Train step of epoch 0:  38%|███▊      | 2423/6434 [5:41:02<10:06:31,  9.07s/it, gpt_loss=0.393, loss_mean=0.344][A
+Train step of epoch 0:  38%|███▊      | 2424/6434 [5:41:02<9:52:14,  8.86s/it, gpt_loss=0.393, loss_mean=0.344] [A
+Train step of epoch 0:  38%|███▊      | 2424/6434 [5:41:11<9:52:14,  8.86s/it, gpt_loss=0.295, loss_mean=0.339][A
+Train step of epoch 0:  38%|███▊      | 2425/6434 [5:41:11<9:52:08,  8.86s/it, gpt_loss=0.295, loss_mean=0.339][A
+Train step of epoch 0:  38%|███▊      | 2425/6434 [5:41:19<9:52:08,  8.86s/it, gpt_loss=0.292, loss_mean=0.335][A
+Train step of epoch 0:  38%|███▊      | 2426/6434 [5:41:19<9:45:35,  8.77s/it, gpt_loss=0.292, loss_mean=0.335][A
+Train step of epoch 0:  38%|███▊      | 2426/6434 [5:41:27<9:45:35,  8.77s/it, gpt_loss=0.35, loss_mean=0.336] [A
+Train step of epoch 0:  38%|███▊      | 2427/6434 [5:41:27<9:34:51,  8.61s/it, gpt_loss=0.35, loss_mean=0.336][A
+Train step of epoch 0:  38%|███▊      | 2427/6434 [5:41:36<9:34:51,  8.61s/it, gpt_loss=0.352, loss_mean=0.338][A
+Train step of epoch 0:  38%|███▊      | 2428/6434 [5:41:36<9:35:41,  8.62s/it, gpt_loss=0.352, loss_mean=0.338][A
+Train step of epoch 0:  38%|███▊      | 2428/6434 [5:41:45<9:35:41,  8.62s/it, gpt_loss=0.272, loss_mean=0.331][A
+Train step of epoch 0:  38%|███▊      | 2429/6434 [5:41:45<9:36:55,  8.64s/it, gpt_loss=0.272, loss_mean=0.331][A
+[LID Router Debug] Step: 2430
+Batch Size: 10
+Audio Batch Size: 111
+LID Assignments: [0, 0, 3, 1, 0, 3, 6, 2, 2, 5]
+Active Experts in Batch: {0, 1, 2, 3, 5, 6}
+
+Train step of epoch 0:  38%|███▊      | 2429/6434 [5:41:52<9:36:55,  8.64s/it, gpt_loss=0.303, loss_mean=0.328][A
+Train step of epoch 0:  38%|███▊      | 2430/6434 [5:41:52<9:18:21,  8.37s/it, gpt_loss=0.303, loss_mean=0.328][A
+Train step of epoch 0:  38%|███▊      | 2430/6434 [5:42:01<9:18:21,  8.37s/it, gpt_loss=0.3, loss_mean=0.326]  [A
+Train step of epoch 0:  38%|███▊      | 2431/6434 [5:42:01<9:30:22,  8.55s/it, gpt_loss=0.3, loss_mean=0.326][A
+Train step of epoch 0:  38%|███▊      | 2431/6434 [5:42:09<9:30:22,  8.55s/it, gpt_loss=0.351, loss_mean=0.328][A
+Train step of epoch 0:  38%|███▊      | 2432/6434 [5:42:09<9:12:56,  8.29s/it, gpt_loss=0.351, loss_mean=0.328][A
+Train step of epoch 0:  38%|███▊      | 2432/6434 [5:42:17<9:12:56,  8.29s/it, gpt_loss=0.331, loss_mean=0.328][A
+Train step of epoch 0:  38%|███▊      | 2433/6434 [5:42:17<9:01:03,  8.11s/it, gpt_loss=0.331, loss_mean=0.328][A
+Train step of epoch 0:  38%|███▊      | 2433/6434 [5:42:26<9:01:03,  8.11s/it, gpt_loss=0.336, loss_mean=0.329][A
+Train step of epoch 0:  38%|███▊      | 2434/6434 [5:42:26<9:29:31,  8.54s/it, gpt_loss=0.336, loss_mean=0.329][A
+Train step of epoch 0:  38%|███▊      | 2434/6434 [5:42:35<9:29:31,  8.54s/it, gpt_loss=0.403, loss_mean=0.337][A
+Train step of epoch 0:  38%|███▊      | 2435/6434 [5:42:35<9:38:36,  8.68s/it, gpt_loss=0.403, loss_mean=0.337][A
+Train step of epoch 0:  38%|███▊      | 2435/6434 [5:42:43<9:38:36,  8.68s/it, gpt_loss=0.378, loss_mean=0.341][A
+Train step of epoch 0:  38%|███▊      | 2436/6434 [5:42:43<9:27:14,  8.51s/it, gpt_loss=0.378, loss_mean=0.341][A
+Train step of epoch 0:  38%|███▊      | 2436/6434 [5:42:52<9:27:14,  8.51s/it, gpt_loss=0.301, loss_mean=0.337][A
+Train step of epoch 0:  38%|███▊      | 2437/6434 [5:42:52<9:31:38,  8.58s/it, gpt_loss=0.301, loss_mean=0.337][A
+Train step of epoch 0:  38%|███▊      | 2437/6434 [5:43:01<9:31:38,  8.58s/it, gpt_loss=0.368, loss_mean=0.34] [A
+Train step of epoch 0:  38%|███▊      | 2438/6434 [5:43:01<9:39:54,  8.71s/it, gpt_loss=0.368, loss_mean=0.34][A
+Train step of epoch 0:  38%|███▊      | 2438/6434 [5:43:10<9:39:54,  8.71s/it, gpt_loss=0.359, loss_mean=0.342][A
+Train step of epoch 0:  38%|███▊      | 2439/6434 [5:43:10<9:35:33,  8.64s/it, gpt_loss=0.359, loss_mean=0.342][A
+[LID Router Debug] Step: 2440
+Batch Size: 10
+Audio Batch Size: 113
+LID Assignments: [5, 1, 9, 5, 2, 3, 3, 1, 1, 9]
+Active Experts in Batch: {1, 2, 3, 5, 9}
+
+Train step of epoch 0:  38%|███▊      | 2439/6434 [5:43:18<9:35:33,  8.64s/it, gpt_loss=0.357, loss_mean=0.343][A
+Train step of epoch 0:  38%|███▊      | 2440/6434 [5:43:18<9:22:16,  8.45s/it, gpt_loss=0.357, loss_mean=0.343][A
+Train step of epoch 0:  38%|███▊      | 2440/6434 [5:43:25<9:22:16,  8.45s/it, gpt_loss=0.305, loss_mean=0.34] [A
+Train step of epoch 0:  38%|███▊      | 2441/6434 [5:43:25<8:55:22,  8.04s/it, gpt_loss=0.305, loss_mean=0.34][A
+Train step of epoch 0:  38%|███▊      | 2441/6434 [5:43:33<8:55:22,  8.04s/it, gpt_loss=0.313, loss_mean=0.337][A
+Train step of epoch 0:  38%|███▊      | 2442/6434 [5:43:33<9:03:22,  8.17s/it, gpt_loss=0.313, loss_mean=0.337][A
+Train step of epoch 0:  38%|███▊      | 2442/6434 [5:43:41<9:03:22,  8.17s/it, gpt_loss=0.293, loss_mean=0.332][A
+Train step of epoch 0:  38%|███▊      | 2443/6434 [5:43:41<8:50:33,  7.98s/it, gpt_loss=0.293, loss_mean=0.332][A
+Train step of epoch 0:  38%|███▊      | 2443/6434 [5:43:49<8:50:33,  7.98s/it, gpt_loss=0.278, loss_mean=0.327][A
+Train step of epoch 0:  38%|███▊      | 2444/6434 [5:43:49<8:58:35,  8.10s/it, gpt_loss=0.278, loss_mean=0.327][A
+Train step of epoch 0:  38%|███▊      | 2444/6434 [5:43:58<8:58:35,  8.10s/it, gpt_loss=0.378, loss_mean=0.332][A
+Train step of epoch 0:  38%|███▊      | 2445/6434 [5:43:58<9:17:50,  8.39s/it, gpt_loss=0.378, loss_mean=0.332][A
+Train step of epoch 0:  38%|███▊      | 2445/6434 [5:44:06<9:17:50,  8.39s/it, gpt_loss=0.271, loss_mean=0.326][A
+Train step of epoch 0:  38%|███▊      | 2446/6434 [5:44:06<9:10:41,  8.29s/it, gpt_loss=0.271, loss_mean=0.326][A
+Train step of epoch 0:  38%|███▊      | 2446/6434 [5:44:14<9:10:41,  8.29s/it, gpt_loss=0.401, loss_mean=0.334][A
+Train step of epoch 0:  38%|███▊      | 2447/6434 [5:44:14<8:58:11,  8.10s/it, gpt_loss=0.401, loss_mean=0.334][A
+Train step of epoch 0:  38%|███▊      | 2447/6434 [5:44:23<8:58:11,  8.10s/it, gpt_loss=0.346, loss_mean=0.335][A
+Train step of epoch 0:  38%|███▊      | 2448/6434 [5:44:23<9:12:27,  8.32s/it, gpt_loss=0.346, loss_mean=0.335][A
+Train step of epoch 0:  38%|███▊      | 2448/6434 [5:44:31<9:12:27,  8.32s/it, gpt_loss=0.312, loss_mean=0.333][A
+Train step of epoch 0:  38%|███▊      | 2449/6434 [5:44:31<9:05:16,  8.21s/it, gpt_loss=0.312, loss_mean=0.333][A
+[LID Router Debug] Step: 2450
+Batch Size: 10
+Audio Batch Size: 137
+LID Assignments: [6, 9, 0, 3, 2, 9, 1, 3, 4, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:  38%|███▊      | 2449/6434 [5:44:40<9:05:16,  8.21s/it, gpt_loss=0.283, loss_mean=0.328][A
+Train step of epoch 0:  38%|███▊      | 2450/6434 [5:44:40<9:18:08,  8.41s/it, gpt_loss=0.283, loss_mean=0.328][A
+Train step of epoch 0:  38%|███▊      | 2450/6434 [5:44:48<9:18:08,  8.41s/it, gpt_loss=0.367, loss_mean=0.332][A
+Train step of epoch 0:  38%|███▊      | 2451/6434 [5:44:48<9:23:33,  8.49s/it, gpt_loss=0.367, loss_mean=0.332][A
+Train step of epoch 0:  38%|███▊      | 2451/6434 [5:44:57<9:23:33,  8.49s/it, gpt_loss=0.314, loss_mean=0.33] [A
+Train step of epoch 0:  38%|███▊      | 2452/6434 [5:44:57<9:29:59,  8.59s/it, gpt_loss=0.314, loss_mean=0.33][A
+Train step of epoch 0:  38%|███▊      | 2452/6434 [5:45:05<9:29:59,  8.59s/it, gpt_loss=0.34, loss_mean=0.331][A
+Train step of epoch 0:  38%|███▊      | 2453/6434 [5:45:05<9:20:27,  8.45s/it, gpt_loss=0.34, loss_mean=0.331][A
+Train step of epoch 0:  38%|███▊      | 2453/6434 [5:45:14<9:20:27,  8.45s/it, gpt_loss=0.33, loss_mean=0.331][A
+Train step of epoch 0:  38%|███▊      | 2454/6434 [5:45:14<9:23:23,  8.49s/it, gpt_loss=0.33, loss_mean=0.331][A
+Train step of epoch 0:  38%|███▊      | 2454/6434 [5:45:22<9:23:23,  8.49s/it, gpt_loss=0.28, loss_mean=0.326][A
+Train step of epoch 0:  38%|███▊      | 2455/6434 [5:45:22<9:25:57,  8.53s/it, gpt_loss=0.28, loss_mean=0.326][A
+Train step of epoch 0:  38%|███▊      | 2455/6434 [5:45:31<9:25:57,  8.53s/it, gpt_loss=0.292, loss_mean=0.322][A
+Train step of epoch 0:  38%|███▊      | 2456/6434 [5:45:31<9:21:59,  8.48s/it, gpt_loss=0.292, loss_mean=0.322][A
+Train step of epoch 0:  38%|███▊      | 2456/6434 [5:45:40<9:21:59,  8.48s/it, gpt_loss=0.249, loss_mean=0.315][A
+Train step of epoch 0:  38%|███▊      | 2457/6434 [5:45:40<9:33:32,  8.65s/it, gpt_loss=0.249, loss_mean=0.315][A
+Train step of epoch 0:  38%|███▊      | 2457/6434 [5:45:48<9:33:32,  8.65s/it, gpt_loss=0.251, loss_mean=0.308][A
+Train step of epoch 0:  38%|███▊      | 2458/6434 [5:45:48<9:28:14,  8.58s/it, gpt_loss=0.251, loss_mean=0.308][A
+Train step of epoch 0:  38%|███▊      | 2458/6434 [5:45:56<9:28:14,  8.58s/it, gpt_loss=0.271, loss_mean=0.305][A
+Train step of epoch 0:  38%|███▊      | 2459/6434 [5:45:56<9:21:16,  8.47s/it, gpt_loss=0.271, loss_mean=0.305][A
+[LID Router Debug] Step: 2460
+Batch Size: 10
+Audio Batch Size: 122
+LID Assignments: [1, 2, 1, 9, 3, 3, 0, 1, 1, 0]
+Active Experts in Batch: {0, 1, 2, 3, 9}
+
+Train step of epoch 0:  38%|███▊      | 2459/6434 [5:46:05<9:21:16,  8.47s/it, gpt_loss=0.308, loss_mean=0.305][A
+Train step of epoch 0:  38%|███▊      | 2460/6434 [5:46:05<9:15:03,  8.38s/it, gpt_loss=0.308, loss_mean=0.305][A
+Train step of epoch 0:  38%|███▊      | 2460/6434 [5:46:14<9:15:03,  8.38s/it, gpt_loss=0.367, loss_mean=0.311][A
+Train step of epoch 0:  38%|███▊      | 2461/6434 [5:46:14<9:31:15,  8.63s/it, gpt_loss=0.367, loss_mean=0.311][A
+Train step of epoch 0:  38%|███▊      | 2461/6434 [5:46:23<9:31:15,  8.63s/it, gpt_loss=0.339, loss_mean=0.314][A
+Train step of epoch 0:  38%|███▊      | 2462/6434 [5:46:23<9:46:25,  8.86s/it, gpt_loss=0.339, loss_mean=0.314][A
+Train step of epoch 0:  38%|███▊      | 2462/6434 [5:46:32<9:46:25,  8.86s/it, gpt_loss=0.375, loss_mean=0.32] [A
+Train step of epoch 0:  38%|███▊      | 2463/6434 [5:46:32<9:38:23,  8.74s/it, gpt_loss=0.375, loss_mean=0.32][A
+Train step of epoch 0:  38%|███▊      | 2463/6434 [5:46:40<9:38:23,  8.74s/it, gpt_loss=0.287, loss_mean=0.317][A
+Train step of epoch 0:  38%|███▊      | 2464/6434 [5:46:40<9:33:30,  8.67s/it, gpt_loss=0.287, loss_mean=0.317][A
+Train step of epoch 0:  38%|███▊      | 2464/6434 [5:46:49<9:33:30,  8.67s/it, gpt_loss=0.412, loss_mean=0.327][A
+Train step of epoch 0:  38%|███▊      | 2465/6434 [5:46:49<9:32:43,  8.66s/it, gpt_loss=0.412, loss_mean=0.327][A
+Train step of epoch 0:  38%|███▊      | 2465/6434 [5:46:58<9:32:43,  8.66s/it, gpt_loss=0.266, loss_mean=0.32] [A
+Train step of epoch 0:  38%|███▊      | 2466/6434 [5:46:58<9:34:37,  8.69s/it, gpt_loss=0.266, loss_mean=0.32][A
+Train step of epoch 0:  38%|███▊      | 2466/6434 [5:47:07<9:34:37,  8.69s/it, gpt_loss=0.325, loss_mean=0.321][A
+Train step of epoch 0:  38%|███▊      | 2467/6434 [5:47:07<9:41:52,  8.80s/it, gpt_loss=0.325, loss_mean=0.321][A
+Train step of epoch 0:  38%|███▊      | 2467/6434 [5:47:15<9:41:52,  8.80s/it, gpt_loss=0.234, loss_mean=0.312][A
+Train step of epoch 0:  38%|███▊      | 2468/6434 [5:47:15<9:41:20,  8.79s/it, gpt_loss=0.234, loss_mean=0.312][A
+Train step of epoch 0:  38%|███▊      | 2468/6434 [5:47:24<9:41:20,  8.79s/it, gpt_loss=0.334, loss_mean=0.314][A
+Train step of epoch 0:  38%|███▊      | 2469/6434 [5:47:24<9:42:41,  8.82s/it, gpt_loss=0.334, loss_mean=0.314][A
+[LID Router Debug] Step: 2470
+Batch Size: 10
+Audio Batch Size: 88
+LID Assignments: [0, 4, 3, 0, 5, 9, 5, 4, 5, 5]
+Active Experts in Batch: {0, 3, 4, 5, 9}
+
+Train step of epoch 0:  38%|███▊      | 2469/6434 [5:47:33<9:42:41,  8.82s/it, gpt_loss=0.293, loss_mean=0.312][A
+Train step of epoch 0:  38%|███▊      | 2470/6434 [5:47:33<9:31:19,  8.65s/it, gpt_loss=0.293, loss_mean=0.312][A
+Train step of epoch 0:  38%|███▊      | 2470/6434 [5:47:40<9:31:19,  8.65s/it, gpt_loss=0.336, loss_mean=0.315][A
+Train step of epoch 0:  38%|███▊      | 2471/6434 [5:47:40<9:15:27,  8.41s/it, gpt_loss=0.336, loss_mean=0.315][A
+Train step of epoch 0:  38%|███▊      | 2471/6434 [5:47:49<9:15:27,  8.41s/it, gpt_loss=0.282, loss_mean=0.311][A
+Train step of epoch 0:  38%|███▊      | 2472/6434 [5:47:49<9:14:27,  8.40s/it, gpt_loss=0.282, loss_mean=0.311][A
+Train step of epoch 0:  38%|███▊      | 2472/6434 [5:47:57<9:14:27,  8.40s/it, gpt_loss=0.278, loss_mean=0.308][A
+Train step of epoch 0:  38%|███▊      | 2473/6434 [5:47:57<9:06:45,  8.28s/it, gpt_loss=0.278, loss_mean=0.308][A
+Train step of epoch 0:  38%|███▊      | 2473/6434 [5:48:06<9:06:45,  8.28s/it, gpt_loss=0.338, loss_mean=0.311][A
+Train step of epoch 0:  38%|███▊      | 2474/6434 [5:48:06<9:31:56,  8.67s/it, gpt_loss=0.338, loss_mean=0.311][A
+Train step of epoch 0:  38%|███▊      | 2474/6434 [5:48:15<9:31:56,  8.67s/it, gpt_loss=0.324, loss_mean=0.312][A
+Train step of epoch 0:  38%|███▊      | 2475/6434 [5:48:15<9:34:51,  8.71s/it, gpt_loss=0.324, loss_mean=0.312][A
+Train step of epoch 0:  38%|███▊      | 2475/6434 [5:48:23<9:34:51,  8.71s/it, gpt_loss=0.364, loss_mean=0.317][A
+Train step of epoch 0:  38%|███▊      | 2476/6434 [5:48:23<9:20:38,  8.50s/it, gpt_loss=0.364, loss_mean=0.317][A
+Train step of epoch 0:  38%|███▊      | 2476/6434 [5:48:32<9:20:38,  8.50s/it, gpt_loss=0.33, loss_mean=0.319] [A
+Train step of epoch 0:  38%|███▊      | 2477/6434 [5:48:32<9:27:07,  8.60s/it, gpt_loss=0.33, loss_mean=0.319][A
+Train step of epoch 0:  38%|███▊      | 2477/6434 [5:48:40<9:27:07,  8.60s/it, gpt_loss=0.289, loss_mean=0.316][A
+Train step of epoch 0:  39%|███▊      | 2478/6434 [5:48:40<9:08:42,  8.32s/it, gpt_loss=0.289, loss_mean=0.316][A
+Train step of epoch 0:  39%|███▊      | 2478/6434 [5:48:47<9:08:42,  8.32s/it, gpt_loss=0.309, loss_mean=0.315][A
+Train step of epoch 0:  39%|███▊      | 2479/6434 [5:48:47<8:57:56,  8.16s/it, gpt_loss=0.309, loss_mean=0.315][A
+[LID Router Debug] Step: 2480
+Batch Size: 10
+Audio Batch Size: 106
+LID Assignments: [4, 1, 3, 5, 5, 4, 2, 6, 3, 9]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  39%|███▊      | 2479/6434 [5:48:55<8:57:56,  8.16s/it, gpt_loss=0.286, loss_mean=0.312][A
+Train step of epoch 0:  39%|███▊      | 2480/6434 [5:48:55<8:47:13,  8.00s/it, gpt_loss=0.286, loss_mean=0.312][A
+Train step of epoch 0:  39%|███▊      | 2480/6434 [5:49:03<8:47:13,  8.00s/it, gpt_loss=0.312, loss_mean=0.312][A
+Train step of epoch 0:  39%|███▊      | 2481/6434 [5:49:03<8:48:33,  8.02s/it, gpt_loss=0.312, loss_mean=0.312][A
+Train step of epoch 0:  39%|███▊      | 2481/6434 [5:49:11<8:48:33,  8.02s/it, gpt_loss=0.3, loss_mean=0.311]  [A
+Train step of epoch 0:  39%|███▊      | 2482/6434 [5:49:11<8:49:58,  8.05s/it, gpt_loss=0.3, loss_mean=0.311][A
+Train step of epoch 0:  39%|███▊      | 2482/6434 [5:49:19<8:49:58,  8.05s/it, gpt_loss=0.352, loss_mean=0.315][A
+Train step of epoch 0:  39%|███▊      | 2483/6434 [5:49:19<8:47:07,  8.01s/it, gpt_loss=0.352, loss_mean=0.315][A
+Train step of epoch 0:  39%|███▊      | 2483/6434 [5:49:27<8:47:07,  8.01s/it, gpt_loss=0.416, loss_mean=0.325][A
+Train step of epoch 0:  39%|███▊      | 2484/6434 [5:49:27<8:38:33,  7.88s/it, gpt_loss=0.416, loss_mean=0.325][A
+Train step of epoch 0:  39%|███▊      | 2484/6434 [5:49:34<8:38:33,  7.88s/it, gpt_loss=0.355, loss_mean=0.328][A
+Train step of epoch 0:  39%|███▊      | 2485/6434 [5:49:34<8:33:08,  7.80s/it, gpt_loss=0.355, loss_mean=0.328][A
+Train step of epoch 0:  39%|███▊      | 2485/6434 [5:49:42<8:33:08,  7.80s/it, gpt_loss=0.246, loss_mean=0.32] [A
+Train step of epoch 0:  39%|███▊      | 2486/6434 [5:49:42<8:30:28,  7.76s/it, gpt_loss=0.246, loss_mean=0.32][A
+Train step of epoch 0:  39%|███▊      | 2486/6434 [5:49:50<8:30:28,  7.76s/it, gpt_loss=0.325, loss_mean=0.321][A
+Train step of epoch 0:  39%|███▊      | 2487/6434 [5:49:50<8:36:41,  7.85s/it, gpt_loss=0.325, loss_mean=0.321][A
+Train step of epoch 0:  39%|███▊      | 2487/6434 [5:49:59<8:36:41,  7.85s/it, gpt_loss=0.397, loss_mean=0.328][A
+Train step of epoch 0:  39%|███▊      | 2488/6434 [5:49:59<8:52:58,  8.10s/it, gpt_loss=0.397, loss_mean=0.328][A
+Train step of epoch 0:  39%|███▊      | 2488/6434 [5:50:08<8:52:58,  8.10s/it, gpt_loss=0.298, loss_mean=0.325][A
+Train step of epoch 0:  39%|███▊      | 2489/6434 [5:50:08<9:05:16,  8.29s/it, gpt_loss=0.298, loss_mean=0.325][A
+[LID Router Debug] Step: 2490
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [5, 4, 6, 6, 1, 1, 3, 0, 0, 4]
+Active Experts in Batch: {0, 1, 3, 4, 5, 6}
+
+Train step of epoch 0:  39%|███▊      | 2489/6434 [5:50:17<9:05:16,  8.29s/it, gpt_loss=0.24, loss_mean=0.317] [A
+Train step of epoch 0:  39%|███▊      | 2490/6434 [5:50:17<9:23:44,  8.58s/it, gpt_loss=0.24, loss_mean=0.317][A
+Train step of epoch 0:  39%|███▊      | 2490/6434 [5:50:25<9:23:44,  8.58s/it, gpt_loss=0.274, loss_mean=0.312][A
+Train step of epoch 0:  39%|███▊      | 2491/6434 [5:50:25<9:20:29,  8.53s/it, gpt_loss=0.274, loss_mean=0.312][A
+Train step of epoch 0:  39%|███▊      | 2491/6434 [5:50:34<9:20:29,  8.53s/it, gpt_loss=0.352, loss_mean=0.316][A
+Train step of epoch 0:  39%|███▊      | 2492/6434 [5:50:34<9:36:12,  8.77s/it, gpt_loss=0.352, loss_mean=0.316][A
+Train step of epoch 0:  39%|███▊      | 2492/6434 [5:50:43<9:36:12,  8.77s/it, gpt_loss=0.37, loss_mean=0.322] [A
+Train step of epoch 0:  39%|███▊      | 2493/6434 [5:50:43<9:26:28,  8.62s/it, gpt_loss=0.37, loss_mean=0.322][A
+Train step of epoch 0:  39%|███▊      | 2493/6434 [5:50:51<9:26:28,  8.62s/it, gpt_loss=0.351, loss_mean=0.325][A
+Train step of epoch 0:  39%|███▉      | 2494/6434 [5:50:51<9:14:36,  8.45s/it, gpt_loss=0.351, loss_mean=0.325][A
+Train step of epoch 0:  39%|███▉      | 2494/6434 [5:51:00<9:14:36,  8.45s/it, gpt_loss=0.332, loss_mean=0.325][A
+Train step of epoch 0:  39%|███▉      | 2495/6434 [5:51:00<9:27:25,  8.64s/it, gpt_loss=0.332, loss_mean=0.325][A
+Train step of epoch 0:  39%|███▉      | 2495/6434 [5:51:07<9:27:25,  8.64s/it, gpt_loss=0.368, loss_mean=0.33] [A
+Train step of epoch 0:  39%|███▉      | 2496/6434 [5:51:07<9:03:08,  8.28s/it, gpt_loss=0.368, loss_mean=0.33][A
+Train step of epoch 0:  39%|███▉      | 2496/6434 [5:51:16<9:03:08,  8.28s/it, gpt_loss=0.245, loss_mean=0.321][A
+Train step of epoch 0:  39%|███▉      | 2497/6434 [5:51:16<9:18:05,  8.51s/it, gpt_loss=0.245, loss_mean=0.321][A
+Train step of epoch 0:  39%|███▉      | 2497/6434 [5:51:25<9:18:05,  8.51s/it, gpt_loss=0.33, loss_mean=0.322] [A
+Train step of epoch 0:  39%|███▉      | 2498/6434 [5:51:25<9:27:07,  8.65s/it, gpt_loss=0.33, loss_mean=0.322][A
+Train step of epoch 0:  39%|███▉      | 2498/6434 [5:51:34<9:27:07,  8.65s/it, gpt_loss=0.349, loss_mean=0.325][A
+Train step of epoch 0:  39%|███▉      | 2499/6434 [5:51:34<9:25:33,  8.62s/it, gpt_loss=0.349, loss_mean=0.325][A
+[LID Router Debug] Step: 2500
+Batch Size: 10
+Audio Batch Size: 114
+LID Assignments: [3, 4, 2, 2, 1, 6, 8, 5, 0, 9]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6, 8, 9}
+
+Train step of epoch 0:  39%|███▉      | 2499/6434 [5:51:43<9:25:33,  8.62s/it, gpt_loss=0.278, loss_mean=0.32] [A
+Train step of epoch 0:  39%|███▉      | 2500/6434 [5:51:43<9:26:10,  8.64s/it, gpt_loss=0.278, loss_mean=0.32][A
+Train step of epoch 0:  39%|███▉      | 2500/6434 [5:51:51<9:26:10,  8.64s/it, gpt_loss=0.277, loss_mean=0.316][A
+Train step of epoch 0:  39%|███▉      | 2501/6434 [5:51:51<9:30:10,  8.70s/it, gpt_loss=0.277, loss_mean=0.316][A
+Train step of epoch 0:  39%|███▉      | 2501/6434 [5:51:59<9:30:10,  8.70s/it, gpt_loss=0.367, loss_mean=0.321][A
+Train step of epoch 0:  39%|███▉      | 2502/6434 [5:51:59<9:17:31,  8.51s/it, gpt_loss=0.367, loss_mean=0.321][A
+Train step of epoch 0:  39%|███▉      | 2502/6434 [5:52:08<9:17:31,  8.51s/it, gpt_loss=0.256, loss_mean=0.314][A
+Train step of epoch 0:  39%|███▉      | 2503/6434 [5:52:08<9:26:07,  8.64s/it, gpt_loss=0.256, loss_mean=0.314][A
+Train step of epoch 0:  39%|███▉      | 2503/6434 [5:52:18<9:26:07,  8.64s/it, gpt_loss=0.373, loss_mean=0.32] [A
+Train step of epoch 0:  39%|███▉      | 2504/6434 [5:52:18<9:36:42,  8.80s/it, gpt_loss=0.373, loss_mean=0.32][A
+Train step of epoch 0:  39%|███▉      | 2504/6434 [5:52:26<9:36:42,  8.80s/it, gpt_loss=0.264, loss_mean=0.315][A
+Train step of epoch 0:  39%|███▉      | 2505/6434 [5:52:26<9:24:25,  8.62s/it, gpt_loss=0.264, loss_mean=0.315][A
+Train step of epoch 0:  39%|███▉      | 2505/6434 [5:52:34<9:24:25,  8.62s/it, gpt_loss=0.476, loss_mean=0.331][A
+Train step of epoch 0:  39%|███▉      | 2506/6434 [5:52:34<9:10:04,  8.40s/it, gpt_loss=0.476, loss_mean=0.331][A
+Train step of epoch 0:  39%|███▉      | 2506/6434 [5:52:42<9:10:04,  8.40s/it, gpt_loss=0.248, loss_mean=0.322][A
+Train step of epoch 0:  39%|███▉      | 2507/6434 [5:52:42<9:16:27,  8.50s/it, gpt_loss=0.248, loss_mean=0.322][A
+Train step of epoch 0:  39%|███▉      | 2507/6434 [5:52:50<9:16:27,  8.50s/it, gpt_loss=0.294, loss_mean=0.32] [A
+Train step of epoch 0:  39%|███▉      | 2508/6434 [5:52:50<9:03:39,  8.31s/it, gpt_loss=0.294, loss_mean=0.32][A
+Train step of epoch 0:  39%|███▉      | 2508/6434 [5:52:59<9:03:39,  8.31s/it, gpt_loss=0.381, loss_mean=0.326][A
+Train step of epoch 0:  39%|███▉      | 2509/6434 [5:52:59<9:20:25,  8.57s/it, gpt_loss=0.381, loss_mean=0.326][A
+[LID Router Debug] Step: 2510
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [0, 5, 2, 5, 1, 2, 3, 0, 9, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  39%|███▉      | 2509/6434 [5:53:08<9:20:25,  8.57s/it, gpt_loss=0.363, loss_mean=0.33] [A
+Train step of epoch 0:  39%|███▉      | 2510/6434 [5:53:08<9:17:13,  8.52s/it, gpt_loss=0.363, loss_mean=0.33][A
+Train step of epoch 0:  39%|███▉      | 2510/6434 [5:53:17<9:17:13,  8.52s/it, gpt_loss=0.271, loss_mean=0.324][A
+Train step of epoch 0:  39%|███▉      | 2511/6434 [5:53:17<9:28:04,  8.69s/it, gpt_loss=0.271, loss_mean=0.324][A
+Train step of epoch 0:  39%|███▉      | 2511/6434 [5:53:25<9:28:04,  8.69s/it, gpt_loss=0.337, loss_mean=0.325][A
+Train step of epoch 0:  39%|███▉      | 2512/6434 [5:53:25<9:13:05,  8.46s/it, gpt_loss=0.337, loss_mean=0.325][A
+Train step of epoch 0:  39%|███▉      | 2512/6434 [5:53:33<9:13:05,  8.46s/it, gpt_loss=0.342, loss_mean=0.327][A
+Train step of epoch 0:  39%|███▉      | 2513/6434 [5:53:33<9:00:52,  8.28s/it, gpt_loss=0.342, loss_mean=0.327][A
+Train step of epoch 0:  39%|███▉      | 2513/6434 [5:53:41<9:00:52,  8.28s/it, gpt_loss=0.309, loss_mean=0.325][A
+Train step of epoch 0:  39%|███▉      | 2514/6434 [5:53:41<8:54:05,  8.17s/it, gpt_loss=0.309, loss_mean=0.325][A
+Train step of epoch 0:  39%|███▉      | 2514/6434 [5:53:49<8:54:05,  8.17s/it, gpt_loss=0.299, loss_mean=0.322][A
+Train step of epoch 0:  39%|███▉      | 2515/6434 [5:53:49<8:55:26,  8.20s/it, gpt_loss=0.299, loss_mean=0.322][A
+Train step of epoch 0:  39%|███▉      | 2515/6434 [5:53:56<8:55:26,  8.20s/it, gpt_loss=0.289, loss_mean=0.319][A
+Train step of epoch 0:  39%|███▉      | 2516/6434 [5:53:56<8:29:07,  7.80s/it, gpt_loss=0.289, loss_mean=0.319][A
+Train step of epoch 0:  39%|███▉      | 2516/6434 [5:54:04<8:29:07,  7.80s/it, gpt_loss=0.319, loss_mean=0.319][A
+Train step of epoch 0:  39%|███▉      | 2517/6434 [5:54:04<8:30:22,  7.82s/it, gpt_loss=0.319, loss_mean=0.319][A
+Train step of epoch 0:  39%|███▉      | 2517/6434 [5:54:13<8:30:22,  7.82s/it, gpt_loss=0.392, loss_mean=0.326][A
+Train step of epoch 0:  39%|███▉      | 2518/6434 [5:54:13<8:55:23,  8.20s/it, gpt_loss=0.392, loss_mean=0.326][A
+Train step of epoch 0:  39%|███▉      | 2518/6434 [5:54:23<8:55:23,  8.20s/it, gpt_loss=0.27, loss_mean=0.321] [A
+Train step of epoch 0:  39%|███▉      | 2519/6434 [5:54:23<9:36:55,  8.84s/it, gpt_loss=0.27, loss_mean=0.321][A
+[LID Router Debug] Step: 2520
+Batch Size: 10
+Audio Batch Size: 114
+LID Assignments: [0, 9, 0, 9, 0, 3, 4, 2, 5, 2]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  39%|███▉      | 2519/6434 [5:54:32<9:36:55,  8.84s/it, gpt_loss=0.373, loss_mean=0.326][A
+Train step of epoch 0:  39%|███▉      | 2520/6434 [5:54:32<9:37:20,  8.85s/it, gpt_loss=0.373, loss_mean=0.326][A
+Train step of epoch 0:  39%|███▉      | 2520/6434 [5:54:41<9:37:20,  8.85s/it, gpt_loss=0.363, loss_mean=0.33] [A
+Train step of epoch 0:  39%|███▉      | 2521/6434 [5:54:41<9:40:55,  8.91s/it, gpt_loss=0.363, loss_mean=0.33][A
+Train step of epoch 0:  39%|███▉      | 2521/6434 [5:54:50<9:40:55,  8.91s/it, gpt_loss=0.236, loss_mean=0.32][A
+Train step of epoch 0:  39%|███▉      | 2522/6434 [5:54:50<9:35:53,  8.83s/it, gpt_loss=0.236, loss_mean=0.32][A
+Train step of epoch 0:  39%|███▉      | 2522/6434 [5:54:58<9:35:53,  8.83s/it, gpt_loss=0.413, loss_mean=0.329][A
+Train step of epoch 0:  39%|███▉      | 2523/6434 [5:54:58<9:19:43,  8.59s/it, gpt_loss=0.413, loss_mean=0.329][A
+Train step of epoch 0:  39%|███▉      | 2523/6434 [5:55:07<9:19:43,  8.59s/it, gpt_loss=0.473, loss_mean=0.344][A
+Train step of epoch 0:  39%|███▉      | 2524/6434 [5:55:07<9:25:07,  8.67s/it, gpt_loss=0.473, loss_mean=0.344][A
+Train step of epoch 0:  39%|███▉      | 2524/6434 [5:55:15<9:25:07,  8.67s/it, gpt_loss=0.26, loss_mean=0.335] [A
+Train step of epoch 0:  39%|███▉      | 2525/6434 [5:55:15<9:16:16,  8.54s/it, gpt_loss=0.26, loss_mean=0.335][A
+Train step of epoch 0:  39%|███▉      | 2525/6434 [5:55:25<9:16:16,  8.54s/it, gpt_loss=0.224, loss_mean=0.324][A
+Train step of epoch 0:  39%|███▉      | 2526/6434 [5:55:25<9:42:51,  8.95s/it, gpt_loss=0.224, loss_mean=0.324][A
+Train step of epoch 0:  39%|███▉      | 2526/6434 [5:55:33<9:42:51,  8.95s/it, gpt_loss=0.234, loss_mean=0.315][A
+Train step of epoch 0:  39%|███▉      | 2527/6434 [5:55:33<9:37:09,  8.86s/it, gpt_loss=0.234, loss_mean=0.315][A
+Train step of epoch 0:  39%|███▉      | 2527/6434 [5:55:41<9:37:09,  8.86s/it, gpt_loss=0.333, loss_mean=0.317][A
+Train step of epoch 0:  39%|███▉      | 2528/6434 [5:55:41<9:15:54,  8.54s/it, gpt_loss=0.333, loss_mean=0.317][A
+Train step of epoch 0:  39%|███▉      | 2528/6434 [5:55:50<9:15:54,  8.54s/it, gpt_loss=0.301, loss_mean=0.315][A
+Train step of epoch 0:  39%|███▉      | 2529/6434 [5:55:50<9:19:38,  8.60s/it, gpt_loss=0.301, loss_mean=0.315][A
+[LID Router Debug] Step: 2530
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [2, 3, 4, 9, 5, 3, 1, 6, 5, 7]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6, 7, 9}
+
+Train step of epoch 0:  39%|███▉      | 2529/6434 [5:55:58<9:19:38,  8.60s/it, gpt_loss=0.407, loss_mean=0.325][A
+Train step of epoch 0:  39%|███▉      | 2530/6434 [5:55:58<9:15:16,  8.53s/it, gpt_loss=0.407, loss_mean=0.325][A
+Train step of epoch 0:  39%|███▉      | 2530/6434 [5:56:07<9:15:16,  8.53s/it, gpt_loss=0.343, loss_mean=0.326][A
+Train step of epoch 0:  39%|███▉      | 2531/6434 [5:56:07<9:10:57,  8.47s/it, gpt_loss=0.343, loss_mean=0.326][A
+Train step of epoch 0:  39%|███▉      | 2531/6434 [5:56:16<9:10:57,  8.47s/it, gpt_loss=0.284, loss_mean=0.322][A
+Train step of epoch 0:  39%|███▉      | 2532/6434 [5:56:16<9:27:07,  8.72s/it, gpt_loss=0.284, loss_mean=0.322][A
+Train step of epoch 0:  39%|███▉      | 2532/6434 [5:56:24<9:27:07,  8.72s/it, gpt_loss=0.278, loss_mean=0.318][A
+Train step of epoch 0:  39%|███▉      | 2533/6434 [5:56:24<9:25:16,  8.69s/it, gpt_loss=0.278, loss_mean=0.318][A
+Train step of epoch 0:  39%|███▉      | 2533/6434 [5:56:33<9:25:16,  8.69s/it, gpt_loss=0.329, loss_mean=0.319][A
+Train step of epoch 0:  39%|███▉      | 2534/6434 [5:56:33<9:19:53,  8.61s/it, gpt_loss=0.329, loss_mean=0.319][A
+Train step of epoch 0:  39%|███▉      | 2534/6434 [5:56:41<9:19:53,  8.61s/it, gpt_loss=0.359, loss_mean=0.323][A
+Train step of epoch 0:  39%|███▉      | 2535/6434 [5:56:41<9:05:34,  8.40s/it, gpt_loss=0.359, loss_mean=0.323][A
+Train step of epoch 0:  39%|███▉      | 2535/6434 [5:56:49<9:05:34,  8.40s/it, gpt_loss=0.315, loss_mean=0.322][A
+Train step of epoch 0:  39%|███▉      | 2536/6434 [5:56:49<9:08:59,  8.45s/it, gpt_loss=0.315, loss_mean=0.322][A
+Train step of epoch 0:  39%|███▉      | 2536/6434 [5:56:58<9:08:59,  8.45s/it, gpt_loss=0.328, loss_mean=0.323][A
+Train step of epoch 0:  39%|███▉      | 2537/6434 [5:56:58<9:14:43,  8.54s/it, gpt_loss=0.328, loss_mean=0.323][A
+Train step of epoch 0:  39%|███▉      | 2537/6434 [5:57:06<9:14:43,  8.54s/it, gpt_loss=0.25, loss_mean=0.316] [A
+Train step of epoch 0:  39%|███▉      | 2538/6434 [5:57:06<9:00:34,  8.33s/it, gpt_loss=0.25, loss_mean=0.316][A
+Train step of epoch 0:  39%|███▉      | 2538/6434 [5:57:14<9:00:34,  8.33s/it, gpt_loss=0.317, loss_mean=0.316][A
+Train step of epoch 0:  39%|███▉      | 2539/6434 [5:57:14<8:48:42,  8.14s/it, gpt_loss=0.317, loss_mean=0.316][A
+[LID Router Debug] Step: 2540
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [9, 4, 6, 9, 1, 2, 8, 9, 5, 3]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6, 8, 9}
+
+Train step of epoch 0:  39%|███▉      | 2539/6434 [5:57:22<8:48:42,  8.14s/it, gpt_loss=0.346, loss_mean=0.319][A
+Train step of epoch 0:  39%|███▉      | 2540/6434 [5:57:22<8:49:54,  8.17s/it, gpt_loss=0.346, loss_mean=0.319][A
+Train step of epoch 0:  39%|███▉      | 2540/6434 [5:57:30<8:49:54,  8.17s/it, gpt_loss=0.361, loss_mean=0.323][A
+Train step of epoch 0:  39%|███▉      | 2541/6434 [5:57:30<8:42:08,  8.05s/it, gpt_loss=0.361, loss_mean=0.323][A
+Train step of epoch 0:  39%|███▉      | 2541/6434 [5:57:37<8:42:08,  8.05s/it, gpt_loss=0.313, loss_mean=0.322][A
+Train step of epoch 0:  40%|███▉      | 2542/6434 [5:57:37<8:28:15,  7.84s/it, gpt_loss=0.313, loss_mean=0.322][A
+Train step of epoch 0:  40%|███▉      | 2542/6434 [5:57:45<8:28:15,  7.84s/it, gpt_loss=0.363, loss_mean=0.326][A
+Train step of epoch 0:  40%|███▉      | 2543/6434 [5:57:45<8:33:43,  7.92s/it, gpt_loss=0.363, loss_mean=0.326][A
+Train step of epoch 0:  40%|███▉      | 2543/6434 [5:57:55<8:33:43,  7.92s/it, gpt_loss=0.278, loss_mean=0.321][A
+Train step of epoch 0:  40%|███▉      | 2544/6434 [5:57:55<9:04:11,  8.39s/it, gpt_loss=0.278, loss_mean=0.321][A
+Train step of epoch 0:  40%|███▉      | 2544/6434 [5:58:02<9:04:11,  8.39s/it, gpt_loss=0.346, loss_mean=0.324][A
+Train step of epoch 0:  40%|███▉      | 2545/6434 [5:58:02<8:46:38,  8.13s/it, gpt_loss=0.346, loss_mean=0.324][A
+Train step of epoch 0:  40%|███▉      | 2545/6434 [5:58:11<8:46:38,  8.13s/it, gpt_loss=0.316, loss_mean=0.323][A
+Train step of epoch 0:  40%|███▉      | 2546/6434 [5:58:11<8:56:09,  8.27s/it, gpt_loss=0.316, loss_mean=0.323][A
+Train step of epoch 0:  40%|███▉      | 2546/6434 [5:58:19<8:56:09,  8.27s/it, gpt_loss=0.368, loss_mean=0.327][A
+Train step of epoch 0:  40%|███▉      | 2547/6434 [5:58:19<8:56:50,  8.29s/it, gpt_loss=0.368, loss_mean=0.327][A
+Train step of epoch 0:  40%|███▉      | 2547/6434 [5:58:29<8:56:50,  8.29s/it, gpt_loss=0.304, loss_mean=0.325][A
+Train step of epoch 0:  40%|███▉      | 2548/6434 [5:58:29<9:19:24,  8.64s/it, gpt_loss=0.304, loss_mean=0.325][A
+Train step of epoch 0:  40%|███▉      | 2548/6434 [5:58:38<9:19:24,  8.64s/it, gpt_loss=0.388, loss_mean=0.331][A
+Train step of epoch 0:  40%|███▉      | 2549/6434 [5:58:38<9:32:20,  8.84s/it, gpt_loss=0.388, loss_mean=0.331][A
+[LID Router Debug] Step: 2550
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [9, 9, 4, 9, 3, 9, 4, 0, 4, 4]
+Active Experts in Batch: {0, 9, 3, 4}
+
+Train step of epoch 0:  40%|███▉      | 2549/6434 [5:58:47<9:32:20,  8.84s/it, gpt_loss=0.288, loss_mean=0.327][A
+Train step of epoch 0:  40%|███▉      | 2550/6434 [5:58:47<9:33:14,  8.86s/it, gpt_loss=0.288, loss_mean=0.327][A
+Train step of epoch 0:  40%|███▉      | 2550/6434 [5:58:55<9:33:14,  8.86s/it, gpt_loss=0.267, loss_mean=0.321][A
+Train step of epoch 0:  40%|███▉      | 2551/6434 [5:58:55<9:21:06,  8.67s/it, gpt_loss=0.267, loss_mean=0.321][A
+Train step of epoch 0:  40%|███▉      | 2551/6434 [5:59:03<9:21:06,  8.67s/it, gpt_loss=0.263, loss_mean=0.315][A
+Train step of epoch 0:  40%|███▉      | 2552/6434 [5:59:03<9:12:16,  8.54s/it, gpt_loss=0.263, loss_mean=0.315][A
+Train step of epoch 0:  40%|███▉      | 2552/6434 [5:59:13<9:12:16,  8.54s/it, gpt_loss=0.306, loss_mean=0.314][A
+Train step of epoch 0:  40%|███▉      | 2553/6434 [5:59:13<9:29:35,  8.81s/it, gpt_loss=0.306, loss_mean=0.314][A
+Train step of epoch 0:  40%|███▉      | 2553/6434 [5:59:20<9:29:35,  8.81s/it, gpt_loss=0.301, loss_mean=0.313][A
+Train step of epoch 0:  40%|███▉      | 2554/6434 [5:59:20<9:03:24,  8.40s/it, gpt_loss=0.301, loss_mean=0.313][A
+Train step of epoch 0:  40%|███▉      | 2554/6434 [5:59:28<9:03:24,  8.40s/it, gpt_loss=0.225, loss_mean=0.304][A
+Train step of epoch 0:  40%|███▉      | 2555/6434 [5:59:28<8:46:51,  8.15s/it, gpt_loss=0.225, loss_mean=0.304][A
+Train step of epoch 0:  40%|███▉      | 2555/6434 [5:59:38<8:46:51,  8.15s/it, gpt_loss=0.412, loss_mean=0.315][A
+Train step of epoch 0:  40%|███▉      | 2556/6434 [5:59:38<9:20:17,  8.67s/it, gpt_loss=0.412, loss_mean=0.315][A
+Train step of epoch 0:  40%|███▉      | 2556/6434 [5:59:45<9:20:17,  8.67s/it, gpt_loss=0.33, loss_mean=0.317] [A
+Train step of epoch 0:  40%|███▉      | 2557/6434 [5:59:45<9:01:46,  8.38s/it, gpt_loss=0.33, loss_mean=0.317][A
+Train step of epoch 0:  40%|███▉      | 2557/6434 [5:59:53<9:01:46,  8.38s/it, gpt_loss=0.389, loss_mean=0.324][A
+Train step of epoch 0:  40%|███▉      | 2558/6434 [5:59:53<8:46:29,  8.15s/it, gpt_loss=0.389, loss_mean=0.324][A
+Train step of epoch 0:  40%|███▉      | 2558/6434 [6:00:01<8:46:29,  8.15s/it, gpt_loss=0.347, loss_mean=0.326][A
+Train step of epoch 0:  40%|███▉      | 2559/6434 [6:00:01<8:46:12,  8.15s/it, gpt_loss=0.347, loss_mean=0.326][A
+[LID Router Debug] Step: 2560
+Batch Size: 10
+Audio Batch Size: 113
+LID Assignments: [9, 4, 4, 1, 3, 9, 9, 9, 4, 5]
+Active Experts in Batch: {1, 3, 4, 5, 9}
+
+Train step of epoch 0:  40%|███▉      | 2559/6434 [6:00:09<8:46:12,  8.15s/it, gpt_loss=0.329, loss_mean=0.326][A
+Train step of epoch 0:  40%|███▉      | 2560/6434 [6:00:09<8:43:05,  8.10s/it, gpt_loss=0.329, loss_mean=0.326][A
+Train step of epoch 0:  40%|███▉      | 2560/6434 [6:00:17<8:43:05,  8.10s/it, gpt_loss=0.33, loss_mean=0.327] [A
+Train step of epoch 0:  40%|███▉      | 2561/6434 [6:00:17<8:40:13,  8.06s/it, gpt_loss=0.33, loss_mean=0.327][A
+Train step of epoch 0:  40%|███▉      | 2561/6434 [6:00:25<8:40:13,  8.06s/it, gpt_loss=0.262, loss_mean=0.32][A
+Train step of epoch 0:  40%|███▉      | 2562/6434 [6:00:25<8:42:19,  8.09s/it, gpt_loss=0.262, loss_mean=0.32][A
+Train step of epoch 0:  40%|███▉      | 2562/6434 [6:00:34<8:42:19,  8.09s/it, gpt_loss=0.365, loss_mean=0.325][A
+Train step of epoch 0:  40%|███▉      | 2563/6434 [6:00:34<9:00:27,  8.38s/it, gpt_loss=0.365, loss_mean=0.325][A
+Train step of epoch 0:  40%|███▉      | 2563/6434 [6:00:44<9:00:27,  8.38s/it, gpt_loss=0.302, loss_mean=0.323][A
+Train step of epoch 0:  40%|███▉      | 2564/6434 [6:00:44<9:22:33,  8.72s/it, gpt_loss=0.302, loss_mean=0.323][A
+Train step of epoch 0:  40%|███▉      | 2564/6434 [6:00:52<9:22:33,  8.72s/it, gpt_loss=0.386, loss_mean=0.329][A
+Train step of epoch 0:  40%|███▉      | 2565/6434 [6:00:52<9:15:08,  8.61s/it, gpt_loss=0.386, loss_mean=0.329][A
+Train step of epoch 0:  40%|███▉      | 2565/6434 [6:01:01<9:15:08,  8.61s/it, gpt_loss=0.386, loss_mean=0.335][A
+Train step of epoch 0:  40%|███▉      | 2566/6434 [6:01:01<9:19:10,  8.67s/it, gpt_loss=0.386, loss_mean=0.335][A
+Train step of epoch 0:  40%|███▉      | 2566/6434 [6:01:09<9:19:10,  8.67s/it, gpt_loss=0.386, loss_mean=0.34] [A
+Train step of epoch 0:  40%|███▉      | 2567/6434 [6:01:09<9:08:22,  8.51s/it, gpt_loss=0.386, loss_mean=0.34][A
+Train step of epoch 0:  40%|███▉      | 2567/6434 [6:01:18<9:08:22,  8.51s/it, gpt_loss=0.291, loss_mean=0.335][A
+Train step of epoch 0:  40%|███▉      | 2568/6434 [6:01:18<9:24:04,  8.75s/it, gpt_loss=0.291, loss_mean=0.335][A
+Train step of epoch 0:  40%|███▉      | 2568/6434 [6:01:27<9:24:04,  8.75s/it, gpt_loss=0.274, loss_mean=0.329][A
+Train step of epoch 0:  40%|███▉      | 2569/6434 [6:01:27<9:22:45,  8.74s/it, gpt_loss=0.274, loss_mean=0.329][A
+[LID Router Debug] Step: 2570
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [1, 9, 2, 0, 2, 0, 0, 0, 9, 0]
+Active Experts in Batch: {0, 1, 2, 9}
+
+Train step of epoch 0:  40%|███▉      | 2569/6434 [6:01:35<9:22:45,  8.74s/it, gpt_loss=0.332, loss_mean=0.329][A
+Train step of epoch 0:  40%|███▉      | 2570/6434 [6:01:35<9:08:27,  8.52s/it, gpt_loss=0.332, loss_mean=0.329][A
+Train step of epoch 0:  40%|███▉      | 2570/6434 [6:01:44<9:08:27,  8.52s/it, gpt_loss=0.343, loss_mean=0.331][A
+Train step of epoch 0:  40%|███▉      | 2571/6434 [6:01:44<9:13:19,  8.59s/it, gpt_loss=0.343, loss_mean=0.331][A
+Train step of epoch 0:  40%|███▉      | 2571/6434 [6:01:52<9:13:19,  8.59s/it, gpt_loss=0.337, loss_mean=0.331][A
+Train step of epoch 0:  40%|███▉      | 2572/6434 [6:01:52<9:14:31,  8.62s/it, gpt_loss=0.337, loss_mean=0.331][A
+Train step of epoch 0:  40%|███▉      | 2572/6434 [6:02:01<9:14:31,  8.62s/it, gpt_loss=0.267, loss_mean=0.325][A
+Train step of epoch 0:  40%|███▉      | 2573/6434 [6:02:01<9:14:24,  8.62s/it, gpt_loss=0.267, loss_mean=0.325][A
+Train step of epoch 0:  40%|███▉      | 2573/6434 [6:02:09<9:14:24,  8.62s/it, gpt_loss=0.296, loss_mean=0.322][A
+Train step of epoch 0:  40%|████      | 2574/6434 [6:02:09<9:02:17,  8.43s/it, gpt_loss=0.296, loss_mean=0.322][A
+Train step of epoch 0:  40%|████      | 2574/6434 [6:02:18<9:02:17,  8.43s/it, gpt_loss=0.292, loss_mean=0.319][A
+Train step of epoch 0:  40%|████      | 2575/6434 [6:02:18<9:17:58,  8.68s/it, gpt_loss=0.292, loss_mean=0.319][A
+Train step of epoch 0:  40%|████      | 2575/6434 [6:02:27<9:17:58,  8.68s/it, gpt_loss=0.287, loss_mean=0.316][A
+Train step of epoch 0:  40%|████      | 2576/6434 [6:02:27<9:13:28,  8.61s/it, gpt_loss=0.287, loss_mean=0.316][A
+Train step of epoch 0:  40%|████      | 2576/6434 [6:02:35<9:13:28,  8.61s/it, gpt_loss=0.331, loss_mean=0.317][A
+Train step of epoch 0:  40%|████      | 2577/6434 [6:02:35<9:10:19,  8.56s/it, gpt_loss=0.331, loss_mean=0.317][A
+Train step of epoch 0:  40%|████      | 2577/6434 [6:02:44<9:10:19,  8.56s/it, gpt_loss=0.251, loss_mean=0.311][A
+Train step of epoch 0:  40%|████      | 2578/6434 [6:02:44<9:12:48,  8.60s/it, gpt_loss=0.251, loss_mean=0.311][A
+Train step of epoch 0:  40%|████      | 2578/6434 [6:02:53<9:12:48,  8.60s/it, gpt_loss=0.283, loss_mean=0.308][A
+Train step of epoch 0:  40%|████      | 2579/6434 [6:02:53<9:16:43,  8.67s/it, gpt_loss=0.283, loss_mean=0.308][A
+[LID Router Debug] Step: 2580
+Batch Size: 10
+Audio Batch Size: 145
+LID Assignments: [9, 4, 9, 6, 2, 9, 4, 6, 2, 9]
+Active Experts in Batch: {9, 2, 4, 6}
+
+Train step of epoch 0:  40%|████      | 2579/6434 [6:03:02<9:16:43,  8.67s/it, gpt_loss=0.252, loss_mean=0.302][A
+Train step of epoch 0:  40%|████      | 2580/6434 [6:03:02<9:27:48,  8.84s/it, gpt_loss=0.252, loss_mean=0.302][A
+Train step of epoch 0:  40%|████      | 2580/6434 [6:03:11<9:27:48,  8.84s/it, gpt_loss=0.384, loss_mean=0.31] [A
+Train step of epoch 0:  40%|████      | 2581/6434 [6:03:11<9:33:35,  8.93s/it, gpt_loss=0.384, loss_mean=0.31][A
+Train step of epoch 0:  40%|████      | 2581/6434 [6:03:19<9:33:35,  8.93s/it, gpt_loss=0.322, loss_mean=0.312][A
+Train step of epoch 0:  40%|████      | 2582/6434 [6:03:19<9:06:28,  8.51s/it, gpt_loss=0.322, loss_mean=0.312][A
+Train step of epoch 0:  40%|████      | 2582/6434 [6:03:27<9:06:28,  8.51s/it, gpt_loss=0.36, loss_mean=0.316] [A
+Train step of epoch 0:  40%|████      | 2583/6434 [6:03:27<8:53:49,  8.32s/it, gpt_loss=0.36, loss_mean=0.316][A
+Train step of epoch 0:  40%|████      | 2583/6434 [6:03:35<8:53:49,  8.32s/it, gpt_loss=0.322, loss_mean=0.317][A
+Train step of epoch 0:  40%|████      | 2584/6434 [6:03:35<8:57:39,  8.38s/it, gpt_loss=0.322, loss_mean=0.317][A
+Train step of epoch 0:  40%|████      | 2584/6434 [6:03:44<8:57:39,  8.38s/it, gpt_loss=0.226, loss_mean=0.308][A
+Train step of epoch 0:  40%|████      | 2585/6434 [6:03:44<9:04:42,  8.49s/it, gpt_loss=0.226, loss_mean=0.308][A
+Train step of epoch 0:  40%|████      | 2585/6434 [6:03:53<9:04:42,  8.49s/it, gpt_loss=0.361, loss_mean=0.313][A
+Train step of epoch 0:  40%|████      | 2586/6434 [6:03:53<9:10:10,  8.58s/it, gpt_loss=0.361, loss_mean=0.313][A
+Train step of epoch 0:  40%|████      | 2586/6434 [6:04:01<9:10:10,  8.58s/it, gpt_loss=0.379, loss_mean=0.32] [A
+Train step of epoch 0:  40%|████      | 2587/6434 [6:04:01<9:15:00,  8.66s/it, gpt_loss=0.379, loss_mean=0.32][A
+Train step of epoch 0:  40%|████      | 2587/6434 [6:04:11<9:15:00,  8.66s/it, gpt_loss=0.286, loss_mean=0.316][A
+Train step of epoch 0:  40%|████      | 2588/6434 [6:04:11<9:39:15,  9.04s/it, gpt_loss=0.286, loss_mean=0.316][A
+Train step of epoch 0:  40%|████      | 2588/6434 [6:04:19<9:39:15,  9.04s/it, gpt_loss=0.309, loss_mean=0.316][A
+Train step of epoch 0:  40%|████      | 2589/6434 [6:04:19<9:21:52,  8.77s/it, gpt_loss=0.309, loss_mean=0.316][A
+[LID Router Debug] Step: 2590
+Batch Size: 10
+Audio Batch Size: 92
+LID Assignments: [1, 0, 9, 5, 5, 9, 9, 5, 3, 6]
+Active Experts in Batch: {0, 1, 3, 5, 6, 9}
+
+Train step of epoch 0:  40%|████      | 2589/6434 [6:04:27<9:21:52,  8.77s/it, gpt_loss=0.375, loss_mean=0.322][A
+Train step of epoch 0:  40%|████      | 2590/6434 [6:04:27<9:03:45,  8.49s/it, gpt_loss=0.375, loss_mean=0.322][A
+Train step of epoch 0:  40%|████      | 2590/6434 [6:04:36<9:03:45,  8.49s/it, gpt_loss=0.347, loss_mean=0.324][A
+Train step of epoch 0:  40%|████      | 2591/6434 [6:04:36<9:14:53,  8.66s/it, gpt_loss=0.347, loss_mean=0.324][A
+Train step of epoch 0:  40%|████      | 2591/6434 [6:04:46<9:14:53,  8.66s/it, gpt_loss=0.274, loss_mean=0.319][A
+Train step of epoch 0:  40%|████      | 2592/6434 [6:04:46<9:42:36,  9.10s/it, gpt_loss=0.274, loss_mean=0.319][A
+Train step of epoch 0:  40%|████      | 2592/6434 [6:04:54<9:42:36,  9.10s/it, gpt_loss=0.284, loss_mean=0.316][A
+Train step of epoch 0:  40%|████      | 2593/6434 [6:04:54<9:17:40,  8.71s/it, gpt_loss=0.284, loss_mean=0.316][A
+Train step of epoch 0:  40%|████      | 2593/6434 [6:05:03<9:17:40,  8.71s/it, gpt_loss=0.241, loss_mean=0.308][A
+Train step of epoch 0:  40%|████      | 2594/6434 [6:05:03<9:23:56,  8.81s/it, gpt_loss=0.241, loss_mean=0.308][A
+Train step of epoch 0:  40%|████      | 2594/6434 [6:05:12<9:23:56,  8.81s/it, gpt_loss=0.36, loss_mean=0.313] [A
+Train step of epoch 0:  40%|████      | 2595/6434 [6:05:12<9:26:46,  8.86s/it, gpt_loss=0.36, loss_mean=0.313][A
+Train step of epoch 0:  40%|████      | 2595/6434 [6:05:21<9:26:46,  8.86s/it, gpt_loss=0.353, loss_mean=0.317][A
+Train step of epoch 0:  40%|████      | 2596/6434 [6:05:21<9:18:06,  8.72s/it, gpt_loss=0.353, loss_mean=0.317][A
+Train step of epoch 0:  40%|████      | 2596/6434 [6:05:29<9:18:06,  8.72s/it, gpt_loss=0.239, loss_mean=0.31] [A
+Train step of epoch 0:  40%|████      | 2597/6434 [6:05:29<9:05:01,  8.52s/it, gpt_loss=0.239, loss_mean=0.31][A
+Train step of epoch 0:  40%|████      | 2597/6434 [6:05:36<9:05:01,  8.52s/it, gpt_loss=0.279, loss_mean=0.306][A
+Train step of epoch 0:  40%|████      | 2598/6434 [6:05:36<8:41:00,  8.15s/it, gpt_loss=0.279, loss_mean=0.306][A
+Train step of epoch 0:  40%|████      | 2598/6434 [6:05:44<8:41:00,  8.15s/it, gpt_loss=0.376, loss_mean=0.313][A
+Train step of epoch 0:  40%|████      | 2599/6434 [6:05:44<8:32:56,  8.03s/it, gpt_loss=0.376, loss_mean=0.313][A
+[LID Router Debug] Step: 2600
+Batch Size: 10
+Audio Batch Size: 98
+LID Assignments: [3, 5, 9, 6, 4, 5, 9, 4, 3, 2]
+Active Experts in Batch: {2, 3, 4, 5, 6, 9}
+[2026-02-06 22:01:55,587] [INFO] [logging.py:96:log_dist] [Rank 0] step=1300, skipped=0, lr=[1.9227746464624966e-05, 1.9227746464624966e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-06 22:01:55,588] [INFO] [timer.py:260:stop] epoch=0/micro_step=2600/global_step=1300, RunningAvgSamplesPerSec=4.749064690233022, CurrSamplesPerSec=5.318516872188701, MemAllocated=12.55GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  40%|████      | 2599/6434 [6:05:51<8:32:56,  8.03s/it, gpt_loss=0.379, loss_mean=0.32] [A
+Train step of epoch 0:  40%|████      | 2600/6434 [6:05:51<8:19:34,  7.82s/it, gpt_loss=0.379, loss_mean=0.32][A
+Train step of epoch 0:  40%|████      | 2600/6434 [6:06:00<8:19:34,  7.82s/it, gpt_loss=0.413, loss_mean=0.329][A
+Train step of epoch 0:  40%|████      | 2601/6434 [6:06:00<8:35:58,  8.08s/it, gpt_loss=0.413, loss_mean=0.329][A
+Train step of epoch 0:  40%|████      | 2601/6434 [6:06:08<8:35:58,  8.08s/it, gpt_loss=0.34, loss_mean=0.33]  [A
+Train step of epoch 0:  40%|████      | 2602/6434 [6:06:08<8:39:46,  8.14s/it, gpt_loss=0.34, loss_mean=0.33][A
+Train step of epoch 0:  40%|████      | 2602/6434 [6:06:16<8:39:46,  8.14s/it, gpt_loss=0.325, loss_mean=0.33][A
+Train step of epoch 0:  40%|████      | 2603/6434 [6:06:16<8:37:38,  8.11s/it, gpt_loss=0.325, loss_mean=0.33][A
+Train step of epoch 0:  40%|████      | 2603/6434 [6:06:24<8:37:38,  8.11s/it, gpt_loss=0.254, loss_mean=0.322][A
+Train step of epoch 0:  40%|████      | 2604/6434 [6:06:24<8:42:35,  8.19s/it, gpt_loss=0.254, loss_mean=0.322][A
+Train step of epoch 0:  40%|████      | 2604/6434 [6:06:33<8:42:35,  8.19s/it, gpt_loss=0.366, loss_mean=0.327][A
+Train step of epoch 0:  40%|████      | 2605/6434 [6:06:33<8:43:47,  8.21s/it, gpt_loss=0.366, loss_mean=0.327][A
+Train step of epoch 0:  40%|████      | 2605/6434 [6:06:41<8:43:47,  8.21s/it, gpt_loss=0.298, loss_mean=0.324][A
+Train step of epoch 0:  41%|████      | 2606/6434 [6:06:41<8:44:34,  8.22s/it, gpt_loss=0.298, loss_mean=0.324][A
+Train step of epoch 0:  41%|████      | 2606/6434 [6:06:48<8:44:34,  8.22s/it, gpt_loss=0.265, loss_mean=0.318][A
+Train step of epoch 0:  41%|████      | 2607/6434 [6:06:48<8:16:41,  7.79s/it, gpt_loss=0.265, loss_mean=0.318][A
+Train step of epoch 0:  41%|████      | 2607/6434 [6:06:56<8:16:41,  7.79s/it, gpt_loss=0.339, loss_mean=0.32] [A
+Train step of epoch 0:  41%|████      | 2608/6434 [6:06:56<8:17:38,  7.80s/it, gpt_loss=0.339, loss_mean=0.32][A
+Train step of epoch 0:  41%|████      | 2608/6434 [6:07:05<8:17:38,  7.80s/it, gpt_loss=0.371, loss_mean=0.325][A
+Train step of epoch 0:  41%|████      | 2609/6434 [6:07:05<8:53:39,  8.37s/it, gpt_loss=0.371, loss_mean=0.325][A
+[LID Router Debug] Step: 2610
+Batch Size: 10
+Audio Batch Size: 79
+LID Assignments: [2, 2, 0, 1, 6, 4, 9, 1, 5, 2]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  41%|████      | 2609/6434 [6:07:14<8:53:39,  8.37s/it, gpt_loss=0.379, loss_mean=0.33] [A
+Train step of epoch 0:  41%|████      | 2610/6434 [6:07:14<8:56:27,  8.42s/it, gpt_loss=0.379, loss_mean=0.33][A
+Train step of epoch 0:  41%|████      | 2610/6434 [6:07:23<8:56:27,  8.42s/it, gpt_loss=0.318, loss_mean=0.329][A
+Train step of epoch 0:  41%|████      | 2611/6434 [6:07:23<9:05:36,  8.56s/it, gpt_loss=0.318, loss_mean=0.329][A
+Train step of epoch 0:  41%|████      | 2611/6434 [6:07:32<9:05:36,  8.56s/it, gpt_loss=0.269, loss_mean=0.323][A
+Train step of epoch 0:  41%|████      | 2612/6434 [6:07:32<9:15:30,  8.72s/it, gpt_loss=0.269, loss_mean=0.323][A
+Train step of epoch 0:  41%|████      | 2612/6434 [6:07:39<9:15:30,  8.72s/it, gpt_loss=0.299, loss_mean=0.321][A
+Train step of epoch 0:  41%|████      | 2613/6434 [6:07:39<8:51:04,  8.34s/it, gpt_loss=0.299, loss_mean=0.321][A
+Train step of epoch 0:  41%|████      | 2613/6434 [6:07:47<8:51:04,  8.34s/it, gpt_loss=0.316, loss_mean=0.32] [A
+Train step of epoch 0:  41%|████      | 2614/6434 [6:07:47<8:44:25,  8.24s/it, gpt_loss=0.316, loss_mean=0.32][A
+Train step of epoch 0:  41%|████      | 2614/6434 [6:07:57<8:44:25,  8.24s/it, gpt_loss=0.294, loss_mean=0.318][A
+Train step of epoch 0:  41%|████      | 2615/6434 [6:07:57<9:15:16,  8.72s/it, gpt_loss=0.294, loss_mean=0.318][A
+Train step of epoch 0:  41%|████      | 2615/6434 [6:08:05<9:15:16,  8.72s/it, gpt_loss=0.302, loss_mean=0.316][A
+Train step of epoch 0:  41%|████      | 2616/6434 [6:08:05<8:58:54,  8.47s/it, gpt_loss=0.302, loss_mean=0.316][A
+Train step of epoch 0:  41%|████      | 2616/6434 [6:08:12<8:58:54,  8.47s/it, gpt_loss=0.238, loss_mean=0.308][A
+Train step of epoch 0:  41%|████      | 2617/6434 [6:08:12<8:35:58,  8.11s/it, gpt_loss=0.238, loss_mean=0.308][A
+Train step of epoch 0:  41%|████      | 2617/6434 [6:08:20<8:35:58,  8.11s/it, gpt_loss=0.313, loss_mean=0.309][A
+Train step of epoch 0:  41%|████      | 2618/6434 [6:08:20<8:34:48,  8.09s/it, gpt_loss=0.313, loss_mean=0.309][A
+Train step of epoch 0:  41%|████      | 2618/6434 [6:08:29<8:34:48,  8.09s/it, gpt_loss=0.4, loss_mean=0.318]  [A
+Train step of epoch 0:  41%|████      | 2619/6434 [6:08:29<8:43:56,  8.24s/it, gpt_loss=0.4, loss_mean=0.318][A
+[LID Router Debug] Step: 2620
+Batch Size: 10
+Audio Batch Size: 112
+LID Assignments: [3, 4, 3, 4, 5, 2, 1, 1, 5, 9]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  41%|████      | 2619/6434 [6:08:37<8:43:56,  8.24s/it, gpt_loss=0.316, loss_mean=0.318][A
+Train step of epoch 0:  41%|████      | 2620/6434 [6:08:37<8:35:21,  8.11s/it, gpt_loss=0.316, loss_mean=0.318][A
+Train step of epoch 0:  41%|████      | 2620/6434 [6:08:44<8:35:21,  8.11s/it, gpt_loss=0.341, loss_mean=0.32] [A
+Train step of epoch 0:  41%|████      | 2621/6434 [6:08:44<8:28:13,  8.00s/it, gpt_loss=0.341, loss_mean=0.32][A
+Train step of epoch 0:  41%|████      | 2621/6434 [6:08:53<8:28:13,  8.00s/it, gpt_loss=0.316, loss_mean=0.32][A
+Train step of epoch 0:  41%|████      | 2622/6434 [6:08:53<8:31:14,  8.05s/it, gpt_loss=0.316, loss_mean=0.32][A
+Train step of epoch 0:  41%|████      | 2622/6434 [6:09:02<8:31:14,  8.05s/it, gpt_loss=0.353, loss_mean=0.323][A
+Train step of epoch 0:  41%|████      | 2623/6434 [6:09:02<8:51:26,  8.37s/it, gpt_loss=0.353, loss_mean=0.323][A
+Train step of epoch 0:  41%|████      | 2623/6434 [6:09:09<8:51:26,  8.37s/it, gpt_loss=0.328, loss_mean=0.323][A
+Train step of epoch 0:  41%|████      | 2624/6434 [6:09:09<8:39:05,  8.17s/it, gpt_loss=0.328, loss_mean=0.323][A
+Train step of epoch 0:  41%|████      | 2624/6434 [6:09:17<8:39:05,  8.17s/it, gpt_loss=0.35, loss_mean=0.326] [A
+Train step of epoch 0:  41%|████      | 2625/6434 [6:09:17<8:31:11,  8.05s/it, gpt_loss=0.35, loss_mean=0.326][A
+Train step of epoch 0:  41%|████      | 2625/6434 [6:09:26<8:31:11,  8.05s/it, gpt_loss=0.3, loss_mean=0.324] [A
+Train step of epoch 0:  41%|████      | 2626/6434 [6:09:26<8:43:14,  8.24s/it, gpt_loss=0.3, loss_mean=0.324][A
+Train step of epoch 0:  41%|████      | 2626/6434 [6:09:34<8:43:14,  8.24s/it, gpt_loss=0.363, loss_mean=0.327][A
+Train step of epoch 0:  41%|████      | 2627/6434 [6:09:34<8:39:10,  8.18s/it, gpt_loss=0.363, loss_mean=0.327][A
+Train step of epoch 0:  41%|████      | 2627/6434 [6:09:41<8:39:10,  8.18s/it, gpt_loss=0.319, loss_mean=0.327][A
+Train step of epoch 0:  41%|████      | 2628/6434 [6:09:41<8:22:16,  7.92s/it, gpt_loss=0.319, loss_mean=0.327][A
+Train step of epoch 0:  41%|████      | 2628/6434 [6:09:51<8:22:16,  7.92s/it, gpt_loss=0.292, loss_mean=0.323][A
+Train step of epoch 0:  41%|████      | 2629/6434 [6:09:51<9:00:12,  8.52s/it, gpt_loss=0.292, loss_mean=0.323][A
+[LID Router Debug] Step: 2630
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [3, 4, 4, 1, 3, 9, 5, 1, 1, 2]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  41%|████      | 2629/6434 [6:09:59<9:00:12,  8.52s/it, gpt_loss=0.24, loss_mean=0.315] [A
+Train step of epoch 0:  41%|████      | 2630/6434 [6:09:59<8:49:39,  8.35s/it, gpt_loss=0.24, loss_mean=0.315][A
+Train step of epoch 0:  41%|████      | 2630/6434 [6:10:07<8:49:39,  8.35s/it, gpt_loss=0.331, loss_mean=0.316][A
+Train step of epoch 0:  41%|████      | 2631/6434 [6:10:07<8:46:31,  8.31s/it, gpt_loss=0.331, loss_mean=0.316][A
+Train step of epoch 0:  41%|████      | 2631/6434 [6:10:15<8:46:31,  8.31s/it, gpt_loss=0.323, loss_mean=0.317][A
+Train step of epoch 0:  41%|████      | 2632/6434 [6:10:15<8:33:28,  8.10s/it, gpt_loss=0.323, loss_mean=0.317][A
+Train step of epoch 0:  41%|████      | 2632/6434 [6:10:23<8:33:28,  8.10s/it, gpt_loss=0.347, loss_mean=0.32] [A
+Train step of epoch 0:  41%|████      | 2633/6434 [6:10:23<8:34:02,  8.11s/it, gpt_loss=0.347, loss_mean=0.32][A
+Train step of epoch 0:  41%|████      | 2633/6434 [6:10:32<8:34:02,  8.11s/it, gpt_loss=0.324, loss_mean=0.321][A
+Train step of epoch 0:  41%|████      | 2634/6434 [6:10:32<8:44:14,  8.28s/it, gpt_loss=0.324, loss_mean=0.321][A
+Train step of epoch 0:  41%|████      | 2634/6434 [6:10:39<8:44:14,  8.28s/it, gpt_loss=0.319, loss_mean=0.32] [A
+Train step of epoch 0:  41%|████      | 2635/6434 [6:10:39<8:29:10,  8.04s/it, gpt_loss=0.319, loss_mean=0.32][A
+Train step of epoch 0:  41%|████      | 2635/6434 [6:10:47<8:29:10,  8.04s/it, gpt_loss=0.318, loss_mean=0.32][A
+Train step of epoch 0:  41%|████      | 2636/6434 [6:10:47<8:23:51,  7.96s/it, gpt_loss=0.318, loss_mean=0.32][A
+Train step of epoch 0:  41%|████      | 2636/6434 [6:10:55<8:23:51,  7.96s/it, gpt_loss=0.263, loss_mean=0.314][A
+Train step of epoch 0:  41%|████      | 2637/6434 [6:10:55<8:27:50,  8.02s/it, gpt_loss=0.263, loss_mean=0.314][A
+Train step of epoch 0:  41%|████      | 2637/6434 [6:11:04<8:27:50,  8.02s/it, gpt_loss=0.332, loss_mean=0.316][A
+Train step of epoch 0:  41%|████      | 2638/6434 [6:11:04<8:44:20,  8.29s/it, gpt_loss=0.332, loss_mean=0.316][A
+Train step of epoch 0:  41%|████      | 2638/6434 [6:11:13<8:44:20,  8.29s/it, gpt_loss=0.299, loss_mean=0.314][A
+Train step of epoch 0:  41%|████      | 2639/6434 [6:11:13<8:46:08,  8.32s/it, gpt_loss=0.299, loss_mean=0.314][A
+[LID Router Debug] Step: 2640
+Batch Size: 10
+Audio Batch Size: 96
+LID Assignments: [4, 3, 4, 9, 0, 2, 0, 0, 5, 4]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  41%|████      | 2639/6434 [6:11:20<8:46:08,  8.32s/it, gpt_loss=0.331, loss_mean=0.316][A
+Train step of epoch 0:  41%|████      | 2640/6434 [6:11:20<8:35:49,  8.16s/it, gpt_loss=0.331, loss_mean=0.316][A
+Train step of epoch 0:  41%|████      | 2640/6434 [6:11:29<8:35:49,  8.16s/it, gpt_loss=0.384, loss_mean=0.323][A
+Train step of epoch 0:  41%|████      | 2641/6434 [6:11:29<8:50:44,  8.40s/it, gpt_loss=0.384, loss_mean=0.323][A
+Train step of epoch 0:  41%|████      | 2641/6434 [6:11:38<8:50:44,  8.40s/it, gpt_loss=0.273, loss_mean=0.318][A
+Train step of epoch 0:  41%|████      | 2642/6434 [6:11:38<8:57:40,  8.51s/it, gpt_loss=0.273, loss_mean=0.318][A
+Train step of epoch 0:  41%|████      | 2642/6434 [6:11:48<8:57:40,  8.51s/it, gpt_loss=0.359, loss_mean=0.322][A
+Train step of epoch 0:  41%|████      | 2643/6434 [6:11:48<9:18:17,  8.84s/it, gpt_loss=0.359, loss_mean=0.322][A
+Train step of epoch 0:  41%|████      | 2643/6434 [6:11:55<9:18:17,  8.84s/it, gpt_loss=0.285, loss_mean=0.318][A
+Train step of epoch 0:  41%|████      | 2644/6434 [6:11:55<8:58:23,  8.52s/it, gpt_loss=0.285, loss_mean=0.318][A
+Train step of epoch 0:  41%|████      | 2644/6434 [6:12:05<8:58:23,  8.52s/it, gpt_loss=0.34, loss_mean=0.32]  [A
+Train step of epoch 0:  41%|████      | 2645/6434 [6:12:05<9:21:04,  8.88s/it, gpt_loss=0.34, loss_mean=0.32][A
+Train step of epoch 0:  41%|████      | 2645/6434 [6:12:13<9:21:04,  8.88s/it, gpt_loss=0.292, loss_mean=0.318][A
+Train step of epoch 0:  41%|████      | 2646/6434 [6:12:13<9:05:24,  8.64s/it, gpt_loss=0.292, loss_mean=0.318][A
+Train step of epoch 0:  41%|████      | 2646/6434 [6:12:22<9:05:24,  8.64s/it, gpt_loss=0.419, loss_mean=0.328][A
+Train step of epoch 0:  41%|████      | 2647/6434 [6:12:22<9:02:45,  8.60s/it, gpt_loss=0.419, loss_mean=0.328][A
+Train step of epoch 0:  41%|████      | 2647/6434 [6:12:30<9:02:45,  8.60s/it, gpt_loss=0.275, loss_mean=0.322][A
+Train step of epoch 0:  41%|████      | 2648/6434 [6:12:30<9:00:56,  8.57s/it, gpt_loss=0.275, loss_mean=0.322][A
+Train step of epoch 0:  41%|████      | 2648/6434 [6:12:39<9:00:56,  8.57s/it, gpt_loss=0.337, loss_mean=0.324][A
+Train step of epoch 0:  41%|████      | 2649/6434 [6:12:39<9:02:27,  8.60s/it, gpt_loss=0.337, loss_mean=0.324][A
+[LID Router Debug] Step: 2650
+Batch Size: 10
+Audio Batch Size: 95
+LID Assignments: [0, 6, 5, 4, 1, 3, 6, 3, 5, 6]
+Active Experts in Batch: {0, 1, 3, 4, 5, 6}
+
+Train step of epoch 0:  41%|████      | 2649/6434 [6:12:48<9:02:27,  8.60s/it, gpt_loss=0.299, loss_mean=0.321][A
+Train step of epoch 0:  41%|████      | 2650/6434 [6:12:48<9:06:45,  8.67s/it, gpt_loss=0.299, loss_mean=0.321][A
+Train step of epoch 0:  41%|████      | 2650/6434 [6:12:56<9:06:45,  8.67s/it, gpt_loss=0.38, loss_mean=0.327] [A
+Train step of epoch 0:  41%|████      | 2651/6434 [6:12:56<8:59:47,  8.56s/it, gpt_loss=0.38, loss_mean=0.327][A
+Train step of epoch 0:  41%|████      | 2651/6434 [6:13:04<8:59:47,  8.56s/it, gpt_loss=0.409, loss_mean=0.335][A
+Train step of epoch 0:  41%|████      | 2652/6434 [6:13:04<8:49:48,  8.41s/it, gpt_loss=0.409, loss_mean=0.335][A
+Train step of epoch 0:  41%|████      | 2652/6434 [6:13:12<8:49:48,  8.41s/it, gpt_loss=0.277, loss_mean=0.33] [A
+Train step of epoch 0:  41%|████      | 2653/6434 [6:13:12<8:33:09,  8.14s/it, gpt_loss=0.277, loss_mean=0.33][A
+Train step of epoch 0:  41%|████      | 2653/6434 [6:13:20<8:33:09,  8.14s/it, gpt_loss=0.381, loss_mean=0.335][A
+Train step of epoch 0:  41%|████      | 2654/6434 [6:13:20<8:42:23,  8.29s/it, gpt_loss=0.381, loss_mean=0.335][A
+Train step of epoch 0:  41%|████      | 2654/6434 [6:13:29<8:42:23,  8.29s/it, gpt_loss=0.269, loss_mean=0.328][A
+Train step of epoch 0:  41%|████▏     | 2655/6434 [6:13:29<8:47:05,  8.37s/it, gpt_loss=0.269, loss_mean=0.328][A
+Train step of epoch 0:  41%|████▏     | 2655/6434 [6:13:38<8:47:05,  8.37s/it, gpt_loss=0.302, loss_mean=0.326][A
+Train step of epoch 0:  41%|████▏     | 2656/6434 [6:13:38<9:06:00,  8.67s/it, gpt_loss=0.302, loss_mean=0.326][A
+Train step of epoch 0:  41%|████▏     | 2656/6434 [6:13:47<9:06:00,  8.67s/it, gpt_loss=0.318, loss_mean=0.325][A
+Train step of epoch 0:  41%|████▏     | 2657/6434 [6:13:47<9:18:17,  8.87s/it, gpt_loss=0.318, loss_mean=0.325][A
+Train step of epoch 0:  41%|████▏     | 2657/6434 [6:13:56<9:18:17,  8.87s/it, gpt_loss=0.229, loss_mean=0.315][A
+Train step of epoch 0:  41%|████▏     | 2658/6434 [6:13:56<9:17:50,  8.86s/it, gpt_loss=0.229, loss_mean=0.315][A
+Train step of epoch 0:  41%|████▏     | 2658/6434 [6:14:05<9:17:50,  8.86s/it, gpt_loss=0.374, loss_mean=0.321][A
+Train step of epoch 0:  41%|████▏     | 2659/6434 [6:14:05<9:07:12,  8.70s/it, gpt_loss=0.374, loss_mean=0.321][A
+[LID Router Debug] Step: 2660
+Batch Size: 10
+Audio Batch Size: 128
+LID Assignments: [2, 4, 9, 9, 9, 3, 5, 6, 2, 2]
+Active Experts in Batch: {2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  41%|████▏     | 2659/6434 [6:14:13<9:07:12,  8.70s/it, gpt_loss=0.305, loss_mean=0.319][A
+Train step of epoch 0:  41%|████▏     | 2660/6434 [6:14:13<9:08:08,  8.71s/it, gpt_loss=0.305, loss_mean=0.319][A
+Train step of epoch 0:  41%|████▏     | 2660/6434 [6:14:22<9:08:08,  8.71s/it, gpt_loss=0.314, loss_mean=0.319][A
+Train step of epoch 0:  41%|████▏     | 2661/6434 [6:14:22<9:06:15,  8.69s/it, gpt_loss=0.314, loss_mean=0.319][A
+Train step of epoch 0:  41%|████▏     | 2661/6434 [6:14:30<9:06:15,  8.69s/it, gpt_loss=0.276, loss_mean=0.315][A
+Train step of epoch 0:  41%|████▏     | 2662/6434 [6:14:30<8:47:32,  8.39s/it, gpt_loss=0.276, loss_mean=0.315][A
+Train step of epoch 0:  41%|████▏     | 2662/6434 [6:14:38<8:47:32,  8.39s/it, gpt_loss=0.257, loss_mean=0.309][A
+Train step of epoch 0:  41%|████▏     | 2663/6434 [6:14:38<8:37:14,  8.23s/it, gpt_loss=0.257, loss_mean=0.309][A
+Train step of epoch 0:  41%|████▏     | 2663/6434 [6:14:46<8:37:14,  8.23s/it, gpt_loss=0.279, loss_mean=0.306][A
+Train step of epoch 0:  41%|████▏     | 2664/6434 [6:14:46<8:48:34,  8.41s/it, gpt_loss=0.279, loss_mean=0.306][A
+Train step of epoch 0:  41%|████▏     | 2664/6434 [6:14:55<8:48:34,  8.41s/it, gpt_loss=0.234, loss_mean=0.299][A
+Train step of epoch 0:  41%|████▏     | 2665/6434 [6:14:55<8:43:54,  8.34s/it, gpt_loss=0.234, loss_mean=0.299][A
+Train step of epoch 0:  41%|████▏     | 2665/6434 [6:15:02<8:43:54,  8.34s/it, gpt_loss=0.348, loss_mean=0.304][A
+Train step of epoch 0:  41%|████▏     | 2666/6434 [6:15:02<8:24:31,  8.03s/it, gpt_loss=0.348, loss_mean=0.304][A
+Train step of epoch 0:  41%|████▏     | 2666/6434 [6:15:11<8:24:31,  8.03s/it, gpt_loss=0.239, loss_mean=0.297][A
+Train step of epoch 0:  41%|████▏     | 2667/6434 [6:15:11<8:50:27,  8.45s/it, gpt_loss=0.239, loss_mean=0.297][A
+Train step of epoch 0:  41%|████▏     | 2667/6434 [6:15:20<8:50:27,  8.45s/it, gpt_loss=0.341, loss_mean=0.302][A
+Train step of epoch 0:  41%|████▏     | 2668/6434 [6:15:20<8:52:36,  8.49s/it, gpt_loss=0.341, loss_mean=0.302][A
+Train step of epoch 0:  41%|████▏     | 2668/6434 [6:15:29<8:52:36,  8.49s/it, gpt_loss=0.351, loss_mean=0.306][A
+Train step of epoch 0:  41%|████▏     | 2669/6434 [6:15:29<9:01:50,  8.63s/it, gpt_loss=0.351, loss_mean=0.306][A
+[LID Router Debug] Step: 2670
+Batch Size: 10
+Audio Batch Size: 143
+LID Assignments: [3, 9, 5, 8, 6, 9, 4, 9, 0, 6]
+Active Experts in Batch: {0, 3, 4, 5, 6, 8, 9}
+
+Train step of epoch 0:  41%|████▏     | 2669/6434 [6:15:38<9:01:50,  8.63s/it, gpt_loss=0.263, loss_mean=0.302][A
+Train step of epoch 0:  41%|████▏     | 2670/6434 [6:15:38<9:06:18,  8.71s/it, gpt_loss=0.263, loss_mean=0.302][A
+Train step of epoch 0:  41%|████▏     | 2670/6434 [6:15:46<9:06:18,  8.71s/it, gpt_loss=0.225, loss_mean=0.294][A
+Train step of epoch 0:  42%|████▏     | 2671/6434 [6:15:46<9:04:58,  8.69s/it, gpt_loss=0.225, loss_mean=0.294][A
+Train step of epoch 0:  42%|████▏     | 2671/6434 [6:15:56<9:04:58,  8.69s/it, gpt_loss=0.323, loss_mean=0.297][A
+Train step of epoch 0:  42%|████▏     | 2672/6434 [6:15:56<9:12:45,  8.82s/it, gpt_loss=0.323, loss_mean=0.297][A
+Train step of epoch 0:  42%|████▏     | 2672/6434 [6:16:04<9:12:45,  8.82s/it, gpt_loss=0.343, loss_mean=0.302][A
+Train step of epoch 0:  42%|████▏     | 2673/6434 [6:16:04<9:03:07,  8.66s/it, gpt_loss=0.343, loss_mean=0.302][A
+Train step of epoch 0:  42%|████▏     | 2673/6434 [6:16:12<9:03:07,  8.66s/it, gpt_loss=0.258, loss_mean=0.297][A
+Train step of epoch 0:  42%|████▏     | 2674/6434 [6:16:12<8:49:41,  8.45s/it, gpt_loss=0.258, loss_mean=0.297][A
+Train step of epoch 0:  42%|████▏     | 2674/6434 [6:16:21<8:49:41,  8.45s/it, gpt_loss=0.421, loss_mean=0.31] [A
+Train step of epoch 0:  42%|████▏     | 2675/6434 [6:16:21<9:00:32,  8.63s/it, gpt_loss=0.421, loss_mean=0.31][A
+Train step of epoch 0:  42%|████▏     | 2675/6434 [6:16:29<9:00:32,  8.63s/it, gpt_loss=0.346, loss_mean=0.313][A
+Train step of epoch 0:  42%|████▏     | 2676/6434 [6:16:29<8:47:33,  8.42s/it, gpt_loss=0.346, loss_mean=0.313][A
+Train step of epoch 0:  42%|████▏     | 2676/6434 [6:16:38<8:47:33,  8.42s/it, gpt_loss=0.307, loss_mean=0.313][A
+Train step of epoch 0:  42%|████▏     | 2677/6434 [6:16:38<9:02:31,  8.66s/it, gpt_loss=0.307, loss_mean=0.313][A
+Train step of epoch 0:  42%|████▏     | 2677/6434 [6:16:45<9:02:31,  8.66s/it, gpt_loss=0.314, loss_mean=0.313][A
+Train step of epoch 0:  42%|████▏     | 2678/6434 [6:16:45<8:39:28,  8.30s/it, gpt_loss=0.314, loss_mean=0.313][A
+Train step of epoch 0:  42%|████▏     | 2678/6434 [6:16:55<8:39:28,  8.30s/it, gpt_loss=0.418, loss_mean=0.324][A
+Train step of epoch 0:  42%|████▏     | 2679/6434 [6:16:55<8:54:18,  8.54s/it, gpt_loss=0.418, loss_mean=0.324][A
+[LID Router Debug] Step: 2680
+Batch Size: 10
+Audio Batch Size: 138
+LID Assignments: [3, 4, 9, 1, 3, 4, 1, 3, 1, 3]
+Active Experts in Batch: {9, 3, 4, 1}
+
+Train step of epoch 0:  42%|████▏     | 2679/6434 [6:17:03<8:54:18,  8.54s/it, gpt_loss=0.275, loss_mean=0.319][A
+Train step of epoch 0:  42%|████▏     | 2680/6434 [6:17:03<9:00:49,  8.64s/it, gpt_loss=0.275, loss_mean=0.319][A
+Train step of epoch 0:  42%|████▏     | 2680/6434 [6:17:11<9:00:49,  8.64s/it, gpt_loss=0.271, loss_mean=0.314][A
+Train step of epoch 0:  42%|████▏     | 2681/6434 [6:17:11<8:37:31,  8.27s/it, gpt_loss=0.271, loss_mean=0.314][A
+Train step of epoch 0:  42%|████▏     | 2681/6434 [6:17:20<8:37:31,  8.27s/it, gpt_loss=0.342, loss_mean=0.317][A
+Train step of epoch 0:  42%|████▏     | 2682/6434 [6:17:20<8:45:40,  8.41s/it, gpt_loss=0.342, loss_mean=0.317][A
+Train step of epoch 0:  42%|████▏     | 2682/6434 [6:17:27<8:45:40,  8.41s/it, gpt_loss=0.331, loss_mean=0.318][A
+Train step of epoch 0:  42%|████▏     | 2683/6434 [6:17:27<8:34:34,  8.23s/it, gpt_loss=0.331, loss_mean=0.318][A
+Train step of epoch 0:  42%|████▏     | 2683/6434 [6:17:35<8:34:34,  8.23s/it, gpt_loss=0.337, loss_mean=0.32] [A
+Train step of epoch 0:  42%|████▏     | 2684/6434 [6:17:35<8:28:04,  8.13s/it, gpt_loss=0.337, loss_mean=0.32][A
+Train step of epoch 0:  42%|████▏     | 2684/6434 [6:17:44<8:28:04,  8.13s/it, gpt_loss=0.272, loss_mean=0.315][A
+Train step of epoch 0:  42%|████▏     | 2685/6434 [6:17:44<8:37:51,  8.29s/it, gpt_loss=0.272, loss_mean=0.315][A
+Train step of epoch 0:  42%|████▏     | 2685/6434 [6:17:53<8:37:51,  8.29s/it, gpt_loss=0.305, loss_mean=0.314][A
+Train step of epoch 0:  42%|████▏     | 2686/6434 [6:17:53<8:51:41,  8.51s/it, gpt_loss=0.305, loss_mean=0.314][A
+Train step of epoch 0:  42%|████▏     | 2686/6434 [6:18:01<8:51:41,  8.51s/it, gpt_loss=0.281, loss_mean=0.311][A
+Train step of epoch 0:  42%|████▏     | 2687/6434 [6:18:01<8:48:34,  8.46s/it, gpt_loss=0.281, loss_mean=0.311][A
+Train step of epoch 0:  42%|████▏     | 2687/6434 [6:18:10<8:48:34,  8.46s/it, gpt_loss=0.304, loss_mean=0.31] [A
+Train step of epoch 0:  42%|████▏     | 2688/6434 [6:18:10<8:47:43,  8.45s/it, gpt_loss=0.304, loss_mean=0.31][A
+Train step of epoch 0:  42%|████▏     | 2688/6434 [6:18:17<8:47:43,  8.45s/it, gpt_loss=0.308, loss_mean=0.31][A
+Train step of epoch 0:  42%|████▏     | 2689/6434 [6:18:17<8:29:19,  8.16s/it, gpt_loss=0.308, loss_mean=0.31][A
+[LID Router Debug] Step: 2690
+Batch Size: 10
+Audio Batch Size: 124
+LID Assignments: [3, 4, 5, 9, 3, 0, 4, 5, 0, 9]
+Active Experts in Batch: {0, 3, 4, 5, 9}
+
+Train step of epoch 0:  42%|████▏     | 2689/6434 [6:18:26<8:29:19,  8.16s/it, gpt_loss=0.28, loss_mean=0.307][A
+Train step of epoch 0:  42%|████▏     | 2690/6434 [6:18:26<8:33:54,  8.24s/it, gpt_loss=0.28, loss_mean=0.307][A
+Train step of epoch 0:  42%|████▏     | 2690/6434 [6:18:33<8:33:54,  8.24s/it, gpt_loss=0.303, loss_mean=0.307][A
+Train step of epoch 0:  42%|████▏     | 2691/6434 [6:18:33<8:15:09,  7.94s/it, gpt_loss=0.303, loss_mean=0.307][A
+Train step of epoch 0:  42%|████▏     | 2691/6434 [6:18:41<8:15:09,  7.94s/it, gpt_loss=0.424, loss_mean=0.318][A
+Train step of epoch 0:  42%|████▏     | 2692/6434 [6:18:41<8:20:44,  8.03s/it, gpt_loss=0.424, loss_mean=0.318][A
+Train step of epoch 0:  42%|████▏     | 2692/6434 [6:18:50<8:20:44,  8.03s/it, gpt_loss=0.423, loss_mean=0.329][A
+Train step of epoch 0:  42%|████▏     | 2693/6434 [6:18:50<8:32:22,  8.22s/it, gpt_loss=0.423, loss_mean=0.329][A
+Train step of epoch 0:  42%|████▏     | 2693/6434 [6:18:59<8:32:22,  8.22s/it, gpt_loss=0.292, loss_mean=0.325][A
+Train step of epoch 0:  42%|████▏     | 2694/6434 [6:18:59<8:56:30,  8.61s/it, gpt_loss=0.292, loss_mean=0.325][A
+Train step of epoch 0:  42%|████▏     | 2694/6434 [6:19:07<8:56:30,  8.61s/it, gpt_loss=0.365, loss_mean=0.329][A
+Train step of epoch 0:  42%|████▏     | 2695/6434 [6:19:07<8:40:04,  8.35s/it, gpt_loss=0.365, loss_mean=0.329][A
+Train step of epoch 0:  42%|████▏     | 2695/6434 [6:19:15<8:40:04,  8.35s/it, gpt_loss=0.343, loss_mean=0.331][A
+Train step of epoch 0:  42%|████▏     | 2696/6434 [6:19:15<8:35:35,  8.28s/it, gpt_loss=0.343, loss_mean=0.331][A
+Train step of epoch 0:  42%|████▏     | 2696/6434 [6:19:23<8:35:35,  8.28s/it, gpt_loss=0.362, loss_mean=0.334][A
+Train step of epoch 0:  42%|████▏     | 2697/6434 [6:19:23<8:24:31,  8.10s/it, gpt_loss=0.362, loss_mean=0.334][A
+Train step of epoch 0:  42%|████▏     | 2697/6434 [6:19:31<8:24:31,  8.10s/it, gpt_loss=0.34, loss_mean=0.334] [A
+Train step of epoch 0:  42%|████▏     | 2698/6434 [6:19:31<8:27:56,  8.16s/it, gpt_loss=0.34, loss_mean=0.334][A
+Train step of epoch 0:  42%|████▏     | 2698/6434 [6:19:39<8:27:56,  8.16s/it, gpt_loss=0.406, loss_mean=0.341][A
+Train step of epoch 0:  42%|████▏     | 2699/6434 [6:19:39<8:28:23,  8.17s/it, gpt_loss=0.406, loss_mean=0.341][A
+[LID Router Debug] Step: 2700
+Batch Size: 10
+Audio Batch Size: 95
+LID Assignments: [1, 9, 5, 4, 3, 6, 0, 0, 5, 1]
+Active Experts in Batch: {0, 1, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  42%|████▏     | 2699/6434 [6:19:48<8:28:23,  8.17s/it, gpt_loss=0.312, loss_mean=0.339][A
+Train step of epoch 0:  42%|████▏     | 2700/6434 [6:19:48<8:44:03,  8.42s/it, gpt_loss=0.312, loss_mean=0.339][A
+Train step of epoch 0:  42%|████▏     | 2700/6434 [6:19:56<8:44:03,  8.42s/it, gpt_loss=0.329, loss_mean=0.338][A
+Train step of epoch 0:  42%|████▏     | 2701/6434 [6:19:56<8:35:20,  8.28s/it, gpt_loss=0.329, loss_mean=0.338][A
+Train step of epoch 0:  42%|████▏     | 2701/6434 [6:20:05<8:35:20,  8.28s/it, gpt_loss=0.323, loss_mean=0.336][A
+Train step of epoch 0:  42%|████▏     | 2702/6434 [6:20:05<8:41:52,  8.39s/it, gpt_loss=0.323, loss_mean=0.336][A
+Train step of epoch 0:  42%|████▏     | 2702/6434 [6:20:13<8:41:52,  8.39s/it, gpt_loss=0.441, loss_mean=0.347][A
+Train step of epoch 0:  42%|████▏     | 2703/6434 [6:20:13<8:38:12,  8.33s/it, gpt_loss=0.441, loss_mean=0.347][A
+Train step of epoch 0:  42%|████▏     | 2703/6434 [6:20:23<8:38:12,  8.33s/it, gpt_loss=0.356, loss_mean=0.348][A
+Train step of epoch 0:  42%|████▏     | 2704/6434 [6:20:23<9:06:25,  8.79s/it, gpt_loss=0.356, loss_mean=0.348][A
+Train step of epoch 0:  42%|████▏     | 2704/6434 [6:20:31<9:06:25,  8.79s/it, gpt_loss=0.339, loss_mean=0.347][A
+Train step of epoch 0:  42%|████▏     | 2705/6434 [6:20:31<8:56:56,  8.64s/it, gpt_loss=0.339, loss_mean=0.347][A
+Train step of epoch 0:  42%|████▏     | 2705/6434 [6:20:39<8:56:56,  8.64s/it, gpt_loss=0.387, loss_mean=0.351][A
+Train step of epoch 0:  42%|████▏     | 2706/6434 [6:20:39<8:48:17,  8.50s/it, gpt_loss=0.387, loss_mean=0.351][A
+Train step of epoch 0:  42%|████▏     | 2706/6434 [6:20:47<8:48:17,  8.50s/it, gpt_loss=0.244, loss_mean=0.34] [A
+Train step of epoch 0:  42%|████▏     | 2707/6434 [6:20:47<8:38:07,  8.34s/it, gpt_loss=0.244, loss_mean=0.34][A
+Train step of epoch 0:  42%|████▏     | 2707/6434 [6:20:55<8:38:07,  8.34s/it, gpt_loss=0.293, loss_mean=0.335][A
+Train step of epoch 0:  42%|████▏     | 2708/6434 [6:20:55<8:22:28,  8.09s/it, gpt_loss=0.293, loss_mean=0.335][A
+Train step of epoch 0:  42%|████▏     | 2708/6434 [6:21:03<8:22:28,  8.09s/it, gpt_loss=0.27, loss_mean=0.329] [A
+Train step of epoch 0:  42%|████▏     | 2709/6434 [6:21:03<8:21:52,  8.08s/it, gpt_loss=0.27, loss_mean=0.329][A
+[LID Router Debug] Step: 2710
+Batch Size: 10
+Audio Batch Size: 107
+LID Assignments: [4, 9, 6, 2, 2, 4, 4, 3, 9, 2]
+Active Experts in Batch: {2, 3, 4, 6, 9}
+
+Train step of epoch 0:  42%|████▏     | 2709/6434 [6:21:11<8:21:52,  8.08s/it, gpt_loss=0.288, loss_mean=0.325][A
+Train step of epoch 0:  42%|████▏     | 2710/6434 [6:21:11<8:13:20,  7.95s/it, gpt_loss=0.288, loss_mean=0.325][A
+Train step of epoch 0:  42%|████▏     | 2710/6434 [6:21:20<8:13:20,  7.95s/it, gpt_loss=0.394, loss_mean=0.332][A
+Train step of epoch 0:  42%|████▏     | 2711/6434 [6:21:20<8:32:54,  8.27s/it, gpt_loss=0.394, loss_mean=0.332][A
+Train step of epoch 0:  42%|████▏     | 2711/6434 [6:21:27<8:32:54,  8.27s/it, gpt_loss=0.329, loss_mean=0.331][A
+Train step of epoch 0:  42%|████▏     | 2712/6434 [6:21:27<8:22:15,  8.10s/it, gpt_loss=0.329, loss_mean=0.331][A
+Train step of epoch 0:  42%|████▏     | 2712/6434 [6:21:36<8:22:15,  8.10s/it, gpt_loss=0.285, loss_mean=0.327][A
+Train step of epoch 0:  42%|████▏     | 2713/6434 [6:21:36<8:39:55,  8.38s/it, gpt_loss=0.285, loss_mean=0.327][A
+Train step of epoch 0:  42%|████▏     | 2713/6434 [6:21:44<8:39:55,  8.38s/it, gpt_loss=0.334, loss_mean=0.328][A
+Train step of epoch 0:  42%|████▏     | 2714/6434 [6:21:44<8:20:49,  8.08s/it, gpt_loss=0.334, loss_mean=0.328][A
+Train step of epoch 0:  42%|████▏     | 2714/6434 [6:21:53<8:20:49,  8.08s/it, gpt_loss=0.337, loss_mean=0.328][A
+Train step of epoch 0:  42%|████▏     | 2715/6434 [6:21:53<8:34:59,  8.31s/it, gpt_loss=0.337, loss_mean=0.328][A
+Train step of epoch 0:  42%|████▏     | 2715/6434 [6:22:01<8:34:59,  8.31s/it, gpt_loss=0.309, loss_mean=0.326][A
+Train step of epoch 0:  42%|████▏     | 2716/6434 [6:22:01<8:38:05,  8.36s/it, gpt_loss=0.309, loss_mean=0.326][A
+Train step of epoch 0:  42%|████▏     | 2716/6434 [6:22:10<8:38:05,  8.36s/it, gpt_loss=0.339, loss_mean=0.328][A
+Train step of epoch 0:  42%|████▏     | 2717/6434 [6:22:10<8:49:53,  8.55s/it, gpt_loss=0.339, loss_mean=0.328][A
+Train step of epoch 0:  42%|████▏     | 2717/6434 [6:22:19<8:49:53,  8.55s/it, gpt_loss=0.431, loss_mean=0.338][A
+Train step of epoch 0:  42%|████▏     | 2718/6434 [6:22:19<8:55:31,  8.65s/it, gpt_loss=0.431, loss_mean=0.338][A
+Train step of epoch 0:  42%|████▏     | 2718/6434 [6:22:27<8:55:31,  8.65s/it, gpt_loss=0.283, loss_mean=0.333][A
+Train step of epoch 0:  42%|████▏     | 2719/6434 [6:22:27<8:43:48,  8.46s/it, gpt_loss=0.283, loss_mean=0.333][A
+[LID Router Debug] Step: 2720
+Batch Size: 10
+Audio Batch Size: 136
+LID Assignments: [5, 6, 3, 9, 4, 4, 3, 5, 6, 1]
+Active Experts in Batch: {1, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  42%|████▏     | 2719/6434 [6:22:36<8:43:48,  8.46s/it, gpt_loss=0.307, loss_mean=0.33] [A
+Train step of epoch 0:  42%|████▏     | 2720/6434 [6:22:36<8:51:57,  8.59s/it, gpt_loss=0.307, loss_mean=0.33][A
+Train step of epoch 0:  42%|████▏     | 2720/6434 [6:22:44<8:51:57,  8.59s/it, gpt_loss=0.257, loss_mean=0.323][A
+Train step of epoch 0:  42%|████▏     | 2721/6434 [6:22:44<8:52:29,  8.60s/it, gpt_loss=0.257, loss_mean=0.323][A
+Train step of epoch 0:  42%|████▏     | 2721/6434 [6:22:53<8:52:29,  8.60s/it, gpt_loss=0.277, loss_mean=0.318][A
+Train step of epoch 0:  42%|████▏     | 2722/6434 [6:22:53<8:42:51,  8.45s/it, gpt_loss=0.277, loss_mean=0.318][A
+Train step of epoch 0:  42%|████▏     | 2722/6434 [6:23:00<8:42:51,  8.45s/it, gpt_loss=0.333, loss_mean=0.32] [A
+Train step of epoch 0:  42%|████▏     | 2723/6434 [6:23:00<8:29:42,  8.24s/it, gpt_loss=0.333, loss_mean=0.32][A
+Train step of epoch 0:  42%|████▏     | 2723/6434 [6:23:08<8:29:42,  8.24s/it, gpt_loss=0.336, loss_mean=0.321][A
+Train step of epoch 0:  42%|████▏     | 2724/6434 [6:23:08<8:14:09,  7.99s/it, gpt_loss=0.336, loss_mean=0.321][A
+Train step of epoch 0:  42%|████▏     | 2724/6434 [6:23:16<8:14:09,  7.99s/it, gpt_loss=0.322, loss_mean=0.321][A
+Train step of epoch 0:  42%|████▏     | 2725/6434 [6:23:16<8:21:32,  8.11s/it, gpt_loss=0.322, loss_mean=0.321][A
+Train step of epoch 0:  42%|████▏     | 2725/6434 [6:23:25<8:21:32,  8.11s/it, gpt_loss=0.376, loss_mean=0.327][A
+Train step of epoch 0:  42%|████▏     | 2726/6434 [6:23:25<8:28:06,  8.22s/it, gpt_loss=0.376, loss_mean=0.327][A
+Train step of epoch 0:  42%|████▏     | 2726/6434 [6:23:33<8:28:06,  8.22s/it, gpt_loss=0.279, loss_mean=0.322][A
+Train step of epoch 0:  42%|████▏     | 2727/6434 [6:23:33<8:31:35,  8.28s/it, gpt_loss=0.279, loss_mean=0.322][A
+Train step of epoch 0:  42%|████▏     | 2727/6434 [6:23:42<8:31:35,  8.28s/it, gpt_loss=0.441, loss_mean=0.334][A
+Train step of epoch 0:  42%|████▏     | 2728/6434 [6:23:42<8:52:43,  8.62s/it, gpt_loss=0.441, loss_mean=0.334][A
+Train step of epoch 0:  42%|████▏     | 2728/6434 [6:23:51<8:52:43,  8.62s/it, gpt_loss=0.367, loss_mean=0.337][A
+Train step of epoch 0:  42%|████▏     | 2729/6434 [6:23:51<8:53:42,  8.64s/it, gpt_loss=0.367, loss_mean=0.337][A
+[LID Router Debug] Step: 2730
+Batch Size: 10
+Audio Batch Size: 98
+LID Assignments: [4, 0, 1, 2, 9, 4, 5, 9, 6, 5]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  42%|████▏     | 2729/6434 [6:24:00<8:53:42,  8.64s/it, gpt_loss=0.416, loss_mean=0.345][A
+Train step of epoch 0:  42%|████▏     | 2730/6434 [6:24:00<8:54:35,  8.66s/it, gpt_loss=0.416, loss_mean=0.345][A
+Train step of epoch 0:  42%|████▏     | 2730/6434 [6:24:08<8:54:35,  8.66s/it, gpt_loss=0.321, loss_mean=0.343][A
+Train step of epoch 0:  42%|████▏     | 2731/6434 [6:24:08<8:47:31,  8.55s/it, gpt_loss=0.321, loss_mean=0.343][A
+Train step of epoch 0:  42%|████▏     | 2731/6434 [6:24:16<8:47:31,  8.55s/it, gpt_loss=0.24, loss_mean=0.332] [A
+Train step of epoch 0:  42%|████▏     | 2732/6434 [6:24:16<8:40:51,  8.44s/it, gpt_loss=0.24, loss_mean=0.332][A
+Train step of epoch 0:  42%|████▏     | 2732/6434 [6:24:25<8:40:51,  8.44s/it, gpt_loss=0.364, loss_mean=0.335][A
+Train step of epoch 0:  42%|████▏     | 2733/6434 [6:24:25<8:49:08,  8.58s/it, gpt_loss=0.364, loss_mean=0.335][A
+Train step of epoch 0:  42%|████▏     | 2733/6434 [6:24:33<8:49:08,  8.58s/it, gpt_loss=0.291, loss_mean=0.331][A
+Train step of epoch 0:  42%|████▏     | 2734/6434 [6:24:33<8:42:15,  8.47s/it, gpt_loss=0.291, loss_mean=0.331][A
+Train step of epoch 0:  42%|████▏     | 2734/6434 [6:24:42<8:42:15,  8.47s/it, gpt_loss=0.34, loss_mean=0.332] [A
+Train step of epoch 0:  43%|████▎     | 2735/6434 [6:24:42<8:36:39,  8.38s/it, gpt_loss=0.34, loss_mean=0.332][A
+Train step of epoch 0:  43%|████▎     | 2735/6434 [6:24:50<8:36:39,  8.38s/it, gpt_loss=0.366, loss_mean=0.335][A
+Train step of epoch 0:  43%|████▎     | 2736/6434 [6:24:50<8:36:33,  8.38s/it, gpt_loss=0.366, loss_mean=0.335][A
+Train step of epoch 0:  43%|████▎     | 2736/6434 [6:24:59<8:36:33,  8.38s/it, gpt_loss=0.227, loss_mean=0.325][A
+Train step of epoch 0:  43%|████▎     | 2737/6434 [6:24:59<8:40:57,  8.45s/it, gpt_loss=0.227, loss_mean=0.325][A
+Train step of epoch 0:  43%|████▎     | 2737/6434 [6:25:08<8:40:57,  8.45s/it, gpt_loss=0.268, loss_mean=0.319][A
+Train step of epoch 0:  43%|████▎     | 2738/6434 [6:25:08<8:57:19,  8.72s/it, gpt_loss=0.268, loss_mean=0.319][A
+Train step of epoch 0:  43%|████▎     | 2738/6434 [6:25:18<8:57:19,  8.72s/it, gpt_loss=0.374, loss_mean=0.324][A
+Train step of epoch 0:  43%|████▎     | 2739/6434 [6:25:18<9:19:30,  9.09s/it, gpt_loss=0.374, loss_mean=0.324][A
+[LID Router Debug] Step: 2740
+Batch Size: 10
+Audio Batch Size: 129
+LID Assignments: [0, 3, 5, 9, 3, 1, 0, 3, 4, 9]
+Active Experts in Batch: {0, 1, 3, 4, 5, 9}
+
+Train step of epoch 0:  43%|████▎     | 2739/6434 [6:25:29<9:19:30,  9.09s/it, gpt_loss=0.237, loss_mean=0.316][A
+Train step of epoch 0:  43%|████▎     | 2740/6434 [6:25:29<9:49:43,  9.58s/it, gpt_loss=0.237, loss_mean=0.316][A
+Train step of epoch 0:  43%|████▎     | 2740/6434 [6:25:38<9:49:43,  9.58s/it, gpt_loss=0.261, loss_mean=0.31] [A
+Train step of epoch 0:  43%|████▎     | 2741/6434 [6:25:38<9:41:07,  9.44s/it, gpt_loss=0.261, loss_mean=0.31][A
+Train step of epoch 0:  43%|████▎     | 2741/6434 [6:25:47<9:41:07,  9.44s/it, gpt_loss=0.281, loss_mean=0.307][A
+Train step of epoch 0:  43%|████▎     | 2742/6434 [6:25:47<9:42:39,  9.47s/it, gpt_loss=0.281, loss_mean=0.307][A
+Train step of epoch 0:  43%|████▎     | 2742/6434 [6:25:56<9:42:39,  9.47s/it, gpt_loss=0.393, loss_mean=0.316][A
+Train step of epoch 0:  43%|████▎     | 2743/6434 [6:25:56<9:32:12,  9.30s/it, gpt_loss=0.393, loss_mean=0.316][A
+Train step of epoch 0:  43%|████▎     | 2743/6434 [6:26:05<9:32:12,  9.30s/it, gpt_loss=0.348, loss_mean=0.319][A
+Train step of epoch 0:  43%|████▎     | 2744/6434 [6:26:05<9:15:49,  9.04s/it, gpt_loss=0.348, loss_mean=0.319][A
+Train step of epoch 0:  43%|████▎     | 2744/6434 [6:26:12<9:15:49,  9.04s/it, gpt_loss=0.406, loss_mean=0.328][A
+Train step of epoch 0:  43%|████▎     | 2745/6434 [6:26:12<8:45:36,  8.55s/it, gpt_loss=0.406, loss_mean=0.328][A
+Train step of epoch 0:  43%|████▎     | 2745/6434 [6:26:19<8:45:36,  8.55s/it, gpt_loss=0.32, loss_mean=0.327] [A
+Train step of epoch 0:  43%|████▎     | 2746/6434 [6:26:19<8:18:30,  8.11s/it, gpt_loss=0.32, loss_mean=0.327][A
+Train step of epoch 0:  43%|████▎     | 2746/6434 [6:26:27<8:18:30,  8.11s/it, gpt_loss=0.368, loss_mean=0.331][A
+Train step of epoch 0:  43%|████▎     | 2747/6434 [6:26:27<8:22:05,  8.17s/it, gpt_loss=0.368, loss_mean=0.331][A
+Train step of epoch 0:  43%|████▎     | 2747/6434 [6:26:36<8:22:05,  8.17s/it, gpt_loss=0.35, loss_mean=0.333] [A
+Train step of epoch 0:  43%|████▎     | 2748/6434 [6:26:36<8:25:35,  8.23s/it, gpt_loss=0.35, loss_mean=0.333][A
+Train step of epoch 0:  43%|████▎     | 2748/6434 [6:26:44<8:25:35,  8.23s/it, gpt_loss=0.258, loss_mean=0.325][A
+Train step of epoch 0:  43%|████▎     | 2749/6434 [6:26:44<8:23:32,  8.20s/it, gpt_loss=0.258, loss_mean=0.325][A
+[LID Router Debug] Step: 2750
+Batch Size: 10
+Audio Batch Size: 91
+LID Assignments: [2, 9, 2, 5, 6, 1, 1, 5, 5, 4]
+Active Experts in Batch: {1, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  43%|████▎     | 2749/6434 [6:26:53<8:23:32,  8.20s/it, gpt_loss=0.319, loss_mean=0.325][A
+Train step of epoch 0:  43%|████▎     | 2750/6434 [6:26:53<8:35:36,  8.40s/it, gpt_loss=0.319, loss_mean=0.325][A
+Train step of epoch 0:  43%|████▎     | 2750/6434 [6:27:01<8:35:36,  8.40s/it, gpt_loss=0.243, loss_mean=0.317][A
+Train step of epoch 0:  43%|████▎     | 2751/6434 [6:27:01<8:32:02,  8.34s/it, gpt_loss=0.243, loss_mean=0.317][A
+Train step of epoch 0:  43%|████▎     | 2751/6434 [6:27:10<8:32:02,  8.34s/it, gpt_loss=0.276, loss_mean=0.312][A
+Train step of epoch 0:  43%|████▎     | 2752/6434 [6:27:10<8:47:13,  8.59s/it, gpt_loss=0.276, loss_mean=0.312][A
+Train step of epoch 0:  43%|████▎     | 2752/6434 [6:27:18<8:47:13,  8.59s/it, gpt_loss=0.395, loss_mean=0.321][A
+Train step of epoch 0:  43%|████▎     | 2753/6434 [6:27:18<8:39:59,  8.48s/it, gpt_loss=0.395, loss_mean=0.321][A
+Train step of epoch 0:  43%|████▎     | 2753/6434 [6:27:27<8:39:59,  8.48s/it, gpt_loss=0.374, loss_mean=0.326][A
+Train step of epoch 0:  43%|████▎     | 2754/6434 [6:27:27<8:43:49,  8.54s/it, gpt_loss=0.374, loss_mean=0.326][A
+Train step of epoch 0:  43%|████▎     | 2754/6434 [6:27:35<8:43:49,  8.54s/it, gpt_loss=0.29, loss_mean=0.322] [A
+Train step of epoch 0:  43%|████▎     | 2755/6434 [6:27:35<8:40:28,  8.49s/it, gpt_loss=0.29, loss_mean=0.322][A
+Train step of epoch 0:  43%|████▎     | 2755/6434 [6:27:43<8:40:28,  8.49s/it, gpt_loss=0.284, loss_mean=0.319][A
+Train step of epoch 0:  43%|████▎     | 2756/6434 [6:27:43<8:27:19,  8.28s/it, gpt_loss=0.284, loss_mean=0.319][A
+Train step of epoch 0:  43%|████▎     | 2756/6434 [6:27:52<8:27:19,  8.28s/it, gpt_loss=0.261, loss_mean=0.313][A
+Train step of epoch 0:  43%|████▎     | 2757/6434 [6:27:52<8:30:09,  8.32s/it, gpt_loss=0.261, loss_mean=0.313][A
+Train step of epoch 0:  43%|████▎     | 2757/6434 [6:28:00<8:30:09,  8.32s/it, gpt_loss=0.324, loss_mean=0.314][A
+Train step of epoch 0:  43%|████▎     | 2758/6434 [6:28:00<8:31:02,  8.34s/it, gpt_loss=0.324, loss_mean=0.314][A
+Train step of epoch 0:  43%|████▎     | 2758/6434 [6:28:08<8:31:02,  8.34s/it, gpt_loss=0.229, loss_mean=0.305][A
+Train step of epoch 0:  43%|████▎     | 2759/6434 [6:28:08<8:21:44,  8.19s/it, gpt_loss=0.229, loss_mean=0.305][A
+[LID Router Debug] Step: 2760
+Batch Size: 10
+Audio Batch Size: 139
+LID Assignments: [1, 2, 0, 2, 3, 3, 1, 2, 9, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+Train step of epoch 0:  43%|████▎     | 2759/6434 [6:28:17<8:21:44,  8.19s/it, gpt_loss=0.253, loss_mean=0.3]  [A
+Train step of epoch 0:  43%|████▎     | 2760/6434 [6:28:17<8:38:00,  8.46s/it, gpt_loss=0.253, loss_mean=0.3][A
+Train step of epoch 0:  43%|████▎     | 2760/6434 [6:28:27<8:38:00,  8.46s/it, gpt_loss=0.25, loss_mean=0.295][A
+Train step of epoch 0:  43%|████▎     | 2761/6434 [6:28:27<8:59:13,  8.81s/it, gpt_loss=0.25, loss_mean=0.295][A
+Train step of epoch 0:  43%|████▎     | 2761/6434 [6:28:35<8:59:13,  8.81s/it, gpt_loss=0.214, loss_mean=0.287][A
+Train step of epoch 0:  43%|████▎     | 2762/6434 [6:28:35<8:42:42,  8.54s/it, gpt_loss=0.214, loss_mean=0.287][A
+Train step of epoch 0:  43%|████▎     | 2762/6434 [6:28:43<8:42:42,  8.54s/it, gpt_loss=0.291, loss_mean=0.288][A
+Train step of epoch 0:  43%|████▎     | 2763/6434 [6:28:43<8:48:48,  8.64s/it, gpt_loss=0.291, loss_mean=0.288][A
+Train step of epoch 0:  43%|████▎     | 2763/6434 [6:28:51<8:48:48,  8.64s/it, gpt_loss=0.336, loss_mean=0.292][A
+Train step of epoch 0:  43%|████▎     | 2764/6434 [6:28:51<8:29:30,  8.33s/it, gpt_loss=0.336, loss_mean=0.292][A
+Train step of epoch 0:  43%|████▎     | 2764/6434 [6:28:59<8:29:30,  8.33s/it, gpt_loss=0.363, loss_mean=0.299][A
+Train step of epoch 0:  43%|████▎     | 2765/6434 [6:28:59<8:21:11,  8.20s/it, gpt_loss=0.363, loss_mean=0.299][A
+Train step of epoch 0:  43%|████▎     | 2765/6434 [6:29:09<8:21:11,  8.20s/it, gpt_loss=0.271, loss_mean=0.297][A
+Train step of epoch 0:  43%|████▎     | 2766/6434 [6:29:09<9:03:05,  8.88s/it, gpt_loss=0.271, loss_mean=0.297][A
+Train step of epoch 0:  43%|████▎     | 2766/6434 [6:29:18<9:03:05,  8.88s/it, gpt_loss=0.297, loss_mean=0.297][A
+Train step of epoch 0:  43%|████▎     | 2767/6434 [6:29:18<9:03:17,  8.89s/it, gpt_loss=0.297, loss_mean=0.297][A
+Train step of epoch 0:  43%|████▎     | 2767/6434 [6:29:26<9:03:17,  8.89s/it, gpt_loss=0.39, loss_mean=0.306] [A
+Train step of epoch 0:  43%|████▎     | 2768/6434 [6:29:26<8:45:48,  8.61s/it, gpt_loss=0.39, loss_mean=0.306][A
+Train step of epoch 0:  43%|████▎     | 2768/6434 [6:29:34<8:45:48,  8.61s/it, gpt_loss=0.249, loss_mean=0.3] [A
+Train step of epoch 0:  43%|████▎     | 2769/6434 [6:29:34<8:31:53,  8.38s/it, gpt_loss=0.249, loss_mean=0.3][A
+[LID Router Debug] Step: 2770
+Batch Size: 10
+Audio Batch Size: 111
+LID Assignments: [3, 0, 5, 1, 2, 5, 6, 3, 2, 5]
+Active Experts in Batch: {0, 1, 2, 3, 5, 6}
+
+Train step of epoch 0:  43%|████▎     | 2769/6434 [6:29:42<8:31:53,  8.38s/it, gpt_loss=0.367, loss_mean=0.307][A
+Train step of epoch 0:  43%|████▎     | 2770/6434 [6:29:42<8:21:06,  8.21s/it, gpt_loss=0.367, loss_mean=0.307][A
+Train step of epoch 0:  43%|████▎     | 2770/6434 [6:29:51<8:21:06,  8.21s/it, gpt_loss=0.436, loss_mean=0.32] [A
+Train step of epoch 0:  43%|████▎     | 2771/6434 [6:29:51<8:43:02,  8.57s/it, gpt_loss=0.436, loss_mean=0.32][A
+Train step of epoch 0:  43%|████▎     | 2771/6434 [6:30:00<8:43:02,  8.57s/it, gpt_loss=0.335, loss_mean=0.321][A
+Train step of epoch 0:  43%|████▎     | 2772/6434 [6:30:00<8:45:30,  8.61s/it, gpt_loss=0.335, loss_mean=0.321][A
+Train step of epoch 0:  43%|████▎     | 2772/6434 [6:30:09<8:45:30,  8.61s/it, gpt_loss=0.241, loss_mean=0.313][A
+Train step of epoch 0:  43%|████▎     | 2773/6434 [6:30:09<8:59:40,  8.84s/it, gpt_loss=0.241, loss_mean=0.313][A
+Train step of epoch 0:  43%|████▎     | 2773/6434 [6:30:18<8:59:40,  8.84s/it, gpt_loss=0.373, loss_mean=0.319][A
+Train step of epoch 0:  43%|████▎     | 2774/6434 [6:30:18<8:47:02,  8.64s/it, gpt_loss=0.373, loss_mean=0.319][A
+Train step of epoch 0:  43%|████▎     | 2774/6434 [6:30:27<8:47:02,  8.64s/it, gpt_loss=0.335, loss_mean=0.321][A
+Train step of epoch 0:  43%|████▎     | 2775/6434 [6:30:27<8:57:24,  8.81s/it, gpt_loss=0.335, loss_mean=0.321][A
+Train step of epoch 0:  43%|████▎     | 2775/6434 [6:30:37<8:57:24,  8.81s/it, gpt_loss=0.391, loss_mean=0.328][A
+Train step of epoch 0:  43%|████▎     | 2776/6434 [6:30:37<9:16:45,  9.13s/it, gpt_loss=0.391, loss_mean=0.328][A
+Train step of epoch 0:  43%|████▎     | 2776/6434 [6:30:45<9:16:45,  9.13s/it, gpt_loss=0.35, loss_mean=0.33]  [A
+Train step of epoch 0:  43%|████▎     | 2777/6434 [6:30:45<9:08:50,  9.00s/it, gpt_loss=0.35, loss_mean=0.33][A
+Train step of epoch 0:  43%|████▎     | 2777/6434 [6:30:55<9:08:50,  9.00s/it, gpt_loss=0.316, loss_mean=0.329][A
+Train step of epoch 0:  43%|████▎     | 2778/6434 [6:30:55<9:22:11,  9.23s/it, gpt_loss=0.316, loss_mean=0.329][A
+Train step of epoch 0:  43%|████▎     | 2778/6434 [6:31:03<9:22:11,  9.23s/it, gpt_loss=0.376, loss_mean=0.333][A
+Train step of epoch 0:  43%|████▎     | 2779/6434 [6:31:03<8:59:28,  8.86s/it, gpt_loss=0.376, loss_mean=0.333][A
+[LID Router Debug] Step: 2780
+Batch Size: 10
+Audio Batch Size: 114
+LID Assignments: [3, 0, 2, 1, 9, 8, 1, 4, 1, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 8, 9}
+
+Train step of epoch 0:  43%|████▎     | 2779/6434 [6:31:12<8:59:28,  8.86s/it, gpt_loss=0.287, loss_mean=0.329][A
+Train step of epoch 0:  43%|████▎     | 2780/6434 [6:31:12<9:04:39,  8.94s/it, gpt_loss=0.287, loss_mean=0.329][A
+Train step of epoch 0:  43%|████▎     | 2780/6434 [6:31:20<9:04:39,  8.94s/it, gpt_loss=0.307, loss_mean=0.327][A
+Train step of epoch 0:  43%|████▎     | 2781/6434 [6:31:20<8:35:43,  8.47s/it, gpt_loss=0.307, loss_mean=0.327][A
+Train step of epoch 0:  43%|████▎     | 2781/6434 [6:31:27<8:35:43,  8.47s/it, gpt_loss=0.377, loss_mean=0.332][A
+Train step of epoch 0:  43%|████▎     | 2782/6434 [6:31:27<8:16:37,  8.16s/it, gpt_loss=0.377, loss_mean=0.332][A
+Train step of epoch 0:  43%|████▎     | 2782/6434 [6:31:35<8:16:37,  8.16s/it, gpt_loss=0.347, loss_mean=0.333][A
+Train step of epoch 0:  43%|████▎     | 2783/6434 [6:31:35<8:15:01,  8.14s/it, gpt_loss=0.347, loss_mean=0.333][A
+Train step of epoch 0:  43%|████▎     | 2783/6434 [6:31:45<8:15:01,  8.14s/it, gpt_loss=0.228, loss_mean=0.323][A
+Train step of epoch 0:  43%|████▎     | 2784/6434 [6:31:45<8:39:08,  8.53s/it, gpt_loss=0.228, loss_mean=0.323][A
+Train step of epoch 0:  43%|████▎     | 2784/6434 [6:31:53<8:39:08,  8.53s/it, gpt_loss=0.337, loss_mean=0.324][A
+Train step of epoch 0:  43%|████▎     | 2785/6434 [6:31:53<8:31:56,  8.42s/it, gpt_loss=0.337, loss_mean=0.324][A
+Train step of epoch 0:  43%|████▎     | 2785/6434 [6:32:02<8:31:56,  8.42s/it, gpt_loss=0.293, loss_mean=0.321][A
+Train step of epoch 0:  43%|████▎     | 2786/6434 [6:32:02<8:42:29,  8.59s/it, gpt_loss=0.293, loss_mean=0.321][A
+Train step of epoch 0:  43%|████▎     | 2786/6434 [6:32:10<8:42:29,  8.59s/it, gpt_loss=0.325, loss_mean=0.322][A
+Train step of epoch 0:  43%|████▎     | 2787/6434 [6:32:10<8:29:18,  8.38s/it, gpt_loss=0.325, loss_mean=0.322][A
+Train step of epoch 0:  43%|████▎     | 2787/6434 [6:32:19<8:29:18,  8.38s/it, gpt_loss=0.308, loss_mean=0.32] [A
+Train step of epoch 0:  43%|████▎     | 2788/6434 [6:32:19<8:47:00,  8.67s/it, gpt_loss=0.308, loss_mean=0.32][A
+Train step of epoch 0:  43%|████▎     | 2788/6434 [6:32:29<8:47:00,  8.67s/it, gpt_loss=0.268, loss_mean=0.315][A
+Train step of epoch 0:  43%|████▎     | 2789/6434 [6:32:29<9:04:00,  8.95s/it, gpt_loss=0.268, loss_mean=0.315][A
+[LID Router Debug] Step: 2790
+Batch Size: 10
+Audio Batch Size: 121
+LID Assignments: [5, 9, 5, 5, 1, 6, 2, 0, 9, 10]
+Active Experts in Batch: {0, 1, 2, 5, 6, 9, 10}
+
+Train step of epoch 0:  43%|████▎     | 2789/6434 [6:32:37<9:04:00,  8.95s/it, gpt_loss=0.354, loss_mean=0.319][A
+Train step of epoch 0:  43%|████▎     | 2790/6434 [6:32:37<8:50:31,  8.74s/it, gpt_loss=0.354, loss_mean=0.319][A
+Train step of epoch 0:  43%|████▎     | 2790/6434 [6:32:45<8:50:31,  8.74s/it, gpt_loss=0.272, loss_mean=0.314][A
+Train step of epoch 0:  43%|████▎     | 2791/6434 [6:32:45<8:35:37,  8.49s/it, gpt_loss=0.272, loss_mean=0.314][A
+Train step of epoch 0:  43%|████▎     | 2791/6434 [6:32:55<8:35:37,  8.49s/it, gpt_loss=0.362, loss_mean=0.319][A
+Train step of epoch 0:  43%|████▎     | 2792/6434 [6:32:55<8:59:13,  8.88s/it, gpt_loss=0.362, loss_mean=0.319][A
+Train step of epoch 0:  43%|████▎     | 2792/6434 [6:33:03<8:59:13,  8.88s/it, gpt_loss=0.383, loss_mean=0.325][A
+Train step of epoch 0:  43%|████▎     | 2793/6434 [6:33:03<8:52:19,  8.77s/it, gpt_loss=0.383, loss_mean=0.325][A
+Train step of epoch 0:  43%|████▎     | 2793/6434 [6:33:11<8:52:19,  8.77s/it, gpt_loss=0.296, loss_mean=0.322][A
+Train step of epoch 0:  43%|████▎     | 2794/6434 [6:33:11<8:32:33,  8.45s/it, gpt_loss=0.296, loss_mean=0.322][A
+Train step of epoch 0:  43%|████▎     | 2794/6434 [6:33:18<8:32:33,  8.45s/it, gpt_loss=0.351, loss_mean=0.325][A
+Train step of epoch 0:  43%|████▎     | 2795/6434 [6:33:18<8:18:55,  8.23s/it, gpt_loss=0.351, loss_mean=0.325][A
+Train step of epoch 0:  43%|████▎     | 2795/6434 [6:33:27<8:18:55,  8.23s/it, gpt_loss=0.294, loss_mean=0.322][A
+Train step of epoch 0:  43%|████▎     | 2796/6434 [6:33:27<8:22:18,  8.28s/it, gpt_loss=0.294, loss_mean=0.322][A
+Train step of epoch 0:  43%|████▎     | 2796/6434 [6:33:35<8:22:18,  8.28s/it, gpt_loss=0.297, loss_mean=0.32] [A
+Train step of epoch 0:  43%|████▎     | 2797/6434 [6:33:35<8:27:59,  8.38s/it, gpt_loss=0.297, loss_mean=0.32][A
+Train step of epoch 0:  43%|████▎     | 2797/6434 [6:33:43<8:27:59,  8.38s/it, gpt_loss=0.29, loss_mean=0.317][A
+Train step of epoch 0:  43%|████▎     | 2798/6434 [6:33:43<8:19:51,  8.25s/it, gpt_loss=0.29, loss_mean=0.317][A
+Train step of epoch 0:  43%|████▎     | 2798/6434 [6:33:52<8:19:51,  8.25s/it, gpt_loss=0.26, loss_mean=0.311][A
+Train step of epoch 0:  44%|████▎     | 2799/6434 [6:33:52<8:21:23,  8.28s/it, gpt_loss=0.26, loss_mean=0.311][A
+[LID Router Debug] Step: 2800
+Batch Size: 10
+Audio Batch Size: 120
+LID Assignments: [2, 9, 2, 9, 0, 9, 1, 0, 3, 1]
+Active Experts in Batch: {0, 1, 2, 3, 9}
+[2026-02-06 22:30:04,658] [INFO] [logging.py:96:log_dist] [Rank 0] step=1400, skipped=0, lr=[1.9096071575868484e-05, 1.9096071575868484e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-06 22:30:04,659] [INFO] [timer.py:260:stop] epoch=0/micro_step=2800/global_step=1400, RunningAvgSamplesPerSec=4.7487973596655, CurrSamplesPerSec=4.768216829232621, MemAllocated=12.78GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  44%|████▎     | 2799/6434 [6:34:00<8:21:23,  8.28s/it, gpt_loss=0.343, loss_mean=0.314][A
+Train step of epoch 0:  44%|████▎     | 2800/6434 [6:34:00<8:24:45,  8.33s/it, gpt_loss=0.343, loss_mean=0.314][A
+Train step of epoch 0:  44%|████▎     | 2800/6434 [6:34:08<8:24:45,  8.33s/it, gpt_loss=0.315, loss_mean=0.314][A
+Train step of epoch 0:  44%|████▎     | 2801/6434 [6:34:08<8:17:41,  8.22s/it, gpt_loss=0.315, loss_mean=0.314][A
+Train step of epoch 0:  44%|████▎     | 2801/6434 [6:34:17<8:17:41,  8.22s/it, gpt_loss=0.37, loss_mean=0.32]  [A
+Train step of epoch 0:  44%|████▎     | 2802/6434 [6:34:17<8:29:18,  8.41s/it, gpt_loss=0.37, loss_mean=0.32][A
+Train step of epoch 0:  44%|████▎     | 2802/6434 [6:34:26<8:29:18,  8.41s/it, gpt_loss=0.237, loss_mean=0.312][A
+Train step of epoch 0:  44%|████▎     | 2803/6434 [6:34:26<8:39:45,  8.59s/it, gpt_loss=0.237, loss_mean=0.312][A
+Train step of epoch 0:  44%|████▎     | 2803/6434 [6:34:36<8:39:45,  8.59s/it, gpt_loss=0.315, loss_mean=0.312][A
+Train step of epoch 0:  44%|████▎     | 2804/6434 [6:34:36<9:10:01,  9.09s/it, gpt_loss=0.315, loss_mean=0.312][A
+Train step of epoch 0:  44%|████▎     | 2804/6434 [6:34:44<9:10:01,  9.09s/it, gpt_loss=0.361, loss_mean=0.317][A
+Train step of epoch 0:  44%|████▎     | 2805/6434 [6:34:44<8:38:03,  8.57s/it, gpt_loss=0.361, loss_mean=0.317][A
+Train step of epoch 0:  44%|████▎     | 2805/6434 [6:34:52<8:38:03,  8.57s/it, gpt_loss=0.325, loss_mean=0.318][A
+Train step of epoch 0:  44%|████▎     | 2806/6434 [6:34:52<8:33:01,  8.48s/it, gpt_loss=0.325, loss_mean=0.318][A
+Train step of epoch 0:  44%|████▎     | 2806/6434 [6:35:01<8:33:01,  8.48s/it, gpt_loss=0.31, loss_mean=0.317] [A
+Train step of epoch 0:  44%|████▎     | 2807/6434 [6:35:01<8:50:10,  8.77s/it, gpt_loss=0.31, loss_mean=0.317][A
+Train step of epoch 0:  44%|████▎     | 2807/6434 [6:35:11<8:50:10,  8.77s/it, gpt_loss=0.293, loss_mean=0.315][A
+Train step of epoch 0:  44%|████▎     | 2808/6434 [6:35:11<8:59:50,  8.93s/it, gpt_loss=0.293, loss_mean=0.315][A
+Train step of epoch 0:  44%|████▎     | 2808/6434 [6:35:18<8:59:50,  8.93s/it, gpt_loss=0.34, loss_mean=0.317] [A
+Train step of epoch 0:  44%|████▎     | 2809/6434 [6:35:18<8:38:30,  8.58s/it, gpt_loss=0.34, loss_mean=0.317][A
+[LID Router Debug] Step: 2810
+Batch Size: 10
+Audio Batch Size: 107
+LID Assignments: [4, 2, 5, 0, 1, 9, 3, 1, 4, 5]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  44%|████▎     | 2809/6434 [6:35:27<8:38:30,  8.58s/it, gpt_loss=0.289, loss_mean=0.314][A
+Train step of epoch 0:  44%|████▎     | 2810/6434 [6:35:27<8:45:45,  8.70s/it, gpt_loss=0.289, loss_mean=0.314][A
+Train step of epoch 0:  44%|████▎     | 2810/6434 [6:35:36<8:45:45,  8.70s/it, gpt_loss=0.335, loss_mean=0.316][A
+Train step of epoch 0:  44%|████▎     | 2811/6434 [6:35:36<8:40:48,  8.62s/it, gpt_loss=0.335, loss_mean=0.316][A
+Train step of epoch 0:  44%|████▎     | 2811/6434 [6:35:45<8:40:48,  8.62s/it, gpt_loss=0.298, loss_mean=0.315][A
+Train step of epoch 0:  44%|████▎     | 2812/6434 [6:35:45<8:53:35,  8.84s/it, gpt_loss=0.298, loss_mean=0.315][A
+Train step of epoch 0:  44%|████▎     | 2812/6434 [6:35:54<8:53:35,  8.84s/it, gpt_loss=0.404, loss_mean=0.324][A
+Train step of epoch 0:  44%|████▎     | 2813/6434 [6:35:54<9:00:07,  8.95s/it, gpt_loss=0.404, loss_mean=0.324][A
+Train step of epoch 0:  44%|████▎     | 2813/6434 [6:36:01<9:00:07,  8.95s/it, gpt_loss=0.345, loss_mean=0.326][A
+Train step of epoch 0:  44%|████▎     | 2814/6434 [6:36:01<8:26:21,  8.39s/it, gpt_loss=0.345, loss_mean=0.326][A
+Train step of epoch 0:  44%|████▎     | 2814/6434 [6:36:10<8:26:21,  8.39s/it, gpt_loss=0.34, loss_mean=0.327] [A
+Train step of epoch 0:  44%|████▍     | 2815/6434 [6:36:10<8:29:32,  8.45s/it, gpt_loss=0.34, loss_mean=0.327][A
+Train step of epoch 0:  44%|████▍     | 2815/6434 [6:36:19<8:29:32,  8.45s/it, gpt_loss=0.317, loss_mean=0.326][A
+Train step of epoch 0:  44%|████▍     | 2816/6434 [6:36:19<8:45:20,  8.71s/it, gpt_loss=0.317, loss_mean=0.326][A
+Train step of epoch 0:  44%|████▍     | 2816/6434 [6:36:28<8:45:20,  8.71s/it, gpt_loss=0.446, loss_mean=0.338][A
+Train step of epoch 0:  44%|████▍     | 2817/6434 [6:36:28<8:42:55,  8.67s/it, gpt_loss=0.446, loss_mean=0.338][A
+Train step of epoch 0:  44%|████▍     | 2817/6434 [6:36:36<8:42:55,  8.67s/it, gpt_loss=0.273, loss_mean=0.332][A
+Train step of epoch 0:  44%|████▍     | 2818/6434 [6:36:36<8:31:11,  8.48s/it, gpt_loss=0.273, loss_mean=0.332][A
+Train step of epoch 0:  44%|████▍     | 2818/6434 [6:36:44<8:31:11,  8.48s/it, gpt_loss=0.327, loss_mean=0.331][A
+Train step of epoch 0:  44%|████▍     | 2819/6434 [6:36:44<8:28:23,  8.44s/it, gpt_loss=0.327, loss_mean=0.331][A
+[LID Router Debug] Step: 2820
+Batch Size: 10
+Audio Batch Size: 121
+LID Assignments: [3, 1, 9, 9, 5, 6, 5, 2, 3, 1]
+Active Experts in Batch: {1, 2, 3, 5, 6, 9}
+
+Train step of epoch 0:  44%|████▍     | 2819/6434 [6:36:53<8:28:23,  8.44s/it, gpt_loss=0.289, loss_mean=0.327][A
+Train step of epoch 0:  44%|████▍     | 2820/6434 [6:36:53<8:39:06,  8.62s/it, gpt_loss=0.289, loss_mean=0.327][A
+Train step of epoch 0:  44%|████▍     | 2820/6434 [6:37:02<8:39:06,  8.62s/it, gpt_loss=0.292, loss_mean=0.323][A
+Train step of epoch 0:  44%|████▍     | 2821/6434 [6:37:02<8:33:03,  8.52s/it, gpt_loss=0.292, loss_mean=0.323][A
+Train step of epoch 0:  44%|████▍     | 2821/6434 [6:37:11<8:33:03,  8.52s/it, gpt_loss=0.314, loss_mean=0.322][A
+Train step of epoch 0:  44%|████▍     | 2822/6434 [6:37:11<8:45:14,  8.72s/it, gpt_loss=0.314, loss_mean=0.322][A
+Train step of epoch 0:  44%|████▍     | 2822/6434 [6:37:18<8:45:14,  8.72s/it, gpt_loss=0.305, loss_mean=0.321][A
+Train step of epoch 0:  44%|████▍     | 2823/6434 [6:37:18<8:21:36,  8.33s/it, gpt_loss=0.305, loss_mean=0.321][A
+Train step of epoch 0:  44%|████▍     | 2823/6434 [6:37:27<8:21:36,  8.33s/it, gpt_loss=0.372, loss_mean=0.326][A
+Train step of epoch 0:  44%|████▍     | 2824/6434 [6:37:27<8:23:53,  8.37s/it, gpt_loss=0.372, loss_mean=0.326][A
+Train step of epoch 0:  44%|████▍     | 2824/6434 [6:37:36<8:23:53,  8.37s/it, gpt_loss=0.259, loss_mean=0.319][A
+Train step of epoch 0:  44%|████▍     | 2825/6434 [6:37:36<8:31:34,  8.50s/it, gpt_loss=0.259, loss_mean=0.319][A
+Train step of epoch 0:  44%|████▍     | 2825/6434 [6:37:44<8:31:34,  8.50s/it, gpt_loss=0.323, loss_mean=0.32] [A
+Train step of epoch 0:  44%|████▍     | 2826/6434 [6:37:44<8:28:43,  8.46s/it, gpt_loss=0.323, loss_mean=0.32][A
+Train step of epoch 0:  44%|████▍     | 2826/6434 [6:37:52<8:28:43,  8.46s/it, gpt_loss=0.37, loss_mean=0.325][A
+Train step of epoch 0:  44%|████▍     | 2827/6434 [6:37:52<8:23:13,  8.37s/it, gpt_loss=0.37, loss_mean=0.325][A
+Train step of epoch 0:  44%|████▍     | 2827/6434 [6:38:02<8:23:13,  8.37s/it, gpt_loss=0.314, loss_mean=0.323][A
+Train step of epoch 0:  44%|████▍     | 2828/6434 [6:38:02<8:49:10,  8.81s/it, gpt_loss=0.314, loss_mean=0.323][A
+Train step of epoch 0:  44%|████▍     | 2828/6434 [6:38:10<8:49:10,  8.81s/it, gpt_loss=0.235, loss_mean=0.315][A
+Train step of epoch 0:  44%|████▍     | 2829/6434 [6:38:10<8:38:56,  8.64s/it, gpt_loss=0.235, loss_mean=0.315][A
+[LID Router Debug] Step: 2830
+Batch Size: 10
+Audio Batch Size: 120
+LID Assignments: [5, 4, 5, 3, 2, 1, 10, 5, 4, 3]
+Active Experts in Batch: {1, 2, 3, 4, 5, 10}
+
+Train step of epoch 0:  44%|████▍     | 2829/6434 [6:38:18<8:38:56,  8.64s/it, gpt_loss=0.287, loss_mean=0.312][A
+Train step of epoch 0:  44%|████▍     | 2830/6434 [6:38:18<8:23:46,  8.39s/it, gpt_loss=0.287, loss_mean=0.312][A
+Train step of epoch 0:  44%|████▍     | 2830/6434 [6:38:25<8:23:46,  8.39s/it, gpt_loss=0.284, loss_mean=0.309][A
+Train step of epoch 0:  44%|████▍     | 2831/6434 [6:38:25<8:00:43,  8.01s/it, gpt_loss=0.284, loss_mean=0.309][A
+Train step of epoch 0:  44%|████▍     | 2831/6434 [6:38:33<8:00:43,  8.01s/it, gpt_loss=0.397, loss_mean=0.318][A
+Train step of epoch 0:  44%|████▍     | 2832/6434 [6:38:33<7:59:36,  7.99s/it, gpt_loss=0.397, loss_mean=0.318][A
+Train step of epoch 0:  44%|████▍     | 2832/6434 [6:38:41<7:59:36,  7.99s/it, gpt_loss=0.299, loss_mean=0.316][A
+Train step of epoch 0:  44%|████▍     | 2833/6434 [6:38:41<7:52:41,  7.88s/it, gpt_loss=0.299, loss_mean=0.316][A
+Train step of epoch 0:  44%|████▍     | 2833/6434 [6:38:48<7:52:41,  7.88s/it, gpt_loss=0.227, loss_mean=0.307][A
+Train step of epoch 0:  44%|████▍     | 2834/6434 [6:38:48<7:46:34,  7.78s/it, gpt_loss=0.227, loss_mean=0.307][A
+Train step of epoch 0:  44%|████▍     | 2834/6434 [6:38:57<7:46:34,  7.78s/it, gpt_loss=0.265, loss_mean=0.303][A
+Train step of epoch 0:  44%|████▍     | 2835/6434 [6:38:57<8:00:38,  8.01s/it, gpt_loss=0.265, loss_mean=0.303][A
+Train step of epoch 0:  44%|████▍     | 2835/6434 [6:39:05<8:00:38,  8.01s/it, gpt_loss=0.323, loss_mean=0.305][A
+Train step of epoch 0:  44%|████▍     | 2836/6434 [6:39:05<8:06:33,  8.11s/it, gpt_loss=0.323, loss_mean=0.305][A
+Train step of epoch 0:  44%|████▍     | 2836/6434 [6:39:14<8:06:33,  8.11s/it, gpt_loss=0.32, loss_mean=0.306] [A
+Train step of epoch 0:  44%|████▍     | 2837/6434 [6:39:14<8:15:13,  8.26s/it, gpt_loss=0.32, loss_mean=0.306][A
+Train step of epoch 0:  44%|████▍     | 2837/6434 [6:39:23<8:15:13,  8.26s/it, gpt_loss=0.299, loss_mean=0.306][A
+Train step of epoch 0:  44%|████▍     | 2838/6434 [6:39:23<8:26:08,  8.45s/it, gpt_loss=0.299, loss_mean=0.306][A
+Train step of epoch 0:  44%|████▍     | 2838/6434 [6:39:31<8:26:08,  8.45s/it, gpt_loss=0.398, loss_mean=0.315][A
+Train step of epoch 0:  44%|████▍     | 2839/6434 [6:39:31<8:17:34,  8.30s/it, gpt_loss=0.398, loss_mean=0.315][A
+[LID Router Debug] Step: 2840
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [3, 0, 2, 5, 4, 1, 9, 9, 0, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  44%|████▍     | 2839/6434 [6:39:38<8:17:34,  8.30s/it, gpt_loss=0.328, loss_mean=0.316][A
+Train step of epoch 0:  44%|████▍     | 2840/6434 [6:39:38<8:07:25,  8.14s/it, gpt_loss=0.328, loss_mean=0.316][A
+Train step of epoch 0:  44%|████▍     | 2840/6434 [6:39:47<8:07:25,  8.14s/it, gpt_loss=0.295, loss_mean=0.314][A
+Train step of epoch 0:  44%|████▍     | 2841/6434 [6:39:47<8:18:12,  8.32s/it, gpt_loss=0.295, loss_mean=0.314][A
+Train step of epoch 0:  44%|████▍     | 2841/6434 [6:39:55<8:18:12,  8.32s/it, gpt_loss=0.378, loss_mean=0.32] [A
+Train step of epoch 0:  44%|████▍     | 2842/6434 [6:39:55<8:18:10,  8.32s/it, gpt_loss=0.378, loss_mean=0.32][A
+Train step of epoch 0:  44%|████▍     | 2842/6434 [6:40:04<8:18:10,  8.32s/it, gpt_loss=0.317, loss_mean=0.32][A
+Train step of epoch 0:  44%|████▍     | 2843/6434 [6:40:04<8:32:18,  8.56s/it, gpt_loss=0.317, loss_mean=0.32][A
+Train step of epoch 0:  44%|████▍     | 2843/6434 [6:40:12<8:32:18,  8.56s/it, gpt_loss=0.238, loss_mean=0.312][A
+Train step of epoch 0:  44%|████▍     | 2844/6434 [6:40:12<8:17:42,  8.32s/it, gpt_loss=0.238, loss_mean=0.312][A
+Train step of epoch 0:  44%|████▍     | 2844/6434 [6:40:21<8:17:42,  8.32s/it, gpt_loss=0.336, loss_mean=0.314][A
+Train step of epoch 0:  44%|████▍     | 2845/6434 [6:40:21<8:27:17,  8.48s/it, gpt_loss=0.336, loss_mean=0.314][A
+Train step of epoch 0:  44%|████▍     | 2845/6434 [6:40:30<8:27:17,  8.48s/it, gpt_loss=0.268, loss_mean=0.31] [A
+Train step of epoch 0:  44%|████▍     | 2846/6434 [6:40:30<8:36:31,  8.64s/it, gpt_loss=0.268, loss_mean=0.31][A
+Train step of epoch 0:  44%|████▍     | 2846/6434 [6:40:40<8:36:31,  8.64s/it, gpt_loss=0.292, loss_mean=0.308][A
+Train step of epoch 0:  44%|████▍     | 2847/6434 [6:40:40<8:54:16,  8.94s/it, gpt_loss=0.292, loss_mean=0.308][A
+Train step of epoch 0:  44%|████▍     | 2847/6434 [6:40:47<8:54:16,  8.94s/it, gpt_loss=0.373, loss_mean=0.314][A
+Train step of epoch 0:  44%|████▍     | 2848/6434 [6:40:47<8:15:58,  8.30s/it, gpt_loss=0.373, loss_mean=0.314][A
+Train step of epoch 0:  44%|████▍     | 2848/6434 [6:40:55<8:15:58,  8.30s/it, gpt_loss=0.284, loss_mean=0.311][A
+Train step of epoch 0:  44%|████▍     | 2849/6434 [6:40:55<8:24:21,  8.44s/it, gpt_loss=0.284, loss_mean=0.311][A
+[LID Router Debug] Step: 2850
+Batch Size: 10
+Audio Batch Size: 117
+LID Assignments: [1, 2, 0, 3, 4, 3, 9, 9, 4, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+Train step of epoch 0:  44%|████▍     | 2849/6434 [6:41:04<8:24:21,  8.44s/it, gpt_loss=0.297, loss_mean=0.31] [A
+Train step of epoch 0:  44%|████▍     | 2850/6434 [6:41:04<8:34:15,  8.61s/it, gpt_loss=0.297, loss_mean=0.31][A
+Train step of epoch 0:  44%|████▍     | 2850/6434 [6:41:12<8:34:15,  8.61s/it, gpt_loss=0.343, loss_mean=0.313][A
+Train step of epoch 0:  44%|████▍     | 2851/6434 [6:41:12<8:09:48,  8.20s/it, gpt_loss=0.343, loss_mean=0.313][A
+Train step of epoch 0:  44%|████▍     | 2851/6434 [6:41:20<8:09:48,  8.20s/it, gpt_loss=0.27, loss_mean=0.309] [A
+Train step of epoch 0:  44%|████▍     | 2852/6434 [6:41:20<8:14:14,  8.28s/it, gpt_loss=0.27, loss_mean=0.309][A
+Train step of epoch 0:  44%|████▍     | 2852/6434 [6:41:28<8:14:14,  8.28s/it, gpt_loss=0.372, loss_mean=0.315][A
+Train step of epoch 0:  44%|████▍     | 2853/6434 [6:41:28<8:15:20,  8.30s/it, gpt_loss=0.372, loss_mean=0.315][A
+Train step of epoch 0:  44%|████▍     | 2853/6434 [6:41:38<8:15:20,  8.30s/it, gpt_loss=0.318, loss_mean=0.315][A
+Train step of epoch 0:  44%|████▍     | 2854/6434 [6:41:38<8:30:46,  8.56s/it, gpt_loss=0.318, loss_mean=0.315][A
+Train step of epoch 0:  44%|████▍     | 2854/6434 [6:41:46<8:30:46,  8.56s/it, gpt_loss=0.272, loss_mean=0.311][A
+Train step of epoch 0:  44%|████▍     | 2855/6434 [6:41:46<8:25:30,  8.47s/it, gpt_loss=0.272, loss_mean=0.311][A
+Train step of epoch 0:  44%|████▍     | 2855/6434 [6:41:54<8:25:30,  8.47s/it, gpt_loss=0.353, loss_mean=0.315][A
+Train step of epoch 0:  44%|████▍     | 2856/6434 [6:41:54<8:25:25,  8.48s/it, gpt_loss=0.353, loss_mean=0.315][A
+Train step of epoch 0:  44%|████▍     | 2856/6434 [6:42:03<8:25:25,  8.48s/it, gpt_loss=0.428, loss_mean=0.327][A
+Train step of epoch 0:  44%|████▍     | 2857/6434 [6:42:03<8:21:07,  8.41s/it, gpt_loss=0.428, loss_mean=0.327][A
+Train step of epoch 0:  44%|████▍     | 2857/6434 [6:42:10<8:21:07,  8.41s/it, gpt_loss=0.33, loss_mean=0.327] [A
+Train step of epoch 0:  44%|████▍     | 2858/6434 [6:42:10<8:02:47,  8.10s/it, gpt_loss=0.33, loss_mean=0.327][A
+Train step of epoch 0:  44%|████▍     | 2858/6434 [6:42:18<8:02:47,  8.10s/it, gpt_loss=0.26, loss_mean=0.32] [A
+Train step of epoch 0:  44%|████▍     | 2859/6434 [6:42:18<8:05:57,  8.16s/it, gpt_loss=0.26, loss_mean=0.32][A
+[LID Router Debug] Step: 2860
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [6, 3, 6, 9, 0, 5, 2, 4, 5, 0]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  44%|████▍     | 2859/6434 [6:42:26<8:05:57,  8.16s/it, gpt_loss=0.281, loss_mean=0.316][A
+Train step of epoch 0:  44%|████▍     | 2860/6434 [6:42:26<7:58:26,  8.03s/it, gpt_loss=0.281, loss_mean=0.316][A
+Train step of epoch 0:  44%|████▍     | 2860/6434 [6:42:35<7:58:26,  8.03s/it, gpt_loss=0.276, loss_mean=0.312][A
+Train step of epoch 0:  44%|████▍     | 2861/6434 [6:42:35<8:08:49,  8.21s/it, gpt_loss=0.276, loss_mean=0.312][A
+Train step of epoch 0:  44%|████▍     | 2861/6434 [6:42:43<8:08:49,  8.21s/it, gpt_loss=0.379, loss_mean=0.319][A
+Train step of epoch 0:  44%|████▍     | 2862/6434 [6:42:43<8:19:53,  8.40s/it, gpt_loss=0.379, loss_mean=0.319][A
+Train step of epoch 0:  44%|████▍     | 2862/6434 [6:42:52<8:19:53,  8.40s/it, gpt_loss=0.329, loss_mean=0.32] [A
+Train step of epoch 0:  44%|████▍     | 2863/6434 [6:42:52<8:28:39,  8.55s/it, gpt_loss=0.329, loss_mean=0.32][A
+Train step of epoch 0:  44%|████▍     | 2863/6434 [6:43:00<8:28:39,  8.55s/it, gpt_loss=0.31, loss_mean=0.319][A
+Train step of epoch 0:  45%|████▍     | 2864/6434 [6:43:00<8:19:06,  8.39s/it, gpt_loss=0.31, loss_mean=0.319][A
+Train step of epoch 0:  45%|████▍     | 2864/6434 [6:43:11<8:19:06,  8.39s/it, gpt_loss=0.322, loss_mean=0.319][A
+Train step of epoch 0:  45%|████▍     | 2865/6434 [6:43:11<9:00:33,  9.09s/it, gpt_loss=0.322, loss_mean=0.319][A
+Train step of epoch 0:  45%|████▍     | 2865/6434 [6:43:19<9:00:33,  9.09s/it, gpt_loss=0.399, loss_mean=0.327][A
+Train step of epoch 0:  45%|████▍     | 2866/6434 [6:43:19<8:41:40,  8.77s/it, gpt_loss=0.399, loss_mean=0.327][A
+Train step of epoch 0:  45%|████▍     | 2866/6434 [6:43:27<8:41:40,  8.77s/it, gpt_loss=0.373, loss_mean=0.332][A
+Train step of epoch 0:  45%|████▍     | 2867/6434 [6:43:27<8:33:14,  8.63s/it, gpt_loss=0.373, loss_mean=0.332][A
+Train step of epoch 0:  45%|████▍     | 2867/6434 [6:43:36<8:33:14,  8.63s/it, gpt_loss=0.308, loss_mean=0.329][A
+Train step of epoch 0:  45%|████▍     | 2868/6434 [6:43:36<8:32:10,  8.62s/it, gpt_loss=0.308, loss_mean=0.329][A
+Train step of epoch 0:  45%|████▍     | 2868/6434 [6:43:45<8:32:10,  8.62s/it, gpt_loss=0.344, loss_mean=0.331][A
+Train step of epoch 0:  45%|████▍     | 2869/6434 [6:43:45<8:42:26,  8.79s/it, gpt_loss=0.344, loss_mean=0.331][A
+[LID Router Debug] Step: 2870
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [2, 5, 0, 2, 5, 1, 1, 1, 0, 9]
+Active Experts in Batch: {0, 1, 2, 5, 9}
+
+Train step of epoch 0:  45%|████▍     | 2869/6434 [6:43:54<8:42:26,  8.79s/it, gpt_loss=0.361, loss_mean=0.334][A
+Train step of epoch 0:  45%|████▍     | 2870/6434 [6:43:54<8:44:07,  8.82s/it, gpt_loss=0.361, loss_mean=0.334][A
+Train step of epoch 0:  45%|████▍     | 2870/6434 [6:44:01<8:44:07,  8.82s/it, gpt_loss=0.317, loss_mean=0.332][A
+Train step of epoch 0:  45%|████▍     | 2871/6434 [6:44:01<8:15:32,  8.34s/it, gpt_loss=0.317, loss_mean=0.332][A
+Train step of epoch 0:  45%|████▍     | 2871/6434 [6:44:09<8:15:32,  8.34s/it, gpt_loss=0.323, loss_mean=0.331][A
+Train step of epoch 0:  45%|████▍     | 2872/6434 [6:44:09<8:03:11,  8.14s/it, gpt_loss=0.323, loss_mean=0.331][A
+Train step of epoch 0:  45%|████▍     | 2872/6434 [6:44:17<8:03:11,  8.14s/it, gpt_loss=0.348, loss_mean=0.333][A
+Train step of epoch 0:  45%|████▍     | 2873/6434 [6:44:17<8:03:01,  8.14s/it, gpt_loss=0.348, loss_mean=0.333][A
+Train step of epoch 0:  45%|████▍     | 2873/6434 [6:44:25<8:03:01,  8.14s/it, gpt_loss=0.25, loss_mean=0.325] [A
+Train step of epoch 0:  45%|████▍     | 2874/6434 [6:44:25<8:03:20,  8.15s/it, gpt_loss=0.25, loss_mean=0.325][A
+Train step of epoch 0:  45%|████▍     | 2874/6434 [6:44:34<8:03:20,  8.15s/it, gpt_loss=0.297, loss_mean=0.322][A
+Train step of epoch 0:  45%|████▍     | 2875/6434 [6:44:34<8:12:32,  8.30s/it, gpt_loss=0.297, loss_mean=0.322][A
+Train step of epoch 0:  45%|████▍     | 2875/6434 [6:44:42<8:12:32,  8.30s/it, gpt_loss=0.349, loss_mean=0.325][A
+Train step of epoch 0:  45%|████▍     | 2876/6434 [6:44:42<8:05:25,  8.19s/it, gpt_loss=0.349, loss_mean=0.325][A
+Train step of epoch 0:  45%|████▍     | 2876/6434 [6:44:50<8:05:25,  8.19s/it, gpt_loss=0.339, loss_mean=0.326][A
+Train step of epoch 0:  45%|████▍     | 2877/6434 [6:44:50<7:59:47,  8.09s/it, gpt_loss=0.339, loss_mean=0.326][A
+Train step of epoch 0:  45%|████▍     | 2877/6434 [6:44:58<7:59:47,  8.09s/it, gpt_loss=0.292, loss_mean=0.323][A
+Train step of epoch 0:  45%|████▍     | 2878/6434 [6:44:58<7:59:50,  8.10s/it, gpt_loss=0.292, loss_mean=0.323][A
+Train step of epoch 0:  45%|████▍     | 2878/6434 [6:45:07<7:59:50,  8.10s/it, gpt_loss=0.286, loss_mean=0.319][A
+Train step of epoch 0:  45%|████▍     | 2879/6434 [6:45:07<8:20:33,  8.45s/it, gpt_loss=0.286, loss_mean=0.319][A
+[LID Router Debug] Step: 2880
+Batch Size: 10
+Audio Batch Size: 91
+LID Assignments: [2, 3, 4, 4, 0, 1, 1, 2, 6, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6}
+
+Train step of epoch 0:  45%|████▍     | 2879/6434 [6:45:16<8:20:33,  8.45s/it, gpt_loss=0.237, loss_mean=0.311][A
+Train step of epoch 0:  45%|████▍     | 2880/6434 [6:45:16<8:21:44,  8.47s/it, gpt_loss=0.237, loss_mean=0.311][A
+Train step of epoch 0:  45%|████▍     | 2880/6434 [6:45:25<8:21:44,  8.47s/it, gpt_loss=0.297, loss_mean=0.309][A
+Train step of epoch 0:  45%|████▍     | 2881/6434 [6:45:25<8:41:38,  8.81s/it, gpt_loss=0.297, loss_mean=0.309][A
+Train step of epoch 0:  45%|████▍     | 2881/6434 [6:45:34<8:41:38,  8.81s/it, gpt_loss=0.324, loss_mean=0.311][A
+Train step of epoch 0:  45%|████▍     | 2882/6434 [6:45:34<8:46:59,  8.90s/it, gpt_loss=0.324, loss_mean=0.311][A
+Train step of epoch 0:  45%|████▍     | 2882/6434 [6:45:41<8:46:59,  8.90s/it, gpt_loss=0.472, loss_mean=0.327][A
+Train step of epoch 0:  45%|████▍     | 2883/6434 [6:45:41<8:13:00,  8.33s/it, gpt_loss=0.472, loss_mean=0.327][A
+Train step of epoch 0:  45%|████▍     | 2883/6434 [6:45:50<8:13:00,  8.33s/it, gpt_loss=0.297, loss_mean=0.324][A
+Train step of epoch 0:  45%|████▍     | 2884/6434 [6:45:50<8:18:35,  8.43s/it, gpt_loss=0.297, loss_mean=0.324][A
+Train step of epoch 0:  45%|████▍     | 2884/6434 [6:45:58<8:18:35,  8.43s/it, gpt_loss=0.3, loss_mean=0.322]  [A
+Train step of epoch 0:  45%|████▍     | 2885/6434 [6:45:58<8:07:57,  8.25s/it, gpt_loss=0.3, loss_mean=0.322][A
+Train step of epoch 0:  45%|████▍     | 2885/6434 [6:46:07<8:07:57,  8.25s/it, gpt_loss=0.318, loss_mean=0.321][A
+Train step of epoch 0:  45%|████▍     | 2886/6434 [6:46:07<8:24:43,  8.54s/it, gpt_loss=0.318, loss_mean=0.321][A
+Train step of epoch 0:  45%|████▍     | 2886/6434 [6:46:16<8:24:43,  8.54s/it, gpt_loss=0.259, loss_mean=0.315][A
+Train step of epoch 0:  45%|████▍     | 2887/6434 [6:46:16<8:33:43,  8.69s/it, gpt_loss=0.259, loss_mean=0.315][A
+Train step of epoch 0:  45%|████▍     | 2887/6434 [6:46:25<8:33:43,  8.69s/it, gpt_loss=0.402, loss_mean=0.324][A
+Train step of epoch 0:  45%|████▍     | 2888/6434 [6:46:25<8:35:58,  8.73s/it, gpt_loss=0.402, loss_mean=0.324][A
+Train step of epoch 0:  45%|████▍     | 2888/6434 [6:46:33<8:35:58,  8.73s/it, gpt_loss=0.393, loss_mean=0.331][A
+Train step of epoch 0:  45%|████▍     | 2889/6434 [6:46:33<8:29:46,  8.63s/it, gpt_loss=0.393, loss_mean=0.331][A
+[LID Router Debug] Step: 2890
+Batch Size: 10
+Audio Batch Size: 123
+LID Assignments: [3, 3, 1, 0, 3, 4, 1, 9, 3, 1]
+Active Experts in Batch: {0, 1, 3, 4, 9}
+
+Train step of epoch 0:  45%|████▍     | 2889/6434 [6:46:42<8:29:46,  8.63s/it, gpt_loss=0.303, loss_mean=0.328][A
+Train step of epoch 0:  45%|████▍     | 2890/6434 [6:46:42<8:37:29,  8.76s/it, gpt_loss=0.303, loss_mean=0.328][A
+Train step of epoch 0:  45%|████▍     | 2890/6434 [6:46:51<8:37:29,  8.76s/it, gpt_loss=0.374, loss_mean=0.332][A
+Train step of epoch 0:  45%|████▍     | 2891/6434 [6:46:51<8:29:09,  8.62s/it, gpt_loss=0.374, loss_mean=0.332][A
+Train step of epoch 0:  45%|████▍     | 2891/6434 [6:46:58<8:29:09,  8.62s/it, gpt_loss=0.269, loss_mean=0.326][A
+Train step of epoch 0:  45%|████▍     | 2892/6434 [6:46:58<7:59:50,  8.13s/it, gpt_loss=0.269, loss_mean=0.326][A
+Train step of epoch 0:  45%|████▍     | 2892/6434 [6:47:06<7:59:50,  8.13s/it, gpt_loss=0.307, loss_mean=0.324][A
+Train step of epoch 0:  45%|████▍     | 2893/6434 [6:47:06<8:09:04,  8.29s/it, gpt_loss=0.307, loss_mean=0.324][A
+Train step of epoch 0:  45%|████▍     | 2893/6434 [6:47:14<8:09:04,  8.29s/it, gpt_loss=0.324, loss_mean=0.324][A
+Train step of epoch 0:  45%|████▍     | 2894/6434 [6:47:14<8:04:12,  8.21s/it, gpt_loss=0.324, loss_mean=0.324][A
+Train step of epoch 0:  45%|████▍     | 2894/6434 [6:47:22<8:04:12,  8.21s/it, gpt_loss=0.296, loss_mean=0.321][A
+Train step of epoch 0:  45%|████▍     | 2895/6434 [6:47:22<8:00:12,  8.14s/it, gpt_loss=0.296, loss_mean=0.321][A
+Train step of epoch 0:  45%|████▍     | 2895/6434 [6:47:31<8:00:12,  8.14s/it, gpt_loss=0.37, loss_mean=0.326] [A
+Train step of epoch 0:  45%|████▌     | 2896/6434 [6:47:31<8:18:23,  8.45s/it, gpt_loss=0.37, loss_mean=0.326][A
+Train step of epoch 0:  45%|████▌     | 2896/6434 [6:47:39<8:18:23,  8.45s/it, gpt_loss=0.305, loss_mean=0.324][A
+Train step of epoch 0:  45%|████▌     | 2897/6434 [6:47:39<7:58:03,  8.11s/it, gpt_loss=0.305, loss_mean=0.324][A
+Train step of epoch 0:  45%|████▌     | 2897/6434 [6:47:47<7:58:03,  8.11s/it, gpt_loss=0.315, loss_mean=0.323][A
+Train step of epoch 0:  45%|████▌     | 2898/6434 [6:47:47<7:59:26,  8.14s/it, gpt_loss=0.315, loss_mean=0.323][A
+Train step of epoch 0:  45%|████▌     | 2898/6434 [6:47:56<7:59:26,  8.14s/it, gpt_loss=0.264, loss_mean=0.317][A
+Train step of epoch 0:  45%|████▌     | 2899/6434 [6:47:56<8:23:00,  8.54s/it, gpt_loss=0.264, loss_mean=0.317][A
+[LID Router Debug] Step: 2900
+Batch Size: 10
+Audio Batch Size: 104
+LID Assignments: [6, 9, 3, 0, 2, 6, 4, 5, 9, 2]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  45%|████▌     | 2899/6434 [6:48:04<8:23:00,  8.54s/it, gpt_loss=0.363, loss_mean=0.322][A
+Train step of epoch 0:  45%|████▌     | 2900/6434 [6:48:04<8:10:18,  8.32s/it, gpt_loss=0.363, loss_mean=0.322][A
+Train step of epoch 0:  45%|████▌     | 2900/6434 [6:48:13<8:10:18,  8.32s/it, gpt_loss=0.284, loss_mean=0.318][A
+Train step of epoch 0:  45%|████▌     | 2901/6434 [6:48:13<8:12:38,  8.37s/it, gpt_loss=0.284, loss_mean=0.318][A
+Train step of epoch 0:  45%|████▌     | 2901/6434 [6:48:20<8:12:38,  8.37s/it, gpt_loss=0.318, loss_mean=0.318][A
+Train step of epoch 0:  45%|████▌     | 2902/6434 [6:48:20<7:49:50,  7.98s/it, gpt_loss=0.318, loss_mean=0.318][A
+Train step of epoch 0:  45%|████▌     | 2902/6434 [6:48:29<7:49:50,  7.98s/it, gpt_loss=0.267, loss_mean=0.313][A
+Train step of epoch 0:  45%|████▌     | 2903/6434 [6:48:29<8:03:36,  8.22s/it, gpt_loss=0.267, loss_mean=0.313][A
+Train step of epoch 0:  45%|████▌     | 2903/6434 [6:48:37<8:03:36,  8.22s/it, gpt_loss=0.283, loss_mean=0.31] [A
+Train step of epoch 0:  45%|████▌     | 2904/6434 [6:48:37<8:10:56,  8.34s/it, gpt_loss=0.283, loss_mean=0.31][A
+Train step of epoch 0:  45%|████▌     | 2904/6434 [6:48:45<8:10:56,  8.34s/it, gpt_loss=0.25, loss_mean=0.304][A
+Train step of epoch 0:  45%|████▌     | 2905/6434 [6:48:45<8:00:58,  8.18s/it, gpt_loss=0.25, loss_mean=0.304][A
+Train step of epoch 0:  45%|████▌     | 2905/6434 [6:48:53<8:00:58,  8.18s/it, gpt_loss=0.288, loss_mean=0.302][A
+Train step of epoch 0:  45%|████▌     | 2906/6434 [6:48:53<7:53:20,  8.05s/it, gpt_loss=0.288, loss_mean=0.302][A
+Train step of epoch 0:  45%|████▌     | 2906/6434 [6:49:00<7:53:20,  8.05s/it, gpt_loss=0.278, loss_mean=0.3]  [A
+Train step of epoch 0:  45%|████▌     | 2907/6434 [6:49:00<7:35:32,  7.75s/it, gpt_loss=0.278, loss_mean=0.3][A
+Train step of epoch 0:  45%|████▌     | 2907/6434 [6:49:09<7:35:32,  7.75s/it, gpt_loss=0.267, loss_mean=0.297][A
+Train step of epoch 0:  45%|████▌     | 2908/6434 [6:49:09<7:58:21,  8.14s/it, gpt_loss=0.267, loss_mean=0.297][A
+Train step of epoch 0:  45%|████▌     | 2908/6434 [6:49:19<7:58:21,  8.14s/it, gpt_loss=0.281, loss_mean=0.295][A
+Train step of epoch 0:  45%|████▌     | 2909/6434 [6:49:19<8:29:31,  8.67s/it, gpt_loss=0.281, loss_mean=0.295][A
+[LID Router Debug] Step: 2910
+Batch Size: 10
+Audio Batch Size: 128
+LID Assignments: [1, 6, 4, 0, 9, 8, 5, 5, 3, 3]
+Active Experts in Batch: {0, 1, 3, 4, 5, 6, 8, 9}
+
+Train step of epoch 0:  45%|████▌     | 2909/6434 [6:49:27<8:29:31,  8.67s/it, gpt_loss=0.346, loss_mean=0.3]  [A
+Train step of epoch 0:  45%|████▌     | 2910/6434 [6:49:27<8:29:42,  8.68s/it, gpt_loss=0.346, loss_mean=0.3][A
+Train step of epoch 0:  45%|████▌     | 2910/6434 [6:49:36<8:29:42,  8.68s/it, gpt_loss=0.348, loss_mean=0.305][A
+Train step of epoch 0:  45%|████▌     | 2911/6434 [6:49:36<8:33:24,  8.74s/it, gpt_loss=0.348, loss_mean=0.305][A
+Train step of epoch 0:  45%|████▌     | 2911/6434 [6:49:44<8:33:24,  8.74s/it, gpt_loss=0.338, loss_mean=0.308][A
+Train step of epoch 0:  45%|████▌     | 2912/6434 [6:49:44<8:07:34,  8.31s/it, gpt_loss=0.338, loss_mean=0.308][A
+Train step of epoch 0:  45%|████▌     | 2912/6434 [6:49:53<8:07:34,  8.31s/it, gpt_loss=0.367, loss_mean=0.314][A
+Train step of epoch 0:  45%|████▌     | 2913/6434 [6:49:53<8:16:44,  8.46s/it, gpt_loss=0.367, loss_mean=0.314][A
+Train step of epoch 0:  45%|████▌     | 2913/6434 [6:50:02<8:16:44,  8.46s/it, gpt_loss=0.366, loss_mean=0.319][A
+Train step of epoch 0:  45%|████▌     | 2914/6434 [6:50:02<8:35:04,  8.78s/it, gpt_loss=0.366, loss_mean=0.319][A
+Train step of epoch 0:  45%|████▌     | 2914/6434 [6:50:11<8:35:04,  8.78s/it, gpt_loss=0.318, loss_mean=0.319][A
+Train step of epoch 0:  45%|████▌     | 2915/6434 [6:50:11<8:41:27,  8.89s/it, gpt_loss=0.318, loss_mean=0.319][A
+Train step of epoch 0:  45%|████▌     | 2915/6434 [6:50:20<8:41:27,  8.89s/it, gpt_loss=0.318, loss_mean=0.319][A
+Train step of epoch 0:  45%|████▌     | 2916/6434 [6:50:20<8:32:15,  8.74s/it, gpt_loss=0.318, loss_mean=0.319][A
+Train step of epoch 0:  45%|████▌     | 2916/6434 [6:50:28<8:32:15,  8.74s/it, gpt_loss=0.484, loss_mean=0.336][A
+Train step of epoch 0:  45%|████▌     | 2917/6434 [6:50:28<8:26:08,  8.63s/it, gpt_loss=0.484, loss_mean=0.336][A
+Train step of epoch 0:  45%|████▌     | 2917/6434 [6:50:37<8:26:08,  8.63s/it, gpt_loss=0.361, loss_mean=0.338][A
+Train step of epoch 0:  45%|████▌     | 2918/6434 [6:50:37<8:30:58,  8.72s/it, gpt_loss=0.361, loss_mean=0.338][A
+Train step of epoch 0:  45%|████▌     | 2918/6434 [6:50:45<8:30:58,  8.72s/it, gpt_loss=0.334, loss_mean=0.338][A
+Train step of epoch 0:  45%|████▌     | 2919/6434 [6:50:45<8:24:32,  8.61s/it, gpt_loss=0.334, loss_mean=0.338][A
+[LID Router Debug] Step: 2920
+Batch Size: 10
+Audio Batch Size: 95
+LID Assignments: [0, 9, 5, 5, 7, 1, 2, 2, 5, 1]
+Active Experts in Batch: {0, 1, 2, 5, 7, 9}
+
+Train step of epoch 0:  45%|████▌     | 2919/6434 [6:50:54<8:24:32,  8.61s/it, gpt_loss=0.361, loss_mean=0.34] [A
+Train step of epoch 0:  45%|████▌     | 2920/6434 [6:50:54<8:20:02,  8.54s/it, gpt_loss=0.361, loss_mean=0.34][A
+Train step of epoch 0:  45%|████▌     | 2920/6434 [6:51:02<8:20:02,  8.54s/it, gpt_loss=0.28, loss_mean=0.334][A
+Train step of epoch 0:  45%|████▌     | 2921/6434 [6:51:02<8:14:30,  8.45s/it, gpt_loss=0.28, loss_mean=0.334][A
+Train step of epoch 0:  45%|████▌     | 2921/6434 [6:51:09<8:14:30,  8.45s/it, gpt_loss=0.421, loss_mean=0.343][A
+Train step of epoch 0:  45%|████▌     | 2922/6434 [6:51:09<7:45:36,  7.95s/it, gpt_loss=0.421, loss_mean=0.343][A
+Train step of epoch 0:  45%|████▌     | 2922/6434 [6:51:16<7:45:36,  7.95s/it, gpt_loss=0.267, loss_mean=0.335][A
+Train step of epoch 0:  45%|████▌     | 2923/6434 [6:51:16<7:42:47,  7.91s/it, gpt_loss=0.267, loss_mean=0.335][A
+Train step of epoch 0:  45%|████▌     | 2923/6434 [6:51:25<7:42:47,  7.91s/it, gpt_loss=0.383, loss_mean=0.34] [A
+Train step of epoch 0:  45%|████▌     | 2924/6434 [6:51:25<7:48:29,  8.01s/it, gpt_loss=0.383, loss_mean=0.34][A
+Train step of epoch 0:  45%|████▌     | 2924/6434 [6:51:33<7:48:29,  8.01s/it, gpt_loss=0.413, loss_mean=0.347][A
+Train step of epoch 0:  45%|████▌     | 2925/6434 [6:51:33<7:45:18,  7.96s/it, gpt_loss=0.413, loss_mean=0.347][A
+Train step of epoch 0:  45%|████▌     | 2925/6434 [6:51:41<7:45:18,  7.96s/it, gpt_loss=0.3, loss_mean=0.342]  [A
+Train step of epoch 0:  45%|████▌     | 2926/6434 [6:51:41<8:02:46,  8.26s/it, gpt_loss=0.3, loss_mean=0.342][A
+Train step of epoch 0:  45%|████▌     | 2926/6434 [6:51:50<8:02:46,  8.26s/it, gpt_loss=0.231, loss_mean=0.331][A
+Train step of epoch 0:  45%|████▌     | 2927/6434 [6:51:50<8:02:39,  8.26s/it, gpt_loss=0.231, loss_mean=0.331][A
+Train step of epoch 0:  45%|████▌     | 2927/6434 [6:51:58<8:02:39,  8.26s/it, gpt_loss=0.318, loss_mean=0.33] [A
+Train step of epoch 0:  46%|████▌     | 2928/6434 [6:51:58<8:00:29,  8.22s/it, gpt_loss=0.318, loss_mean=0.33][A
+Train step of epoch 0:  46%|████▌     | 2928/6434 [6:52:06<8:00:29,  8.22s/it, gpt_loss=0.348, loss_mean=0.332][A
+Train step of epoch 0:  46%|████▌     | 2929/6434 [6:52:06<8:07:10,  8.34s/it, gpt_loss=0.348, loss_mean=0.332][A
+[LID Router Debug] Step: 2930
+Batch Size: 10
+Audio Batch Size: 98
+LID Assignments: [0, 5, 6, 0, 6, 0, 0, 2, 1, 3]
+Active Experts in Batch: {0, 1, 2, 3, 5, 6}
+
+Train step of epoch 0:  46%|████▌     | 2929/6434 [6:52:15<8:07:10,  8.34s/it, gpt_loss=0.25, loss_mean=0.324] [A
+Train step of epoch 0:  46%|████▌     | 2930/6434 [6:52:15<8:12:38,  8.44s/it, gpt_loss=0.25, loss_mean=0.324][A
+Train step of epoch 0:  46%|████▌     | 2930/6434 [6:52:23<8:12:38,  8.44s/it, gpt_loss=0.33, loss_mean=0.324][A
+Train step of epoch 0:  46%|████▌     | 2931/6434 [6:52:23<8:04:07,  8.29s/it, gpt_loss=0.33, loss_mean=0.324][A
+Train step of epoch 0:  46%|████▌     | 2931/6434 [6:52:31<8:04:07,  8.29s/it, gpt_loss=0.287, loss_mean=0.32][A
+Train step of epoch 0:  46%|████▌     | 2932/6434 [6:52:31<8:04:25,  8.30s/it, gpt_loss=0.287, loss_mean=0.32][A
+Train step of epoch 0:  46%|████▌     | 2932/6434 [6:52:40<8:04:25,  8.30s/it, gpt_loss=0.244, loss_mean=0.313][A
+Train step of epoch 0:  46%|████▌     | 2933/6434 [6:52:40<8:06:59,  8.35s/it, gpt_loss=0.244, loss_mean=0.313][A
+Train step of epoch 0:  46%|████▌     | 2933/6434 [6:52:48<8:06:59,  8.35s/it, gpt_loss=0.258, loss_mean=0.307][A
+Train step of epoch 0:  46%|████▌     | 2934/6434 [6:52:48<8:00:13,  8.23s/it, gpt_loss=0.258, loss_mean=0.307][A
+Train step of epoch 0:  46%|████▌     | 2934/6434 [6:52:56<8:00:13,  8.23s/it, gpt_loss=0.404, loss_mean=0.317][A
+Train step of epoch 0:  46%|████▌     | 2935/6434 [6:52:56<8:05:30,  8.33s/it, gpt_loss=0.404, loss_mean=0.317][A
+Train step of epoch 0:  46%|████▌     | 2935/6434 [6:53:05<8:05:30,  8.33s/it, gpt_loss=0.281, loss_mean=0.313][A
+Train step of epoch 0:  46%|████▌     | 2936/6434 [6:53:05<8:14:05,  8.47s/it, gpt_loss=0.281, loss_mean=0.313][A
+Train step of epoch 0:  46%|████▌     | 2936/6434 [6:53:14<8:14:05,  8.47s/it, gpt_loss=0.3, loss_mean=0.312]  [A
+Train step of epoch 0:  46%|████▌     | 2937/6434 [6:53:14<8:26:59,  8.70s/it, gpt_loss=0.3, loss_mean=0.312][A
+Train step of epoch 0:  46%|████▌     | 2937/6434 [6:53:23<8:26:59,  8.70s/it, gpt_loss=0.301, loss_mean=0.311][A
+Train step of epoch 0:  46%|████▌     | 2938/6434 [6:53:23<8:24:04,  8.65s/it, gpt_loss=0.301, loss_mean=0.311][A
+Train step of epoch 0:  46%|████▌     | 2938/6434 [6:53:31<8:24:04,  8.65s/it, gpt_loss=0.355, loss_mean=0.315][A
+Train step of epoch 0:  46%|████▌     | 2939/6434 [6:53:31<8:05:19,  8.33s/it, gpt_loss=0.355, loss_mean=0.315][A
+[LID Router Debug] Step: 2940
+Batch Size: 10
+Audio Batch Size: 122
+LID Assignments: [1, 3, 6, 6, 2, 4, 3, 9, 4, 9]
+Active Experts in Batch: {1, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:  46%|████▌     | 2939/6434 [6:53:39<8:05:19,  8.33s/it, gpt_loss=0.363, loss_mean=0.32] [A
+Train step of epoch 0:  46%|████▌     | 2940/6434 [6:53:39<8:04:32,  8.32s/it, gpt_loss=0.363, loss_mean=0.32][A
+Train step of epoch 0:  46%|████▌     | 2940/6434 [6:53:47<8:04:32,  8.32s/it, gpt_loss=0.354, loss_mean=0.324][A
+Train step of epoch 0:  46%|████▌     | 2941/6434 [6:53:47<8:03:40,  8.31s/it, gpt_loss=0.354, loss_mean=0.324][A
+Train step of epoch 0:  46%|████▌     | 2941/6434 [6:53:55<8:03:40,  8.31s/it, gpt_loss=0.281, loss_mean=0.319][A
+Train step of epoch 0:  46%|████▌     | 2942/6434 [6:53:55<8:00:40,  8.26s/it, gpt_loss=0.281, loss_mean=0.319][A
+Train step of epoch 0:  46%|████▌     | 2942/6434 [6:54:04<8:00:40,  8.26s/it, gpt_loss=0.282, loss_mean=0.316][A
+Train step of epoch 0:  46%|████▌     | 2943/6434 [6:54:04<8:13:38,  8.48s/it, gpt_loss=0.282, loss_mean=0.316][A
+Train step of epoch 0:  46%|████▌     | 2943/6434 [6:54:13<8:13:38,  8.48s/it, gpt_loss=0.377, loss_mean=0.322][A
+Train step of epoch 0:  46%|████▌     | 2944/6434 [6:54:13<8:16:43,  8.54s/it, gpt_loss=0.377, loss_mean=0.322][A
+Train step of epoch 0:  46%|████▌     | 2944/6434 [6:54:21<8:16:43,  8.54s/it, gpt_loss=0.337, loss_mean=0.323][A
+Train step of epoch 0:  46%|████▌     | 2945/6434 [6:54:21<8:05:37,  8.35s/it, gpt_loss=0.337, loss_mean=0.323][A
+Train step of epoch 0:  46%|████▌     | 2945/6434 [6:54:29<8:05:37,  8.35s/it, gpt_loss=0.366, loss_mean=0.328][A
+Train step of epoch 0:  46%|████▌     | 2946/6434 [6:54:29<7:59:46,  8.25s/it, gpt_loss=0.366, loss_mean=0.328][A
+Train step of epoch 0:  46%|████▌     | 2946/6434 [6:54:38<7:59:46,  8.25s/it, gpt_loss=0.253, loss_mean=0.32] [A
+Train step of epoch 0:  46%|████▌     | 2947/6434 [6:54:38<8:16:23,  8.54s/it, gpt_loss=0.253, loss_mean=0.32][A
+Train step of epoch 0:  46%|████▌     | 2947/6434 [6:54:45<8:16:23,  8.54s/it, gpt_loss=0.332, loss_mean=0.321][A
+Train step of epoch 0:  46%|████▌     | 2948/6434 [6:54:45<7:54:49,  8.17s/it, gpt_loss=0.332, loss_mean=0.321][A
+Train step of epoch 0:  46%|████▌     | 2948/6434 [6:54:54<7:54:49,  8.17s/it, gpt_loss=0.305, loss_mean=0.32] [A
+Train step of epoch 0:  46%|████▌     | 2949/6434 [6:54:54<7:59:51,  8.26s/it, gpt_loss=0.305, loss_mean=0.32][A
+[LID Router Debug] Step: 2950
+Batch Size: 10
+Audio Batch Size: 122
+LID Assignments: [2, 9, 4, 3, 3, 6, 9, 0, 6, 9]
+Active Experts in Batch: {0, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:  46%|████▌     | 2949/6434 [6:55:02<7:59:51,  8.26s/it, gpt_loss=0.278, loss_mean=0.315][A
+Train step of epoch 0:  46%|████▌     | 2950/6434 [6:55:02<7:57:40,  8.23s/it, gpt_loss=0.278, loss_mean=0.315][A
+Train step of epoch 0:  46%|████▌     | 2950/6434 [6:55:12<7:57:40,  8.23s/it, gpt_loss=0.212, loss_mean=0.305][A
+Train step of epoch 0:  46%|████▌     | 2951/6434 [6:55:12<8:27:01,  8.73s/it, gpt_loss=0.212, loss_mean=0.305][A
+Train step of epoch 0:  46%|████▌     | 2951/6434 [6:55:20<8:27:01,  8.73s/it, gpt_loss=0.274, loss_mean=0.302][A
+Train step of epoch 0:  46%|████▌     | 2952/6434 [6:55:20<8:09:24,  8.43s/it, gpt_loss=0.274, loss_mean=0.302][A
+Train step of epoch 0:  46%|████▌     | 2952/6434 [6:55:28<8:09:24,  8.43s/it, gpt_loss=0.281, loss_mean=0.3]  [A
+Train step of epoch 0:  46%|████▌     | 2953/6434 [6:55:28<8:08:02,  8.41s/it, gpt_loss=0.281, loss_mean=0.3][A
+Train step of epoch 0:  46%|████▌     | 2953/6434 [6:55:36<8:08:02,  8.41s/it, gpt_loss=0.35, loss_mean=0.305][A
+Train step of epoch 0:  46%|████▌     | 2954/6434 [6:55:36<7:54:39,  8.18s/it, gpt_loss=0.35, loss_mean=0.305][A
+Train step of epoch 0:  46%|████▌     | 2954/6434 [6:55:45<7:54:39,  8.18s/it, gpt_loss=0.433, loss_mean=0.318][A
+Train step of epoch 0:  46%|████▌     | 2955/6434 [6:55:45<8:06:36,  8.39s/it, gpt_loss=0.433, loss_mean=0.318][A
+Train step of epoch 0:  46%|████▌     | 2955/6434 [6:55:52<8:06:36,  8.39s/it, gpt_loss=0.271, loss_mean=0.313][A
+Train step of epoch 0:  46%|████▌     | 2956/6434 [6:55:52<7:45:12,  8.03s/it, gpt_loss=0.271, loss_mean=0.313][A
+Train step of epoch 0:  46%|████▌     | 2956/6434 [6:56:02<7:45:12,  8.03s/it, gpt_loss=0.384, loss_mean=0.32] [A
+Train step of epoch 0:  46%|████▌     | 2957/6434 [6:56:02<8:21:58,  8.66s/it, gpt_loss=0.384, loss_mean=0.32][A
+Train step of epoch 0:  46%|████▌     | 2957/6434 [6:56:10<8:21:58,  8.66s/it, gpt_loss=0.277, loss_mean=0.316][A
+Train step of epoch 0:  46%|████▌     | 2958/6434 [6:56:10<8:04:11,  8.36s/it, gpt_loss=0.277, loss_mean=0.316][A
+Train step of epoch 0:  46%|████▌     | 2958/6434 [6:56:19<8:04:11,  8.36s/it, gpt_loss=0.271, loss_mean=0.311][A
+Train step of epoch 0:  46%|████▌     | 2959/6434 [6:56:19<8:19:32,  8.63s/it, gpt_loss=0.271, loss_mean=0.311][A
+[LID Router Debug] Step: 2960
+Batch Size: 10
+Audio Batch Size: 145
+LID Assignments: [9, 2, 4, 6, 2, 9, 3, 3, 2, 9]
+Active Experts in Batch: {2, 3, 4, 6, 9}
+
+Train step of epoch 0:  46%|████▌     | 2959/6434 [6:56:28<8:19:32,  8.63s/it, gpt_loss=0.263, loss_mean=0.307][A
+Train step of epoch 0:  46%|████▌     | 2960/6434 [6:56:28<8:26:29,  8.75s/it, gpt_loss=0.263, loss_mean=0.307][A
+Train step of epoch 0:  46%|████▌     | 2960/6434 [6:56:38<8:26:29,  8.75s/it, gpt_loss=0.313, loss_mean=0.307][A
+Train step of epoch 0:  46%|████▌     | 2961/6434 [6:56:38<8:42:43,  9.03s/it, gpt_loss=0.313, loss_mean=0.307][A
+Train step of epoch 0:  46%|████▌     | 2961/6434 [6:56:45<8:42:43,  9.03s/it, gpt_loss=0.326, loss_mean=0.309][A
+Train step of epoch 0:  46%|████▌     | 2962/6434 [6:56:45<8:22:31,  8.68s/it, gpt_loss=0.326, loss_mean=0.309][A
+Train step of epoch 0:  46%|████▌     | 2962/6434 [6:56:54<8:22:31,  8.68s/it, gpt_loss=0.29, loss_mean=0.307] [A
+Train step of epoch 0:  46%|████▌     | 2963/6434 [6:56:54<8:14:33,  8.55s/it, gpt_loss=0.29, loss_mean=0.307][A
+Train step of epoch 0:  46%|████▌     | 2963/6434 [6:57:03<8:14:33,  8.55s/it, gpt_loss=0.358, loss_mean=0.312][A
+Train step of epoch 0:  46%|████▌     | 2964/6434 [6:57:03<8:21:06,  8.66s/it, gpt_loss=0.358, loss_mean=0.312][A
+Train step of epoch 0:  46%|████▌     | 2964/6434 [6:57:13<8:21:06,  8.66s/it, gpt_loss=0.301, loss_mean=0.311][A
+Train step of epoch 0:  46%|████▌     | 2965/6434 [6:57:13<8:47:22,  9.12s/it, gpt_loss=0.301, loss_mean=0.311][A
+Train step of epoch 0:  46%|████▌     | 2965/6434 [6:57:21<8:47:22,  9.12s/it, gpt_loss=0.334, loss_mean=0.313][A
+Train step of epoch 0:  46%|████▌     | 2966/6434 [6:57:21<8:29:07,  8.81s/it, gpt_loss=0.334, loss_mean=0.313][A
+Train step of epoch 0:  46%|████▌     | 2966/6434 [6:57:29<8:29:07,  8.81s/it, gpt_loss=0.333, loss_mean=0.315][A
+Train step of epoch 0:  46%|████▌     | 2967/6434 [6:57:29<8:21:43,  8.68s/it, gpt_loss=0.333, loss_mean=0.315][A
+Train step of epoch 0:  46%|████▌     | 2967/6434 [6:57:38<8:21:43,  8.68s/it, gpt_loss=0.294, loss_mean=0.313][A
+Train step of epoch 0:  46%|████▌     | 2968/6434 [6:57:38<8:25:00,  8.74s/it, gpt_loss=0.294, loss_mean=0.313][A
+Train step of epoch 0:  46%|████▌     | 2968/6434 [6:57:47<8:25:00,  8.74s/it, gpt_loss=0.241, loss_mean=0.306][A
+Train step of epoch 0:  46%|████▌     | 2969/6434 [6:57:47<8:28:20,  8.80s/it, gpt_loss=0.241, loss_mean=0.306][A
+[LID Router Debug] Step: 2970
+Batch Size: 10
+Audio Batch Size: 79
+LID Assignments: [9, 1, 0, 6, 4, 1, 9, 6, 6, 1]
+Active Experts in Batch: {0, 1, 4, 6, 9}
+
+Train step of epoch 0:  46%|████▌     | 2969/6434 [6:57:56<8:28:20,  8.80s/it, gpt_loss=0.315, loss_mean=0.307][A
+Train step of epoch 0:  46%|████▌     | 2970/6434 [6:57:56<8:35:18,  8.93s/it, gpt_loss=0.315, loss_mean=0.307][A
+Train step of epoch 0:  46%|████▌     | 2970/6434 [6:58:06<8:35:18,  8.93s/it, gpt_loss=0.246, loss_mean=0.301][A
+Train step of epoch 0:  46%|████▌     | 2971/6434 [6:58:06<8:54:38,  9.26s/it, gpt_loss=0.246, loss_mean=0.301][A
+Train step of epoch 0:  46%|████▌     | 2971/6434 [6:58:15<8:54:38,  9.26s/it, gpt_loss=0.336, loss_mean=0.304][A
+Train step of epoch 0:  46%|████▌     | 2972/6434 [6:58:15<8:40:10,  9.02s/it, gpt_loss=0.336, loss_mean=0.304][A
+Train step of epoch 0:  46%|████▌     | 2972/6434 [6:58:22<8:40:10,  9.02s/it, gpt_loss=0.323, loss_mean=0.306][A
+Train step of epoch 0:  46%|████▌     | 2973/6434 [6:58:22<8:18:09,  8.64s/it, gpt_loss=0.323, loss_mean=0.306][A
+Train step of epoch 0:  46%|████▌     | 2973/6434 [6:58:30<8:18:09,  8.64s/it, gpt_loss=0.344, loss_mean=0.31] [A
+Train step of epoch 0:  46%|████▌     | 2974/6434 [6:58:30<7:58:52,  8.30s/it, gpt_loss=0.344, loss_mean=0.31][A
+Train step of epoch 0:  46%|████▌     | 2974/6434 [6:58:39<7:58:52,  8.30s/it, gpt_loss=0.326, loss_mean=0.312][A
+Train step of epoch 0:  46%|████▌     | 2975/6434 [6:58:39<8:14:03,  8.57s/it, gpt_loss=0.326, loss_mean=0.312][A
+Train step of epoch 0:  46%|████▌     | 2975/6434 [6:58:48<8:14:03,  8.57s/it, gpt_loss=0.29, loss_mean=0.309] [A
+Train step of epoch 0:  46%|████▋     | 2976/6434 [6:58:48<8:16:50,  8.62s/it, gpt_loss=0.29, loss_mean=0.309][A
+Train step of epoch 0:  46%|████▋     | 2976/6434 [6:58:55<8:16:50,  8.62s/it, gpt_loss=0.328, loss_mean=0.311][A
+Train step of epoch 0:  46%|████▋     | 2977/6434 [6:58:55<7:51:44,  8.19s/it, gpt_loss=0.328, loss_mean=0.311][A
+Train step of epoch 0:  46%|████▋     | 2977/6434 [6:59:03<7:51:44,  8.19s/it, gpt_loss=0.358, loss_mean=0.316][A
+Train step of epoch 0:  46%|████▋     | 2978/6434 [6:59:03<7:47:37,  8.12s/it, gpt_loss=0.358, loss_mean=0.316][A
+Train step of epoch 0:  46%|████▋     | 2978/6434 [6:59:11<7:47:37,  8.12s/it, gpt_loss=0.373, loss_mean=0.322][A
+Train step of epoch 0:  46%|████▋     | 2979/6434 [6:59:11<7:46:58,  8.11s/it, gpt_loss=0.373, loss_mean=0.322][A
+[LID Router Debug] Step: 2980
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [5, 0, 10, 0, 1, 0, 9, 9, 1, 5]
+Active Experts in Batch: {0, 1, 5, 9, 10}
+
+Train step of epoch 0:  46%|████▋     | 2979/6434 [6:59:19<7:46:58,  8.11s/it, gpt_loss=0.292, loss_mean=0.319][A
+Train step of epoch 0:  46%|████▋     | 2980/6434 [6:59:19<7:49:54,  8.16s/it, gpt_loss=0.292, loss_mean=0.319][A
+Train step of epoch 0:  46%|████▋     | 2980/6434 [6:59:26<7:49:54,  8.16s/it, gpt_loss=0.401, loss_mean=0.327][A
+Train step of epoch 0:  46%|████▋     | 2981/6434 [6:59:26<7:27:03,  7.77s/it, gpt_loss=0.401, loss_mean=0.327][A
+Train step of epoch 0:  46%|████▋     | 2981/6434 [6:59:36<7:27:03,  7.77s/it, gpt_loss=0.232, loss_mean=0.317][A
+Train step of epoch 0:  46%|████▋     | 2982/6434 [6:59:36<7:54:36,  8.25s/it, gpt_loss=0.232, loss_mean=0.317][A
+Train step of epoch 0:  46%|████▋     | 2982/6434 [6:59:43<7:54:36,  8.25s/it, gpt_loss=0.32, loss_mean=0.318] [A
+Train step of epoch 0:  46%|████▋     | 2983/6434 [6:59:43<7:46:50,  8.12s/it, gpt_loss=0.32, loss_mean=0.318][A
+Train step of epoch 0:  46%|████▋     | 2983/6434 [6:59:52<7:46:50,  8.12s/it, gpt_loss=0.29, loss_mean=0.315][A
+Train step of epoch 0:  46%|████▋     | 2984/6434 [6:59:52<7:57:26,  8.30s/it, gpt_loss=0.29, loss_mean=0.315][A
+Train step of epoch 0:  46%|████▋     | 2984/6434 [7:00:00<7:57:26,  8.30s/it, gpt_loss=0.401, loss_mean=0.323][A
+Train step of epoch 0:  46%|████▋     | 2985/6434 [7:00:00<7:44:55,  8.09s/it, gpt_loss=0.401, loss_mean=0.323][A
+Train step of epoch 0:  46%|████▋     | 2985/6434 [7:00:08<7:44:55,  8.09s/it, gpt_loss=0.298, loss_mean=0.321][A
+Train step of epoch 0:  46%|████▋     | 2986/6434 [7:00:08<7:51:14,  8.20s/it, gpt_loss=0.298, loss_mean=0.321][A
+Train step of epoch 0:  46%|████▋     | 2986/6434 [7:00:17<7:51:14,  8.20s/it, gpt_loss=0.329, loss_mean=0.322][A
+Train step of epoch 0:  46%|████▋     | 2987/6434 [7:00:17<8:01:41,  8.38s/it, gpt_loss=0.329, loss_mean=0.322][A
+Train step of epoch 0:  46%|████▋     | 2987/6434 [7:00:25<8:01:41,  8.38s/it, gpt_loss=0.334, loss_mean=0.323][A
+Train step of epoch 0:  46%|████▋     | 2988/6434 [7:00:25<7:55:14,  8.27s/it, gpt_loss=0.334, loss_mean=0.323][A
+Train step of epoch 0:  46%|████▋     | 2988/6434 [7:00:34<7:55:14,  8.27s/it, gpt_loss=0.324, loss_mean=0.323][A
+Train step of epoch 0:  46%|████▋     | 2989/6434 [7:00:34<7:58:05,  8.33s/it, gpt_loss=0.324, loss_mean=0.323][A
+[LID Router Debug] Step: 2990
+Batch Size: 10
+Audio Batch Size: 115
+LID Assignments: [1, 9, 3, 1, 6, 0, 2, 6, 2, 8]
+Active Experts in Batch: {0, 1, 2, 3, 6, 8, 9}
+
+Train step of epoch 0:  46%|████▋     | 2989/6434 [7:00:42<7:58:05,  8.33s/it, gpt_loss=0.294, loss_mean=0.32] [A
+Train step of epoch 0:  46%|████▋     | 2990/6434 [7:00:42<7:51:49,  8.22s/it, gpt_loss=0.294, loss_mean=0.32][A
+Train step of epoch 0:  46%|████▋     | 2990/6434 [7:00:50<7:51:49,  8.22s/it, gpt_loss=0.371, loss_mean=0.325][A
+Train step of epoch 0:  46%|████▋     | 2991/6434 [7:00:50<7:51:19,  8.21s/it, gpt_loss=0.371, loss_mean=0.325][A
+Train step of epoch 0:  46%|████▋     | 2991/6434 [7:00:58<7:51:19,  8.21s/it, gpt_loss=0.261, loss_mean=0.319][A
+Train step of epoch 0:  47%|████▋     | 2992/6434 [7:00:58<7:45:45,  8.12s/it, gpt_loss=0.261, loss_mean=0.319][A
+Train step of epoch 0:  47%|████▋     | 2992/6434 [7:01:05<7:45:45,  8.12s/it, gpt_loss=0.27, loss_mean=0.314] [A
+Train step of epoch 0:  47%|████▋     | 2993/6434 [7:01:05<7:37:58,  7.99s/it, gpt_loss=0.27, loss_mean=0.314][A
+Train step of epoch 0:  47%|████▋     | 2993/6434 [7:01:14<7:37:58,  7.99s/it, gpt_loss=0.247, loss_mean=0.307][A
+Train step of epoch 0:  47%|████▋     | 2994/6434 [7:01:14<7:46:53,  8.14s/it, gpt_loss=0.247, loss_mean=0.307][A
+Train step of epoch 0:  47%|████▋     | 2994/6434 [7:01:22<7:46:53,  8.14s/it, gpt_loss=0.303, loss_mean=0.307][A
+Train step of epoch 0:  47%|████▋     | 2995/6434 [7:01:22<7:46:29,  8.14s/it, gpt_loss=0.303, loss_mean=0.307][A
+Train step of epoch 0:  47%|████▋     | 2995/6434 [7:01:32<7:46:29,  8.14s/it, gpt_loss=0.308, loss_mean=0.307][A
+Train step of epoch 0:  47%|████▋     | 2996/6434 [7:01:32<8:14:40,  8.63s/it, gpt_loss=0.308, loss_mean=0.307][A
+Train step of epoch 0:  47%|████▋     | 2996/6434 [7:01:40<8:14:40,  8.63s/it, gpt_loss=0.24, loss_mean=0.3]   [A
+Train step of epoch 0:  47%|████▋     | 2997/6434 [7:01:40<8:09:16,  8.54s/it, gpt_loss=0.24, loss_mean=0.3][A
+Train step of epoch 0:  47%|████▋     | 2997/6434 [7:01:49<8:09:16,  8.54s/it, gpt_loss=0.324, loss_mean=0.303][A
+Train step of epoch 0:  47%|████▋     | 2998/6434 [7:01:49<8:09:14,  8.54s/it, gpt_loss=0.324, loss_mean=0.303][A
+Train step of epoch 0:  47%|████▋     | 2998/6434 [7:01:57<8:09:14,  8.54s/it, gpt_loss=0.27, loss_mean=0.299] [A
+Train step of epoch 0:  47%|████▋     | 2999/6434 [7:01:57<8:08:27,  8.53s/it, gpt_loss=0.27, loss_mean=0.299][A
+[LID Router Debug] Step: 3000
+Batch Size: 10
+Audio Batch Size: 100
+LID Assignments: [4, 3, 5, 5, 5, 5, 3, 5, 5, 1]
+Active Experts in Batch: {1, 3, 4, 5}
+[2026-02-06 22:58:10,142] [INFO] [logging.py:96:log_dist] [Rank 0] step=1500, skipped=0, lr=[1.8954562374528512e-05, 1.8954562374528512e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-06 22:58:10,143] [INFO] [timer.py:260:stop] epoch=0/micro_step=3000/global_step=1500, RunningAvgSamplesPerSec=4.7492055333738445, CurrSamplesPerSec=4.690624816479954, MemAllocated=12.61GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  47%|████▋     | 2999/6434 [7:02:06<8:08:27,  8.53s/it, gpt_loss=0.39, loss_mean=0.308][A
+Train step of epoch 0:  47%|████▋     | 3000/6434 [7:02:06<8:09:07,  8.55s/it, gpt_loss=0.39, loss_mean=0.308][A
+Train step of epoch 0:  47%|████▋     | 3000/6434 [7:02:15<8:09:07,  8.55s/it, gpt_loss=0.249, loss_mean=0.302][A
+Train step of epoch 0:  47%|████▋     | 3001/6434 [7:02:15<8:18:02,  8.70s/it, gpt_loss=0.249, loss_mean=0.302][A
+Train step of epoch 0:  47%|████▋     | 3001/6434 [7:02:24<8:18:02,  8.70s/it, gpt_loss=0.205, loss_mean=0.293][A
+Train step of epoch 0:  47%|████▋     | 3002/6434 [7:02:24<8:31:49,  8.95s/it, gpt_loss=0.205, loss_mean=0.293][A
+Train step of epoch 0:  47%|████▋     | 3002/6434 [7:02:32<8:31:49,  8.95s/it, gpt_loss=0.245, loss_mean=0.288][A
+Train step of epoch 0:  47%|████▋     | 3003/6434 [7:02:32<8:16:49,  8.69s/it, gpt_loss=0.245, loss_mean=0.288][A
+Train step of epoch 0:  47%|████▋     | 3003/6434 [7:02:41<8:16:49,  8.69s/it, gpt_loss=0.303, loss_mean=0.289][A
+Train step of epoch 0:  47%|████▋     | 3004/6434 [7:02:41<8:20:43,  8.76s/it, gpt_loss=0.303, loss_mean=0.289][A
+Train step of epoch 0:  47%|████▋     | 3004/6434 [7:02:50<8:20:43,  8.76s/it, gpt_loss=0.312, loss_mean=0.292][A
+Train step of epoch 0:  47%|████▋     | 3005/6434 [7:02:50<8:21:42,  8.78s/it, gpt_loss=0.312, loss_mean=0.292][A
+Train step of epoch 0:  47%|████▋     | 3005/6434 [7:02:58<8:21:42,  8.78s/it, gpt_loss=0.37, loss_mean=0.3]   [A
+Train step of epoch 0:  47%|████▋     | 3006/6434 [7:02:58<8:04:30,  8.48s/it, gpt_loss=0.37, loss_mean=0.3][A
+Train step of epoch 0:  47%|████▋     | 3006/6434 [7:03:07<8:04:30,  8.48s/it, gpt_loss=0.274, loss_mean=0.297][A
+Train step of epoch 0:  47%|████▋     | 3007/6434 [7:03:07<8:10:48,  8.59s/it, gpt_loss=0.274, loss_mean=0.297][A
+Train step of epoch 0:  47%|████▋     | 3007/6434 [7:03:16<8:10:48,  8.59s/it, gpt_loss=0.331, loss_mean=0.3]  [A
+Train step of epoch 0:  47%|████▋     | 3008/6434 [7:03:16<8:23:59,  8.83s/it, gpt_loss=0.331, loss_mean=0.3][A
+Train step of epoch 0:  47%|████▋     | 3008/6434 [7:03:24<8:23:59,  8.83s/it, gpt_loss=0.301, loss_mean=0.3][A
+Train step of epoch 0:  47%|████▋     | 3009/6434 [7:03:24<8:15:17,  8.68s/it, gpt_loss=0.301, loss_mean=0.3][A
+[LID Router Debug] Step: 3010
+Batch Size: 10
+Audio Batch Size: 122
+LID Assignments: [9, 9, 2, 3, 5, 2, 3, 0, 9, 0]
+Active Experts in Batch: {0, 2, 3, 5, 9}
+
+Train step of epoch 0:  47%|████▋     | 3009/6434 [7:03:33<8:15:17,  8.68s/it, gpt_loss=0.297, loss_mean=0.3][A
+Train step of epoch 0:  47%|████▋     | 3010/6434 [7:03:33<8:06:27,  8.52s/it, gpt_loss=0.297, loss_mean=0.3][A
+Train step of epoch 0:  47%|████▋     | 3010/6434 [7:03:41<8:06:27,  8.52s/it, gpt_loss=0.307, loss_mean=0.301][A
+Train step of epoch 0:  47%|████▋     | 3011/6434 [7:03:41<8:09:16,  8.58s/it, gpt_loss=0.307, loss_mean=0.301][A
+Train step of epoch 0:  47%|████▋     | 3011/6434 [7:03:50<8:09:16,  8.58s/it, gpt_loss=0.285, loss_mean=0.299][A
+Train step of epoch 0:  47%|████▋     | 3012/6434 [7:03:50<8:04:29,  8.49s/it, gpt_loss=0.285, loss_mean=0.299][A
+Train step of epoch 0:  47%|████▋     | 3012/6434 [7:03:59<8:04:29,  8.49s/it, gpt_loss=0.349, loss_mean=0.304][A
+Train step of epoch 0:  47%|████▋     | 3013/6434 [7:03:59<8:22:41,  8.82s/it, gpt_loss=0.349, loss_mean=0.304][A
+Train step of epoch 0:  47%|████▋     | 3013/6434 [7:04:08<8:22:41,  8.82s/it, gpt_loss=0.324, loss_mean=0.306][A
+Train step of epoch 0:  47%|████▋     | 3014/6434 [7:04:08<8:18:02,  8.74s/it, gpt_loss=0.324, loss_mean=0.306][A
+Train step of epoch 0:  47%|████▋     | 3014/6434 [7:04:16<8:18:02,  8.74s/it, gpt_loss=0.266, loss_mean=0.302][A
+Train step of epoch 0:  47%|████▋     | 3015/6434 [7:04:16<8:08:34,  8.57s/it, gpt_loss=0.266, loss_mean=0.302][A
+Train step of epoch 0:  47%|████▋     | 3015/6434 [7:04:25<8:08:34,  8.57s/it, gpt_loss=0.257, loss_mean=0.298][A
+Train step of epoch 0:  47%|████▋     | 3016/6434 [7:04:25<8:14:02,  8.67s/it, gpt_loss=0.257, loss_mean=0.298][A
+Train step of epoch 0:  47%|████▋     | 3016/6434 [7:04:33<8:14:02,  8.67s/it, gpt_loss=0.327, loss_mean=0.301][A
+Train step of epoch 0:  47%|████▋     | 3017/6434 [7:04:33<8:12:59,  8.66s/it, gpt_loss=0.327, loss_mean=0.301][A
+Train step of epoch 0:  47%|████▋     | 3017/6434 [7:04:42<8:12:59,  8.66s/it, gpt_loss=0.244, loss_mean=0.295][A
+Train step of epoch 0:  47%|████▋     | 3018/6434 [7:04:42<8:14:48,  8.69s/it, gpt_loss=0.244, loss_mean=0.295][A
+Train step of epoch 0:  47%|████▋     | 3018/6434 [7:04:50<8:14:48,  8.69s/it, gpt_loss=0.238, loss_mean=0.289][A
+Train step of epoch 0:  47%|████▋     | 3019/6434 [7:04:50<8:07:46,  8.57s/it, gpt_loss=0.238, loss_mean=0.289][A
+[LID Router Debug] Step: 3020
+Batch Size: 10
+Audio Batch Size: 89
+LID Assignments: [1, 6, 1, 5, 6, 9, 0, 6, 1, 3]
+Active Experts in Batch: {0, 1, 3, 5, 6, 9}
+
+Train step of epoch 0:  47%|████▋     | 3019/6434 [7:04:59<8:07:46,  8.57s/it, gpt_loss=0.244, loss_mean=0.285][A
+Train step of epoch 0:  47%|████▋     | 3020/6434 [7:04:59<8:07:19,  8.56s/it, gpt_loss=0.244, loss_mean=0.285][A
+Train step of epoch 0:  47%|████▋     | 3020/6434 [7:05:07<8:07:19,  8.56s/it, gpt_loss=0.296, loss_mean=0.286][A
+Train step of epoch 0:  47%|████▋     | 3021/6434 [7:05:07<7:53:36,  8.33s/it, gpt_loss=0.296, loss_mean=0.286][A
+Train step of epoch 0:  47%|████▋     | 3021/6434 [7:05:15<7:53:36,  8.33s/it, gpt_loss=0.362, loss_mean=0.293][A
+Train step of epoch 0:  47%|████▋     | 3022/6434 [7:05:15<7:52:45,  8.31s/it, gpt_loss=0.362, loss_mean=0.293][A
+Train step of epoch 0:  47%|████▋     | 3022/6434 [7:05:24<7:52:45,  8.31s/it, gpt_loss=0.364, loss_mean=0.3]  [A
+Train step of epoch 0:  47%|████▋     | 3023/6434 [7:05:24<7:55:23,  8.36s/it, gpt_loss=0.364, loss_mean=0.3][A
+Train step of epoch 0:  47%|████▋     | 3023/6434 [7:05:32<7:55:23,  8.36s/it, gpt_loss=0.311, loss_mean=0.302][A
+Train step of epoch 0:  47%|████▋     | 3024/6434 [7:05:32<7:55:03,  8.36s/it, gpt_loss=0.311, loss_mean=0.302][A
+Train step of epoch 0:  47%|████▋     | 3024/6434 [7:05:41<7:55:03,  8.36s/it, gpt_loss=0.422, loss_mean=0.314][A
+Train step of epoch 0:  47%|████▋     | 3025/6434 [7:05:41<8:04:31,  8.53s/it, gpt_loss=0.422, loss_mean=0.314][A
+Train step of epoch 0:  47%|████▋     | 3025/6434 [7:05:49<8:04:31,  8.53s/it, gpt_loss=0.309, loss_mean=0.313][A
+Train step of epoch 0:  47%|████▋     | 3026/6434 [7:05:49<8:03:39,  8.52s/it, gpt_loss=0.309, loss_mean=0.313][A
+Train step of epoch 0:  47%|████▋     | 3026/6434 [7:05:57<8:03:39,  8.52s/it, gpt_loss=0.39, loss_mean=0.321] [A
+Train step of epoch 0:  47%|████▋     | 3027/6434 [7:05:57<7:52:45,  8.33s/it, gpt_loss=0.39, loss_mean=0.321][A
+Train step of epoch 0:  47%|████▋     | 3027/6434 [7:06:05<7:52:45,  8.33s/it, gpt_loss=0.338, loss_mean=0.322][A
+Train step of epoch 0:  47%|████▋     | 3028/6434 [7:06:05<7:47:06,  8.23s/it, gpt_loss=0.338, loss_mean=0.322][A
+Train step of epoch 0:  47%|████▋     | 3028/6434 [7:06:13<7:47:06,  8.23s/it, gpt_loss=0.353, loss_mean=0.326][A
+Train step of epoch 0:  47%|████▋     | 3029/6434 [7:06:13<7:43:10,  8.16s/it, gpt_loss=0.353, loss_mean=0.326][A
+[LID Router Debug] Step: 3030
+Batch Size: 10
+Audio Batch Size: 94
+LID Assignments: [6, 1, 6, 1, 1, 1, 2, 2, 2, 10]
+Active Experts in Batch: {1, 2, 10, 6}
+
+Train step of epoch 0:  47%|████▋     | 3029/6434 [7:06:22<7:43:10,  8.16s/it, gpt_loss=0.395, loss_mean=0.333][A
+Train step of epoch 0:  47%|████▋     | 3030/6434 [7:06:22<7:55:49,  8.39s/it, gpt_loss=0.395, loss_mean=0.333][A
+Train step of epoch 0:  47%|████▋     | 3030/6434 [7:06:30<7:55:49,  8.39s/it, gpt_loss=0.315, loss_mean=0.331][A
+Train step of epoch 0:  47%|████▋     | 3031/6434 [7:06:30<7:54:09,  8.36s/it, gpt_loss=0.315, loss_mean=0.331][A
+Train step of epoch 0:  47%|████▋     | 3031/6434 [7:06:38<7:54:09,  8.36s/it, gpt_loss=0.302, loss_mean=0.328][A
+Train step of epoch 0:  47%|████▋     | 3032/6434 [7:06:38<7:46:34,  8.23s/it, gpt_loss=0.302, loss_mean=0.328][A
+Train step of epoch 0:  47%|████▋     | 3032/6434 [7:06:47<7:46:34,  8.23s/it, gpt_loss=0.343, loss_mean=0.329][A
+Train step of epoch 0:  47%|████▋     | 3033/6434 [7:06:47<7:53:41,  8.36s/it, gpt_loss=0.343, loss_mean=0.329][A
+Train step of epoch 0:  47%|████▋     | 3033/6434 [7:06:55<7:53:41,  8.36s/it, gpt_loss=0.296, loss_mean=0.326][A
+Train step of epoch 0:  47%|████▋     | 3034/6434 [7:06:55<7:45:18,  8.21s/it, gpt_loss=0.296, loss_mean=0.326][A
+Train step of epoch 0:  47%|████▋     | 3034/6434 [7:07:03<7:45:18,  8.21s/it, gpt_loss=0.302, loss_mean=0.324][A
+Train step of epoch 0:  47%|████▋     | 3035/6434 [7:07:03<7:36:05,  8.05s/it, gpt_loss=0.302, loss_mean=0.324][A
+Train step of epoch 0:  47%|████▋     | 3035/6434 [7:07:11<7:36:05,  8.05s/it, gpt_loss=0.319, loss_mean=0.323][A
+Train step of epoch 0:  47%|████▋     | 3036/6434 [7:07:11<7:36:19,  8.06s/it, gpt_loss=0.319, loss_mean=0.323][A
+Train step of epoch 0:  47%|████▋     | 3036/6434 [7:07:19<7:36:19,  8.06s/it, gpt_loss=0.235, loss_mean=0.314][A
+Train step of epoch 0:  47%|████▋     | 3037/6434 [7:07:19<7:33:52,  8.02s/it, gpt_loss=0.235, loss_mean=0.314][A
+Train step of epoch 0:  47%|████▋     | 3037/6434 [7:07:26<7:33:52,  8.02s/it, gpt_loss=0.28, loss_mean=0.311] [A
+Train step of epoch 0:  47%|████▋     | 3038/6434 [7:07:26<7:30:58,  7.97s/it, gpt_loss=0.28, loss_mean=0.311][A
+Train step of epoch 0:  47%|████▋     | 3038/6434 [7:07:35<7:30:58,  7.97s/it, gpt_loss=0.368, loss_mean=0.317][A
+Train step of epoch 0:  47%|████▋     | 3039/6434 [7:07:35<7:37:00,  8.08s/it, gpt_loss=0.368, loss_mean=0.317][A
+[LID Router Debug] Step: 3040
+Batch Size: 10
+Audio Batch Size: 163
+LID Assignments: [4, 3, 9, 5, 1, 3, 3, 6, 3, 3]
+Active Experts in Batch: {1, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  47%|████▋     | 3039/6434 [7:07:45<7:37:00,  8.08s/it, gpt_loss=0.284, loss_mean=0.313][A
+Train step of epoch 0:  47%|████▋     | 3040/6434 [7:07:45<8:13:27,  8.72s/it, gpt_loss=0.284, loss_mean=0.313][A
+Train step of epoch 0:  47%|████▋     | 3040/6434 [7:07:54<8:13:27,  8.72s/it, gpt_loss=0.323, loss_mean=0.314][A
+Train step of epoch 0:  47%|████▋     | 3041/6434 [7:07:54<8:12:37,  8.71s/it, gpt_loss=0.323, loss_mean=0.314][A
+Train step of epoch 0:  47%|████▋     | 3041/6434 [7:08:01<8:12:37,  8.71s/it, gpt_loss=0.247, loss_mean=0.308][A
+Train step of epoch 0:  47%|████▋     | 3042/6434 [7:08:01<7:53:33,  8.38s/it, gpt_loss=0.247, loss_mean=0.308][A
+Train step of epoch 0:  47%|████▋     | 3042/6434 [7:08:10<7:53:33,  8.38s/it, gpt_loss=0.336, loss_mean=0.311][A
+Train step of epoch 0:  47%|████▋     | 3043/6434 [7:08:10<7:55:58,  8.42s/it, gpt_loss=0.336, loss_mean=0.311][A
+Train step of epoch 0:  47%|████▋     | 3043/6434 [7:08:18<7:55:58,  8.42s/it, gpt_loss=0.385, loss_mean=0.318][A
+Train step of epoch 0:  47%|████▋     | 3044/6434 [7:08:18<8:00:44,  8.51s/it, gpt_loss=0.385, loss_mean=0.318][A
+Train step of epoch 0:  47%|████▋     | 3044/6434 [7:08:26<8:00:44,  8.51s/it, gpt_loss=0.339, loss_mean=0.32] [A
+Train step of epoch 0:  47%|████▋     | 3045/6434 [7:08:26<7:45:37,  8.24s/it, gpt_loss=0.339, loss_mean=0.32][A
+Train step of epoch 0:  47%|████▋     | 3045/6434 [7:08:37<7:45:37,  8.24s/it, gpt_loss=0.286, loss_mean=0.317][A
+Train step of epoch 0:  47%|████▋     | 3046/6434 [7:08:37<8:30:35,  9.04s/it, gpt_loss=0.286, loss_mean=0.317][A
+Train step of epoch 0:  47%|████▋     | 3046/6434 [7:08:45<8:30:35,  9.04s/it, gpt_loss=0.337, loss_mean=0.319][A
+Train step of epoch 0:  47%|████▋     | 3047/6434 [7:08:45<8:11:54,  8.71s/it, gpt_loss=0.337, loss_mean=0.319][A
+Train step of epoch 0:  47%|████▋     | 3047/6434 [7:08:54<8:11:54,  8.71s/it, gpt_loss=0.281, loss_mean=0.315][A
+Train step of epoch 0:  47%|████▋     | 3048/6434 [7:08:54<8:12:20,  8.72s/it, gpt_loss=0.281, loss_mean=0.315][A
+Train step of epoch 0:  47%|████▋     | 3048/6434 [7:09:04<8:12:20,  8.72s/it, gpt_loss=0.273, loss_mean=0.311][A
+Train step of epoch 0:  47%|████▋     | 3049/6434 [7:09:04<8:38:03,  9.18s/it, gpt_loss=0.273, loss_mean=0.311][A
+[LID Router Debug] Step: 3050
+Batch Size: 10
+Audio Batch Size: 88
+LID Assignments: [1, 2, 1, 4, 5, 1, 5, 4, 3, 5]
+Active Experts in Batch: {1, 2, 3, 4, 5}
+
+Train step of epoch 0:  47%|████▋     | 3049/6434 [7:09:13<8:38:03,  9.18s/it, gpt_loss=0.272, loss_mean=0.307][A
+Train step of epoch 0:  47%|████▋     | 3050/6434 [7:09:13<8:34:56,  9.13s/it, gpt_loss=0.272, loss_mean=0.307][A
+Train step of epoch 0:  47%|████▋     | 3050/6434 [7:09:22<8:34:56,  9.13s/it, gpt_loss=0.309, loss_mean=0.307][A
+Train step of epoch 0:  47%|████▋     | 3051/6434 [7:09:22<8:26:08,  8.98s/it, gpt_loss=0.309, loss_mean=0.307][A
+Train step of epoch 0:  47%|████▋     | 3051/6434 [7:09:31<8:26:08,  8.98s/it, gpt_loss=0.333, loss_mean=0.31] [A
+Train step of epoch 0:  47%|████▋     | 3052/6434 [7:09:31<8:27:17,  9.00s/it, gpt_loss=0.333, loss_mean=0.31][A
+Train step of epoch 0:  47%|████▋     | 3052/6434 [7:09:38<8:27:17,  9.00s/it, gpt_loss=0.388, loss_mean=0.317][A
+Train step of epoch 0:  47%|████▋     | 3053/6434 [7:09:38<8:02:52,  8.57s/it, gpt_loss=0.388, loss_mean=0.317][A
+Train step of epoch 0:  47%|████▋     | 3053/6434 [7:09:48<8:02:52,  8.57s/it, gpt_loss=0.236, loss_mean=0.309][A
+Train step of epoch 0:  47%|████▋     | 3054/6434 [7:09:48<8:20:23,  8.88s/it, gpt_loss=0.236, loss_mean=0.309][A
+Train step of epoch 0:  47%|████▋     | 3054/6434 [7:09:57<8:20:23,  8.88s/it, gpt_loss=0.301, loss_mean=0.308][A
+Train step of epoch 0:  47%|████▋     | 3055/6434 [7:09:57<8:16:57,  8.82s/it, gpt_loss=0.301, loss_mean=0.308][A
+Train step of epoch 0:  47%|████▋     | 3055/6434 [7:10:06<8:16:57,  8.82s/it, gpt_loss=0.297, loss_mean=0.307][A
+Train step of epoch 0:  47%|████▋     | 3056/6434 [7:10:06<8:27:13,  9.01s/it, gpt_loss=0.297, loss_mean=0.307][A
+Train step of epoch 0:  47%|████▋     | 3056/6434 [7:10:15<8:27:13,  9.01s/it, gpt_loss=0.222, loss_mean=0.299][A
+Train step of epoch 0:  48%|████▊     | 3057/6434 [7:10:15<8:19:47,  8.88s/it, gpt_loss=0.222, loss_mean=0.299][A
+Train step of epoch 0:  48%|████▊     | 3057/6434 [7:10:22<8:19:47,  8.88s/it, gpt_loss=0.283, loss_mean=0.297][A
+Train step of epoch 0:  48%|████▊     | 3058/6434 [7:10:22<8:00:59,  8.55s/it, gpt_loss=0.283, loss_mean=0.297][A
+Train step of epoch 0:  48%|████▊     | 3058/6434 [7:10:31<8:00:59,  8.55s/it, gpt_loss=0.407, loss_mean=0.308][A
+Train step of epoch 0:  48%|████▊     | 3059/6434 [7:10:31<7:57:52,  8.50s/it, gpt_loss=0.407, loss_mean=0.308][A
+[LID Router Debug] Step: 3060
+Batch Size: 10
+Audio Batch Size: 129
+LID Assignments: [9, 9, 10, 5, 2, 9, 9, 4, 4, 2]
+Active Experts in Batch: {2, 4, 5, 9, 10}
+
+Train step of epoch 0:  48%|████▊     | 3059/6434 [7:10:39<7:57:52,  8.50s/it, gpt_loss=0.421, loss_mean=0.319][A
+Train step of epoch 0:  48%|████▊     | 3060/6434 [7:10:39<7:57:35,  8.49s/it, gpt_loss=0.421, loss_mean=0.319][A
+Train step of epoch 0:  48%|████▊     | 3060/6434 [7:10:48<7:57:35,  8.49s/it, gpt_loss=0.268, loss_mean=0.314][A
+Train step of epoch 0:  48%|████▊     | 3061/6434 [7:10:48<8:02:10,  8.58s/it, gpt_loss=0.268, loss_mean=0.314][A
+Train step of epoch 0:  48%|████▊     | 3061/6434 [7:10:58<8:02:10,  8.58s/it, gpt_loss=0.275, loss_mean=0.31] [A
+Train step of epoch 0:  48%|████▊     | 3062/6434 [7:10:58<8:20:47,  8.91s/it, gpt_loss=0.275, loss_mean=0.31][A
+Train step of epoch 0:  48%|████▊     | 3062/6434 [7:11:06<8:20:47,  8.91s/it, gpt_loss=0.33, loss_mean=0.312][A
+Train step of epoch 0:  48%|████▊     | 3063/6434 [7:11:06<8:14:19,  8.80s/it, gpt_loss=0.33, loss_mean=0.312][A
+Train step of epoch 0:  48%|████▊     | 3063/6434 [7:11:15<8:14:19,  8.80s/it, gpt_loss=0.306, loss_mean=0.312][A
+Train step of epoch 0:  48%|████▊     | 3064/6434 [7:11:15<8:06:29,  8.66s/it, gpt_loss=0.306, loss_mean=0.312][A
+Train step of epoch 0:  48%|████▊     | 3064/6434 [7:11:22<8:06:29,  8.66s/it, gpt_loss=0.339, loss_mean=0.314][A
+Train step of epoch 0:  48%|████▊     | 3065/6434 [7:11:22<7:54:11,  8.45s/it, gpt_loss=0.339, loss_mean=0.314][A
+Train step of epoch 0:  48%|████▊     | 3065/6434 [7:11:31<7:54:11,  8.45s/it, gpt_loss=0.287, loss_mean=0.312][A
+Train step of epoch 0:  48%|████▊     | 3066/6434 [7:11:31<7:54:32,  8.45s/it, gpt_loss=0.287, loss_mean=0.312][A
+Train step of epoch 0:  48%|████▊     | 3066/6434 [7:11:39<7:54:32,  8.45s/it, gpt_loss=0.32, loss_mean=0.312] [A
+Train step of epoch 0:  48%|████▊     | 3067/6434 [7:11:39<7:52:47,  8.43s/it, gpt_loss=0.32, loss_mean=0.312][A
+Train step of epoch 0:  48%|████▊     | 3067/6434 [7:11:48<7:52:47,  8.43s/it, gpt_loss=0.251, loss_mean=0.306][A
+Train step of epoch 0:  48%|████▊     | 3068/6434 [7:11:48<8:02:12,  8.60s/it, gpt_loss=0.251, loss_mean=0.306][A
+Train step of epoch 0:  48%|████▊     | 3068/6434 [7:11:57<8:02:12,  8.60s/it, gpt_loss=0.311, loss_mean=0.307][A
+Train step of epoch 0:  48%|████▊     | 3069/6434 [7:11:57<8:00:32,  8.57s/it, gpt_loss=0.311, loss_mean=0.307][A
+[LID Router Debug] Step: 3070
+Batch Size: 10
+Audio Batch Size: 121
+LID Assignments: [2, 9, 1, 3, 2, 6, 6, 4, 5, 9]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  48%|████▊     | 3069/6434 [7:12:05<8:00:32,  8.57s/it, gpt_loss=0.358, loss_mean=0.312][A
+Train step of epoch 0:  48%|████▊     | 3070/6434 [7:12:05<7:53:13,  8.44s/it, gpt_loss=0.358, loss_mean=0.312][A
+Train step of epoch 0:  48%|████▊     | 3070/6434 [7:12:13<7:53:13,  8.44s/it, gpt_loss=0.354, loss_mean=0.316][A
+Train step of epoch 0:  48%|████▊     | 3071/6434 [7:12:13<7:46:11,  8.32s/it, gpt_loss=0.354, loss_mean=0.316][A
+Train step of epoch 0:  48%|████▊     | 3071/6434 [7:12:22<7:46:11,  8.32s/it, gpt_loss=0.274, loss_mean=0.312][A
+Train step of epoch 0:  48%|████▊     | 3072/6434 [7:12:22<8:02:12,  8.61s/it, gpt_loss=0.274, loss_mean=0.312][A
+Train step of epoch 0:  48%|████▊     | 3072/6434 [7:12:30<8:02:12,  8.61s/it, gpt_loss=0.401, loss_mean=0.321][A
+Train step of epoch 0:  48%|████▊     | 3073/6434 [7:12:30<7:44:28,  8.29s/it, gpt_loss=0.401, loss_mean=0.321][A
+Train step of epoch 0:  48%|████▊     | 3073/6434 [7:12:38<7:44:28,  8.29s/it, gpt_loss=0.339, loss_mean=0.323][A
+Train step of epoch 0:  48%|████▊     | 3074/6434 [7:12:38<7:48:29,  8.37s/it, gpt_loss=0.339, loss_mean=0.323][A
+Train step of epoch 0:  48%|████▊     | 3074/6434 [7:12:47<7:48:29,  8.37s/it, gpt_loss=0.31, loss_mean=0.321] [A
+Train step of epoch 0:  48%|████▊     | 3075/6434 [7:12:47<7:54:54,  8.48s/it, gpt_loss=0.31, loss_mean=0.321][A
+Train step of epoch 0:  48%|████▊     | 3075/6434 [7:12:55<7:54:54,  8.48s/it, gpt_loss=0.284, loss_mean=0.318][A
+Train step of epoch 0:  48%|████▊     | 3076/6434 [7:12:55<7:39:25,  8.21s/it, gpt_loss=0.284, loss_mean=0.318][A
+Train step of epoch 0:  48%|████▊     | 3076/6434 [7:13:03<7:39:25,  8.21s/it, gpt_loss=0.24, loss_mean=0.31]  [A
+Train step of epoch 0:  48%|████▊     | 3077/6434 [7:13:03<7:49:02,  8.38s/it, gpt_loss=0.24, loss_mean=0.31][A
+Train step of epoch 0:  48%|████▊     | 3077/6434 [7:13:12<7:49:02,  8.38s/it, gpt_loss=0.249, loss_mean=0.304][A
+Train step of epoch 0:  48%|████▊     | 3078/6434 [7:13:12<7:46:43,  8.34s/it, gpt_loss=0.249, loss_mean=0.304][A
+Train step of epoch 0:  48%|████▊     | 3078/6434 [7:13:20<7:46:43,  8.34s/it, gpt_loss=0.274, loss_mean=0.301][A
+Train step of epoch 0:  48%|████▊     | 3079/6434 [7:13:20<7:45:43,  8.33s/it, gpt_loss=0.274, loss_mean=0.301][A
+[LID Router Debug] Step: 3080
+Batch Size: 10
+Audio Batch Size: 87
+LID Assignments: [9, 5, 0, 2, 9, 9, 5, 6, 9, 1]
+Active Experts in Batch: {0, 1, 2, 5, 6, 9}
+
+Train step of epoch 0:  48%|████▊     | 3079/6434 [7:13:28<7:45:43,  8.33s/it, gpt_loss=0.324, loss_mean=0.303][A
+Train step of epoch 0:  48%|████▊     | 3080/6434 [7:13:28<7:45:28,  8.33s/it, gpt_loss=0.324, loss_mean=0.303][A
+Train step of epoch 0:  48%|████▊     | 3080/6434 [7:13:38<7:45:28,  8.33s/it, gpt_loss=0.382, loss_mean=0.311][A
+Train step of epoch 0:  48%|████▊     | 3081/6434 [7:13:38<8:11:21,  8.79s/it, gpt_loss=0.382, loss_mean=0.311][A
+Train step of epoch 0:  48%|████▊     | 3081/6434 [7:13:46<8:11:21,  8.79s/it, gpt_loss=0.363, loss_mean=0.316][A
+Train step of epoch 0:  48%|████▊     | 3082/6434 [7:13:46<7:56:34,  8.53s/it, gpt_loss=0.363, loss_mean=0.316][A
+Train step of epoch 0:  48%|████▊     | 3082/6434 [7:13:54<7:56:34,  8.53s/it, gpt_loss=0.376, loss_mean=0.322][A
+Train step of epoch 0:  48%|████▊     | 3083/6434 [7:13:54<7:47:18,  8.37s/it, gpt_loss=0.376, loss_mean=0.322][A
+Train step of epoch 0:  48%|████▊     | 3083/6434 [7:14:03<7:47:18,  8.37s/it, gpt_loss=0.368, loss_mean=0.327][A
+Train step of epoch 0:  48%|████▊     | 3084/6434 [7:14:03<7:59:12,  8.58s/it, gpt_loss=0.368, loss_mean=0.327][A
+Train step of epoch 0:  48%|████▊     | 3084/6434 [7:14:12<7:59:12,  8.58s/it, gpt_loss=0.296, loss_mean=0.324][A
+Train step of epoch 0:  48%|████▊     | 3085/6434 [7:14:12<7:59:49,  8.60s/it, gpt_loss=0.296, loss_mean=0.324][A
+Train step of epoch 0:  48%|████▊     | 3085/6434 [7:14:20<7:59:49,  8.60s/it, gpt_loss=0.25, loss_mean=0.316] [A
+Train step of epoch 0:  48%|████▊     | 3086/6434 [7:14:20<7:53:48,  8.49s/it, gpt_loss=0.25, loss_mean=0.316][A
+Train step of epoch 0:  48%|████▊     | 3086/6434 [7:14:29<7:53:48,  8.49s/it, gpt_loss=0.324, loss_mean=0.317][A
+Train step of epoch 0:  48%|████▊     | 3087/6434 [7:14:29<7:54:49,  8.51s/it, gpt_loss=0.324, loss_mean=0.317][A
+Train step of epoch 0:  48%|████▊     | 3087/6434 [7:14:38<7:54:49,  8.51s/it, gpt_loss=0.273, loss_mean=0.313][A
+Train step of epoch 0:  48%|████▊     | 3088/6434 [7:14:38<8:11:44,  8.82s/it, gpt_loss=0.273, loss_mean=0.313][A
+Train step of epoch 0:  48%|████▊     | 3088/6434 [7:14:46<8:11:44,  8.82s/it, gpt_loss=0.391, loss_mean=0.321][A
+Train step of epoch 0:  48%|████▊     | 3089/6434 [7:14:46<7:57:02,  8.56s/it, gpt_loss=0.391, loss_mean=0.321][A
+[LID Router Debug] Step: 3090
+Batch Size: 10
+Audio Batch Size: 111
+LID Assignments: [6, 0, 5, 4, 5, 10, 3, 0, 3, 5]
+Active Experts in Batch: {0, 3, 4, 5, 6, 10}
+
+Train step of epoch 0:  48%|████▊     | 3089/6434 [7:14:55<7:57:02,  8.56s/it, gpt_loss=0.322, loss_mean=0.321][A
+Train step of epoch 0:  48%|████▊     | 3090/6434 [7:14:55<7:55:20,  8.53s/it, gpt_loss=0.322, loss_mean=0.321][A
+Train step of epoch 0:  48%|████▊     | 3090/6434 [7:15:03<7:55:20,  8.53s/it, gpt_loss=0.249, loss_mean=0.313][A
+Train step of epoch 0:  48%|████▊     | 3091/6434 [7:15:03<7:53:52,  8.50s/it, gpt_loss=0.249, loss_mean=0.313][A
+Train step of epoch 0:  48%|████▊     | 3091/6434 [7:15:11<7:53:52,  8.50s/it, gpt_loss=0.34, loss_mean=0.316] [A
+Train step of epoch 0:  48%|████▊     | 3092/6434 [7:15:11<7:47:36,  8.40s/it, gpt_loss=0.34, loss_mean=0.316][A
+Train step of epoch 0:  48%|████▊     | 3092/6434 [7:15:19<7:47:36,  8.40s/it, gpt_loss=0.327, loss_mean=0.317][A
+Train step of epoch 0:  48%|████▊     | 3093/6434 [7:15:19<7:41:21,  8.29s/it, gpt_loss=0.327, loss_mean=0.317][A
+Train step of epoch 0:  48%|████▊     | 3093/6434 [7:15:28<7:41:21,  8.29s/it, gpt_loss=0.354, loss_mean=0.321][A
+Train step of epoch 0:  48%|████▊     | 3094/6434 [7:15:28<7:51:29,  8.47s/it, gpt_loss=0.354, loss_mean=0.321][A
+Train step of epoch 0:  48%|████▊     | 3094/6434 [7:15:37<7:51:29,  8.47s/it, gpt_loss=0.306, loss_mean=0.319][A
+Train step of epoch 0:  48%|████▊     | 3095/6434 [7:15:37<7:55:00,  8.54s/it, gpt_loss=0.306, loss_mean=0.319][A
+Train step of epoch 0:  48%|████▊     | 3095/6434 [7:15:45<7:55:00,  8.54s/it, gpt_loss=0.283, loss_mean=0.316][A
+Train step of epoch 0:  48%|████▊     | 3096/6434 [7:15:45<7:41:39,  8.30s/it, gpt_loss=0.283, loss_mean=0.316][A
+Train step of epoch 0:  48%|████▊     | 3096/6434 [7:15:52<7:41:39,  8.30s/it, gpt_loss=0.316, loss_mean=0.316][A
+Train step of epoch 0:  48%|████▊     | 3097/6434 [7:15:52<7:34:51,  8.18s/it, gpt_loss=0.316, loss_mean=0.316][A
+Train step of epoch 0:  48%|████▊     | 3097/6434 [7:16:00<7:34:51,  8.18s/it, gpt_loss=0.345, loss_mean=0.319][A
+Train step of epoch 0:  48%|████▊     | 3098/6434 [7:16:00<7:29:44,  8.09s/it, gpt_loss=0.345, loss_mean=0.319][A
+Train step of epoch 0:  48%|████▊     | 3098/6434 [7:16:09<7:29:44,  8.09s/it, gpt_loss=0.375, loss_mean=0.324][A
+Train step of epoch 0:  48%|████▊     | 3099/6434 [7:16:09<7:38:12,  8.24s/it, gpt_loss=0.375, loss_mean=0.324][A
+[LID Router Debug] Step: 3100
+Batch Size: 10
+Audio Batch Size: 107
+LID Assignments: [0, 5, 3, 4, 0, 4, 5, 0, 1, 3]
+Active Experts in Batch: {0, 1, 3, 4, 5}
+
+Train step of epoch 0:  48%|████▊     | 3099/6434 [7:16:18<7:38:12,  8.24s/it, gpt_loss=0.365, loss_mean=0.328][A
+Train step of epoch 0:  48%|████▊     | 3100/6434 [7:16:18<7:44:55,  8.37s/it, gpt_loss=0.365, loss_mean=0.328][A
+Train step of epoch 0:  48%|████▊     | 3100/6434 [7:16:26<7:44:55,  8.37s/it, gpt_loss=0.252, loss_mean=0.321][A
+Train step of epoch 0:  48%|████▊     | 3101/6434 [7:16:26<7:43:37,  8.35s/it, gpt_loss=0.252, loss_mean=0.321][A
+Train step of epoch 0:  48%|████▊     | 3101/6434 [7:16:34<7:43:37,  8.35s/it, gpt_loss=0.328, loss_mean=0.321][A
+Train step of epoch 0:  48%|████▊     | 3102/6434 [7:16:34<7:44:14,  8.36s/it, gpt_loss=0.328, loss_mean=0.321][A
+Train step of epoch 0:  48%|████▊     | 3102/6434 [7:16:42<7:44:14,  8.36s/it, gpt_loss=0.357, loss_mean=0.325][A
+Train step of epoch 0:  48%|████▊     | 3103/6434 [7:16:42<7:27:51,  8.07s/it, gpt_loss=0.357, loss_mean=0.325][A
+Train step of epoch 0:  48%|████▊     | 3103/6434 [7:16:51<7:27:51,  8.07s/it, gpt_loss=0.338, loss_mean=0.326][A
+Train step of epoch 0:  48%|████▊     | 3104/6434 [7:16:51<7:44:00,  8.36s/it, gpt_loss=0.338, loss_mean=0.326][A
+Train step of epoch 0:  48%|████▊     | 3104/6434 [7:16:59<7:44:00,  8.36s/it, gpt_loss=0.325, loss_mean=0.326][A
+Train step of epoch 0:  48%|████▊     | 3105/6434 [7:16:59<7:38:49,  8.27s/it, gpt_loss=0.325, loss_mean=0.326][A
+Train step of epoch 0:  48%|████▊     | 3105/6434 [7:17:08<7:38:49,  8.27s/it, gpt_loss=0.281, loss_mean=0.322][A
+Train step of epoch 0:  48%|████▊     | 3106/6434 [7:17:08<7:50:12,  8.48s/it, gpt_loss=0.281, loss_mean=0.322][A
+Train step of epoch 0:  48%|████▊     | 3106/6434 [7:17:16<7:50:12,  8.48s/it, gpt_loss=0.274, loss_mean=0.317][A
+Train step of epoch 0:  48%|████▊     | 3107/6434 [7:17:16<7:45:46,  8.40s/it, gpt_loss=0.274, loss_mean=0.317][A
+Train step of epoch 0:  48%|████▊     | 3107/6434 [7:17:24<7:45:46,  8.40s/it, gpt_loss=0.303, loss_mean=0.315][A
+Train step of epoch 0:  48%|████▊     | 3108/6434 [7:17:24<7:38:16,  8.27s/it, gpt_loss=0.303, loss_mean=0.315][A
+Train step of epoch 0:  48%|████▊     | 3108/6434 [7:17:32<7:38:16,  8.27s/it, gpt_loss=0.341, loss_mean=0.318][A
+Train step of epoch 0:  48%|████▊     | 3109/6434 [7:17:32<7:40:54,  8.32s/it, gpt_loss=0.341, loss_mean=0.318][A
+[LID Router Debug] Step: 3110
+Batch Size: 10
+Audio Batch Size: 76
+LID Assignments: [4, 9, 1, 1, 0, 5, 9, 5, 1, 1]
+Active Experts in Batch: {0, 1, 4, 5, 9}
+
+Train step of epoch 0:  48%|████▊     | 3109/6434 [7:17:42<7:40:54,  8.32s/it, gpt_loss=0.374, loss_mean=0.324][A
+Train step of epoch 0:  48%|████▊     | 3110/6434 [7:17:42<8:01:02,  8.68s/it, gpt_loss=0.374, loss_mean=0.324][A
+Train step of epoch 0:  48%|████▊     | 3110/6434 [7:17:51<8:01:02,  8.68s/it, gpt_loss=0.331, loss_mean=0.324][A
+Train step of epoch 0:  48%|████▊     | 3111/6434 [7:17:51<8:05:40,  8.77s/it, gpt_loss=0.331, loss_mean=0.324][A
+Train step of epoch 0:  48%|████▊     | 3111/6434 [7:17:59<8:05:40,  8.77s/it, gpt_loss=0.346, loss_mean=0.326][A
+Train step of epoch 0:  48%|████▊     | 3112/6434 [7:17:59<7:55:37,  8.59s/it, gpt_loss=0.346, loss_mean=0.326][A
+Train step of epoch 0:  48%|████▊     | 3112/6434 [7:18:09<7:55:37,  8.59s/it, gpt_loss=0.278, loss_mean=0.322][A
+Train step of epoch 0:  48%|████▊     | 3113/6434 [7:18:09<8:13:38,  8.92s/it, gpt_loss=0.278, loss_mean=0.322][A
+Train step of epoch 0:  48%|████▊     | 3113/6434 [7:18:19<8:13:38,  8.92s/it, gpt_loss=0.223, loss_mean=0.312][A
+Train step of epoch 0:  48%|████▊     | 3114/6434 [7:18:19<8:31:44,  9.25s/it, gpt_loss=0.223, loss_mean=0.312][A
+Train step of epoch 0:  48%|████▊     | 3114/6434 [7:18:26<8:31:44,  9.25s/it, gpt_loss=0.27, loss_mean=0.308] [A
+Train step of epoch 0:  48%|████▊     | 3115/6434 [7:18:26<8:07:30,  8.81s/it, gpt_loss=0.27, loss_mean=0.308][A
+Train step of epoch 0:  48%|████▊     | 3115/6434 [7:18:35<8:07:30,  8.81s/it, gpt_loss=0.316, loss_mean=0.308][A
+Train step of epoch 0:  48%|████▊     | 3116/6434 [7:18:35<7:59:19,  8.67s/it, gpt_loss=0.316, loss_mean=0.308][A
+Train step of epoch 0:  48%|████▊     | 3116/6434 [7:18:44<7:59:19,  8.67s/it, gpt_loss=0.291, loss_mean=0.307][A
+Train step of epoch 0:  48%|████▊     | 3117/6434 [7:18:44<8:00:00,  8.68s/it, gpt_loss=0.291, loss_mean=0.307][A
+Train step of epoch 0:  48%|████▊     | 3117/6434 [7:18:52<8:00:00,  8.68s/it, gpt_loss=0.447, loss_mean=0.321][A
+Train step of epoch 0:  48%|████▊     | 3118/6434 [7:18:52<7:59:27,  8.68s/it, gpt_loss=0.447, loss_mean=0.321][A
+Train step of epoch 0:  48%|████▊     | 3118/6434 [7:19:01<7:59:27,  8.68s/it, gpt_loss=0.328, loss_mean=0.321][A
+Train step of epoch 0:  48%|████▊     | 3119/6434 [7:19:01<7:57:44,  8.65s/it, gpt_loss=0.328, loss_mean=0.321][A
+[LID Router Debug] Step: 3120
+Batch Size: 10
+Audio Batch Size: 112
+LID Assignments: [3, 0, 2, 9, 0, 2, 2, 0, 2, 1]
+Active Experts in Batch: {0, 1, 2, 3, 9}
+
+Train step of epoch 0:  48%|████▊     | 3119/6434 [7:19:09<7:57:44,  8.65s/it, gpt_loss=0.281, loss_mean=0.317][A
+Train step of epoch 0:  48%|████▊     | 3120/6434 [7:19:09<7:57:27,  8.64s/it, gpt_loss=0.281, loss_mean=0.317][A
+Train step of epoch 0:  48%|████▊     | 3120/6434 [7:19:17<7:57:27,  8.64s/it, gpt_loss=0.349, loss_mean=0.321][A
+Train step of epoch 0:  49%|████▊     | 3121/6434 [7:19:17<7:47:31,  8.47s/it, gpt_loss=0.349, loss_mean=0.321][A
+Train step of epoch 0:  49%|████▊     | 3121/6434 [7:19:27<7:47:31,  8.47s/it, gpt_loss=0.344, loss_mean=0.323][A
+Train step of epoch 0:  49%|████▊     | 3122/6434 [7:19:27<8:08:57,  8.86s/it, gpt_loss=0.344, loss_mean=0.323][A
+Train step of epoch 0:  49%|████▊     | 3122/6434 [7:19:35<8:08:57,  8.86s/it, gpt_loss=0.297, loss_mean=0.32] [A
+Train step of epoch 0:  49%|████▊     | 3123/6434 [7:19:35<7:45:56,  8.44s/it, gpt_loss=0.297, loss_mean=0.32][A
+Train step of epoch 0:  49%|████▊     | 3123/6434 [7:19:42<7:45:56,  8.44s/it, gpt_loss=0.296, loss_mean=0.318][A
+Train step of epoch 0:  49%|████▊     | 3124/6434 [7:19:42<7:30:01,  8.16s/it, gpt_loss=0.296, loss_mean=0.318][A
+Train step of epoch 0:  49%|████▊     | 3124/6434 [7:19:51<7:30:01,  8.16s/it, gpt_loss=0.358, loss_mean=0.322][A
+Train step of epoch 0:  49%|████▊     | 3125/6434 [7:19:51<7:33:23,  8.22s/it, gpt_loss=0.358, loss_mean=0.322][A
+Train step of epoch 0:  49%|████▊     | 3125/6434 [7:19:58<7:33:23,  8.22s/it, gpt_loss=0.323, loss_mean=0.322][A
+Train step of epoch 0:  49%|████▊     | 3126/6434 [7:19:58<7:28:08,  8.13s/it, gpt_loss=0.323, loss_mean=0.322][A
+Train step of epoch 0:  49%|████▊     | 3126/6434 [7:20:07<7:28:08,  8.13s/it, gpt_loss=0.263, loss_mean=0.316][A
+Train step of epoch 0:  49%|████▊     | 3127/6434 [7:20:07<7:33:22,  8.23s/it, gpt_loss=0.263, loss_mean=0.316][A
+Train step of epoch 0:  49%|████▊     | 3127/6434 [7:20:15<7:33:22,  8.23s/it, gpt_loss=0.352, loss_mean=0.32] [A
+Train step of epoch 0:  49%|████▊     | 3128/6434 [7:20:15<7:25:06,  8.08s/it, gpt_loss=0.352, loss_mean=0.32][A
+Train step of epoch 0:  49%|████▊     | 3128/6434 [7:20:23<7:25:06,  8.08s/it, gpt_loss=0.292, loss_mean=0.317][A
+Train step of epoch 0:  49%|████▊     | 3129/6434 [7:20:23<7:29:28,  8.16s/it, gpt_loss=0.292, loss_mean=0.317][A
+[LID Router Debug] Step: 3130
+Batch Size: 10
+Audio Batch Size: 125
+LID Assignments: [0, 9, 0, 1, 9, 9, 9, 5, 3, 0]
+Active Experts in Batch: {0, 1, 3, 5, 9}
+
+Train step of epoch 0:  49%|████▊     | 3129/6434 [7:20:31<7:29:28,  8.16s/it, gpt_loss=0.356, loss_mean=0.321][A
+Train step of epoch 0:  49%|████▊     | 3130/6434 [7:20:31<7:32:58,  8.23s/it, gpt_loss=0.356, loss_mean=0.321][A
+Train step of epoch 0:  49%|████▊     | 3130/6434 [7:20:40<7:32:58,  8.23s/it, gpt_loss=0.336, loss_mean=0.322][A
+Train step of epoch 0:  49%|████▊     | 3131/6434 [7:20:40<7:38:58,  8.34s/it, gpt_loss=0.336, loss_mean=0.322][A
+Train step of epoch 0:  49%|████▊     | 3131/6434 [7:20:48<7:38:58,  8.34s/it, gpt_loss=0.417, loss_mean=0.332][A
+Train step of epoch 0:  49%|████▊     | 3132/6434 [7:20:48<7:34:07,  8.25s/it, gpt_loss=0.417, loss_mean=0.332][A
+Train step of epoch 0:  49%|████▊     | 3132/6434 [7:20:57<7:34:07,  8.25s/it, gpt_loss=0.407, loss_mean=0.339][A
+Train step of epoch 0:  49%|████▊     | 3133/6434 [7:20:57<7:45:15,  8.46s/it, gpt_loss=0.407, loss_mean=0.339][A
+Train step of epoch 0:  49%|████▊     | 3133/6434 [7:21:05<7:45:15,  8.46s/it, gpt_loss=0.29, loss_mean=0.334] [A
+Train step of epoch 0:  49%|████▊     | 3134/6434 [7:21:05<7:42:16,  8.40s/it, gpt_loss=0.29, loss_mean=0.334][A
+Train step of epoch 0:  49%|████▊     | 3134/6434 [7:21:14<7:42:16,  8.40s/it, gpt_loss=0.318, loss_mean=0.333][A
+Train step of epoch 0:  49%|████▊     | 3135/6434 [7:21:14<7:51:44,  8.58s/it, gpt_loss=0.318, loss_mean=0.333][A
+Train step of epoch 0:  49%|████▊     | 3135/6434 [7:21:22<7:51:44,  8.58s/it, gpt_loss=0.275, loss_mean=0.327][A
+Train step of epoch 0:  49%|████▊     | 3136/6434 [7:21:22<7:39:50,  8.37s/it, gpt_loss=0.275, loss_mean=0.327][A
+Train step of epoch 0:  49%|████▊     | 3136/6434 [7:21:31<7:39:50,  8.37s/it, gpt_loss=0.306, loss_mean=0.325][A
+Train step of epoch 0:  49%|████▉     | 3137/6434 [7:21:31<7:48:22,  8.52s/it, gpt_loss=0.306, loss_mean=0.325][A
+Train step of epoch 0:  49%|████▉     | 3137/6434 [7:21:39<7:48:22,  8.52s/it, gpt_loss=0.254, loss_mean=0.318][A
+Train step of epoch 0:  49%|████▉     | 3138/6434 [7:21:39<7:33:39,  8.26s/it, gpt_loss=0.254, loss_mean=0.318][A
+Train step of epoch 0:  49%|████▉     | 3138/6434 [7:21:46<7:33:39,  8.26s/it, gpt_loss=0.291, loss_mean=0.315][A
+Train step of epoch 0:  49%|████▉     | 3139/6434 [7:21:46<7:25:29,  8.11s/it, gpt_loss=0.291, loss_mean=0.315][A
+[LID Router Debug] Step: 3140
+Batch Size: 10
+Audio Batch Size: 117
+LID Assignments: [5, 9, 1, 3, 2, 1, 3, 9, 2, 0]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9}
+
+Train step of epoch 0:  49%|████▉     | 3139/6434 [7:21:54<7:25:29,  8.11s/it, gpt_loss=0.295, loss_mean=0.313][A
+Train step of epoch 0:  49%|████▉     | 3140/6434 [7:21:54<7:23:05,  8.07s/it, gpt_loss=0.295, loss_mean=0.313][A
+Train step of epoch 0:  49%|████▉     | 3140/6434 [7:22:03<7:23:05,  8.07s/it, gpt_loss=0.338, loss_mean=0.316][A
+Train step of epoch 0:  49%|████▉     | 3141/6434 [7:22:03<7:39:02,  8.36s/it, gpt_loss=0.338, loss_mean=0.316][A
+Train step of epoch 0:  49%|████▉     | 3141/6434 [7:22:11<7:39:02,  8.36s/it, gpt_loss=0.408, loss_mean=0.325][A
+Train step of epoch 0:  49%|████▉     | 3142/6434 [7:22:11<7:31:22,  8.23s/it, gpt_loss=0.408, loss_mean=0.325][A
+Train step of epoch 0:  49%|████▉     | 3142/6434 [7:22:20<7:31:22,  8.23s/it, gpt_loss=0.413, loss_mean=0.334][A
+Train step of epoch 0:  49%|████▉     | 3143/6434 [7:22:20<7:39:18,  8.37s/it, gpt_loss=0.413, loss_mean=0.334][A
+Train step of epoch 0:  49%|████▉     | 3143/6434 [7:22:29<7:39:18,  8.37s/it, gpt_loss=0.281, loss_mean=0.328][A
+Train step of epoch 0:  49%|████▉     | 3144/6434 [7:22:29<7:43:13,  8.45s/it, gpt_loss=0.281, loss_mean=0.328][A
+Train step of epoch 0:  49%|████▉     | 3144/6434 [7:22:36<7:43:13,  8.45s/it, gpt_loss=0.296, loss_mean=0.325][A
+Train step of epoch 0:  49%|████▉     | 3145/6434 [7:22:36<7:24:35,  8.11s/it, gpt_loss=0.296, loss_mean=0.325][A
+Train step of epoch 0:  49%|████▉     | 3145/6434 [7:22:46<7:24:35,  8.11s/it, gpt_loss=0.392, loss_mean=0.332][A
+Train step of epoch 0:  49%|████▉     | 3146/6434 [7:22:46<7:53:14,  8.64s/it, gpt_loss=0.392, loss_mean=0.332][A
+Train step of epoch 0:  49%|████▉     | 3146/6434 [7:22:55<7:53:14,  8.64s/it, gpt_loss=0.359, loss_mean=0.335][A
+Train step of epoch 0:  49%|████▉     | 3147/6434 [7:22:55<7:59:07,  8.75s/it, gpt_loss=0.359, loss_mean=0.335][A
+Train step of epoch 0:  49%|████▉     | 3147/6434 [7:23:03<7:59:07,  8.75s/it, gpt_loss=0.385, loss_mean=0.34] [A
+Train step of epoch 0:  49%|████▉     | 3148/6434 [7:23:03<7:53:56,  8.65s/it, gpt_loss=0.385, loss_mean=0.34][A
+Train step of epoch 0:  49%|████▉     | 3148/6434 [7:23:11<7:53:56,  8.65s/it, gpt_loss=0.344, loss_mean=0.34][A
+Train step of epoch 0:  49%|████▉     | 3149/6434 [7:23:11<7:35:37,  8.32s/it, gpt_loss=0.344, loss_mean=0.34][A
+[LID Router Debug] Step: 3150
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [4, 2, 5, 3, 5, 1, 0, 0, 1, 5]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5}
+
+Train step of epoch 0:  49%|████▉     | 3149/6434 [7:23:19<7:35:37,  8.32s/it, gpt_loss=0.31, loss_mean=0.337][A
+Train step of epoch 0:  49%|████▉     | 3150/6434 [7:23:19<7:32:03,  8.26s/it, gpt_loss=0.31, loss_mean=0.337][A
+Train step of epoch 0:  49%|████▉     | 3150/6434 [7:23:27<7:32:03,  8.26s/it, gpt_loss=0.306, loss_mean=0.334][A
+Train step of epoch 0:  49%|████▉     | 3151/6434 [7:23:27<7:31:02,  8.24s/it, gpt_loss=0.306, loss_mean=0.334][A
+Train step of epoch 0:  49%|████▉     | 3151/6434 [7:23:37<7:31:02,  8.24s/it, gpt_loss=0.29, loss_mean=0.33]  [A
+Train step of epoch 0:  49%|████▉     | 3152/6434 [7:23:37<7:56:22,  8.71s/it, gpt_loss=0.29, loss_mean=0.33][A
+Train step of epoch 0:  49%|████▉     | 3152/6434 [7:23:45<7:56:22,  8.71s/it, gpt_loss=0.34, loss_mean=0.331][A
+Train step of epoch 0:  49%|████▉     | 3153/6434 [7:23:45<7:53:05,  8.65s/it, gpt_loss=0.34, loss_mean=0.331][A
+Train step of epoch 0:  49%|████▉     | 3153/6434 [7:23:53<7:53:05,  8.65s/it, gpt_loss=0.321, loss_mean=0.33][A
+Train step of epoch 0:  49%|████▉     | 3154/6434 [7:23:53<7:32:19,  8.27s/it, gpt_loss=0.321, loss_mean=0.33][A
+Train step of epoch 0:  49%|████▉     | 3154/6434 [7:24:02<7:32:19,  8.27s/it, gpt_loss=0.343, loss_mean=0.331][A
+Train step of epoch 0:  49%|████▉     | 3155/6434 [7:24:02<7:50:37,  8.61s/it, gpt_loss=0.343, loss_mean=0.331][A
+Train step of epoch 0:  49%|████▉     | 3155/6434 [7:24:10<7:50:37,  8.61s/it, gpt_loss=0.283, loss_mean=0.326][A
+Train step of epoch 0:  49%|████▉     | 3156/6434 [7:24:10<7:30:15,  8.24s/it, gpt_loss=0.283, loss_mean=0.326][A
+Train step of epoch 0:  49%|████▉     | 3156/6434 [7:24:18<7:30:15,  8.24s/it, gpt_loss=0.333, loss_mean=0.327][A
+Train step of epoch 0:  49%|████▉     | 3157/6434 [7:24:18<7:31:44,  8.27s/it, gpt_loss=0.333, loss_mean=0.327][A
+Train step of epoch 0:  49%|████▉     | 3157/6434 [7:24:26<7:31:44,  8.27s/it, gpt_loss=0.281, loss_mean=0.322][A
+Train step of epoch 0:  49%|████▉     | 3158/6434 [7:24:26<7:34:08,  8.32s/it, gpt_loss=0.281, loss_mean=0.322][A
+Train step of epoch 0:  49%|████▉     | 3158/6434 [7:24:35<7:34:08,  8.32s/it, gpt_loss=0.345, loss_mean=0.325][A
+Train step of epoch 0:  49%|████▉     | 3159/6434 [7:24:35<7:30:05,  8.25s/it, gpt_loss=0.345, loss_mean=0.325][A
+[LID Router Debug] Step: 3160
+Batch Size: 10
+Audio Batch Size: 118
+LID Assignments: [4, 3, 2, 5, 3, 0, 4, 1, 2, 5]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5}
+
+Train step of epoch 0:  49%|████▉     | 3159/6434 [7:24:45<7:30:05,  8.25s/it, gpt_loss=0.302, loss_mean=0.322][A
+Train step of epoch 0:  49%|████▉     | 3160/6434 [7:24:45<8:04:40,  8.88s/it, gpt_loss=0.302, loss_mean=0.322][A
+Train step of epoch 0:  49%|████▉     | 3160/6434 [7:24:53<8:04:40,  8.88s/it, gpt_loss=0.262, loss_mean=0.316][A
+Train step of epoch 0:  49%|████▉     | 3161/6434 [7:24:53<7:51:58,  8.65s/it, gpt_loss=0.262, loss_mean=0.316][A
+Train step of epoch 0:  49%|████▉     | 3161/6434 [7:25:02<7:51:58,  8.65s/it, gpt_loss=0.33, loss_mean=0.318] [A
+Train step of epoch 0:  49%|████▉     | 3162/6434 [7:25:02<7:54:45,  8.71s/it, gpt_loss=0.33, loss_mean=0.318][A
+Train step of epoch 0:  49%|████▉     | 3162/6434 [7:25:10<7:54:45,  8.71s/it, gpt_loss=0.35, loss_mean=0.321][A
+Train step of epoch 0:  49%|████▉     | 3163/6434 [7:25:10<7:39:03,  8.42s/it, gpt_loss=0.35, loss_mean=0.321][A
+Train step of epoch 0:  49%|████▉     | 3163/6434 [7:25:18<7:39:03,  8.42s/it, gpt_loss=0.194, loss_mean=0.308][A
+Train step of epoch 0:  49%|████▉     | 3164/6434 [7:25:18<7:38:03,  8.40s/it, gpt_loss=0.194, loss_mean=0.308][A
+Train step of epoch 0:  49%|████▉     | 3164/6434 [7:25:26<7:38:03,  8.40s/it, gpt_loss=0.349, loss_mean=0.312][A
+Train step of epoch 0:  49%|████▉     | 3165/6434 [7:25:26<7:30:43,  8.27s/it, gpt_loss=0.349, loss_mean=0.312][A
+Train step of epoch 0:  49%|████▉     | 3165/6434 [7:25:34<7:30:43,  8.27s/it, gpt_loss=0.328, loss_mean=0.314][A
+Train step of epoch 0:  49%|████▉     | 3166/6434 [7:25:34<7:24:24,  8.16s/it, gpt_loss=0.328, loss_mean=0.314][A
+Train step of epoch 0:  49%|████▉     | 3166/6434 [7:25:42<7:24:24,  8.16s/it, gpt_loss=0.302, loss_mean=0.313][A
+Train step of epoch 0:  49%|████▉     | 3167/6434 [7:25:42<7:29:29,  8.26s/it, gpt_loss=0.302, loss_mean=0.313][A
+Train step of epoch 0:  49%|████▉     | 3167/6434 [7:25:51<7:29:29,  8.26s/it, gpt_loss=0.341, loss_mean=0.315][A
+Train step of epoch 0:  49%|████▉     | 3168/6434 [7:25:51<7:33:15,  8.33s/it, gpt_loss=0.341, loss_mean=0.315][A
+Train step of epoch 0:  49%|████▉     | 3168/6434 [7:25:58<7:33:15,  8.33s/it, gpt_loss=0.296, loss_mean=0.314][A
+Train step of epoch 0:  49%|████▉     | 3169/6434 [7:25:58<7:22:37,  8.13s/it, gpt_loss=0.296, loss_mean=0.314][A
+[LID Router Debug] Step: 3170
+Batch Size: 10
+Audio Batch Size: 92
+LID Assignments: [5, 6, 2, 5, 1, 0, 5, 5, 9, 3]
+Active Experts in Batch: {0, 1, 2, 3, 5, 6, 9}
+
+Train step of epoch 0:  49%|████▉     | 3169/6434 [7:26:07<7:22:37,  8.13s/it, gpt_loss=0.292, loss_mean=0.311][A
+Train step of epoch 0:  49%|████▉     | 3170/6434 [7:26:07<7:34:17,  8.35s/it, gpt_loss=0.292, loss_mean=0.311][A
+Train step of epoch 0:  49%|████▉     | 3170/6434 [7:26:16<7:34:17,  8.35s/it, gpt_loss=0.26, loss_mean=0.306] [A
+Train step of epoch 0:  49%|████▉     | 3171/6434 [7:26:16<7:35:04,  8.37s/it, gpt_loss=0.26, loss_mean=0.306][A
+Train step of epoch 0:  49%|████▉     | 3171/6434 [7:26:25<7:35:04,  8.37s/it, gpt_loss=0.265, loss_mean=0.302][A
+Train step of epoch 0:  49%|████▉     | 3172/6434 [7:26:25<7:51:40,  8.68s/it, gpt_loss=0.265, loss_mean=0.302][A
+Train step of epoch 0:  49%|████▉     | 3172/6434 [7:26:33<7:51:40,  8.68s/it, gpt_loss=0.271, loss_mean=0.299][A
+Train step of epoch 0:  49%|████▉     | 3173/6434 [7:26:33<7:38:18,  8.43s/it, gpt_loss=0.271, loss_mean=0.299][A
+Train step of epoch 0:  49%|████▉     | 3173/6434 [7:26:42<7:38:18,  8.43s/it, gpt_loss=0.289, loss_mean=0.298][A
+Train step of epoch 0:  49%|████▉     | 3174/6434 [7:26:42<7:44:12,  8.54s/it, gpt_loss=0.289, loss_mean=0.298][A
+Train step of epoch 0:  49%|████▉     | 3174/6434 [7:26:50<7:44:12,  8.54s/it, gpt_loss=0.274, loss_mean=0.296][A
+Train step of epoch 0:  49%|████▉     | 3175/6434 [7:26:50<7:43:40,  8.54s/it, gpt_loss=0.274, loss_mean=0.296][A
+Train step of epoch 0:  49%|████▉     | 3175/6434 [7:26:59<7:43:40,  8.54s/it, gpt_loss=0.351, loss_mean=0.301][A
+Train step of epoch 0:  49%|████▉     | 3176/6434 [7:26:59<7:42:03,  8.51s/it, gpt_loss=0.351, loss_mean=0.301][A
+Train step of epoch 0:  49%|████▉     | 3176/6434 [7:27:06<7:42:03,  8.51s/it, gpt_loss=0.304, loss_mean=0.301][A
+Train step of epoch 0:  49%|████▉     | 3177/6434 [7:27:06<7:14:45,  8.01s/it, gpt_loss=0.304, loss_mean=0.301][A
+Train step of epoch 0:  49%|████▉     | 3177/6434 [7:27:14<7:14:45,  8.01s/it, gpt_loss=0.296, loss_mean=0.301][A
+Train step of epoch 0:  49%|████▉     | 3178/6434 [7:27:14<7:19:00,  8.09s/it, gpt_loss=0.296, loss_mean=0.301][A
+Train step of epoch 0:  49%|████▉     | 3178/6434 [7:27:22<7:19:00,  8.09s/it, gpt_loss=0.3, loss_mean=0.301]  [A
+Train step of epoch 0:  49%|████▉     | 3179/6434 [7:27:22<7:23:24,  8.17s/it, gpt_loss=0.3, loss_mean=0.301][A
+[LID Router Debug] Step: 3180
+Batch Size: 10
+Audio Batch Size: 106
+LID Assignments: [0, 3, 0, 7, 4, 0, 4, 0, 2, 0]
+Active Experts in Batch: {0, 2, 3, 4, 7}
+
+Train step of epoch 0:  49%|████▉     | 3179/6434 [7:27:30<7:23:24,  8.17s/it, gpt_loss=0.349, loss_mean=0.306][A
+Train step of epoch 0:  49%|████▉     | 3180/6434 [7:27:30<7:22:53,  8.17s/it, gpt_loss=0.349, loss_mean=0.306][A
+Train step of epoch 0:  49%|████▉     | 3180/6434 [7:27:38<7:22:53,  8.17s/it, gpt_loss=0.314, loss_mean=0.306][A
+Train step of epoch 0:  49%|████▉     | 3181/6434 [7:27:38<7:07:46,  7.89s/it, gpt_loss=0.314, loss_mean=0.306][A
+Train step of epoch 0:  49%|████▉     | 3181/6434 [7:27:47<7:07:46,  7.89s/it, gpt_loss=0.357, loss_mean=0.311][A
+Train step of epoch 0:  49%|████▉     | 3182/6434 [7:27:47<7:29:47,  8.30s/it, gpt_loss=0.357, loss_mean=0.311][A
+Train step of epoch 0:  49%|████▉     | 3182/6434 [7:27:55<7:29:47,  8.30s/it, gpt_loss=0.28, loss_mean=0.308] [A
+Train step of epoch 0:  49%|████▉     | 3183/6434 [7:27:55<7:24:47,  8.21s/it, gpt_loss=0.28, loss_mean=0.308][A
+Train step of epoch 0:  49%|████▉     | 3183/6434 [7:28:03<7:24:47,  8.21s/it, gpt_loss=0.341, loss_mean=0.312][A
+Train step of epoch 0:  49%|████▉     | 3184/6434 [7:28:03<7:21:10,  8.14s/it, gpt_loss=0.341, loss_mean=0.312][A
+Train step of epoch 0:  49%|████▉     | 3184/6434 [7:28:12<7:21:10,  8.14s/it, gpt_loss=0.341, loss_mean=0.315][A
+Train step of epoch 0:  50%|████▉     | 3185/6434 [7:28:12<7:30:35,  8.32s/it, gpt_loss=0.341, loss_mean=0.315][A
+Train step of epoch 0:  50%|████▉     | 3185/6434 [7:28:20<7:30:35,  8.32s/it, gpt_loss=0.226, loss_mean=0.306][A
+Train step of epoch 0:  50%|████▉     | 3186/6434 [7:28:20<7:34:49,  8.40s/it, gpt_loss=0.226, loss_mean=0.306][A
+Train step of epoch 0:  50%|████▉     | 3186/6434 [7:28:28<7:34:49,  8.40s/it, gpt_loss=0.39, loss_mean=0.314] [A
+Train step of epoch 0:  50%|████▉     | 3187/6434 [7:28:28<7:22:07,  8.17s/it, gpt_loss=0.39, loss_mean=0.314][A
+Train step of epoch 0:  50%|████▉     | 3187/6434 [7:28:36<7:22:07,  8.17s/it, gpt_loss=0.299, loss_mean=0.313][A
+Train step of epoch 0:  50%|████▉     | 3188/6434 [7:28:36<7:18:09,  8.10s/it, gpt_loss=0.299, loss_mean=0.313][A
+Train step of epoch 0:  50%|████▉     | 3188/6434 [7:28:44<7:18:09,  8.10s/it, gpt_loss=0.344, loss_mean=0.316][A
+Train step of epoch 0:  50%|████▉     | 3189/6434 [7:28:44<7:26:43,  8.26s/it, gpt_loss=0.344, loss_mean=0.316][A
+[LID Router Debug] Step: 3190
+Batch Size: 10
+Audio Batch Size: 88
+LID Assignments: [5, 3, 5, 5, 0, 4, 4, 1, 1, 4]
+Active Experts in Batch: {0, 1, 3, 4, 5}
+
+Train step of epoch 0:  50%|████▉     | 3189/6434 [7:28:52<7:26:43,  8.26s/it, gpt_loss=0.342, loss_mean=0.318][A
+Train step of epoch 0:  50%|████▉     | 3190/6434 [7:28:52<7:19:09,  8.12s/it, gpt_loss=0.342, loss_mean=0.318][A
+Train step of epoch 0:  50%|████▉     | 3190/6434 [7:29:00<7:19:09,  8.12s/it, gpt_loss=0.37, loss_mean=0.323] [A
+Train step of epoch 0:  50%|████▉     | 3191/6434 [7:29:00<7:21:18,  8.16s/it, gpt_loss=0.37, loss_mean=0.323][A
+Train step of epoch 0:  50%|████▉     | 3191/6434 [7:29:10<7:21:18,  8.16s/it, gpt_loss=0.359, loss_mean=0.327][A
+Train step of epoch 0:  50%|████▉     | 3192/6434 [7:29:10<7:47:34,  8.65s/it, gpt_loss=0.359, loss_mean=0.327][A
+Train step of epoch 0:  50%|████▉     | 3192/6434 [7:29:18<7:47:34,  8.65s/it, gpt_loss=0.373, loss_mean=0.332][A
+Train step of epoch 0:  50%|████▉     | 3193/6434 [7:29:18<7:27:02,  8.28s/it, gpt_loss=0.373, loss_mean=0.332][A
+Train step of epoch 0:  50%|████▉     | 3193/6434 [7:29:25<7:27:02,  8.28s/it, gpt_loss=0.389, loss_mean=0.337][A
+Train step of epoch 0:  50%|████▉     | 3194/6434 [7:29:25<7:10:58,  7.98s/it, gpt_loss=0.389, loss_mean=0.337][A
+Train step of epoch 0:  50%|████▉     | 3194/6434 [7:29:33<7:10:58,  7.98s/it, gpt_loss=0.3, loss_mean=0.334]  [A
+Train step of epoch 0:  50%|████▉     | 3195/6434 [7:29:33<7:11:02,  7.98s/it, gpt_loss=0.3, loss_mean=0.334][A
+Train step of epoch 0:  50%|████▉     | 3195/6434 [7:29:41<7:11:02,  7.98s/it, gpt_loss=0.301, loss_mean=0.33][A
+Train step of epoch 0:  50%|████▉     | 3196/6434 [7:29:41<7:14:52,  8.06s/it, gpt_loss=0.301, loss_mean=0.33][A
+Train step of epoch 0:  50%|████▉     | 3196/6434 [7:29:49<7:14:52,  8.06s/it, gpt_loss=0.344, loss_mean=0.332][A
+Train step of epoch 0:  50%|████▉     | 3197/6434 [7:29:49<7:06:02,  7.90s/it, gpt_loss=0.344, loss_mean=0.332][A
+Train step of epoch 0:  50%|████▉     | 3197/6434 [7:29:57<7:06:02,  7.90s/it, gpt_loss=0.235, loss_mean=0.322][A
+Train step of epoch 0:  50%|████▉     | 3198/6434 [7:29:57<7:20:15,  8.16s/it, gpt_loss=0.235, loss_mean=0.322][A
+Train step of epoch 0:  50%|████▉     | 3198/6434 [7:30:06<7:20:15,  8.16s/it, gpt_loss=0.307, loss_mean=0.32] [A
+Train step of epoch 0:  50%|████▉     | 3199/6434 [7:30:06<7:25:09,  8.26s/it, gpt_loss=0.307, loss_mean=0.32][A
+[LID Router Debug] Step: 3200
+Batch Size: 10
+Audio Batch Size: 82
+LID Assignments: [6, 9, 4, 5, 1, 4, 9, 4, 5, 0]
+Active Experts in Batch: {0, 1, 4, 5, 6, 9}
+[2026-02-06 23:26:18,021] [INFO] [logging.py:96:log_dist] [Rank 0] step=1600, skipped=0, lr=[1.8803371854752884e-05, 1.8803371854752884e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-06 23:26:18,022] [INFO] [timer.py:260:stop] epoch=0/micro_step=3200/global_step=1600, RunningAvgSamplesPerSec=4.749177581100083, CurrSamplesPerSec=4.986431416259362, MemAllocated=12.47GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  50%|████▉     | 3199/6434 [7:30:14<7:25:09,  8.26s/it, gpt_loss=0.327, loss_mean=0.321][A
+Train step of epoch 0:  50%|████▉     | 3200/6434 [7:30:14<7:14:21,  8.06s/it, gpt_loss=0.327, loss_mean=0.321][A
+Train step of epoch 0:  50%|████▉     | 3200/6434 [7:30:22<7:14:21,  8.06s/it, gpt_loss=0.307, loss_mean=0.32] [A
+Train step of epoch 0:  50%|████▉     | 3201/6434 [7:30:22<7:16:56,  8.11s/it, gpt_loss=0.307, loss_mean=0.32][A
+Train step of epoch 0:  50%|████▉     | 3201/6434 [7:30:29<7:16:56,  8.11s/it, gpt_loss=0.349, loss_mean=0.323][A
+Train step of epoch 0:  50%|████▉     | 3202/6434 [7:30:29<7:04:17,  7.88s/it, gpt_loss=0.349, loss_mean=0.323][A
+Train step of epoch 0:  50%|████▉     | 3202/6434 [7:30:38<7:04:17,  7.88s/it, gpt_loss=0.446, loss_mean=0.335][A
+Train step of epoch 0:  50%|████▉     | 3203/6434 [7:30:38<7:17:06,  8.12s/it, gpt_loss=0.446, loss_mean=0.335][A
+Train step of epoch 0:  50%|████▉     | 3203/6434 [7:30:46<7:17:06,  8.12s/it, gpt_loss=0.285, loss_mean=0.33] [A
+Train step of epoch 0:  50%|████▉     | 3204/6434 [7:30:46<7:12:29,  8.03s/it, gpt_loss=0.285, loss_mean=0.33][A
+Train step of epoch 0:  50%|████▉     | 3204/6434 [7:30:54<7:12:29,  8.03s/it, gpt_loss=0.309, loss_mean=0.328][A
+Train step of epoch 0:  50%|████▉     | 3205/6434 [7:30:54<7:21:22,  8.20s/it, gpt_loss=0.309, loss_mean=0.328][A
+Train step of epoch 0:  50%|████▉     | 3205/6434 [7:31:03<7:21:22,  8.20s/it, gpt_loss=0.323, loss_mean=0.327][A
+Train step of epoch 0:  50%|████▉     | 3206/6434 [7:31:03<7:23:36,  8.25s/it, gpt_loss=0.323, loss_mean=0.327][A
+Train step of epoch 0:  50%|████▉     | 3206/6434 [7:31:11<7:23:36,  8.25s/it, gpt_loss=0.25, loss_mean=0.32]  [A
+Train step of epoch 0:  50%|████▉     | 3207/6434 [7:31:11<7:22:14,  8.22s/it, gpt_loss=0.25, loss_mean=0.32][A
+Train step of epoch 0:  50%|████▉     | 3207/6434 [7:31:20<7:22:14,  8.22s/it, gpt_loss=0.326, loss_mean=0.32][A
+Train step of epoch 0:  50%|████▉     | 3208/6434 [7:31:20<7:33:45,  8.44s/it, gpt_loss=0.326, loss_mean=0.32][A
+Train step of epoch 0:  50%|████▉     | 3208/6434 [7:31:28<7:33:45,  8.44s/it, gpt_loss=0.344, loss_mean=0.323][A
+Train step of epoch 0:  50%|████▉     | 3209/6434 [7:31:28<7:32:45,  8.42s/it, gpt_loss=0.344, loss_mean=0.323][A
+[LID Router Debug] Step: 3210
+Batch Size: 10
+Audio Batch Size: 89
+LID Assignments: [4, 1, 0, 9, 0, 2, 1, 2, 4, 2]
+Active Experts in Batch: {0, 1, 2, 4, 9}
+
+Train step of epoch 0:  50%|████▉     | 3209/6434 [7:31:36<7:32:45,  8.42s/it, gpt_loss=0.321, loss_mean=0.323][A
+Train step of epoch 0:  50%|████▉     | 3210/6434 [7:31:36<7:28:30,  8.35s/it, gpt_loss=0.321, loss_mean=0.323][A
+Train step of epoch 0:  50%|████▉     | 3210/6434 [7:31:44<7:28:30,  8.35s/it, gpt_loss=0.322, loss_mean=0.323][A
+Train step of epoch 0:  50%|████▉     | 3211/6434 [7:31:44<7:18:56,  8.17s/it, gpt_loss=0.322, loss_mean=0.323][A
+Train step of epoch 0:  50%|████▉     | 3211/6434 [7:31:53<7:18:56,  8.17s/it, gpt_loss=0.303, loss_mean=0.321][A
+Train step of epoch 0:  50%|████▉     | 3212/6434 [7:31:53<7:34:41,  8.47s/it, gpt_loss=0.303, loss_mean=0.321][A
+Train step of epoch 0:  50%|████▉     | 3212/6434 [7:32:01<7:34:41,  8.47s/it, gpt_loss=0.213, loss_mean=0.31] [A
+Train step of epoch 0:  50%|████▉     | 3213/6434 [7:32:01<7:17:15,  8.15s/it, gpt_loss=0.213, loss_mean=0.31][A
+Train step of epoch 0:  50%|████▉     | 3213/6434 [7:32:09<7:17:15,  8.15s/it, gpt_loss=0.304, loss_mean=0.309][A
+Train step of epoch 0:  50%|████▉     | 3214/6434 [7:32:09<7:25:19,  8.30s/it, gpt_loss=0.304, loss_mean=0.309][A
+Train step of epoch 0:  50%|████▉     | 3214/6434 [7:32:18<7:25:19,  8.30s/it, gpt_loss=0.24, loss_mean=0.302] [A
+Train step of epoch 0:  50%|████▉     | 3215/6434 [7:32:18<7:33:36,  8.45s/it, gpt_loss=0.24, loss_mean=0.302][A
+Train step of epoch 0:  50%|████▉     | 3215/6434 [7:32:25<7:33:36,  8.45s/it, gpt_loss=0.366, loss_mean=0.309][A
+Train step of epoch 0:  50%|████▉     | 3216/6434 [7:32:25<7:13:05,  8.08s/it, gpt_loss=0.366, loss_mean=0.309][A
+Train step of epoch 0:  50%|████▉     | 3216/6434 [7:32:33<7:13:05,  8.08s/it, gpt_loss=0.278, loss_mean=0.306][A
+Train step of epoch 0:  50%|█████     | 3217/6434 [7:32:33<7:16:00,  8.13s/it, gpt_loss=0.278, loss_mean=0.306][A
+Train step of epoch 0:  50%|█████     | 3217/6434 [7:32:42<7:16:00,  8.13s/it, gpt_loss=0.259, loss_mean=0.301][A
+Train step of epoch 0:  50%|█████     | 3218/6434 [7:32:42<7:18:06,  8.17s/it, gpt_loss=0.259, loss_mean=0.301][A
+Train step of epoch 0:  50%|█████     | 3218/6434 [7:32:50<7:18:06,  8.17s/it, gpt_loss=0.281, loss_mean=0.299][A
+Train step of epoch 0:  50%|█████     | 3219/6434 [7:32:50<7:19:33,  8.20s/it, gpt_loss=0.281, loss_mean=0.299][A
+[LID Router Debug] Step: 3220
+Batch Size: 10
+Audio Batch Size: 104
+LID Assignments: [1, 1, 0, 9, 0, 9, 5, 2, 9, 2]
+Active Experts in Batch: {0, 1, 2, 5, 9}
+
+Train step of epoch 0:  50%|█████     | 3219/6434 [7:32:59<7:19:33,  8.20s/it, gpt_loss=0.323, loss_mean=0.301][A
+Train step of epoch 0:  50%|█████     | 3220/6434 [7:32:59<7:35:17,  8.50s/it, gpt_loss=0.323, loss_mean=0.301][A
+Train step of epoch 0:  50%|█████     | 3220/6434 [7:33:08<7:35:17,  8.50s/it, gpt_loss=0.238, loss_mean=0.295][A
+Train step of epoch 0:  50%|█████     | 3221/6434 [7:33:08<7:47:18,  8.73s/it, gpt_loss=0.238, loss_mean=0.295][A
+Train step of epoch 0:  50%|█████     | 3221/6434 [7:33:16<7:47:18,  8.73s/it, gpt_loss=0.375, loss_mean=0.303][A
+Train step of epoch 0:  50%|█████     | 3222/6434 [7:33:16<7:28:09,  8.37s/it, gpt_loss=0.375, loss_mean=0.303][A
+Train step of epoch 0:  50%|█████     | 3222/6434 [7:33:24<7:28:09,  8.37s/it, gpt_loss=0.327, loss_mean=0.306][A
+Train step of epoch 0:  50%|█████     | 3223/6434 [7:33:24<7:27:58,  8.37s/it, gpt_loss=0.327, loss_mean=0.306][A
+Train step of epoch 0:  50%|█████     | 3223/6434 [7:33:32<7:27:58,  8.37s/it, gpt_loss=0.346, loss_mean=0.31] [A
+Train step of epoch 0:  50%|█████     | 3224/6434 [7:33:32<7:18:58,  8.21s/it, gpt_loss=0.346, loss_mean=0.31][A
+Train step of epoch 0:  50%|█████     | 3224/6434 [7:33:40<7:18:58,  8.21s/it, gpt_loss=0.27, loss_mean=0.306][A
+Train step of epoch 0:  50%|█████     | 3225/6434 [7:33:40<7:10:25,  8.05s/it, gpt_loss=0.27, loss_mean=0.306][A
+Train step of epoch 0:  50%|█████     | 3225/6434 [7:33:49<7:10:25,  8.05s/it, gpt_loss=0.302, loss_mean=0.305][A
+Train step of epoch 0:  50%|█████     | 3226/6434 [7:33:49<7:22:12,  8.27s/it, gpt_loss=0.302, loss_mean=0.305][A
+Train step of epoch 0:  50%|█████     | 3226/6434 [7:33:58<7:22:12,  8.27s/it, gpt_loss=0.323, loss_mean=0.307][A
+Train step of epoch 0:  50%|█████     | 3227/6434 [7:33:58<7:38:08,  8.57s/it, gpt_loss=0.323, loss_mean=0.307][A
+Train step of epoch 0:  50%|█████     | 3227/6434 [7:34:06<7:38:08,  8.57s/it, gpt_loss=0.399, loss_mean=0.316][A
+Train step of epoch 0:  50%|█████     | 3228/6434 [7:34:06<7:22:04,  8.27s/it, gpt_loss=0.399, loss_mean=0.316][A
+Train step of epoch 0:  50%|█████     | 3228/6434 [7:34:14<7:22:04,  8.27s/it, gpt_loss=0.353, loss_mean=0.32] [A
+Train step of epoch 0:  50%|█████     | 3229/6434 [7:34:14<7:28:15,  8.39s/it, gpt_loss=0.353, loss_mean=0.32][A
+[LID Router Debug] Step: 3230
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [2, 4, 0, 4, 5, 4, 4, 0, 2, 2]
+Active Experts in Batch: {0, 2, 4, 5}
+
+Train step of epoch 0:  50%|█████     | 3229/6434 [7:34:24<7:28:15,  8.39s/it, gpt_loss=0.238, loss_mean=0.312][A
+Train step of epoch 0:  50%|█████     | 3230/6434 [7:34:24<7:43:08,  8.67s/it, gpt_loss=0.238, loss_mean=0.312][A
+Train step of epoch 0:  50%|█████     | 3230/6434 [7:34:31<7:43:08,  8.67s/it, gpt_loss=0.354, loss_mean=0.316][A
+Train step of epoch 0:  50%|█████     | 3231/6434 [7:34:31<7:24:54,  8.33s/it, gpt_loss=0.354, loss_mean=0.316][A
+Train step of epoch 0:  50%|█████     | 3231/6434 [7:34:40<7:24:54,  8.33s/it, gpt_loss=0.245, loss_mean=0.309][A
+Train step of epoch 0:  50%|█████     | 3232/6434 [7:34:40<7:40:08,  8.62s/it, gpt_loss=0.245, loss_mean=0.309][A
+Train step of epoch 0:  50%|█████     | 3232/6434 [7:34:49<7:40:08,  8.62s/it, gpt_loss=0.285, loss_mean=0.306][A
+Train step of epoch 0:  50%|█████     | 3233/6434 [7:34:49<7:46:06,  8.74s/it, gpt_loss=0.285, loss_mean=0.306][A
+Train step of epoch 0:  50%|█████     | 3233/6434 [7:34:59<7:46:06,  8.74s/it, gpt_loss=0.26, loss_mean=0.302] [A
+Train step of epoch 0:  50%|█████     | 3234/6434 [7:34:59<7:54:54,  8.90s/it, gpt_loss=0.26, loss_mean=0.302][A
+Train step of epoch 0:  50%|█████     | 3234/6434 [7:35:07<7:54:54,  8.90s/it, gpt_loss=0.303, loss_mean=0.302][A
+Train step of epoch 0:  50%|█████     | 3235/6434 [7:35:07<7:40:28,  8.64s/it, gpt_loss=0.303, loss_mean=0.302][A
+Train step of epoch 0:  50%|█████     | 3235/6434 [7:35:15<7:40:28,  8.64s/it, gpt_loss=0.268, loss_mean=0.299][A
+Train step of epoch 0:  50%|█████     | 3236/6434 [7:35:15<7:41:03,  8.65s/it, gpt_loss=0.268, loss_mean=0.299][A
+Train step of epoch 0:  50%|█████     | 3236/6434 [7:35:24<7:41:03,  8.65s/it, gpt_loss=0.366, loss_mean=0.305][A
+Train step of epoch 0:  50%|█████     | 3237/6434 [7:35:24<7:43:42,  8.70s/it, gpt_loss=0.366, loss_mean=0.305][A
+Train step of epoch 0:  50%|█████     | 3237/6434 [7:35:33<7:43:42,  8.70s/it, gpt_loss=0.386, loss_mean=0.313][A
+Train step of epoch 0:  50%|█████     | 3238/6434 [7:35:33<7:46:12,  8.75s/it, gpt_loss=0.386, loss_mean=0.313][A
+Train step of epoch 0:  50%|█████     | 3238/6434 [7:35:42<7:46:12,  8.75s/it, gpt_loss=0.29, loss_mean=0.311] [A
+Train step of epoch 0:  50%|█████     | 3239/6434 [7:35:42<7:56:54,  8.96s/it, gpt_loss=0.29, loss_mean=0.311][A
+[LID Router Debug] Step: 3240
+Batch Size: 10
+Audio Batch Size: 110
+LID Assignments: [3, 4, 5, 4, 1, 9, 9, 0, 0, 6]
+Active Experts in Batch: {0, 1, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  50%|█████     | 3239/6434 [7:35:51<7:56:54,  8.96s/it, gpt_loss=0.333, loss_mean=0.313][A
+Train step of epoch 0:  50%|█████     | 3240/6434 [7:35:51<7:45:08,  8.74s/it, gpt_loss=0.333, loss_mean=0.313][A
+Train step of epoch 0:  50%|█████     | 3240/6434 [7:35:59<7:45:08,  8.74s/it, gpt_loss=0.293, loss_mean=0.311][A
+Train step of epoch 0:  50%|█████     | 3241/6434 [7:35:59<7:37:01,  8.59s/it, gpt_loss=0.293, loss_mean=0.311][A
+Train step of epoch 0:  50%|█████     | 3241/6434 [7:36:08<7:37:01,  8.59s/it, gpt_loss=0.351, loss_mean=0.315][A
+Train step of epoch 0:  50%|█████     | 3242/6434 [7:36:08<7:47:32,  8.79s/it, gpt_loss=0.351, loss_mean=0.315][A
+Train step of epoch 0:  50%|█████     | 3242/6434 [7:36:16<7:47:32,  8.79s/it, gpt_loss=0.29, loss_mean=0.313] [A
+Train step of epoch 0:  50%|█████     | 3243/6434 [7:36:16<7:35:15,  8.56s/it, gpt_loss=0.29, loss_mean=0.313][A
+Train step of epoch 0:  50%|█████     | 3243/6434 [7:36:27<7:35:15,  8.56s/it, gpt_loss=0.311, loss_mean=0.312][A
+Train step of epoch 0:  50%|█████     | 3244/6434 [7:36:27<8:03:01,  9.09s/it, gpt_loss=0.311, loss_mean=0.312][A
+Train step of epoch 0:  50%|█████     | 3244/6434 [7:36:35<8:03:01,  9.09s/it, gpt_loss=0.28, loss_mean=0.309] [A
+Train step of epoch 0:  50%|█████     | 3245/6434 [7:36:35<7:59:20,  9.02s/it, gpt_loss=0.28, loss_mean=0.309][A
+Train step of epoch 0:  50%|█████     | 3245/6434 [7:36:44<7:59:20,  9.02s/it, gpt_loss=0.318, loss_mean=0.31][A
+Train step of epoch 0:  50%|█████     | 3246/6434 [7:36:44<7:52:59,  8.90s/it, gpt_loss=0.318, loss_mean=0.31][A
+Train step of epoch 0:  50%|█████     | 3246/6434 [7:36:52<7:52:59,  8.90s/it, gpt_loss=0.297, loss_mean=0.309][A
+Train step of epoch 0:  50%|█████     | 3247/6434 [7:36:52<7:45:35,  8.77s/it, gpt_loss=0.297, loss_mean=0.309][A
+Train step of epoch 0:  50%|█████     | 3247/6434 [7:37:01<7:45:35,  8.77s/it, gpt_loss=0.331, loss_mean=0.311][A
+Train step of epoch 0:  50%|█████     | 3248/6434 [7:37:01<7:36:57,  8.61s/it, gpt_loss=0.331, loss_mean=0.311][A
+Train step of epoch 0:  50%|█████     | 3248/6434 [7:37:10<7:36:57,  8.61s/it, gpt_loss=0.278, loss_mean=0.308][A
+Train step of epoch 0:  50%|█████     | 3249/6434 [7:37:10<7:41:58,  8.70s/it, gpt_loss=0.278, loss_mean=0.308][A
+[LID Router Debug] Step: 3250
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [1, 1, 5, 9, 4, 9, 9, 3, 5, 9]
+Active Experts in Batch: {1, 3, 4, 5, 9}
+
+Train step of epoch 0:  50%|█████     | 3249/6434 [7:37:18<7:41:58,  8.70s/it, gpt_loss=0.362, loss_mean=0.313][A
+Train step of epoch 0:  51%|█████     | 3250/6434 [7:37:18<7:35:26,  8.58s/it, gpt_loss=0.362, loss_mean=0.313][A
+Train step of epoch 0:  51%|█████     | 3250/6434 [7:37:27<7:35:26,  8.58s/it, gpt_loss=0.309, loss_mean=0.313][A
+Train step of epoch 0:  51%|█████     | 3251/6434 [7:37:27<7:41:16,  8.69s/it, gpt_loss=0.309, loss_mean=0.313][A
+Train step of epoch 0:  51%|█████     | 3251/6434 [7:37:35<7:41:16,  8.69s/it, gpt_loss=0.267, loss_mean=0.308][A
+Train step of epoch 0:  51%|█████     | 3252/6434 [7:37:35<7:34:32,  8.57s/it, gpt_loss=0.267, loss_mean=0.308][A
+Train step of epoch 0:  51%|█████     | 3252/6434 [7:37:44<7:34:32,  8.57s/it, gpt_loss=0.219, loss_mean=0.299][A
+Train step of epoch 0:  51%|█████     | 3253/6434 [7:37:44<7:43:30,  8.74s/it, gpt_loss=0.219, loss_mean=0.299][A
+Train step of epoch 0:  51%|█████     | 3253/6434 [7:37:53<7:43:30,  8.74s/it, gpt_loss=0.33, loss_mean=0.302] [A
+Train step of epoch 0:  51%|█████     | 3254/6434 [7:37:53<7:38:57,  8.66s/it, gpt_loss=0.33, loss_mean=0.302][A
+Train step of epoch 0:  51%|█████     | 3254/6434 [7:38:01<7:38:57,  8.66s/it, gpt_loss=0.233, loss_mean=0.295][A
+Train step of epoch 0:  51%|█████     | 3255/6434 [7:38:01<7:27:47,  8.45s/it, gpt_loss=0.233, loss_mean=0.295][A
+Train step of epoch 0:  51%|█████     | 3255/6434 [7:38:09<7:27:47,  8.45s/it, gpt_loss=0.266, loss_mean=0.292][A
+Train step of epoch 0:  51%|█████     | 3256/6434 [7:38:09<7:25:05,  8.40s/it, gpt_loss=0.266, loss_mean=0.292][A
+Train step of epoch 0:  51%|█████     | 3256/6434 [7:38:17<7:25:05,  8.40s/it, gpt_loss=0.311, loss_mean=0.294][A
+Train step of epoch 0:  51%|█████     | 3257/6434 [7:38:17<7:24:29,  8.39s/it, gpt_loss=0.311, loss_mean=0.294][A
+Train step of epoch 0:  51%|█████     | 3257/6434 [7:38:25<7:24:29,  8.39s/it, gpt_loss=0.252, loss_mean=0.29] [A
+Train step of epoch 0:  51%|█████     | 3258/6434 [7:38:25<7:16:33,  8.25s/it, gpt_loss=0.252, loss_mean=0.29][A
+Train step of epoch 0:  51%|█████     | 3258/6434 [7:38:34<7:16:33,  8.25s/it, gpt_loss=0.298, loss_mean=0.291][A
+Train step of epoch 0:  51%|█████     | 3259/6434 [7:38:34<7:22:19,  8.36s/it, gpt_loss=0.298, loss_mean=0.291][A
+[LID Router Debug] Step: 3260
+Batch Size: 10
+Audio Batch Size: 100
+LID Assignments: [0, 2, 4, 2, 4, 9, 5, 6, 5, 3]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  51%|█████     | 3259/6434 [7:38:42<7:22:19,  8.36s/it, gpt_loss=0.319, loss_mean=0.294][A
+Train step of epoch 0:  51%|█████     | 3260/6434 [7:38:42<7:17:43,  8.27s/it, gpt_loss=0.319, loss_mean=0.294][A
+Train step of epoch 0:  51%|█████     | 3260/6434 [7:38:51<7:17:43,  8.27s/it, gpt_loss=0.242, loss_mean=0.288][A
+Train step of epoch 0:  51%|█████     | 3261/6434 [7:38:51<7:27:26,  8.46s/it, gpt_loss=0.242, loss_mean=0.288][A
+Train step of epoch 0:  51%|█████     | 3261/6434 [7:39:01<7:27:26,  8.46s/it, gpt_loss=0.332, loss_mean=0.293][A
+Train step of epoch 0:  51%|█████     | 3262/6434 [7:39:01<7:53:55,  8.96s/it, gpt_loss=0.332, loss_mean=0.293][A
+Train step of epoch 0:  51%|█████     | 3262/6434 [7:39:10<7:53:55,  8.96s/it, gpt_loss=0.314, loss_mean=0.295][A
+Train step of epoch 0:  51%|█████     | 3263/6434 [7:39:10<7:51:50,  8.93s/it, gpt_loss=0.314, loss_mean=0.295][A
+Train step of epoch 0:  51%|█████     | 3263/6434 [7:39:17<7:51:50,  8.93s/it, gpt_loss=0.309, loss_mean=0.296][A
+Train step of epoch 0:  51%|█████     | 3264/6434 [7:39:17<7:24:34,  8.41s/it, gpt_loss=0.309, loss_mean=0.296][A
+Train step of epoch 0:  51%|█████     | 3264/6434 [7:39:25<7:24:34,  8.41s/it, gpt_loss=0.236, loss_mean=0.29] [A
+Train step of epoch 0:  51%|█████     | 3265/6434 [7:39:25<7:20:20,  8.34s/it, gpt_loss=0.236, loss_mean=0.29][A
+Train step of epoch 0:  51%|█████     | 3265/6434 [7:39:33<7:20:20,  8.34s/it, gpt_loss=0.245, loss_mean=0.286][A
+Train step of epoch 0:  51%|█████     | 3266/6434 [7:39:33<7:05:48,  8.06s/it, gpt_loss=0.245, loss_mean=0.286][A
+Train step of epoch 0:  51%|█████     | 3266/6434 [7:39:42<7:05:48,  8.06s/it, gpt_loss=0.267, loss_mean=0.284][A
+Train step of epoch 0:  51%|█████     | 3267/6434 [7:39:42<7:20:50,  8.35s/it, gpt_loss=0.267, loss_mean=0.284][A
+Train step of epoch 0:  51%|█████     | 3267/6434 [7:39:51<7:20:50,  8.35s/it, gpt_loss=0.284, loss_mean=0.284][A
+Train step of epoch 0:  51%|█████     | 3268/6434 [7:39:51<7:30:00,  8.53s/it, gpt_loss=0.284, loss_mean=0.284][A
+Train step of epoch 0:  51%|█████     | 3268/6434 [7:39:59<7:30:00,  8.53s/it, gpt_loss=0.338, loss_mean=0.289][A
+Train step of epoch 0:  51%|█████     | 3269/6434 [7:39:59<7:21:05,  8.36s/it, gpt_loss=0.338, loss_mean=0.289][A
+[LID Router Debug] Step: 3270
+Batch Size: 10
+Audio Batch Size: 95
+LID Assignments: [5, 5, 0, 1, 5, 3, 2, 4, 6, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+Train step of epoch 0:  51%|█████     | 3269/6434 [7:40:09<7:21:05,  8.36s/it, gpt_loss=0.467, loss_mean=0.307][A
+Train step of epoch 0:  51%|█████     | 3270/6434 [7:40:09<7:55:59,  9.03s/it, gpt_loss=0.467, loss_mean=0.307][A
+Train step of epoch 0:  51%|█████     | 3270/6434 [7:40:17<7:55:59,  9.03s/it, gpt_loss=0.353, loss_mean=0.312][A
+Train step of epoch 0:  51%|█████     | 3271/6434 [7:40:17<7:39:14,  8.71s/it, gpt_loss=0.353, loss_mean=0.312][A
+Train step of epoch 0:  51%|█████     | 3271/6434 [7:40:25<7:39:14,  8.71s/it, gpt_loss=0.358, loss_mean=0.316][A
+Train step of epoch 0:  51%|█████     | 3272/6434 [7:40:25<7:22:53,  8.40s/it, gpt_loss=0.358, loss_mean=0.316][A
+Train step of epoch 0:  51%|█████     | 3272/6434 [7:40:33<7:22:53,  8.40s/it, gpt_loss=0.286, loss_mean=0.313][A
+Train step of epoch 0:  51%|█████     | 3273/6434 [7:40:33<7:19:31,  8.34s/it, gpt_loss=0.286, loss_mean=0.313][A
+Train step of epoch 0:  51%|█████     | 3273/6434 [7:40:41<7:19:31,  8.34s/it, gpt_loss=0.309, loss_mean=0.313][A
+Train step of epoch 0:  51%|█████     | 3274/6434 [7:40:41<7:09:13,  8.15s/it, gpt_loss=0.309, loss_mean=0.313][A
+Train step of epoch 0:  51%|█████     | 3274/6434 [7:40:50<7:09:13,  8.15s/it, gpt_loss=0.278, loss_mean=0.309][A
+Train step of epoch 0:  51%|█████     | 3275/6434 [7:40:50<7:32:00,  8.59s/it, gpt_loss=0.278, loss_mean=0.309][A
+Train step of epoch 0:  51%|█████     | 3275/6434 [7:40:59<7:32:00,  8.59s/it, gpt_loss=0.29, loss_mean=0.307] [A
+Train step of epoch 0:  51%|█████     | 3276/6434 [7:40:59<7:30:00,  8.55s/it, gpt_loss=0.29, loss_mean=0.307][A
+Train step of epoch 0:  51%|█████     | 3276/6434 [7:41:07<7:30:00,  8.55s/it, gpt_loss=0.233, loss_mean=0.3] [A
+Train step of epoch 0:  51%|█████     | 3277/6434 [7:41:07<7:30:38,  8.56s/it, gpt_loss=0.233, loss_mean=0.3][A
+Train step of epoch 0:  51%|█████     | 3277/6434 [7:41:15<7:30:38,  8.56s/it, gpt_loss=0.297, loss_mean=0.3][A
+Train step of epoch 0:  51%|█████     | 3278/6434 [7:41:15<7:09:55,  8.17s/it, gpt_loss=0.297, loss_mean=0.3][A
+Train step of epoch 0:  51%|█████     | 3278/6434 [7:41:24<7:09:55,  8.17s/it, gpt_loss=0.27, loss_mean=0.297][A
+Train step of epoch 0:  51%|█████     | 3279/6434 [7:41:24<7:22:32,  8.42s/it, gpt_loss=0.27, loss_mean=0.297][A
+[LID Router Debug] Step: 3280
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [0, 3, 0, 6, 4, 4, 5, 4, 6, 2]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6}
+
+Train step of epoch 0:  51%|█████     | 3279/6434 [7:41:32<7:22:32,  8.42s/it, gpt_loss=0.363, loss_mean=0.303][A
+Train step of epoch 0:  51%|█████     | 3280/6434 [7:41:32<7:14:55,  8.27s/it, gpt_loss=0.363, loss_mean=0.303][A
+Train step of epoch 0:  51%|█████     | 3280/6434 [7:41:40<7:14:55,  8.27s/it, gpt_loss=0.22, loss_mean=0.295] [A
+Train step of epoch 0:  51%|█████     | 3281/6434 [7:41:40<7:20:52,  8.39s/it, gpt_loss=0.22, loss_mean=0.295][A
+Train step of epoch 0:  51%|█████     | 3281/6434 [7:41:49<7:20:52,  8.39s/it, gpt_loss=0.29, loss_mean=0.295][A
+Train step of epoch 0:  51%|█████     | 3282/6434 [7:41:49<7:27:39,  8.52s/it, gpt_loss=0.29, loss_mean=0.295][A
+Train step of epoch 0:  51%|█████     | 3282/6434 [7:41:58<7:27:39,  8.52s/it, gpt_loss=0.32, loss_mean=0.297][A
+Train step of epoch 0:  51%|█████     | 3283/6434 [7:41:58<7:29:53,  8.57s/it, gpt_loss=0.32, loss_mean=0.297][A
+Train step of epoch 0:  51%|█████     | 3283/6434 [7:42:06<7:29:53,  8.57s/it, gpt_loss=0.222, loss_mean=0.29][A
+Train step of epoch 0:  51%|█████     | 3284/6434 [7:42:06<7:31:06,  8.59s/it, gpt_loss=0.222, loss_mean=0.29][A
+Train step of epoch 0:  51%|█████     | 3284/6434 [7:42:14<7:31:06,  8.59s/it, gpt_loss=0.297, loss_mean=0.29][A
+Train step of epoch 0:  51%|█████     | 3285/6434 [7:42:14<7:19:02,  8.37s/it, gpt_loss=0.297, loss_mean=0.29][A
+Train step of epoch 0:  51%|█████     | 3285/6434 [7:42:22<7:19:02,  8.37s/it, gpt_loss=0.315, loss_mean=0.293][A
+Train step of epoch 0:  51%|█████     | 3286/6434 [7:42:22<7:10:11,  8.20s/it, gpt_loss=0.315, loss_mean=0.293][A
+Train step of epoch 0:  51%|█████     | 3286/6434 [7:42:30<7:10:11,  8.20s/it, gpt_loss=0.247, loss_mean=0.288][A
+Train step of epoch 0:  51%|█████     | 3287/6434 [7:42:30<7:10:13,  8.20s/it, gpt_loss=0.247, loss_mean=0.288][A
+Train step of epoch 0:  51%|█████     | 3287/6434 [7:42:39<7:10:13,  8.20s/it, gpt_loss=0.26, loss_mean=0.285] [A
+Train step of epoch 0:  51%|█████     | 3288/6434 [7:42:39<7:16:12,  8.32s/it, gpt_loss=0.26, loss_mean=0.285][A
+Train step of epoch 0:  51%|█████     | 3288/6434 [7:42:48<7:16:12,  8.32s/it, gpt_loss=0.253, loss_mean=0.282][A
+Train step of epoch 0:  51%|█████     | 3289/6434 [7:42:48<7:33:35,  8.65s/it, gpt_loss=0.253, loss_mean=0.282][A
+[LID Router Debug] Step: 3290
+Batch Size: 10
+Audio Batch Size: 120
+LID Assignments: [0, 9, 3, 6, 4, 4, 2, 3, 3, 5]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  51%|█████     | 3289/6434 [7:42:57<7:33:35,  8.65s/it, gpt_loss=0.263, loss_mean=0.28] [A
+Train step of epoch 0:  51%|█████     | 3290/6434 [7:42:57<7:32:45,  8.64s/it, gpt_loss=0.263, loss_mean=0.28][A
+Train step of epoch 0:  51%|█████     | 3290/6434 [7:43:06<7:32:45,  8.64s/it, gpt_loss=0.25, loss_mean=0.277][A
+Train step of epoch 0:  51%|█████     | 3291/6434 [7:43:06<7:38:53,  8.76s/it, gpt_loss=0.25, loss_mean=0.277][A
+Train step of epoch 0:  51%|█████     | 3291/6434 [7:43:14<7:38:53,  8.76s/it, gpt_loss=0.353, loss_mean=0.285][A
+Train step of epoch 0:  51%|█████     | 3292/6434 [7:43:14<7:20:59,  8.42s/it, gpt_loss=0.353, loss_mean=0.285][A
+Train step of epoch 0:  51%|█████     | 3292/6434 [7:43:23<7:20:59,  8.42s/it, gpt_loss=0.257, loss_mean=0.282][A
+Train step of epoch 0:  51%|█████     | 3293/6434 [7:43:23<7:33:14,  8.66s/it, gpt_loss=0.257, loss_mean=0.282][A
+Train step of epoch 0:  51%|█████     | 3293/6434 [7:43:31<7:33:14,  8.66s/it, gpt_loss=0.429, loss_mean=0.297][A
+Train step of epoch 0:  51%|█████     | 3294/6434 [7:43:31<7:27:15,  8.55s/it, gpt_loss=0.429, loss_mean=0.297][A
+Train step of epoch 0:  51%|█████     | 3294/6434 [7:43:39<7:27:15,  8.55s/it, gpt_loss=0.293, loss_mean=0.296][A
+Train step of epoch 0:  51%|█████     | 3295/6434 [7:43:39<7:19:44,  8.41s/it, gpt_loss=0.293, loss_mean=0.296][A
+Train step of epoch 0:  51%|█████     | 3295/6434 [7:43:48<7:19:44,  8.41s/it, gpt_loss=0.244, loss_mean=0.291][A
+Train step of epoch 0:  51%|█████     | 3296/6434 [7:43:48<7:19:00,  8.39s/it, gpt_loss=0.244, loss_mean=0.291][A
+Train step of epoch 0:  51%|█████     | 3296/6434 [7:43:55<7:19:00,  8.39s/it, gpt_loss=0.295, loss_mean=0.291][A
+Train step of epoch 0:  51%|█████     | 3297/6434 [7:43:55<7:05:35,  8.14s/it, gpt_loss=0.295, loss_mean=0.291][A
+Train step of epoch 0:  51%|█████     | 3297/6434 [7:44:04<7:05:35,  8.14s/it, gpt_loss=0.422, loss_mean=0.304][A
+Train step of epoch 0:  51%|█████▏    | 3298/6434 [7:44:04<7:15:14,  8.33s/it, gpt_loss=0.422, loss_mean=0.304][A
+Train step of epoch 0:  51%|█████▏    | 3298/6434 [7:44:12<7:15:14,  8.33s/it, gpt_loss=0.278, loss_mean=0.302][A
+Train step of epoch 0:  51%|█████▏    | 3299/6434 [7:44:12<7:15:13,  8.33s/it, gpt_loss=0.278, loss_mean=0.302][A
+[LID Router Debug] Step: 3300
+Batch Size: 10
+Audio Batch Size: 90
+LID Assignments: [0, 4, 5, 6, 6, 9, 9, 6, 2, 4]
+Active Experts in Batch: {0, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  51%|█████▏    | 3299/6434 [7:44:20<7:15:13,  8.33s/it, gpt_loss=0.298, loss_mean=0.301][A
+Train step of epoch 0:  51%|█████▏    | 3300/6434 [7:44:20<7:13:17,  8.30s/it, gpt_loss=0.298, loss_mean=0.301][A
+Train step of epoch 0:  51%|█████▏    | 3300/6434 [7:44:30<7:13:17,  8.30s/it, gpt_loss=0.357, loss_mean=0.307][A
+Train step of epoch 0:  51%|█████▏    | 3301/6434 [7:44:30<7:34:46,  8.71s/it, gpt_loss=0.357, loss_mean=0.307][A
+Train step of epoch 0:  51%|█████▏    | 3301/6434 [7:44:38<7:34:46,  8.71s/it, gpt_loss=0.338, loss_mean=0.31] [A
+Train step of epoch 0:  51%|█████▏    | 3302/6434 [7:44:38<7:29:31,  8.61s/it, gpt_loss=0.338, loss_mean=0.31][A
+Train step of epoch 0:  51%|█████▏    | 3302/6434 [7:44:47<7:29:31,  8.61s/it, gpt_loss=0.26, loss_mean=0.305][A
+Train step of epoch 0:  51%|█████▏    | 3303/6434 [7:44:47<7:34:07,  8.70s/it, gpt_loss=0.26, loss_mean=0.305][A
+Train step of epoch 0:  51%|█████▏    | 3303/6434 [7:44:55<7:34:07,  8.70s/it, gpt_loss=0.289, loss_mean=0.303][A
+Train step of epoch 0:  51%|█████▏    | 3304/6434 [7:44:55<7:19:25,  8.42s/it, gpt_loss=0.289, loss_mean=0.303][A
+Train step of epoch 0:  51%|█████▏    | 3304/6434 [7:45:04<7:19:25,  8.42s/it, gpt_loss=0.331, loss_mean=0.306][A
+Train step of epoch 0:  51%|█████▏    | 3305/6434 [7:45:04<7:22:00,  8.48s/it, gpt_loss=0.331, loss_mean=0.306][A
+Train step of epoch 0:  51%|█████▏    | 3305/6434 [7:45:12<7:22:00,  8.48s/it, gpt_loss=0.358, loss_mean=0.311][A
+Train step of epoch 0:  51%|█████▏    | 3306/6434 [7:45:12<7:12:35,  8.30s/it, gpt_loss=0.358, loss_mean=0.311][A
+Train step of epoch 0:  51%|█████▏    | 3306/6434 [7:45:20<7:12:35,  8.30s/it, gpt_loss=0.279, loss_mean=0.308][A
+Train step of epoch 0:  51%|█████▏    | 3307/6434 [7:45:20<7:18:23,  8.41s/it, gpt_loss=0.279, loss_mean=0.308][A
+Train step of epoch 0:  51%|█████▏    | 3307/6434 [7:45:30<7:18:23,  8.41s/it, gpt_loss=0.299, loss_mean=0.307][A
+Train step of epoch 0:  51%|█████▏    | 3308/6434 [7:45:30<7:41:41,  8.86s/it, gpt_loss=0.299, loss_mean=0.307][A
+Train step of epoch 0:  51%|█████▏    | 3308/6434 [7:45:38<7:41:41,  8.86s/it, gpt_loss=0.408, loss_mean=0.317][A
+Train step of epoch 0:  51%|█████▏    | 3309/6434 [7:45:38<7:18:47,  8.42s/it, gpt_loss=0.408, loss_mean=0.317][A
+[LID Router Debug] Step: 3310
+Batch Size: 10
+Audio Batch Size: 117
+LID Assignments: [3, 2, 5, 2, 2, 9, 2, 2, 1, 5]
+Active Experts in Batch: {1, 2, 3, 5, 9}
+
+Train step of epoch 0:  51%|█████▏    | 3309/6434 [7:45:46<7:18:47,  8.42s/it, gpt_loss=0.301, loss_mean=0.316][A
+Train step of epoch 0:  51%|█████▏    | 3310/6434 [7:45:46<7:12:53,  8.31s/it, gpt_loss=0.301, loss_mean=0.316][A
+Train step of epoch 0:  51%|█████▏    | 3310/6434 [7:45:55<7:12:53,  8.31s/it, gpt_loss=0.312, loss_mean=0.315][A
+Train step of epoch 0:  51%|█████▏    | 3311/6434 [7:45:55<7:23:13,  8.52s/it, gpt_loss=0.312, loss_mean=0.315][A
+Train step of epoch 0:  51%|█████▏    | 3311/6434 [7:46:03<7:23:13,  8.52s/it, gpt_loss=0.298, loss_mean=0.314][A
+Train step of epoch 0:  51%|█████▏    | 3312/6434 [7:46:03<7:23:16,  8.52s/it, gpt_loss=0.298, loss_mean=0.314][A
+Train step of epoch 0:  51%|█████▏    | 3312/6434 [7:46:11<7:23:16,  8.52s/it, gpt_loss=0.296, loss_mean=0.312][A
+Train step of epoch 0:  51%|█████▏    | 3313/6434 [7:46:11<7:18:25,  8.43s/it, gpt_loss=0.296, loss_mean=0.312][A
+Train step of epoch 0:  51%|█████▏    | 3313/6434 [7:46:20<7:18:25,  8.43s/it, gpt_loss=0.303, loss_mean=0.311][A
+Train step of epoch 0:  52%|█████▏    | 3314/6434 [7:46:20<7:14:08,  8.35s/it, gpt_loss=0.303, loss_mean=0.311][A
+Train step of epoch 0:  52%|█████▏    | 3314/6434 [7:46:28<7:14:08,  8.35s/it, gpt_loss=0.403, loss_mean=0.32] [A
+Train step of epoch 0:  52%|█████▏    | 3315/6434 [7:46:28<7:14:40,  8.36s/it, gpt_loss=0.403, loss_mean=0.32][A
+Train step of epoch 0:  52%|█████▏    | 3315/6434 [7:46:36<7:14:40,  8.36s/it, gpt_loss=0.29, loss_mean=0.317][A
+Train step of epoch 0:  52%|█████▏    | 3316/6434 [7:46:36<7:06:11,  8.20s/it, gpt_loss=0.29, loss_mean=0.317][A
+Train step of epoch 0:  52%|█████▏    | 3316/6434 [7:46:43<7:06:11,  8.20s/it, gpt_loss=0.393, loss_mean=0.325][A
+Train step of epoch 0:  52%|█████▏    | 3317/6434 [7:46:43<6:54:00,  7.97s/it, gpt_loss=0.393, loss_mean=0.325][A
+Train step of epoch 0:  52%|█████▏    | 3317/6434 [7:46:51<6:54:00,  7.97s/it, gpt_loss=0.215, loss_mean=0.314][A
+Train step of epoch 0:  52%|█████▏    | 3318/6434 [7:46:51<6:49:50,  7.89s/it, gpt_loss=0.215, loss_mean=0.314][A
+Train step of epoch 0:  52%|█████▏    | 3318/6434 [7:47:00<6:49:50,  7.89s/it, gpt_loss=0.332, loss_mean=0.316][A
+Train step of epoch 0:  52%|█████▏    | 3319/6434 [7:47:00<7:01:22,  8.12s/it, gpt_loss=0.332, loss_mean=0.316][A
+[LID Router Debug] Step: 3320
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [3, 2, 5, 1, 2, 5, 5, 2, 2, 5]
+Active Experts in Batch: {1, 2, 3, 5}
+
+Train step of epoch 0:  52%|█████▏    | 3319/6434 [7:47:08<7:01:22,  8.12s/it, gpt_loss=0.314, loss_mean=0.315][A
+Train step of epoch 0:  52%|█████▏    | 3320/6434 [7:47:08<7:07:44,  8.24s/it, gpt_loss=0.314, loss_mean=0.315][A
+Train step of epoch 0:  52%|█████▏    | 3320/6434 [7:47:16<7:07:44,  8.24s/it, gpt_loss=0.364, loss_mean=0.32] [A
+Train step of epoch 0:  52%|█████▏    | 3321/6434 [7:47:16<7:04:14,  8.18s/it, gpt_loss=0.364, loss_mean=0.32][A
+Train step of epoch 0:  52%|█████▏    | 3321/6434 [7:47:26<7:04:14,  8.18s/it, gpt_loss=0.353, loss_mean=0.324][A
+Train step of epoch 0:  52%|█████▏    | 3322/6434 [7:47:26<7:28:50,  8.65s/it, gpt_loss=0.353, loss_mean=0.324][A
+Train step of epoch 0:  52%|█████▏    | 3322/6434 [7:47:35<7:28:50,  8.65s/it, gpt_loss=0.313, loss_mean=0.323][A
+Train step of epoch 0:  52%|█████▏    | 3323/6434 [7:47:35<7:34:42,  8.77s/it, gpt_loss=0.313, loss_mean=0.323][A
+Train step of epoch 0:  52%|█████▏    | 3323/6434 [7:47:44<7:34:42,  8.77s/it, gpt_loss=0.301, loss_mean=0.32] [A
+Train step of epoch 0:  52%|█████▏    | 3324/6434 [7:47:44<7:33:18,  8.75s/it, gpt_loss=0.301, loss_mean=0.32][A
+Train step of epoch 0:  52%|█████▏    | 3324/6434 [7:47:52<7:33:18,  8.75s/it, gpt_loss=0.301, loss_mean=0.318][A
+Train step of epoch 0:  52%|█████▏    | 3325/6434 [7:47:52<7:32:50,  8.74s/it, gpt_loss=0.301, loss_mean=0.318][A
+Train step of epoch 0:  52%|█████▏    | 3325/6434 [7:48:00<7:32:50,  8.74s/it, gpt_loss=0.28, loss_mean=0.315] [A
+Train step of epoch 0:  52%|█████▏    | 3326/6434 [7:48:00<7:15:27,  8.41s/it, gpt_loss=0.28, loss_mean=0.315][A
+Train step of epoch 0:  52%|█████▏    | 3326/6434 [7:48:10<7:15:27,  8.41s/it, gpt_loss=0.258, loss_mean=0.309][A
+Train step of epoch 0:  52%|█████▏    | 3327/6434 [7:48:10<7:32:57,  8.75s/it, gpt_loss=0.258, loss_mean=0.309][A
+Train step of epoch 0:  52%|█████▏    | 3327/6434 [7:48:18<7:32:57,  8.75s/it, gpt_loss=0.219, loss_mean=0.3]  [A
+Train step of epoch 0:  52%|█████▏    | 3328/6434 [7:48:18<7:34:22,  8.78s/it, gpt_loss=0.219, loss_mean=0.3][A
+Train step of epoch 0:  52%|█████▏    | 3328/6434 [7:48:27<7:34:22,  8.78s/it, gpt_loss=0.401, loss_mean=0.31][A
+Train step of epoch 0:  52%|█████▏    | 3329/6434 [7:48:27<7:28:03,  8.66s/it, gpt_loss=0.401, loss_mean=0.31][A
+[LID Router Debug] Step: 3330
+Batch Size: 10
+Audio Batch Size: 153
+LID Assignments: [1, 9, 3, 2, 3, 3, 2, 0, 3, 9]
+Active Experts in Batch: {0, 1, 2, 3, 9}
+
+Train step of epoch 0:  52%|█████▏    | 3329/6434 [7:48:36<7:28:03,  8.66s/it, gpt_loss=0.252, loss_mean=0.304][A
+Train step of epoch 0:  52%|█████▏    | 3330/6434 [7:48:36<7:43:34,  8.96s/it, gpt_loss=0.252, loss_mean=0.304][A
+Train step of epoch 0:  52%|█████▏    | 3330/6434 [7:48:46<7:43:34,  8.96s/it, gpt_loss=0.296, loss_mean=0.304][A
+Train step of epoch 0:  52%|█████▏    | 3331/6434 [7:48:46<7:46:07,  9.01s/it, gpt_loss=0.296, loss_mean=0.304][A
+Train step of epoch 0:  52%|█████▏    | 3331/6434 [7:48:54<7:46:07,  9.01s/it, gpt_loss=0.263, loss_mean=0.299][A
+Train step of epoch 0:  52%|█████▏    | 3332/6434 [7:48:54<7:40:51,  8.91s/it, gpt_loss=0.263, loss_mean=0.299][A
+Train step of epoch 0:  52%|█████▏    | 3332/6434 [7:49:03<7:40:51,  8.91s/it, gpt_loss=0.293, loss_mean=0.299][A
+Train step of epoch 0:  52%|█████▏    | 3333/6434 [7:49:03<7:32:29,  8.76s/it, gpt_loss=0.293, loss_mean=0.299][A
+Train step of epoch 0:  52%|█████▏    | 3333/6434 [7:49:11<7:32:29,  8.76s/it, gpt_loss=0.257, loss_mean=0.295][A
+Train step of epoch 0:  52%|█████▏    | 3334/6434 [7:49:11<7:25:30,  8.62s/it, gpt_loss=0.257, loss_mean=0.295][A
+Train step of epoch 0:  52%|█████▏    | 3334/6434 [7:49:20<7:25:30,  8.62s/it, gpt_loss=0.279, loss_mean=0.293][A
+Train step of epoch 0:  52%|█████▏    | 3335/6434 [7:49:20<7:24:45,  8.61s/it, gpt_loss=0.279, loss_mean=0.293][A
+Train step of epoch 0:  52%|█████▏    | 3335/6434 [7:49:28<7:24:45,  8.61s/it, gpt_loss=0.322, loss_mean=0.296][A
+Train step of epoch 0:  52%|█████▏    | 3336/6434 [7:49:28<7:30:06,  8.72s/it, gpt_loss=0.322, loss_mean=0.296][A
+Train step of epoch 0:  52%|█████▏    | 3336/6434 [7:49:37<7:30:06,  8.72s/it, gpt_loss=0.332, loss_mean=0.3]  [A
+Train step of epoch 0:  52%|█████▏    | 3337/6434 [7:49:37<7:21:10,  8.55s/it, gpt_loss=0.332, loss_mean=0.3][A
+Train step of epoch 0:  52%|█████▏    | 3337/6434 [7:49:47<7:21:10,  8.55s/it, gpt_loss=0.333, loss_mean=0.303][A
+Train step of epoch 0:  52%|█████▏    | 3338/6434 [7:49:47<7:43:55,  8.99s/it, gpt_loss=0.333, loss_mean=0.303][A
+Train step of epoch 0:  52%|█████▏    | 3338/6434 [7:49:56<7:43:55,  8.99s/it, gpt_loss=0.254, loss_mean=0.298][A
+Train step of epoch 0:  52%|█████▏    | 3339/6434 [7:49:56<7:44:01,  9.00s/it, gpt_loss=0.254, loss_mean=0.298][A
+[LID Router Debug] Step: 3340
+Batch Size: 10
+Audio Batch Size: 104
+LID Assignments: [9, 1, 9, 2, 1, 6, 3, 9, 4, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:  52%|█████▏    | 3339/6434 [7:50:04<7:44:01,  9.00s/it, gpt_loss=0.232, loss_mean=0.291][A
+Train step of epoch 0:  52%|█████▏    | 3340/6434 [7:50:04<7:36:41,  8.86s/it, gpt_loss=0.232, loss_mean=0.291][A
+Train step of epoch 0:  52%|█████▏    | 3340/6434 [7:50:13<7:36:41,  8.86s/it, gpt_loss=0.273, loss_mean=0.29] [A
+Train step of epoch 0:  52%|█████▏    | 3341/6434 [7:50:13<7:28:57,  8.71s/it, gpt_loss=0.273, loss_mean=0.29][A
+Train step of epoch 0:  52%|█████▏    | 3341/6434 [7:50:21<7:28:57,  8.71s/it, gpt_loss=0.258, loss_mean=0.286][A
+Train step of epoch 0:  52%|█████▏    | 3342/6434 [7:50:21<7:23:30,  8.61s/it, gpt_loss=0.258, loss_mean=0.286][A
+Train step of epoch 0:  52%|█████▏    | 3342/6434 [7:50:30<7:23:30,  8.61s/it, gpt_loss=0.335, loss_mean=0.291][A
+Train step of epoch 0:  52%|█████▏    | 3343/6434 [7:50:30<7:33:22,  8.80s/it, gpt_loss=0.335, loss_mean=0.291][A
+Train step of epoch 0:  52%|█████▏    | 3343/6434 [7:50:39<7:33:22,  8.80s/it, gpt_loss=0.223, loss_mean=0.284][A
+Train step of epoch 0:  52%|█████▏    | 3344/6434 [7:50:39<7:36:48,  8.87s/it, gpt_loss=0.223, loss_mean=0.284][A
+Train step of epoch 0:  52%|█████▏    | 3344/6434 [7:50:48<7:36:48,  8.87s/it, gpt_loss=0.252, loss_mean=0.281][A
+Train step of epoch 0:  52%|█████▏    | 3345/6434 [7:50:48<7:30:33,  8.75s/it, gpt_loss=0.252, loss_mean=0.281][A
+Train step of epoch 0:  52%|█████▏    | 3345/6434 [7:50:56<7:30:33,  8.75s/it, gpt_loss=0.269, loss_mean=0.28] [A
+Train step of epoch 0:  52%|█████▏    | 3346/6434 [7:50:56<7:20:49,  8.57s/it, gpt_loss=0.269, loss_mean=0.28][A
+Train step of epoch 0:  52%|█████▏    | 3346/6434 [7:51:04<7:20:49,  8.57s/it, gpt_loss=0.296, loss_mean=0.282][A
+Train step of epoch 0:  52%|█████▏    | 3347/6434 [7:51:04<7:20:37,  8.56s/it, gpt_loss=0.296, loss_mean=0.282][A
+Train step of epoch 0:  52%|█████▏    | 3347/6434 [7:51:13<7:20:37,  8.56s/it, gpt_loss=0.236, loss_mean=0.277][A
+Train step of epoch 0:  52%|█████▏    | 3348/6434 [7:51:13<7:15:58,  8.48s/it, gpt_loss=0.236, loss_mean=0.277][A
+Train step of epoch 0:  52%|█████▏    | 3348/6434 [7:51:21<7:15:58,  8.48s/it, gpt_loss=0.325, loss_mean=0.282][A
+Train step of epoch 0:  52%|█████▏    | 3349/6434 [7:51:21<7:17:08,  8.50s/it, gpt_loss=0.325, loss_mean=0.282][A
+[LID Router Debug] Step: 3350
+Batch Size: 10
+Audio Batch Size: 87
+LID Assignments: [1, 4, 1, 0, 2, 0, 9, 0, 9, 1]
+Active Experts in Batch: {0, 1, 2, 4, 9}
+
+Train step of epoch 0:  52%|█████▏    | 3349/6434 [7:51:30<7:17:08,  8.50s/it, gpt_loss=0.26, loss_mean=0.28]  [A
+Train step of epoch 0:  52%|█████▏    | 3350/6434 [7:51:30<7:15:43,  8.48s/it, gpt_loss=0.26, loss_mean=0.28][A
+Train step of epoch 0:  52%|█████▏    | 3350/6434 [7:51:37<7:15:43,  8.48s/it, gpt_loss=0.325, loss_mean=0.284][A
+Train step of epoch 0:  52%|█████▏    | 3351/6434 [7:51:37<6:55:14,  8.08s/it, gpt_loss=0.325, loss_mean=0.284][A
+Train step of epoch 0:  52%|█████▏    | 3351/6434 [7:51:45<6:55:14,  8.08s/it, gpt_loss=0.292, loss_mean=0.285][A
+Train step of epoch 0:  52%|█████▏    | 3352/6434 [7:51:45<6:51:26,  8.01s/it, gpt_loss=0.292, loss_mean=0.285][A
+Train step of epoch 0:  52%|█████▏    | 3352/6434 [7:51:54<6:51:26,  8.01s/it, gpt_loss=0.309, loss_mean=0.287][A
+Train step of epoch 0:  52%|█████▏    | 3353/6434 [7:51:54<7:09:08,  8.36s/it, gpt_loss=0.309, loss_mean=0.287][A
+Train step of epoch 0:  52%|█████▏    | 3353/6434 [7:52:02<7:09:08,  8.36s/it, gpt_loss=0.415, loss_mean=0.3]  [A
+Train step of epoch 0:  52%|█████▏    | 3354/6434 [7:52:02<7:09:12,  8.36s/it, gpt_loss=0.415, loss_mean=0.3][A
+Train step of epoch 0:  52%|█████▏    | 3354/6434 [7:52:10<7:09:12,  8.36s/it, gpt_loss=0.341, loss_mean=0.304][A
+Train step of epoch 0:  52%|█████▏    | 3355/6434 [7:52:10<6:59:56,  8.18s/it, gpt_loss=0.341, loss_mean=0.304][A
+Train step of epoch 0:  52%|█████▏    | 3355/6434 [7:52:18<6:59:56,  8.18s/it, gpt_loss=0.277, loss_mean=0.301][A
+Train step of epoch 0:  52%|█████▏    | 3356/6434 [7:52:18<6:55:55,  8.11s/it, gpt_loss=0.277, loss_mean=0.301][A
+Train step of epoch 0:  52%|█████▏    | 3356/6434 [7:52:26<6:55:55,  8.11s/it, gpt_loss=0.342, loss_mean=0.305][A
+Train step of epoch 0:  52%|█████▏    | 3357/6434 [7:52:26<6:49:09,  7.98s/it, gpt_loss=0.342, loss_mean=0.305][A
+Train step of epoch 0:  52%|█████▏    | 3357/6434 [7:52:35<6:49:09,  7.98s/it, gpt_loss=0.362, loss_mean=0.311][A
+Train step of epoch 0:  52%|█████▏    | 3358/6434 [7:52:35<7:16:27,  8.51s/it, gpt_loss=0.362, loss_mean=0.311][A
+Train step of epoch 0:  52%|█████▏    | 3358/6434 [7:52:43<7:16:27,  8.51s/it, gpt_loss=0.282, loss_mean=0.308][A
+Train step of epoch 0:  52%|█████▏    | 3359/6434 [7:52:43<7:10:01,  8.39s/it, gpt_loss=0.282, loss_mean=0.308][A
+[LID Router Debug] Step: 3360
+Batch Size: 10
+Audio Batch Size: 110
+LID Assignments: [1, 0, 5, 9, 9, 9, 6, 3, 0, 4]
+Active Experts in Batch: {0, 1, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  52%|█████▏    | 3359/6434 [7:52:53<7:10:01,  8.39s/it, gpt_loss=0.297, loss_mean=0.307][A
+Train step of epoch 0:  52%|█████▏    | 3360/6434 [7:52:53<7:25:57,  8.70s/it, gpt_loss=0.297, loss_mean=0.307][A
+Train step of epoch 0:  52%|█████▏    | 3360/6434 [7:53:01<7:25:57,  8.70s/it, gpt_loss=0.267, loss_mean=0.303][A
+Train step of epoch 0:  52%|█████▏    | 3361/6434 [7:53:01<7:20:56,  8.61s/it, gpt_loss=0.267, loss_mean=0.303][A
+Train step of epoch 0:  52%|█████▏    | 3361/6434 [7:53:10<7:20:56,  8.61s/it, gpt_loss=0.325, loss_mean=0.305][A
+Train step of epoch 0:  52%|█████▏    | 3362/6434 [7:53:10<7:26:06,  8.71s/it, gpt_loss=0.325, loss_mean=0.305][A
+Train step of epoch 0:  52%|█████▏    | 3362/6434 [7:53:18<7:26:06,  8.71s/it, gpt_loss=0.237, loss_mean=0.298][A
+Train step of epoch 0:  52%|█████▏    | 3363/6434 [7:53:18<7:15:50,  8.52s/it, gpt_loss=0.237, loss_mean=0.298][A
+Train step of epoch 0:  52%|█████▏    | 3363/6434 [7:53:26<7:15:50,  8.52s/it, gpt_loss=0.3, loss_mean=0.299]  [A
+Train step of epoch 0:  52%|█████▏    | 3364/6434 [7:53:26<7:03:38,  8.28s/it, gpt_loss=0.3, loss_mean=0.299][A
+Train step of epoch 0:  52%|█████▏    | 3364/6434 [7:53:35<7:03:38,  8.28s/it, gpt_loss=0.263, loss_mean=0.295][A
+Train step of epoch 0:  52%|█████▏    | 3365/6434 [7:53:35<7:10:18,  8.41s/it, gpt_loss=0.263, loss_mean=0.295][A
+Train step of epoch 0:  52%|█████▏    | 3365/6434 [7:53:44<7:10:18,  8.41s/it, gpt_loss=0.325, loss_mean=0.298][A
+Train step of epoch 0:  52%|█████▏    | 3366/6434 [7:53:44<7:20:16,  8.61s/it, gpt_loss=0.325, loss_mean=0.298][A
+Train step of epoch 0:  52%|█████▏    | 3366/6434 [7:53:52<7:20:16,  8.61s/it, gpt_loss=0.37, loss_mean=0.305] [A
+Train step of epoch 0:  52%|█████▏    | 3367/6434 [7:53:52<7:13:27,  8.48s/it, gpt_loss=0.37, loss_mean=0.305][A
+Train step of epoch 0:  52%|█████▏    | 3367/6434 [7:54:01<7:13:27,  8.48s/it, gpt_loss=0.29, loss_mean=0.304][A
+Train step of epoch 0:  52%|█████▏    | 3368/6434 [7:54:01<7:15:23,  8.52s/it, gpt_loss=0.29, loss_mean=0.304][A
+Train step of epoch 0:  52%|█████▏    | 3368/6434 [7:54:09<7:15:23,  8.52s/it, gpt_loss=0.387, loss_mean=0.312][A
+Train step of epoch 0:  52%|█████▏    | 3369/6434 [7:54:09<7:09:19,  8.40s/it, gpt_loss=0.387, loss_mean=0.312][A
+[LID Router Debug] Step: 3370
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [6, 9, 9, 9, 6, 1, 0, 1, 2, 2]
+Active Experts in Batch: {0, 1, 2, 6, 9}
+
+Train step of epoch 0:  52%|█████▏    | 3369/6434 [7:54:16<7:09:19,  8.40s/it, gpt_loss=0.349, loss_mean=0.316][A
+Train step of epoch 0:  52%|█████▏    | 3370/6434 [7:54:16<6:45:23,  7.94s/it, gpt_loss=0.349, loss_mean=0.316][A
+Train step of epoch 0:  52%|█████▏    | 3370/6434 [7:54:24<6:45:23,  7.94s/it, gpt_loss=0.332, loss_mean=0.317][A
+Train step of epoch 0:  52%|█████▏    | 3371/6434 [7:54:24<6:45:57,  7.95s/it, gpt_loss=0.332, loss_mean=0.317][A
+Train step of epoch 0:  52%|█████▏    | 3371/6434 [7:54:31<6:45:57,  7.95s/it, gpt_loss=0.316, loss_mean=0.317][A
+Train step of epoch 0:  52%|█████▏    | 3372/6434 [7:54:31<6:42:49,  7.89s/it, gpt_loss=0.316, loss_mean=0.317][A
+Train step of epoch 0:  52%|█████▏    | 3372/6434 [7:54:40<6:42:49,  7.89s/it, gpt_loss=0.323, loss_mean=0.318][A
+Train step of epoch 0:  52%|█████▏    | 3373/6434 [7:54:40<6:47:47,  7.99s/it, gpt_loss=0.323, loss_mean=0.318][A
+Train step of epoch 0:  52%|█████▏    | 3373/6434 [7:54:48<6:47:47,  7.99s/it, gpt_loss=0.32, loss_mean=0.318] [A
+Train step of epoch 0:  52%|█████▏    | 3374/6434 [7:54:48<6:55:33,  8.15s/it, gpt_loss=0.32, loss_mean=0.318][A
+Train step of epoch 0:  52%|█████▏    | 3374/6434 [7:54:58<6:55:33,  8.15s/it, gpt_loss=0.273, loss_mean=0.314][A
+Train step of epoch 0:  52%|█████▏    | 3375/6434 [7:54:58<7:21:11,  8.65s/it, gpt_loss=0.273, loss_mean=0.314][A
+Train step of epoch 0:  52%|█████▏    | 3375/6434 [7:55:06<7:21:11,  8.65s/it, gpt_loss=0.329, loss_mean=0.315][A
+Train step of epoch 0:  52%|█████▏    | 3376/6434 [7:55:06<7:14:29,  8.52s/it, gpt_loss=0.329, loss_mean=0.315][A
+Train step of epoch 0:  52%|█████▏    | 3376/6434 [7:55:15<7:14:29,  8.52s/it, gpt_loss=0.306, loss_mean=0.314][A
+Train step of epoch 0:  52%|█████▏    | 3377/6434 [7:55:15<7:24:46,  8.73s/it, gpt_loss=0.306, loss_mean=0.314][A
+Train step of epoch 0:  52%|█████▏    | 3377/6434 [7:55:24<7:24:46,  8.73s/it, gpt_loss=0.3, loss_mean=0.313]  [A
+Train step of epoch 0:  53%|█████▎    | 3378/6434 [7:55:24<7:28:53,  8.81s/it, gpt_loss=0.3, loss_mean=0.313][A
+Train step of epoch 0:  53%|█████▎    | 3378/6434 [7:55:33<7:28:53,  8.81s/it, gpt_loss=0.241, loss_mean=0.306][A
+Train step of epoch 0:  53%|█████▎    | 3379/6434 [7:55:33<7:27:52,  8.80s/it, gpt_loss=0.241, loss_mean=0.306][A
+[LID Router Debug] Step: 3380
+Batch Size: 10
+Audio Batch Size: 117
+LID Assignments: [3, 1, 0, 9, 0, 9, 2, 9, 9, 2]
+Active Experts in Batch: {0, 1, 2, 3, 9}
+
+Train step of epoch 0:  53%|█████▎    | 3379/6434 [7:55:42<7:27:52,  8.80s/it, gpt_loss=0.327, loss_mean=0.308][A
+Train step of epoch 0:  53%|█████▎    | 3380/6434 [7:55:42<7:30:02,  8.84s/it, gpt_loss=0.327, loss_mean=0.308][A
+Train step of epoch 0:  53%|█████▎    | 3380/6434 [7:55:51<7:30:02,  8.84s/it, gpt_loss=0.293, loss_mean=0.306][A
+Train step of epoch 0:  53%|█████▎    | 3381/6434 [7:55:51<7:24:52,  8.74s/it, gpt_loss=0.293, loss_mean=0.306][A
+Train step of epoch 0:  53%|█████▎    | 3381/6434 [7:55:59<7:24:52,  8.74s/it, gpt_loss=0.33, loss_mean=0.309] [A
+Train step of epoch 0:  53%|█████▎    | 3382/6434 [7:55:59<7:23:16,  8.71s/it, gpt_loss=0.33, loss_mean=0.309][A
+Train step of epoch 0:  53%|█████▎    | 3382/6434 [7:56:06<7:23:16,  8.71s/it, gpt_loss=0.283, loss_mean=0.306][A
+Train step of epoch 0:  53%|█████▎    | 3383/6434 [7:56:06<6:55:45,  8.18s/it, gpt_loss=0.283, loss_mean=0.306][A
+Train step of epoch 0:  53%|█████▎    | 3383/6434 [7:56:16<6:55:45,  8.18s/it, gpt_loss=0.226, loss_mean=0.298][A
+Train step of epoch 0:  53%|█████▎    | 3384/6434 [7:56:16<7:14:46,  8.55s/it, gpt_loss=0.226, loss_mean=0.298][A
+Train step of epoch 0:  53%|█████▎    | 3384/6434 [7:56:24<7:14:46,  8.55s/it, gpt_loss=0.257, loss_mean=0.294][A
+Train step of epoch 0:  53%|█████▎    | 3385/6434 [7:56:24<7:19:17,  8.64s/it, gpt_loss=0.257, loss_mean=0.294][A
+Train step of epoch 0:  53%|█████▎    | 3385/6434 [7:56:32<7:19:17,  8.64s/it, gpt_loss=0.346, loss_mean=0.299][A
+Train step of epoch 0:  53%|█████▎    | 3386/6434 [7:56:32<7:10:21,  8.47s/it, gpt_loss=0.346, loss_mean=0.299][A
+Train step of epoch 0:  53%|█████▎    | 3386/6434 [7:56:40<7:10:21,  8.47s/it, gpt_loss=0.416, loss_mean=0.311][A
+Train step of epoch 0:  53%|█████▎    | 3387/6434 [7:56:40<7:02:01,  8.31s/it, gpt_loss=0.416, loss_mean=0.311][A
+Train step of epoch 0:  53%|█████▎    | 3387/6434 [7:56:48<7:02:01,  8.31s/it, gpt_loss=0.251, loss_mean=0.305][A
+Train step of epoch 0:  53%|█████▎    | 3388/6434 [7:56:48<6:50:55,  8.09s/it, gpt_loss=0.251, loss_mean=0.305][A
+Train step of epoch 0:  53%|█████▎    | 3388/6434 [7:56:56<6:50:55,  8.09s/it, gpt_loss=0.286, loss_mean=0.303][A
+Train step of epoch 0:  53%|█████▎    | 3389/6434 [7:56:56<6:46:37,  8.01s/it, gpt_loss=0.286, loss_mean=0.303][A
+[LID Router Debug] Step: 3390
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [0, 2, 2, 1, 4, 1, 5, 2, 4, 6]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6}
+
+Train step of epoch 0:  53%|█████▎    | 3389/6434 [7:57:04<6:46:37,  8.01s/it, gpt_loss=0.296, loss_mean=0.302][A
+Train step of epoch 0:  53%|█████▎    | 3390/6434 [7:57:04<6:44:17,  7.97s/it, gpt_loss=0.296, loss_mean=0.302][A
+Train step of epoch 0:  53%|█████▎    | 3390/6434 [7:57:13<6:44:17,  7.97s/it, gpt_loss=0.273, loss_mean=0.299][A
+Train step of epoch 0:  53%|█████▎    | 3391/6434 [7:57:13<7:04:31,  8.37s/it, gpt_loss=0.273, loss_mean=0.299][A
+Train step of epoch 0:  53%|█████▎    | 3391/6434 [7:57:21<7:04:31,  8.37s/it, gpt_loss=0.368, loss_mean=0.306][A
+Train step of epoch 0:  53%|█████▎    | 3392/6434 [7:57:21<7:04:13,  8.37s/it, gpt_loss=0.368, loss_mean=0.306][A
+Train step of epoch 0:  53%|█████▎    | 3392/6434 [7:57:29<7:04:13,  8.37s/it, gpt_loss=0.275, loss_mean=0.303][A
+Train step of epoch 0:  53%|█████▎    | 3393/6434 [7:57:29<7:00:33,  8.30s/it, gpt_loss=0.275, loss_mean=0.303][A
+Train step of epoch 0:  53%|█████▎    | 3393/6434 [7:57:38<7:00:33,  8.30s/it, gpt_loss=0.329, loss_mean=0.306][A
+Train step of epoch 0:  53%|█████▎    | 3394/6434 [7:57:38<6:56:57,  8.23s/it, gpt_loss=0.329, loss_mean=0.306][A
+Train step of epoch 0:  53%|█████▎    | 3394/6434 [7:57:44<6:56:57,  8.23s/it, gpt_loss=0.303, loss_mean=0.306][A
+Train step of epoch 0:  53%|█████▎    | 3395/6434 [7:57:44<6:37:07,  7.84s/it, gpt_loss=0.303, loss_mean=0.306][A
+Train step of epoch 0:  53%|█████▎    | 3395/6434 [7:57:53<6:37:07,  7.84s/it, gpt_loss=0.279, loss_mean=0.303][A
+Train step of epoch 0:  53%|█████▎    | 3396/6434 [7:57:53<6:40:08,  7.90s/it, gpt_loss=0.279, loss_mean=0.303][A
+Train step of epoch 0:  53%|█████▎    | 3396/6434 [7:58:02<6:40:08,  7.90s/it, gpt_loss=0.292, loss_mean=0.302][A
+Train step of epoch 0:  53%|█████▎    | 3397/6434 [7:58:02<6:59:13,  8.28s/it, gpt_loss=0.292, loss_mean=0.302][A
+Train step of epoch 0:  53%|█████▎    | 3397/6434 [7:58:10<6:59:13,  8.28s/it, gpt_loss=0.363, loss_mean=0.308][A
+Train step of epoch 0:  53%|█████▎    | 3398/6434 [7:58:10<7:07:02,  8.44s/it, gpt_loss=0.363, loss_mean=0.308][A
+Train step of epoch 0:  53%|█████▎    | 3398/6434 [7:58:18<7:07:02,  8.44s/it, gpt_loss=0.344, loss_mean=0.312][A
+Train step of epoch 0:  53%|█████▎    | 3399/6434 [7:58:18<6:50:46,  8.12s/it, gpt_loss=0.344, loss_mean=0.312][A
+[LID Router Debug] Step: 3400
+Batch Size: 10
+Audio Batch Size: 134
+LID Assignments: [1, 8, 2, 5, 3, 5, 4, 0, 9, 9]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 8, 9}
+[2026-02-06 23:54:30,784] [INFO] [logging.py:96:log_dist] [Rank 0] step=1700, skipped=0, lr=[1.8642663477747764e-05, 1.8642663477747764e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-06 23:54:30,785] [INFO] [timer.py:260:stop] epoch=0/micro_step=3400/global_step=1700, RunningAvgSamplesPerSec=4.748307569883452, CurrSamplesPerSec=5.065422270138025, MemAllocated=12.77GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  53%|█████▎    | 3399/6434 [7:58:26<6:50:46,  8.12s/it, gpt_loss=0.316, loss_mean=0.312][A
+Train step of epoch 0:  53%|█████▎    | 3400/6434 [7:58:26<6:55:34,  8.22s/it, gpt_loss=0.316, loss_mean=0.312][A
+Train step of epoch 0:  53%|█████▎    | 3400/6434 [7:58:35<6:55:34,  8.22s/it, gpt_loss=0.359, loss_mean=0.317][A
+Train step of epoch 0:  53%|█████▎    | 3401/6434 [7:58:35<7:08:28,  8.48s/it, gpt_loss=0.359, loss_mean=0.317][A
+Train step of epoch 0:  53%|█████▎    | 3401/6434 [7:58:44<7:08:28,  8.48s/it, gpt_loss=0.302, loss_mean=0.315][A
+Train step of epoch 0:  53%|█████▎    | 3402/6434 [7:58:44<7:05:50,  8.43s/it, gpt_loss=0.302, loss_mean=0.315][A
+Train step of epoch 0:  53%|█████▎    | 3402/6434 [7:58:52<7:05:50,  8.43s/it, gpt_loss=0.28, loss_mean=0.312] [A
+Train step of epoch 0:  53%|█████▎    | 3403/6434 [7:58:52<7:06:27,  8.44s/it, gpt_loss=0.28, loss_mean=0.312][A
+Train step of epoch 0:  53%|█████▎    | 3403/6434 [7:59:01<7:06:27,  8.44s/it, gpt_loss=0.27, loss_mean=0.308][A
+Train step of epoch 0:  53%|█████▎    | 3404/6434 [7:59:01<7:04:58,  8.42s/it, gpt_loss=0.27, loss_mean=0.308][A
+Train step of epoch 0:  53%|█████▎    | 3404/6434 [7:59:09<7:04:58,  8.42s/it, gpt_loss=0.361, loss_mean=0.313][A
+Train step of epoch 0:  53%|█████▎    | 3405/6434 [7:59:09<7:10:31,  8.53s/it, gpt_loss=0.361, loss_mean=0.313][A
+Train step of epoch 0:  53%|█████▎    | 3405/6434 [7:59:17<7:10:31,  8.53s/it, gpt_loss=0.28, loss_mean=0.31]  [A
+Train step of epoch 0:  53%|█████▎    | 3406/6434 [7:59:17<7:02:30,  8.37s/it, gpt_loss=0.28, loss_mean=0.31][A
+Train step of epoch 0:  53%|█████▎    | 3406/6434 [7:59:26<7:02:30,  8.37s/it, gpt_loss=0.324, loss_mean=0.311][A
+Train step of epoch 0:  53%|█████▎    | 3407/6434 [7:59:26<7:01:01,  8.35s/it, gpt_loss=0.324, loss_mean=0.311][A
+Train step of epoch 0:  53%|█████▎    | 3407/6434 [7:59:34<7:01:01,  8.35s/it, gpt_loss=0.296, loss_mean=0.309][A
+Train step of epoch 0:  53%|█████▎    | 3408/6434 [7:59:34<7:01:04,  8.35s/it, gpt_loss=0.296, loss_mean=0.309][A
+Train step of epoch 0:  53%|█████▎    | 3408/6434 [7:59:42<7:01:04,  8.35s/it, gpt_loss=0.263, loss_mean=0.305][A
+Train step of epoch 0:  53%|█████▎    | 3409/6434 [7:59:42<6:50:55,  8.15s/it, gpt_loss=0.263, loss_mean=0.305][A
+[LID Router Debug] Step: 3410
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [0, 6, 2, 4, 2, 1, 0, 0, 5, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+Train step of epoch 0:  53%|█████▎    | 3409/6434 [7:59:50<6:50:55,  8.15s/it, gpt_loss=0.376, loss_mean=0.312][A
+Train step of epoch 0:  53%|█████▎    | 3410/6434 [7:59:50<6:54:21,  8.22s/it, gpt_loss=0.376, loss_mean=0.312][A
+Train step of epoch 0:  53%|█████▎    | 3410/6434 [7:59:59<6:54:21,  8.22s/it, gpt_loss=0.298, loss_mean=0.311][A
+Train step of epoch 0:  53%|█████▎    | 3411/6434 [7:59:59<6:58:28,  8.31s/it, gpt_loss=0.298, loss_mean=0.311][A
+Train step of epoch 0:  53%|█████▎    | 3411/6434 [8:00:07<6:58:28,  8.31s/it, gpt_loss=0.278, loss_mean=0.307][A
+Train step of epoch 0:  53%|█████▎    | 3412/6434 [8:00:07<6:58:06,  8.30s/it, gpt_loss=0.278, loss_mean=0.307][A
+Train step of epoch 0:  53%|█████▎    | 3412/6434 [8:00:14<6:58:06,  8.30s/it, gpt_loss=0.41, loss_mean=0.318] [A
+Train step of epoch 0:  53%|█████▎    | 3413/6434 [8:00:14<6:47:36,  8.10s/it, gpt_loss=0.41, loss_mean=0.318][A
+Train step of epoch 0:  53%|█████▎    | 3413/6434 [8:00:22<6:47:36,  8.10s/it, gpt_loss=0.21, loss_mean=0.307][A
+Train step of epoch 0:  53%|█████▎    | 3414/6434 [8:00:22<6:39:20,  7.93s/it, gpt_loss=0.21, loss_mean=0.307][A
+Train step of epoch 0:  53%|█████▎    | 3414/6434 [8:00:30<6:39:20,  7.93s/it, gpt_loss=0.377, loss_mean=0.314][A
+Train step of epoch 0:  53%|█████▎    | 3415/6434 [8:00:30<6:40:27,  7.96s/it, gpt_loss=0.377, loss_mean=0.314][A
+Train step of epoch 0:  53%|█████▎    | 3415/6434 [8:00:38<6:40:27,  7.96s/it, gpt_loss=0.302, loss_mean=0.313][A
+Train step of epoch 0:  53%|█████▎    | 3416/6434 [8:00:38<6:41:50,  7.99s/it, gpt_loss=0.302, loss_mean=0.313][A
+Train step of epoch 0:  53%|█████▎    | 3416/6434 [8:00:47<6:41:50,  7.99s/it, gpt_loss=0.316, loss_mean=0.313][A
+Train step of epoch 0:  53%|█████▎    | 3417/6434 [8:00:47<6:53:52,  8.23s/it, gpt_loss=0.316, loss_mean=0.313][A
+Train step of epoch 0:  53%|█████▎    | 3417/6434 [8:00:55<6:53:52,  8.23s/it, gpt_loss=0.349, loss_mean=0.317][A
+Train step of epoch 0:  53%|█████▎    | 3418/6434 [8:00:55<6:46:12,  8.08s/it, gpt_loss=0.349, loss_mean=0.317][A
+Train step of epoch 0:  53%|█████▎    | 3418/6434 [8:01:03<6:46:12,  8.08s/it, gpt_loss=0.259, loss_mean=0.311][A
+Train step of epoch 0:  53%|█████▎    | 3419/6434 [8:01:03<6:46:12,  8.08s/it, gpt_loss=0.259, loss_mean=0.311][A
+[LID Router Debug] Step: 3420
+Batch Size: 10
+Audio Batch Size: 109
+LID Assignments: [3, 1, 5, 1, 3, 9, 5, 0, 4, 4]
+Active Experts in Batch: {0, 1, 3, 4, 5, 9}
+
+Train step of epoch 0:  53%|█████▎    | 3419/6434 [8:01:12<6:46:12,  8.08s/it, gpt_loss=0.236, loss_mean=0.303][A
+Train step of epoch 0:  53%|█████▎    | 3420/6434 [8:01:12<7:04:42,  8.45s/it, gpt_loss=0.236, loss_mean=0.303][A
+Train step of epoch 0:  53%|█████▎    | 3420/6434 [8:01:21<7:04:42,  8.45s/it, gpt_loss=0.276, loss_mean=0.301][A
+Train step of epoch 0:  53%|█████▎    | 3421/6434 [8:01:21<7:06:50,  8.50s/it, gpt_loss=0.276, loss_mean=0.301][A
+Train step of epoch 0:  53%|█████▎    | 3421/6434 [8:01:29<7:06:50,  8.50s/it, gpt_loss=0.312, loss_mean=0.302][A
+Train step of epoch 0:  53%|█████▎    | 3422/6434 [8:01:29<6:58:26,  8.34s/it, gpt_loss=0.312, loss_mean=0.302][A
+Train step of epoch 0:  53%|█████▎    | 3422/6434 [8:01:38<6:58:26,  8.34s/it, gpt_loss=0.338, loss_mean=0.305][A
+Train step of epoch 0:  53%|█████▎    | 3423/6434 [8:01:38<7:09:01,  8.55s/it, gpt_loss=0.338, loss_mean=0.305][A
+Train step of epoch 0:  53%|█████▎    | 3423/6434 [8:01:46<7:09:01,  8.55s/it, gpt_loss=0.334, loss_mean=0.308][A
+Train step of epoch 0:  53%|█████▎    | 3424/6434 [8:01:46<7:02:32,  8.42s/it, gpt_loss=0.334, loss_mean=0.308][A
+Train step of epoch 0:  53%|█████▎    | 3424/6434 [8:01:53<7:02:32,  8.42s/it, gpt_loss=0.327, loss_mean=0.31] [A
+Train step of epoch 0:  53%|█████▎    | 3425/6434 [8:01:53<6:51:25,  8.20s/it, gpt_loss=0.327, loss_mean=0.31][A
+Train step of epoch 0:  53%|█████▎    | 3425/6434 [8:02:03<6:51:25,  8.20s/it, gpt_loss=0.284, loss_mean=0.307][A
+Train step of epoch 0:  53%|█████▎    | 3426/6434 [8:02:03<7:06:17,  8.50s/it, gpt_loss=0.284, loss_mean=0.307][A
+Train step of epoch 0:  53%|█████▎    | 3426/6434 [8:02:12<7:06:17,  8.50s/it, gpt_loss=0.374, loss_mean=0.314][A
+Train step of epoch 0:  53%|█████▎    | 3427/6434 [8:02:12<7:20:46,  8.79s/it, gpt_loss=0.374, loss_mean=0.314][A
+Train step of epoch 0:  53%|█████▎    | 3427/6434 [8:02:21<7:20:46,  8.79s/it, gpt_loss=0.242, loss_mean=0.307][A
+Train step of epoch 0:  53%|█████▎    | 3428/6434 [8:02:21<7:27:40,  8.94s/it, gpt_loss=0.242, loss_mean=0.307][A
+Train step of epoch 0:  53%|█████▎    | 3428/6434 [8:02:30<7:27:40,  8.94s/it, gpt_loss=0.324, loss_mean=0.309][A
+Train step of epoch 0:  53%|█████▎    | 3429/6434 [8:02:30<7:26:48,  8.92s/it, gpt_loss=0.324, loss_mean=0.309][A
+[LID Router Debug] Step: 3430
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [5, 4, 4, 5, 2, 9, 5, 6, 1, 4]
+Active Experts in Batch: {1, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  53%|█████▎    | 3429/6434 [8:02:39<7:26:48,  8.92s/it, gpt_loss=0.323, loss_mean=0.31] [A
+Train step of epoch 0:  53%|█████▎    | 3430/6434 [8:02:39<7:18:43,  8.76s/it, gpt_loss=0.323, loss_mean=0.31][A
+Train step of epoch 0:  53%|█████▎    | 3430/6434 [8:02:48<7:18:43,  8.76s/it, gpt_loss=0.258, loss_mean=0.305][A
+Train step of epoch 0:  53%|█████▎    | 3431/6434 [8:02:48<7:30:25,  9.00s/it, gpt_loss=0.258, loss_mean=0.305][A
+Train step of epoch 0:  53%|█████▎    | 3431/6434 [8:02:57<7:30:25,  9.00s/it, gpt_loss=0.311, loss_mean=0.305][A
+Train step of epoch 0:  53%|█████▎    | 3432/6434 [8:02:57<7:23:54,  8.87s/it, gpt_loss=0.311, loss_mean=0.305][A
+Train step of epoch 0:  53%|█████▎    | 3432/6434 [8:03:05<7:23:54,  8.87s/it, gpt_loss=0.319, loss_mean=0.307][A
+Train step of epoch 0:  53%|█████▎    | 3433/6434 [8:03:05<7:17:52,  8.75s/it, gpt_loss=0.319, loss_mean=0.307][A
+Train step of epoch 0:  53%|█████▎    | 3433/6434 [8:03:14<7:17:52,  8.75s/it, gpt_loss=0.222, loss_mean=0.298][A
+Train step of epoch 0:  53%|█████▎    | 3434/6434 [8:03:14<7:10:27,  8.61s/it, gpt_loss=0.222, loss_mean=0.298][A
+Train step of epoch 0:  53%|█████▎    | 3434/6434 [8:03:22<7:10:27,  8.61s/it, gpt_loss=0.306, loss_mean=0.299][A
+Train step of epoch 0:  53%|█████▎    | 3435/6434 [8:03:22<7:04:34,  8.49s/it, gpt_loss=0.306, loss_mean=0.299][A
+Train step of epoch 0:  53%|█████▎    | 3435/6434 [8:03:31<7:04:34,  8.49s/it, gpt_loss=0.422, loss_mean=0.311][A
+Train step of epoch 0:  53%|█████▎    | 3436/6434 [8:03:31<7:13:32,  8.68s/it, gpt_loss=0.422, loss_mean=0.311][A
+Train step of epoch 0:  53%|█████▎    | 3436/6434 [8:03:39<7:13:32,  8.68s/it, gpt_loss=0.306, loss_mean=0.311][A
+Train step of epoch 0:  53%|█████▎    | 3437/6434 [8:03:39<7:05:42,  8.52s/it, gpt_loss=0.306, loss_mean=0.311][A
+Train step of epoch 0:  53%|█████▎    | 3437/6434 [8:03:48<7:05:42,  8.52s/it, gpt_loss=0.283, loss_mean=0.308][A
+Train step of epoch 0:  53%|█████▎    | 3438/6434 [8:03:48<7:15:21,  8.72s/it, gpt_loss=0.283, loss_mean=0.308][A
+Train step of epoch 0:  53%|█████▎    | 3438/6434 [8:03:57<7:15:21,  8.72s/it, gpt_loss=0.276, loss_mean=0.305][A
+Train step of epoch 0:  53%|█████▎    | 3439/6434 [8:03:57<7:15:42,  8.73s/it, gpt_loss=0.276, loss_mean=0.305][A
+[LID Router Debug] Step: 3440
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [5, 3, 2, 1, 5, 0, 0, 2, 2, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5}
+
+Train step of epoch 0:  53%|█████▎    | 3439/6434 [8:04:05<7:15:42,  8.73s/it, gpt_loss=0.271, loss_mean=0.302][A
+Train step of epoch 0:  53%|█████▎    | 3440/6434 [8:04:05<7:08:08,  8.58s/it, gpt_loss=0.271, loss_mean=0.302][A
+Train step of epoch 0:  53%|█████▎    | 3440/6434 [8:04:13<7:08:08,  8.58s/it, gpt_loss=0.362, loss_mean=0.308][A
+Train step of epoch 0:  53%|█████▎    | 3441/6434 [8:04:13<6:59:51,  8.42s/it, gpt_loss=0.362, loss_mean=0.308][A
+Train step of epoch 0:  53%|█████▎    | 3441/6434 [8:04:22<6:59:51,  8.42s/it, gpt_loss=0.322, loss_mean=0.309][A
+Train step of epoch 0:  53%|█████▎    | 3442/6434 [8:04:22<6:58:36,  8.39s/it, gpt_loss=0.322, loss_mean=0.309][A
+Train step of epoch 0:  53%|█████▎    | 3442/6434 [8:04:29<6:58:36,  8.39s/it, gpt_loss=0.265, loss_mean=0.305][A
+Train step of epoch 0:  54%|█████▎    | 3443/6434 [8:04:29<6:46:26,  8.15s/it, gpt_loss=0.265, loss_mean=0.305][A
+Train step of epoch 0:  54%|█████▎    | 3443/6434 [8:04:37<6:46:26,  8.15s/it, gpt_loss=0.292, loss_mean=0.303][A
+Train step of epoch 0:  54%|█████▎    | 3444/6434 [8:04:37<6:36:02,  7.95s/it, gpt_loss=0.292, loss_mean=0.303][A
+Train step of epoch 0:  54%|█████▎    | 3444/6434 [8:04:45<6:36:02,  7.95s/it, gpt_loss=0.264, loss_mean=0.299][A
+Train step of epoch 0:  54%|█████▎    | 3445/6434 [8:04:45<6:39:41,  8.02s/it, gpt_loss=0.264, loss_mean=0.299][A
+Train step of epoch 0:  54%|█████▎    | 3445/6434 [8:04:53<6:39:41,  8.02s/it, gpt_loss=0.353, loss_mean=0.305][A
+Train step of epoch 0:  54%|█████▎    | 3446/6434 [8:04:53<6:45:22,  8.14s/it, gpt_loss=0.353, loss_mean=0.305][A
+Train step of epoch 0:  54%|█████▎    | 3446/6434 [8:05:01<6:45:22,  8.14s/it, gpt_loss=0.317, loss_mean=0.306][A
+Train step of epoch 0:  54%|█████▎    | 3447/6434 [8:05:01<6:34:05,  7.92s/it, gpt_loss=0.317, loss_mean=0.306][A
+Train step of epoch 0:  54%|█████▎    | 3447/6434 [8:05:09<6:34:05,  7.92s/it, gpt_loss=0.31, loss_mean=0.306] [A
+Train step of epoch 0:  54%|█████▎    | 3448/6434 [8:05:09<6:33:39,  7.91s/it, gpt_loss=0.31, loss_mean=0.306][A
+Train step of epoch 0:  54%|█████▎    | 3448/6434 [8:05:18<6:33:39,  7.91s/it, gpt_loss=0.331, loss_mean=0.309][A
+Train step of epoch 0:  54%|█████▎    | 3449/6434 [8:05:18<6:53:25,  8.31s/it, gpt_loss=0.331, loss_mean=0.309][A
+[LID Router Debug] Step: 3450
+Batch Size: 10
+Audio Batch Size: 123
+LID Assignments: [3, 3, 2, 5, 3, 0, 5, 1, 6, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+Train step of epoch 0:  54%|█████▎    | 3449/6434 [8:05:26<6:53:25,  8.31s/it, gpt_loss=0.332, loss_mean=0.311][A
+Train step of epoch 0:  54%|█████▎    | 3450/6434 [8:05:26<6:49:19,  8.23s/it, gpt_loss=0.332, loss_mean=0.311][A
+Train step of epoch 0:  54%|█████▎    | 3450/6434 [8:05:34<6:49:19,  8.23s/it, gpt_loss=0.291, loss_mean=0.309][A
+Train step of epoch 0:  54%|█████▎    | 3451/6434 [8:05:34<6:54:21,  8.33s/it, gpt_loss=0.291, loss_mean=0.309][A
+Train step of epoch 0:  54%|█████▎    | 3451/6434 [8:05:43<6:54:21,  8.33s/it, gpt_loss=0.279, loss_mean=0.306][A
+Train step of epoch 0:  54%|█████▎    | 3452/6434 [8:05:43<7:00:26,  8.46s/it, gpt_loss=0.279, loss_mean=0.306][A
+Train step of epoch 0:  54%|█████▎    | 3452/6434 [8:05:51<7:00:26,  8.46s/it, gpt_loss=0.284, loss_mean=0.304][A
+Train step of epoch 0:  54%|█████▎    | 3453/6434 [8:05:51<6:55:24,  8.36s/it, gpt_loss=0.284, loss_mean=0.304][A
+Train step of epoch 0:  54%|█████▎    | 3453/6434 [8:06:00<6:55:24,  8.36s/it, gpt_loss=0.319, loss_mean=0.305][A
+Train step of epoch 0:  54%|█████▎    | 3454/6434 [8:06:00<6:53:03,  8.32s/it, gpt_loss=0.319, loss_mean=0.305][A
+Train step of epoch 0:  54%|█████▎    | 3454/6434 [8:06:08<6:53:03,  8.32s/it, gpt_loss=0.304, loss_mean=0.305][A
+Train step of epoch 0:  54%|█████▎    | 3455/6434 [8:06:08<6:49:56,  8.26s/it, gpt_loss=0.304, loss_mean=0.305][A
+Train step of epoch 0:  54%|█████▎    | 3455/6434 [8:06:15<6:49:56,  8.26s/it, gpt_loss=0.341, loss_mean=0.309][A
+Train step of epoch 0:  54%|█████▎    | 3456/6434 [8:06:15<6:40:38,  8.07s/it, gpt_loss=0.341, loss_mean=0.309][A
+Train step of epoch 0:  54%|█████▎    | 3456/6434 [8:06:23<6:40:38,  8.07s/it, gpt_loss=0.306, loss_mean=0.309][A
+Train step of epoch 0:  54%|█████▎    | 3457/6434 [8:06:23<6:28:07,  7.82s/it, gpt_loss=0.306, loss_mean=0.309][A
+Train step of epoch 0:  54%|█████▎    | 3457/6434 [8:06:30<6:28:07,  7.82s/it, gpt_loss=0.245, loss_mean=0.302][A
+Train step of epoch 0:  54%|█████▎    | 3458/6434 [8:06:30<6:23:08,  7.72s/it, gpt_loss=0.245, loss_mean=0.302][A
+Train step of epoch 0:  54%|█████▎    | 3458/6434 [8:06:38<6:23:08,  7.72s/it, gpt_loss=0.267, loss_mean=0.299][A
+Train step of epoch 0:  54%|█████▍    | 3459/6434 [8:06:38<6:25:42,  7.78s/it, gpt_loss=0.267, loss_mean=0.299][A
+[LID Router Debug] Step: 3460
+Batch Size: 10
+Audio Batch Size: 78
+LID Assignments: [6, 4, 5, 1, 1, 0, 5, 1, 4, 5]
+Active Experts in Batch: {0, 1, 4, 5, 6}
+
+Train step of epoch 0:  54%|█████▍    | 3459/6434 [8:06:45<6:25:42,  7.78s/it, gpt_loss=0.344, loss_mean=0.303][A
+Train step of epoch 0:  54%|█████▍    | 3460/6434 [8:06:45<6:19:32,  7.66s/it, gpt_loss=0.344, loss_mean=0.303][A
+Train step of epoch 0:  54%|█████▍    | 3460/6434 [8:06:53<6:19:32,  7.66s/it, gpt_loss=0.257, loss_mean=0.299][A
+Train step of epoch 0:  54%|█████▍    | 3461/6434 [8:06:53<6:18:40,  7.64s/it, gpt_loss=0.257, loss_mean=0.299][A
+Train step of epoch 0:  54%|█████▍    | 3461/6434 [8:07:00<6:18:40,  7.64s/it, gpt_loss=0.287, loss_mean=0.297][A
+Train step of epoch 0:  54%|█████▍    | 3462/6434 [8:07:00<6:16:57,  7.61s/it, gpt_loss=0.287, loss_mean=0.297][A
+Train step of epoch 0:  54%|█████▍    | 3462/6434 [8:07:08<6:16:57,  7.61s/it, gpt_loss=0.35, loss_mean=0.303] [A
+Train step of epoch 0:  54%|█████▍    | 3463/6434 [8:07:08<6:20:03,  7.68s/it, gpt_loss=0.35, loss_mean=0.303][A
+Train step of epoch 0:  54%|█████▍    | 3463/6434 [8:07:16<6:20:03,  7.68s/it, gpt_loss=0.268, loss_mean=0.299][A
+Train step of epoch 0:  54%|█████▍    | 3464/6434 [8:07:16<6:25:12,  7.78s/it, gpt_loss=0.268, loss_mean=0.299][A
+Train step of epoch 0:  54%|█████▍    | 3464/6434 [8:07:24<6:25:12,  7.78s/it, gpt_loss=0.343, loss_mean=0.304][A
+Train step of epoch 0:  54%|█████▍    | 3465/6434 [8:07:24<6:30:17,  7.89s/it, gpt_loss=0.343, loss_mean=0.304][A
+Train step of epoch 0:  54%|█████▍    | 3465/6434 [8:07:32<6:30:17,  7.89s/it, gpt_loss=0.27, loss_mean=0.3]   [A
+Train step of epoch 0:  54%|█████▍    | 3466/6434 [8:07:32<6:24:43,  7.78s/it, gpt_loss=0.27, loss_mean=0.3][A
+Train step of epoch 0:  54%|█████▍    | 3466/6434 [8:07:41<6:24:43,  7.78s/it, gpt_loss=0.321, loss_mean=0.302][A
+Train step of epoch 0:  54%|█████▍    | 3467/6434 [8:07:41<6:38:43,  8.06s/it, gpt_loss=0.321, loss_mean=0.302][A
+Train step of epoch 0:  54%|█████▍    | 3467/6434 [8:07:48<6:38:43,  8.06s/it, gpt_loss=0.319, loss_mean=0.304][A
+Train step of epoch 0:  54%|█████▍    | 3468/6434 [8:07:48<6:34:34,  7.98s/it, gpt_loss=0.319, loss_mean=0.304][A
+Train step of epoch 0:  54%|█████▍    | 3468/6434 [8:07:57<6:34:34,  7.98s/it, gpt_loss=0.387, loss_mean=0.312][A
+Train step of epoch 0:  54%|█████▍    | 3469/6434 [8:07:57<6:42:17,  8.14s/it, gpt_loss=0.387, loss_mean=0.312][A
+[LID Router Debug] Step: 3470
+Batch Size: 10
+Audio Batch Size: 95
+LID Assignments: [5, 5, 9, 2, 4, 5, 4, 3, 4, 0]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  54%|█████▍    | 3469/6434 [8:08:05<6:42:17,  8.14s/it, gpt_loss=0.28, loss_mean=0.309] [A
+Train step of epoch 0:  54%|█████▍    | 3470/6434 [8:08:05<6:33:45,  7.97s/it, gpt_loss=0.28, loss_mean=0.309][A
+Train step of epoch 0:  54%|█████▍    | 3470/6434 [8:08:13<6:33:45,  7.97s/it, gpt_loss=0.297, loss_mean=0.308][A
+Train step of epoch 0:  54%|█████▍    | 3471/6434 [8:08:13<6:37:49,  8.06s/it, gpt_loss=0.297, loss_mean=0.308][A
+Train step of epoch 0:  54%|█████▍    | 3471/6434 [8:08:21<6:37:49,  8.06s/it, gpt_loss=0.293, loss_mean=0.306][A
+Train step of epoch 0:  54%|█████▍    | 3472/6434 [8:08:21<6:34:42,  8.00s/it, gpt_loss=0.293, loss_mean=0.306][A
+Train step of epoch 0:  54%|█████▍    | 3472/6434 [8:08:29<6:34:42,  8.00s/it, gpt_loss=0.274, loss_mean=0.303][A
+Train step of epoch 0:  54%|█████▍    | 3473/6434 [8:08:29<6:33:59,  7.98s/it, gpt_loss=0.274, loss_mean=0.303][A
+Train step of epoch 0:  54%|█████▍    | 3473/6434 [8:08:37<6:33:59,  7.98s/it, gpt_loss=0.268, loss_mean=0.3]  [A
+Train step of epoch 0:  54%|█████▍    | 3474/6434 [8:08:37<6:42:08,  8.15s/it, gpt_loss=0.268, loss_mean=0.3][A
+Train step of epoch 0:  54%|█████▍    | 3474/6434 [8:08:45<6:42:08,  8.15s/it, gpt_loss=0.322, loss_mean=0.302][A
+Train step of epoch 0:  54%|█████▍    | 3475/6434 [8:08:45<6:36:10,  8.03s/it, gpt_loss=0.322, loss_mean=0.302][A
+Train step of epoch 0:  54%|█████▍    | 3475/6434 [8:08:54<6:36:10,  8.03s/it, gpt_loss=0.341, loss_mean=0.306][A
+Train step of epoch 0:  54%|█████▍    | 3476/6434 [8:08:54<6:56:49,  8.45s/it, gpt_loss=0.341, loss_mean=0.306][A
+Train step of epoch 0:  54%|█████▍    | 3476/6434 [8:09:03<6:56:49,  8.45s/it, gpt_loss=0.303, loss_mean=0.306][A
+Train step of epoch 0:  54%|█████▍    | 3477/6434 [8:09:03<6:58:12,  8.49s/it, gpt_loss=0.303, loss_mean=0.306][A
+Train step of epoch 0:  54%|█████▍    | 3477/6434 [8:09:12<6:58:12,  8.49s/it, gpt_loss=0.284, loss_mean=0.303][A
+Train step of epoch 0:  54%|█████▍    | 3478/6434 [8:09:12<7:02:15,  8.57s/it, gpt_loss=0.284, loss_mean=0.303][A
+Train step of epoch 0:  54%|█████▍    | 3478/6434 [8:09:20<7:02:15,  8.57s/it, gpt_loss=0.34, loss_mean=0.307] [A
+Train step of epoch 0:  54%|█████▍    | 3479/6434 [8:09:20<7:00:10,  8.53s/it, gpt_loss=0.34, loss_mean=0.307][A
+[LID Router Debug] Step: 3480
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [2, 4, 2, 2, 2, 9, 3, 1, 2, 4]
+Active Experts in Batch: {1, 2, 3, 4, 9}
+
+Train step of epoch 0:  54%|█████▍    | 3479/6434 [8:09:28<7:00:10,  8.53s/it, gpt_loss=0.371, loss_mean=0.313][A
+Train step of epoch 0:  54%|█████▍    | 3480/6434 [8:09:28<6:54:59,  8.43s/it, gpt_loss=0.371, loss_mean=0.313][A
+Train step of epoch 0:  54%|█████▍    | 3480/6434 [8:09:38<6:54:59,  8.43s/it, gpt_loss=0.352, loss_mean=0.317][A
+Train step of epoch 0:  54%|█████▍    | 3481/6434 [8:09:38<7:08:33,  8.71s/it, gpt_loss=0.352, loss_mean=0.317][A
+Train step of epoch 0:  54%|█████▍    | 3481/6434 [8:09:46<7:08:33,  8.71s/it, gpt_loss=0.327, loss_mean=0.318][A
+Train step of epoch 0:  54%|█████▍    | 3482/6434 [8:09:46<6:58:25,  8.50s/it, gpt_loss=0.327, loss_mean=0.318][A
+Train step of epoch 0:  54%|█████▍    | 3482/6434 [8:09:54<6:58:25,  8.50s/it, gpt_loss=0.237, loss_mean=0.31] [A
+Train step of epoch 0:  54%|█████▍    | 3483/6434 [8:09:54<7:01:29,  8.57s/it, gpt_loss=0.237, loss_mean=0.31][A
+Train step of epoch 0:  54%|█████▍    | 3483/6434 [8:10:02<7:01:29,  8.57s/it, gpt_loss=0.307, loss_mean=0.31][A
+Train step of epoch 0:  54%|█████▍    | 3484/6434 [8:10:02<6:44:55,  8.24s/it, gpt_loss=0.307, loss_mean=0.31][A
+Train step of epoch 0:  54%|█████▍    | 3484/6434 [8:10:10<6:44:55,  8.24s/it, gpt_loss=0.254, loss_mean=0.304][A
+Train step of epoch 0:  54%|█████▍    | 3485/6434 [8:10:10<6:37:30,  8.09s/it, gpt_loss=0.254, loss_mean=0.304][A
+Train step of epoch 0:  54%|█████▍    | 3485/6434 [8:10:18<6:37:30,  8.09s/it, gpt_loss=0.285, loss_mean=0.302][A
+Train step of epoch 0:  54%|█████▍    | 3486/6434 [8:10:18<6:40:37,  8.15s/it, gpt_loss=0.285, loss_mean=0.302][A
+Train step of epoch 0:  54%|█████▍    | 3486/6434 [8:10:27<6:40:37,  8.15s/it, gpt_loss=0.286, loss_mean=0.301][A
+Train step of epoch 0:  54%|█████▍    | 3487/6434 [8:10:27<6:47:20,  8.29s/it, gpt_loss=0.286, loss_mean=0.301][A
+Train step of epoch 0:  54%|█████▍    | 3487/6434 [8:10:35<6:47:20,  8.29s/it, gpt_loss=0.258, loss_mean=0.296][A
+Train step of epoch 0:  54%|█████▍    | 3488/6434 [8:10:35<6:45:06,  8.25s/it, gpt_loss=0.258, loss_mean=0.296][A
+Train step of epoch 0:  54%|█████▍    | 3488/6434 [8:10:43<6:45:06,  8.25s/it, gpt_loss=0.295, loss_mean=0.296][A
+Train step of epoch 0:  54%|█████▍    | 3489/6434 [8:10:43<6:49:44,  8.35s/it, gpt_loss=0.295, loss_mean=0.296][A
+[LID Router Debug] Step: 3490
+Batch Size: 10
+Audio Batch Size: 132
+LID Assignments: [6, 9, 9, 2, 5, 3, 5, 1, 6, 3]
+Active Experts in Batch: {1, 2, 3, 5, 6, 9}
+
+Train step of epoch 0:  54%|█████▍    | 3489/6434 [8:10:52<6:49:44,  8.35s/it, gpt_loss=0.326, loss_mean=0.299][A
+Train step of epoch 0:  54%|█████▍    | 3490/6434 [8:10:52<6:53:29,  8.43s/it, gpt_loss=0.326, loss_mean=0.299][A
+Train step of epoch 0:  54%|█████▍    | 3490/6434 [8:11:00<6:53:29,  8.43s/it, gpt_loss=0.269, loss_mean=0.296][A
+Train step of epoch 0:  54%|█████▍    | 3491/6434 [8:11:00<6:52:49,  8.42s/it, gpt_loss=0.269, loss_mean=0.296][A
+Train step of epoch 0:  54%|█████▍    | 3491/6434 [8:11:09<6:52:49,  8.42s/it, gpt_loss=0.281, loss_mean=0.295][A
+Train step of epoch 0:  54%|█████▍    | 3492/6434 [8:11:09<6:59:16,  8.55s/it, gpt_loss=0.281, loss_mean=0.295][A
+Train step of epoch 0:  54%|█████▍    | 3492/6434 [8:11:17<6:59:16,  8.55s/it, gpt_loss=0.321, loss_mean=0.297][A
+Train step of epoch 0:  54%|█████▍    | 3493/6434 [8:11:17<6:49:25,  8.35s/it, gpt_loss=0.321, loss_mean=0.297][A
+Train step of epoch 0:  54%|█████▍    | 3493/6434 [8:11:25<6:49:25,  8.35s/it, gpt_loss=0.278, loss_mean=0.295][A
+Train step of epoch 0:  54%|█████▍    | 3494/6434 [8:11:25<6:43:13,  8.23s/it, gpt_loss=0.278, loss_mean=0.295][A
+Train step of epoch 0:  54%|█████▍    | 3494/6434 [8:11:33<6:43:13,  8.23s/it, gpt_loss=0.26, loss_mean=0.292] [A
+Train step of epoch 0:  54%|█████▍    | 3495/6434 [8:11:33<6:42:28,  8.22s/it, gpt_loss=0.26, loss_mean=0.292][A
+Train step of epoch 0:  54%|█████▍    | 3495/6434 [8:11:42<6:42:28,  8.22s/it, gpt_loss=0.279, loss_mean=0.291][A
+Train step of epoch 0:  54%|█████▍    | 3496/6434 [8:11:42<6:50:13,  8.38s/it, gpt_loss=0.279, loss_mean=0.291][A
+Train step of epoch 0:  54%|█████▍    | 3496/6434 [8:11:51<6:50:13,  8.38s/it, gpt_loss=0.276, loss_mean=0.289][A
+Train step of epoch 0:  54%|█████▍    | 3497/6434 [8:11:51<6:55:30,  8.49s/it, gpt_loss=0.276, loss_mean=0.289][A
+Train step of epoch 0:  54%|█████▍    | 3497/6434 [8:11:59<6:55:30,  8.49s/it, gpt_loss=0.297, loss_mean=0.29] [A
+Train step of epoch 0:  54%|█████▍    | 3498/6434 [8:11:59<6:47:51,  8.33s/it, gpt_loss=0.297, loss_mean=0.29][A
+Train step of epoch 0:  54%|█████▍    | 3498/6434 [8:12:06<6:47:51,  8.33s/it, gpt_loss=0.353, loss_mean=0.296][A
+Train step of epoch 0:  54%|█████▍    | 3499/6434 [8:12:06<6:39:13,  8.16s/it, gpt_loss=0.353, loss_mean=0.296][A
+[LID Router Debug] Step: 3500
+Batch Size: 10
+Audio Batch Size: 110
+LID Assignments: [1, 9, 2, 3, 0, 4, 2, 1, 2, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+Train step of epoch 0:  54%|█████▍    | 3499/6434 [8:12:14<6:39:13,  8.16s/it, gpt_loss=0.249, loss_mean=0.291][A
+Train step of epoch 0:  54%|█████▍    | 3500/6434 [8:12:14<6:32:30,  8.03s/it, gpt_loss=0.249, loss_mean=0.291][A
+Train step of epoch 0:  54%|█████▍    | 3500/6434 [8:12:23<6:32:30,  8.03s/it, gpt_loss=0.216, loss_mean=0.284][A
+Train step of epoch 0:  54%|█████▍    | 3501/6434 [8:12:23<6:47:25,  8.33s/it, gpt_loss=0.216, loss_mean=0.284][A
+Train step of epoch 0:  54%|█████▍    | 3501/6434 [8:12:32<6:47:25,  8.33s/it, gpt_loss=0.261, loss_mean=0.282][A
+Train step of epoch 0:  54%|█████▍    | 3502/6434 [8:12:32<6:51:48,  8.43s/it, gpt_loss=0.261, loss_mean=0.282][A
+Train step of epoch 0:  54%|█████▍    | 3502/6434 [8:12:40<6:51:48,  8.43s/it, gpt_loss=0.254, loss_mean=0.279][A
+Train step of epoch 0:  54%|█████▍    | 3503/6434 [8:12:40<6:43:35,  8.26s/it, gpt_loss=0.254, loss_mean=0.279][A
+Train step of epoch 0:  54%|█████▍    | 3503/6434 [8:12:48<6:43:35,  8.26s/it, gpt_loss=0.313, loss_mean=0.282][A
+Train step of epoch 0:  54%|█████▍    | 3504/6434 [8:12:48<6:39:04,  8.17s/it, gpt_loss=0.313, loss_mean=0.282][A
+Train step of epoch 0:  54%|█████▍    | 3504/6434 [8:12:56<6:39:04,  8.17s/it, gpt_loss=0.324, loss_mean=0.287][A
+Train step of epoch 0:  54%|█████▍    | 3505/6434 [8:12:56<6:46:43,  8.33s/it, gpt_loss=0.324, loss_mean=0.287][A
+Train step of epoch 0:  54%|█████▍    | 3505/6434 [8:13:04<6:46:43,  8.33s/it, gpt_loss=0.249, loss_mean=0.283][A
+Train step of epoch 0:  54%|█████▍    | 3506/6434 [8:13:04<6:29:56,  7.99s/it, gpt_loss=0.249, loss_mean=0.283][A
+Train step of epoch 0:  54%|█████▍    | 3506/6434 [8:13:11<6:29:56,  7.99s/it, gpt_loss=0.291, loss_mean=0.284][A
+Train step of epoch 0:  55%|█████▍    | 3507/6434 [8:13:11<6:27:03,  7.93s/it, gpt_loss=0.291, loss_mean=0.284][A
+Train step of epoch 0:  55%|█████▍    | 3507/6434 [8:13:20<6:27:03,  7.93s/it, gpt_loss=0.323, loss_mean=0.288][A
+Train step of epoch 0:  55%|█████▍    | 3508/6434 [8:13:20<6:38:42,  8.18s/it, gpt_loss=0.323, loss_mean=0.288][A
+Train step of epoch 0:  55%|█████▍    | 3508/6434 [8:13:28<6:38:42,  8.18s/it, gpt_loss=0.202, loss_mean=0.279][A
+Train step of epoch 0:  55%|█████▍    | 3509/6434 [8:13:28<6:33:21,  8.07s/it, gpt_loss=0.202, loss_mean=0.279][A
+[LID Router Debug] Step: 3510
+Batch Size: 10
+Audio Batch Size: 92
+LID Assignments: [2, 2, 0, 9, 6, 0, 6, 3, 4, 4]
+Active Experts in Batch: {0, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:  55%|█████▍    | 3509/6434 [8:13:36<6:33:21,  8.07s/it, gpt_loss=0.303, loss_mean=0.281][A
+Train step of epoch 0:  55%|█████▍    | 3510/6434 [8:13:36<6:35:26,  8.11s/it, gpt_loss=0.303, loss_mean=0.281][A
+Train step of epoch 0:  55%|█████▍    | 3510/6434 [8:13:44<6:35:26,  8.11s/it, gpt_loss=0.328, loss_mean=0.286][A
+Train step of epoch 0:  55%|█████▍    | 3511/6434 [8:13:44<6:26:46,  7.94s/it, gpt_loss=0.328, loss_mean=0.286][A
+Train step of epoch 0:  55%|█████▍    | 3511/6434 [8:13:52<6:26:46,  7.94s/it, gpt_loss=0.282, loss_mean=0.286][A
+Train step of epoch 0:  55%|█████▍    | 3512/6434 [8:13:52<6:38:34,  8.18s/it, gpt_loss=0.282, loss_mean=0.286][A
+Train step of epoch 0:  55%|█████▍    | 3512/6434 [8:13:59<6:38:34,  8.18s/it, gpt_loss=0.307, loss_mean=0.288][A
+Train step of epoch 0:  55%|█████▍    | 3513/6434 [8:13:59<6:17:31,  7.75s/it, gpt_loss=0.307, loss_mean=0.288][A
+Train step of epoch 0:  55%|█████▍    | 3513/6434 [8:14:08<6:17:31,  7.75s/it, gpt_loss=0.248, loss_mean=0.284][A
+Train step of epoch 0:  55%|█████▍    | 3514/6434 [8:14:08<6:40:08,  8.22s/it, gpt_loss=0.248, loss_mean=0.284][A
+Train step of epoch 0:  55%|█████▍    | 3514/6434 [8:14:17<6:40:08,  8.22s/it, gpt_loss=0.29, loss_mean=0.284] [A
+Train step of epoch 0:  55%|█████▍    | 3515/6434 [8:14:17<6:46:28,  8.35s/it, gpt_loss=0.29, loss_mean=0.284][A
+Train step of epoch 0:  55%|█████▍    | 3515/6434 [8:14:25<6:46:28,  8.35s/it, gpt_loss=0.244, loss_mean=0.28][A
+Train step of epoch 0:  55%|█████▍    | 3516/6434 [8:14:25<6:41:33,  8.26s/it, gpt_loss=0.244, loss_mean=0.28][A
+Train step of epoch 0:  55%|█████▍    | 3516/6434 [8:14:34<6:41:33,  8.26s/it, gpt_loss=0.329, loss_mean=0.285][A
+Train step of epoch 0:  55%|█████▍    | 3517/6434 [8:14:34<6:54:25,  8.52s/it, gpt_loss=0.329, loss_mean=0.285][A
+Train step of epoch 0:  55%|█████▍    | 3517/6434 [8:14:42<6:54:25,  8.52s/it, gpt_loss=0.251, loss_mean=0.282][A
+Train step of epoch 0:  55%|█████▍    | 3518/6434 [8:14:42<6:37:03,  8.17s/it, gpt_loss=0.251, loss_mean=0.282][A
+Train step of epoch 0:  55%|█████▍    | 3518/6434 [8:14:50<6:37:03,  8.17s/it, gpt_loss=0.344, loss_mean=0.288][A
+Train step of epoch 0:  55%|█████▍    | 3519/6434 [8:14:50<6:37:17,  8.18s/it, gpt_loss=0.344, loss_mean=0.288][A
+[LID Router Debug] Step: 3520
+Batch Size: 10
+Audio Batch Size: 104
+LID Assignments: [1, 9, 4, 5, 6, 0, 6, 0, 9, 4]
+Active Experts in Batch: {0, 1, 4, 5, 6, 9}
+
+Train step of epoch 0:  55%|█████▍    | 3519/6434 [8:14:58<6:37:17,  8.18s/it, gpt_loss=0.264, loss_mean=0.286][A
+Train step of epoch 0:  55%|█████▍    | 3520/6434 [8:14:58<6:30:35,  8.04s/it, gpt_loss=0.264, loss_mean=0.286][A
+Train step of epoch 0:  55%|█████▍    | 3520/6434 [8:15:06<6:30:35,  8.04s/it, gpt_loss=0.347, loss_mean=0.292][A
+Train step of epoch 0:  55%|█████▍    | 3521/6434 [8:15:06<6:32:02,  8.07s/it, gpt_loss=0.347, loss_mean=0.292][A
+Train step of epoch 0:  55%|█████▍    | 3521/6434 [8:15:15<6:32:02,  8.07s/it, gpt_loss=0.326, loss_mean=0.295][A
+Train step of epoch 0:  55%|█████▍    | 3522/6434 [8:15:15<6:43:28,  8.31s/it, gpt_loss=0.326, loss_mean=0.295][A
+Train step of epoch 0:  55%|█████▍    | 3522/6434 [8:15:24<6:43:28,  8.31s/it, gpt_loss=0.239, loss_mean=0.29] [A
+Train step of epoch 0:  55%|█████▍    | 3523/6434 [8:15:24<7:00:39,  8.67s/it, gpt_loss=0.239, loss_mean=0.29][A
+Train step of epoch 0:  55%|█████▍    | 3523/6434 [8:15:32<7:00:39,  8.67s/it, gpt_loss=0.312, loss_mean=0.292][A
+Train step of epoch 0:  55%|█████▍    | 3524/6434 [8:15:32<6:45:49,  8.37s/it, gpt_loss=0.312, loss_mean=0.292][A
+Train step of epoch 0:  55%|█████▍    | 3524/6434 [8:15:39<6:45:49,  8.37s/it, gpt_loss=0.308, loss_mean=0.293][A
+Train step of epoch 0:  55%|█████▍    | 3525/6434 [8:15:39<6:35:58,  8.17s/it, gpt_loss=0.308, loss_mean=0.293][A
+Train step of epoch 0:  55%|█████▍    | 3525/6434 [8:15:48<6:35:58,  8.17s/it, gpt_loss=0.332, loss_mean=0.297][A
+Train step of epoch 0:  55%|█████▍    | 3526/6434 [8:15:48<6:35:51,  8.17s/it, gpt_loss=0.332, loss_mean=0.297][A
+Train step of epoch 0:  55%|█████▍    | 3526/6434 [8:15:56<6:35:51,  8.17s/it, gpt_loss=0.263, loss_mean=0.294][A
+Train step of epoch 0:  55%|█████▍    | 3527/6434 [8:15:56<6:36:21,  8.18s/it, gpt_loss=0.263, loss_mean=0.294][A
+Train step of epoch 0:  55%|█████▍    | 3527/6434 [8:16:04<6:36:21,  8.18s/it, gpt_loss=0.281, loss_mean=0.293][A
+Train step of epoch 0:  55%|█████▍    | 3528/6434 [8:16:04<6:42:47,  8.32s/it, gpt_loss=0.281, loss_mean=0.293][A
+Train step of epoch 0:  55%|█████▍    | 3528/6434 [8:16:12<6:42:47,  8.32s/it, gpt_loss=0.344, loss_mean=0.298][A
+Train step of epoch 0:  55%|█████▍    | 3529/6434 [8:16:12<6:37:42,  8.21s/it, gpt_loss=0.344, loss_mean=0.298][A
+[LID Router Debug] Step: 3530
+Batch Size: 10
+Audio Batch Size: 134
+LID Assignments: [2, 3, 3, 4, 9, 1, 3, 5, 3, 1]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  55%|█████▍    | 3529/6434 [8:16:21<6:37:42,  8.21s/it, gpt_loss=0.219, loss_mean=0.29] [A
+Train step of epoch 0:  55%|█████▍    | 3530/6434 [8:16:21<6:49:33,  8.46s/it, gpt_loss=0.219, loss_mean=0.29][A
+Train step of epoch 0:  55%|█████▍    | 3530/6434 [8:16:29<6:49:33,  8.46s/it, gpt_loss=0.335, loss_mean=0.294][A
+Train step of epoch 0:  55%|█████▍    | 3531/6434 [8:16:29<6:42:46,  8.32s/it, gpt_loss=0.335, loss_mean=0.294][A
+Train step of epoch 0:  55%|█████▍    | 3531/6434 [8:16:39<6:42:46,  8.32s/it, gpt_loss=0.354, loss_mean=0.3]  [A
+Train step of epoch 0:  55%|█████▍    | 3532/6434 [8:16:39<6:56:30,  8.61s/it, gpt_loss=0.354, loss_mean=0.3][A
+Train step of epoch 0:  55%|█████▍    | 3532/6434 [8:16:48<6:56:30,  8.61s/it, gpt_loss=0.31, loss_mean=0.301][A
+Train step of epoch 0:  55%|█████▍    | 3533/6434 [8:16:48<6:59:34,  8.68s/it, gpt_loss=0.31, loss_mean=0.301][A
+Train step of epoch 0:  55%|█████▍    | 3533/6434 [8:16:57<6:59:34,  8.68s/it, gpt_loss=0.321, loss_mean=0.303][A
+Train step of epoch 0:  55%|█████▍    | 3534/6434 [8:16:57<7:02:57,  8.75s/it, gpt_loss=0.321, loss_mean=0.303][A
+Train step of epoch 0:  55%|█████▍    | 3534/6434 [8:17:05<7:02:57,  8.75s/it, gpt_loss=0.359, loss_mean=0.309][A
+Train step of epoch 0:  55%|█████▍    | 3535/6434 [8:17:05<7:00:06,  8.69s/it, gpt_loss=0.359, loss_mean=0.309][A
+Train step of epoch 0:  55%|█████▍    | 3535/6434 [8:17:14<7:00:06,  8.69s/it, gpt_loss=0.242, loss_mean=0.302][A
+Train step of epoch 0:  55%|█████▍    | 3536/6434 [8:17:14<7:10:08,  8.91s/it, gpt_loss=0.242, loss_mean=0.302][A
+Train step of epoch 0:  55%|█████▍    | 3536/6434 [8:17:23<7:10:08,  8.91s/it, gpt_loss=0.351, loss_mean=0.307][A
+Train step of epoch 0:  55%|█████▍    | 3537/6434 [8:17:23<7:03:49,  8.78s/it, gpt_loss=0.351, loss_mean=0.307][A
+Train step of epoch 0:  55%|█████▍    | 3537/6434 [8:17:31<7:03:49,  8.78s/it, gpt_loss=0.33, loss_mean=0.309] [A
+Train step of epoch 0:  55%|█████▍    | 3538/6434 [8:17:31<6:52:48,  8.55s/it, gpt_loss=0.33, loss_mean=0.309][A
+Train step of epoch 0:  55%|█████▍    | 3538/6434 [8:17:39<6:52:48,  8.55s/it, gpt_loss=0.339, loss_mean=0.312][A
+Train step of epoch 0:  55%|█████▌    | 3539/6434 [8:17:39<6:48:08,  8.46s/it, gpt_loss=0.339, loss_mean=0.312][A
+[LID Router Debug] Step: 3540
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [3, 2, 1, 0, 9, 3, 9, 4, 9, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+Train step of epoch 0:  55%|█████▌    | 3539/6434 [8:17:47<6:48:08,  8.46s/it, gpt_loss=0.265, loss_mean=0.307][A
+Train step of epoch 0:  55%|█████▌    | 3540/6434 [8:17:47<6:44:32,  8.39s/it, gpt_loss=0.265, loss_mean=0.307][A
+Train step of epoch 0:  55%|█████▌    | 3540/6434 [8:17:56<6:44:32,  8.39s/it, gpt_loss=0.245, loss_mean=0.301][A
+Train step of epoch 0:  55%|█████▌    | 3541/6434 [8:17:56<6:40:43,  8.31s/it, gpt_loss=0.245, loss_mean=0.301][A
+Train step of epoch 0:  55%|█████▌    | 3541/6434 [8:18:06<6:40:43,  8.31s/it, gpt_loss=0.231, loss_mean=0.294][A
+Train step of epoch 0:  55%|█████▌    | 3542/6434 [8:18:06<7:04:06,  8.80s/it, gpt_loss=0.231, loss_mean=0.294][A
+Train step of epoch 0:  55%|█████▌    | 3542/6434 [8:18:14<7:04:06,  8.80s/it, gpt_loss=0.289, loss_mean=0.294][A
+Train step of epoch 0:  55%|█████▌    | 3543/6434 [8:18:14<7:06:11,  8.85s/it, gpt_loss=0.289, loss_mean=0.294][A
+Train step of epoch 0:  55%|█████▌    | 3543/6434 [8:18:22<7:06:11,  8.85s/it, gpt_loss=0.254, loss_mean=0.29] [A
+Train step of epoch 0:  55%|█████▌    | 3544/6434 [8:18:22<6:50:48,  8.53s/it, gpt_loss=0.254, loss_mean=0.29][A
+Train step of epoch 0:  55%|█████▌    | 3544/6434 [8:18:31<6:50:48,  8.53s/it, gpt_loss=0.306, loss_mean=0.291][A
+Train step of epoch 0:  55%|█████▌    | 3545/6434 [8:18:31<6:53:05,  8.58s/it, gpt_loss=0.306, loss_mean=0.291][A
+Train step of epoch 0:  55%|█████▌    | 3545/6434 [8:18:39<6:53:05,  8.58s/it, gpt_loss=0.321, loss_mean=0.294][A
+Train step of epoch 0:  55%|█████▌    | 3546/6434 [8:18:39<6:46:15,  8.44s/it, gpt_loss=0.321, loss_mean=0.294][A
+Train step of epoch 0:  55%|█████▌    | 3546/6434 [8:18:47<6:46:15,  8.44s/it, gpt_loss=0.263, loss_mean=0.291][A
+Train step of epoch 0:  55%|█████▌    | 3547/6434 [8:18:47<6:45:16,  8.42s/it, gpt_loss=0.263, loss_mean=0.291][A
+Train step of epoch 0:  55%|█████▌    | 3547/6434 [8:18:56<6:45:16,  8.42s/it, gpt_loss=0.368, loss_mean=0.299][A
+Train step of epoch 0:  55%|█████▌    | 3548/6434 [8:18:56<6:45:43,  8.43s/it, gpt_loss=0.368, loss_mean=0.299][A
+Train step of epoch 0:  55%|█████▌    | 3548/6434 [8:19:04<6:45:43,  8.43s/it, gpt_loss=0.271, loss_mean=0.296][A
+Train step of epoch 0:  55%|█████▌    | 3549/6434 [8:19:04<6:37:22,  8.26s/it, gpt_loss=0.271, loss_mean=0.296][A
+[LID Router Debug] Step: 3550
+Batch Size: 10
+Audio Batch Size: 88
+LID Assignments: [5, 5, 4, 4, 4, 2, 3, 9, 1, 5]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  55%|█████▌    | 3549/6434 [8:19:13<6:37:22,  8.26s/it, gpt_loss=0.312, loss_mean=0.298][A
+Train step of epoch 0:  55%|█████▌    | 3550/6434 [8:19:13<6:46:46,  8.46s/it, gpt_loss=0.312, loss_mean=0.298][A
+Train step of epoch 0:  55%|█████▌    | 3550/6434 [8:19:22<6:46:46,  8.46s/it, gpt_loss=0.293, loss_mean=0.297][A
+Train step of epoch 0:  55%|█████▌    | 3551/6434 [8:19:22<6:53:15,  8.60s/it, gpt_loss=0.293, loss_mean=0.297][A
+Train step of epoch 0:  55%|█████▌    | 3551/6434 [8:19:30<6:53:15,  8.60s/it, gpt_loss=0.242, loss_mean=0.292][A
+Train step of epoch 0:  55%|█████▌    | 3552/6434 [8:19:30<6:43:26,  8.40s/it, gpt_loss=0.242, loss_mean=0.292][A
+Train step of epoch 0:  55%|█████▌    | 3552/6434 [8:19:37<6:43:26,  8.40s/it, gpt_loss=0.311, loss_mean=0.294][A
+Train step of epoch 0:  55%|█████▌    | 3553/6434 [8:19:37<6:33:44,  8.20s/it, gpt_loss=0.311, loss_mean=0.294][A
+Train step of epoch 0:  55%|█████▌    | 3553/6434 [8:19:47<6:33:44,  8.20s/it, gpt_loss=0.377, loss_mean=0.302][A
+Train step of epoch 0:  55%|█████▌    | 3554/6434 [8:19:47<6:52:22,  8.59s/it, gpt_loss=0.377, loss_mean=0.302][A
+Train step of epoch 0:  55%|█████▌    | 3554/6434 [8:19:55<6:52:22,  8.59s/it, gpt_loss=0.314, loss_mean=0.303][A
+Train step of epoch 0:  55%|█████▌    | 3555/6434 [8:19:55<6:48:11,  8.51s/it, gpt_loss=0.314, loss_mean=0.303][A
+Train step of epoch 0:  55%|█████▌    | 3555/6434 [8:20:03<6:48:11,  8.51s/it, gpt_loss=0.422, loss_mean=0.315][A
+Train step of epoch 0:  55%|█████▌    | 3556/6434 [8:20:03<6:38:21,  8.30s/it, gpt_loss=0.422, loss_mean=0.315][A
+Train step of epoch 0:  55%|█████▌    | 3556/6434 [8:20:11<6:38:21,  8.30s/it, gpt_loss=0.298, loss_mean=0.313][A
+Train step of epoch 0:  55%|█████▌    | 3557/6434 [8:20:11<6:41:25,  8.37s/it, gpt_loss=0.298, loss_mean=0.313][A
+Train step of epoch 0:  55%|█████▌    | 3557/6434 [8:20:20<6:41:25,  8.37s/it, gpt_loss=0.359, loss_mean=0.318][A
+Train step of epoch 0:  55%|█████▌    | 3558/6434 [8:20:20<6:44:27,  8.44s/it, gpt_loss=0.359, loss_mean=0.318][A
+Train step of epoch 0:  55%|█████▌    | 3558/6434 [8:20:29<6:44:27,  8.44s/it, gpt_loss=0.292, loss_mean=0.315][A
+Train step of epoch 0:  55%|█████▌    | 3559/6434 [8:20:29<6:50:50,  8.57s/it, gpt_loss=0.292, loss_mean=0.315][A
+[LID Router Debug] Step: 3560
+Batch Size: 10
+Audio Batch Size: 107
+LID Assignments: [1, 2, 2, 1, 5, 3, 2, 4, 3, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5}
+
+Train step of epoch 0:  55%|█████▌    | 3559/6434 [8:20:38<6:50:50,  8.57s/it, gpt_loss=0.333, loss_mean=0.317][A
+Train step of epoch 0:  55%|█████▌    | 3560/6434 [8:20:38<6:51:53,  8.60s/it, gpt_loss=0.333, loss_mean=0.317][A
+Train step of epoch 0:  55%|█████▌    | 3560/6434 [8:20:45<6:51:53,  8.60s/it, gpt_loss=0.319, loss_mean=0.317][A
+Train step of epoch 0:  55%|█████▌    | 3561/6434 [8:20:45<6:35:57,  8.27s/it, gpt_loss=0.319, loss_mean=0.317][A
+Train step of epoch 0:  55%|█████▌    | 3561/6434 [8:20:53<6:35:57,  8.27s/it, gpt_loss=0.441, loss_mean=0.33] [A
+Train step of epoch 0:  55%|█████▌    | 3562/6434 [8:20:53<6:33:49,  8.23s/it, gpt_loss=0.441, loss_mean=0.33][A
+Train step of epoch 0:  55%|█████▌    | 3562/6434 [8:21:01<6:33:49,  8.23s/it, gpt_loss=0.268, loss_mean=0.324][A
+Train step of epoch 0:  55%|█████▌    | 3563/6434 [8:21:01<6:29:37,  8.14s/it, gpt_loss=0.268, loss_mean=0.324][A
+Train step of epoch 0:  55%|█████▌    | 3563/6434 [8:21:09<6:29:37,  8.14s/it, gpt_loss=0.348, loss_mean=0.326][A
+Train step of epoch 0:  55%|█████▌    | 3564/6434 [8:21:09<6:28:59,  8.13s/it, gpt_loss=0.348, loss_mean=0.326][A
+Train step of epoch 0:  55%|█████▌    | 3564/6434 [8:21:17<6:28:59,  8.13s/it, gpt_loss=0.255, loss_mean=0.319][A
+Train step of epoch 0:  55%|█████▌    | 3565/6434 [8:21:17<6:16:17,  7.87s/it, gpt_loss=0.255, loss_mean=0.319][A
+Train step of epoch 0:  55%|█████▌    | 3565/6434 [8:21:26<6:16:17,  7.87s/it, gpt_loss=0.267, loss_mean=0.314][A
+Train step of epoch 0:  55%|█████▌    | 3566/6434 [8:21:26<6:36:26,  8.29s/it, gpt_loss=0.267, loss_mean=0.314][A
+Train step of epoch 0:  55%|█████▌    | 3566/6434 [8:21:34<6:36:26,  8.29s/it, gpt_loss=0.245, loss_mean=0.307][A
+Train step of epoch 0:  55%|█████▌    | 3567/6434 [8:21:34<6:37:16,  8.31s/it, gpt_loss=0.245, loss_mean=0.307][A
+Train step of epoch 0:  55%|█████▌    | 3567/6434 [8:21:44<6:37:16,  8.31s/it, gpt_loss=0.343, loss_mean=0.311][A
+Train step of epoch 0:  55%|█████▌    | 3568/6434 [8:21:44<6:53:12,  8.65s/it, gpt_loss=0.343, loss_mean=0.311][A
+Train step of epoch 0:  55%|█████▌    | 3568/6434 [8:21:53<6:53:12,  8.65s/it, gpt_loss=0.371, loss_mean=0.317][A
+Train step of epoch 0:  55%|█████▌    | 3569/6434 [8:21:53<7:01:57,  8.84s/it, gpt_loss=0.371, loss_mean=0.317][A
+[LID Router Debug] Step: 3570
+Batch Size: 10
+Audio Batch Size: 90
+LID Assignments: [0, 4, 6, 4, 0, 9, 6, 4, 9, 2]
+Active Experts in Batch: {0, 2, 4, 6, 9}
+
+Train step of epoch 0:  55%|█████▌    | 3569/6434 [8:22:02<7:01:57,  8.84s/it, gpt_loss=0.303, loss_mean=0.315][A
+Train step of epoch 0:  55%|█████▌    | 3570/6434 [8:22:02<7:02:13,  8.85s/it, gpt_loss=0.303, loss_mean=0.315][A
+Train step of epoch 0:  55%|█████▌    | 3570/6434 [8:22:12<7:02:13,  8.85s/it, gpt_loss=0.424, loss_mean=0.326][A
+Train step of epoch 0:  56%|█████▌    | 3571/6434 [8:22:12<7:16:56,  9.16s/it, gpt_loss=0.424, loss_mean=0.326][A
+Train step of epoch 0:  56%|█████▌    | 3571/6434 [8:22:21<7:16:56,  9.16s/it, gpt_loss=0.428, loss_mean=0.336][A
+Train step of epoch 0:  56%|█████▌    | 3572/6434 [8:22:21<7:26:13,  9.35s/it, gpt_loss=0.428, loss_mean=0.336][A
+Train step of epoch 0:  56%|█████▌    | 3572/6434 [8:22:30<7:26:13,  9.35s/it, gpt_loss=0.299, loss_mean=0.333][A
+Train step of epoch 0:  56%|█████▌    | 3573/6434 [8:22:30<7:12:42,  9.07s/it, gpt_loss=0.299, loss_mean=0.333][A
+Train step of epoch 0:  56%|█████▌    | 3573/6434 [8:22:38<7:12:42,  9.07s/it, gpt_loss=0.329, loss_mean=0.332][A
+Train step of epoch 0:  56%|█████▌    | 3574/6434 [8:22:38<7:00:29,  8.82s/it, gpt_loss=0.329, loss_mean=0.332][A
+Train step of epoch 0:  56%|█████▌    | 3574/6434 [8:22:45<7:00:29,  8.82s/it, gpt_loss=0.283, loss_mean=0.327][A
+Train step of epoch 0:  56%|█████▌    | 3575/6434 [8:22:45<6:39:06,  8.38s/it, gpt_loss=0.283, loss_mean=0.327][A
+Train step of epoch 0:  56%|█████▌    | 3575/6434 [8:22:53<6:39:06,  8.38s/it, gpt_loss=0.283, loss_mean=0.323][A
+Train step of epoch 0:  56%|█████▌    | 3576/6434 [8:22:53<6:33:25,  8.26s/it, gpt_loss=0.283, loss_mean=0.323][A
+Train step of epoch 0:  56%|█████▌    | 3576/6434 [8:23:01<6:33:25,  8.26s/it, gpt_loss=0.253, loss_mean=0.316][A
+Train step of epoch 0:  56%|█████▌    | 3577/6434 [8:23:01<6:27:23,  8.14s/it, gpt_loss=0.253, loss_mean=0.316][A
+Train step of epoch 0:  56%|█████▌    | 3577/6434 [8:23:10<6:27:23,  8.14s/it, gpt_loss=0.321, loss_mean=0.316][A
+Train step of epoch 0:  56%|█████▌    | 3578/6434 [8:23:10<6:28:26,  8.16s/it, gpt_loss=0.321, loss_mean=0.316][A
+Train step of epoch 0:  56%|█████▌    | 3578/6434 [8:23:18<6:28:26,  8.16s/it, gpt_loss=0.288, loss_mean=0.314][A
+Train step of epoch 0:  56%|█████▌    | 3579/6434 [8:23:18<6:38:12,  8.37s/it, gpt_loss=0.288, loss_mean=0.314][A
+[LID Router Debug] Step: 3580
+Batch Size: 10
+Audio Batch Size: 128
+LID Assignments: [1, 0, 2, 2, 2, 3, 3, 1, 2, 0]
+Active Experts in Batch: {0, 1, 2, 3}
+
+Train step of epoch 0:  56%|█████▌    | 3579/6434 [8:23:28<6:38:12,  8.37s/it, gpt_loss=0.241, loss_mean=0.306][A
+Train step of epoch 0:  56%|█████▌    | 3580/6434 [8:23:28<6:51:44,  8.66s/it, gpt_loss=0.241, loss_mean=0.306][A
+Train step of epoch 0:  56%|█████▌    | 3580/6434 [8:23:37<6:51:44,  8.66s/it, gpt_loss=0.285, loss_mean=0.304][A
+Train step of epoch 0:  56%|█████▌    | 3581/6434 [8:23:37<6:56:27,  8.76s/it, gpt_loss=0.285, loss_mean=0.304][A
+Train step of epoch 0:  56%|█████▌    | 3581/6434 [8:23:47<6:56:27,  8.76s/it, gpt_loss=0.386, loss_mean=0.312][A
+Train step of epoch 0:  56%|█████▌    | 3582/6434 [8:23:47<7:17:45,  9.21s/it, gpt_loss=0.386, loss_mean=0.312][A
+Train step of epoch 0:  56%|█████▌    | 3582/6434 [8:23:55<7:17:45,  9.21s/it, gpt_loss=0.385, loss_mean=0.32] [A
+Train step of epoch 0:  56%|█████▌    | 3583/6434 [8:23:55<6:59:25,  8.83s/it, gpt_loss=0.385, loss_mean=0.32][A
+Train step of epoch 0:  56%|█████▌    | 3583/6434 [8:24:03<6:59:25,  8.83s/it, gpt_loss=0.282, loss_mean=0.316][A
+Train step of epoch 0:  56%|█████▌    | 3584/6434 [8:24:03<6:54:17,  8.72s/it, gpt_loss=0.282, loss_mean=0.316][A
+Train step of epoch 0:  56%|█████▌    | 3584/6434 [8:24:11<6:54:17,  8.72s/it, gpt_loss=0.353, loss_mean=0.32] [A
+Train step of epoch 0:  56%|█████▌    | 3585/6434 [8:24:11<6:45:12,  8.53s/it, gpt_loss=0.353, loss_mean=0.32][A
+Train step of epoch 0:  56%|█████▌    | 3585/6434 [8:24:19<6:45:12,  8.53s/it, gpt_loss=0.386, loss_mean=0.326][A
+Train step of epoch 0:  56%|█████▌    | 3586/6434 [8:24:19<6:31:29,  8.25s/it, gpt_loss=0.386, loss_mean=0.326][A
+Train step of epoch 0:  56%|█████▌    | 3586/6434 [8:24:27<6:31:29,  8.25s/it, gpt_loss=0.313, loss_mean=0.325][A
+Train step of epoch 0:  56%|█████▌    | 3587/6434 [8:24:27<6:26:47,  8.15s/it, gpt_loss=0.313, loss_mean=0.325][A
+Train step of epoch 0:  56%|█████▌    | 3587/6434 [8:24:36<6:26:47,  8.15s/it, gpt_loss=0.282, loss_mean=0.321][A
+Train step of epoch 0:  56%|█████▌    | 3588/6434 [8:24:36<6:33:55,  8.30s/it, gpt_loss=0.282, loss_mean=0.321][A
+Train step of epoch 0:  56%|█████▌    | 3588/6434 [8:24:45<6:33:55,  8.30s/it, gpt_loss=0.289, loss_mean=0.318][A
+Train step of epoch 0:  56%|█████▌    | 3589/6434 [8:24:45<6:42:47,  8.49s/it, gpt_loss=0.289, loss_mean=0.318][A
+[LID Router Debug] Step: 3590
+Batch Size: 10
+Audio Batch Size: 122
+LID Assignments: [0, 3, 3, 9, 9, 2, 4, 4, 2, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+Train step of epoch 0:  56%|█████▌    | 3589/6434 [8:24:53<6:42:47,  8.49s/it, gpt_loss=0.324, loss_mean=0.318][A
+Train step of epoch 0:  56%|█████▌    | 3590/6434 [8:24:53<6:39:34,  8.43s/it, gpt_loss=0.324, loss_mean=0.318][A
+Train step of epoch 0:  56%|█████▌    | 3590/6434 [8:25:02<6:39:34,  8.43s/it, gpt_loss=0.322, loss_mean=0.319][A
+Train step of epoch 0:  56%|█████▌    | 3591/6434 [8:25:02<6:45:30,  8.56s/it, gpt_loss=0.322, loss_mean=0.319][A
+Train step of epoch 0:  56%|█████▌    | 3591/6434 [8:25:11<6:45:30,  8.56s/it, gpt_loss=0.43, loss_mean=0.33]  [A
+Train step of epoch 0:  56%|█████▌    | 3592/6434 [8:25:11<6:55:56,  8.78s/it, gpt_loss=0.43, loss_mean=0.33][A
+Train step of epoch 0:  56%|█████▌    | 3592/6434 [8:25:19<6:55:56,  8.78s/it, gpt_loss=0.285, loss_mean=0.325][A
+Train step of epoch 0:  56%|█████▌    | 3593/6434 [8:25:19<6:50:40,  8.67s/it, gpt_loss=0.285, loss_mean=0.325][A
+Train step of epoch 0:  56%|█████▌    | 3593/6434 [8:25:29<6:50:40,  8.67s/it, gpt_loss=0.233, loss_mean=0.316][A
+Train step of epoch 0:  56%|█████▌    | 3594/6434 [8:25:29<7:07:24,  9.03s/it, gpt_loss=0.233, loss_mean=0.316][A
+Train step of epoch 0:  56%|█████▌    | 3594/6434 [8:25:38<7:07:24,  9.03s/it, gpt_loss=0.231, loss_mean=0.308][A
+Train step of epoch 0:  56%|█████▌    | 3595/6434 [8:25:38<6:56:20,  8.80s/it, gpt_loss=0.231, loss_mean=0.308][A
+Train step of epoch 0:  56%|█████▌    | 3595/6434 [8:25:46<6:56:20,  8.80s/it, gpt_loss=0.389, loss_mean=0.316][A
+Train step of epoch 0:  56%|█████▌    | 3596/6434 [8:25:46<6:55:54,  8.79s/it, gpt_loss=0.389, loss_mean=0.316][A
+Train step of epoch 0:  56%|█████▌    | 3596/6434 [8:25:55<6:55:54,  8.79s/it, gpt_loss=0.271, loss_mean=0.311][A
+Train step of epoch 0:  56%|█████▌    | 3597/6434 [8:25:55<6:57:04,  8.82s/it, gpt_loss=0.271, loss_mean=0.311][A
+Train step of epoch 0:  56%|█████▌    | 3597/6434 [8:26:04<6:57:04,  8.82s/it, gpt_loss=0.293, loss_mean=0.309][A
+Train step of epoch 0:  56%|█████▌    | 3598/6434 [8:26:04<6:57:04,  8.82s/it, gpt_loss=0.293, loss_mean=0.309][A
+Train step of epoch 0:  56%|█████▌    | 3598/6434 [8:26:12<6:57:04,  8.82s/it, gpt_loss=0.304, loss_mean=0.309][A
+Train step of epoch 0:  56%|█████▌    | 3599/6434 [8:26:12<6:48:39,  8.65s/it, gpt_loss=0.304, loss_mean=0.309][A
+[LID Router Debug] Step: 3600
+Batch Size: 10
+Audio Batch Size: 106
+LID Assignments: [0, 4, 1, 1, 9, 0, 0, 3, 6, 3]
+Active Experts in Batch: {0, 1, 3, 4, 6, 9}
+[2026-02-07 00:22:24,262] [INFO] [logging.py:96:log_dist] [Rank 0] step=1800, skipped=0, lr=[1.8472610995049858e-05, 1.8472610995049858e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 00:22:24,264] [INFO] [timer.py:260:stop] epoch=0/micro_step=3600/global_step=1800, RunningAvgSamplesPerSec=4.750560367283726, CurrSamplesPerSec=5.090208347096951, MemAllocated=12.56GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  56%|█████▌    | 3599/6434 [8:26:20<6:48:39,  8.65s/it, gpt_loss=0.307, loss_mean=0.309][A
+Train step of epoch 0:  56%|█████▌    | 3600/6434 [8:26:20<6:32:17,  8.31s/it, gpt_loss=0.307, loss_mean=0.309][A
+Train step of epoch 0:  56%|█████▌    | 3600/6434 [8:26:27<6:32:17,  8.31s/it, gpt_loss=0.255, loss_mean=0.303][A
+Train step of epoch 0:  56%|█████▌    | 3601/6434 [8:26:27<6:22:52,  8.11s/it, gpt_loss=0.255, loss_mean=0.303][A
+Train step of epoch 0:  56%|█████▌    | 3601/6434 [8:26:37<6:22:52,  8.11s/it, gpt_loss=0.374, loss_mean=0.31] [A
+Train step of epoch 0:  56%|█████▌    | 3602/6434 [8:26:37<6:49:14,  8.67s/it, gpt_loss=0.374, loss_mean=0.31][A
+Train step of epoch 0:  56%|█████▌    | 3602/6434 [8:26:45<6:49:14,  8.67s/it, gpt_loss=0.243, loss_mean=0.304][A
+Train step of epoch 0:  56%|█████▌    | 3603/6434 [8:26:45<6:38:15,  8.44s/it, gpt_loss=0.243, loss_mean=0.304][A
+Train step of epoch 0:  56%|█████▌    | 3603/6434 [8:26:53<6:38:15,  8.44s/it, gpt_loss=0.318, loss_mean=0.305][A
+Train step of epoch 0:  56%|█████▌    | 3604/6434 [8:26:53<6:27:55,  8.22s/it, gpt_loss=0.318, loss_mean=0.305][A
+Train step of epoch 0:  56%|█████▌    | 3604/6434 [8:27:03<6:27:55,  8.22s/it, gpt_loss=0.326, loss_mean=0.307][A
+Train step of epoch 0:  56%|█████▌    | 3605/6434 [8:27:03<6:45:36,  8.60s/it, gpt_loss=0.326, loss_mean=0.307][A
+Train step of epoch 0:  56%|█████▌    | 3605/6434 [8:27:11<6:45:36,  8.60s/it, gpt_loss=0.372, loss_mean=0.314][A
+Train step of epoch 0:  56%|█████▌    | 3606/6434 [8:27:11<6:48:19,  8.66s/it, gpt_loss=0.372, loss_mean=0.314][A
+Train step of epoch 0:  56%|█████▌    | 3606/6434 [8:27:20<6:48:19,  8.66s/it, gpt_loss=0.254, loss_mean=0.308][A
+Train step of epoch 0:  56%|█████▌    | 3607/6434 [8:27:20<6:54:46,  8.80s/it, gpt_loss=0.254, loss_mean=0.308][A
+Train step of epoch 0:  56%|█████▌    | 3607/6434 [8:27:29<6:54:46,  8.80s/it, gpt_loss=0.299, loss_mean=0.307][A
+Train step of epoch 0:  56%|█████▌    | 3608/6434 [8:27:29<6:47:23,  8.65s/it, gpt_loss=0.299, loss_mean=0.307][A
+Train step of epoch 0:  56%|█████▌    | 3608/6434 [8:27:38<6:47:23,  8.65s/it, gpt_loss=0.331, loss_mean=0.309][A
+Train step of epoch 0:  56%|█████▌    | 3609/6434 [8:27:38<7:01:36,  8.95s/it, gpt_loss=0.331, loss_mean=0.309][A
+[LID Router Debug] Step: 3610
+Batch Size: 10
+Audio Batch Size: 94
+LID Assignments: [5, 2, 4, 2, 1, 6, 7, 5, 4, 4]
+Active Experts in Batch: {1, 2, 4, 5, 6, 7}
+
+Train step of epoch 0:  56%|█████▌    | 3609/6434 [8:27:46<7:01:36,  8.95s/it, gpt_loss=0.291, loss_mean=0.307][A
+Train step of epoch 0:  56%|█████▌    | 3610/6434 [8:27:46<6:39:43,  8.49s/it, gpt_loss=0.291, loss_mean=0.307][A
+Train step of epoch 0:  56%|█████▌    | 3610/6434 [8:27:54<6:39:43,  8.49s/it, gpt_loss=0.376, loss_mean=0.314][A
+Train step of epoch 0:  56%|█████▌    | 3611/6434 [8:27:54<6:35:21,  8.40s/it, gpt_loss=0.376, loss_mean=0.314][A
+Train step of epoch 0:  56%|█████▌    | 3611/6434 [8:28:02<6:35:21,  8.40s/it, gpt_loss=0.329, loss_mean=0.316][A
+Train step of epoch 0:  56%|█████▌    | 3612/6434 [8:28:02<6:35:05,  8.40s/it, gpt_loss=0.329, loss_mean=0.316][A
+Train step of epoch 0:  56%|█████▌    | 3612/6434 [8:28:11<6:35:05,  8.40s/it, gpt_loss=0.371, loss_mean=0.321][A
+Train step of epoch 0:  56%|█████▌    | 3613/6434 [8:28:11<6:43:01,  8.57s/it, gpt_loss=0.371, loss_mean=0.321][A
+Train step of epoch 0:  56%|█████▌    | 3613/6434 [8:28:20<6:43:01,  8.57s/it, gpt_loss=0.391, loss_mean=0.328][A
+Train step of epoch 0:  56%|█████▌    | 3614/6434 [8:28:20<6:41:02,  8.53s/it, gpt_loss=0.391, loss_mean=0.328][A
+Train step of epoch 0:  56%|█████▌    | 3614/6434 [8:28:29<6:41:02,  8.53s/it, gpt_loss=0.316, loss_mean=0.327][A
+Train step of epoch 0:  56%|█████▌    | 3615/6434 [8:28:29<6:43:19,  8.58s/it, gpt_loss=0.316, loss_mean=0.327][A
+Train step of epoch 0:  56%|█████▌    | 3615/6434 [8:28:38<6:43:19,  8.58s/it, gpt_loss=0.268, loss_mean=0.321][A
+Train step of epoch 0:  56%|█████▌    | 3616/6434 [8:28:38<6:51:27,  8.76s/it, gpt_loss=0.268, loss_mean=0.321][A
+Train step of epoch 0:  56%|█████▌    | 3616/6434 [8:28:47<6:51:27,  8.76s/it, gpt_loss=0.354, loss_mean=0.324][A
+Train step of epoch 0:  56%|█████▌    | 3617/6434 [8:28:47<6:56:28,  8.87s/it, gpt_loss=0.354, loss_mean=0.324][A
+Train step of epoch 0:  56%|█████▌    | 3617/6434 [8:28:56<6:56:28,  8.87s/it, gpt_loss=0.282, loss_mean=0.32] [A
+Train step of epoch 0:  56%|█████▌    | 3618/6434 [8:28:56<6:57:57,  8.91s/it, gpt_loss=0.282, loss_mean=0.32][A
+Train step of epoch 0:  56%|█████▌    | 3618/6434 [8:29:04<6:57:57,  8.91s/it, gpt_loss=0.314, loss_mean=0.32][A
+Train step of epoch 0:  56%|█████▌    | 3619/6434 [8:29:04<6:43:31,  8.60s/it, gpt_loss=0.314, loss_mean=0.32][A
+[LID Router Debug] Step: 3620
+Batch Size: 10
+Audio Batch Size: 86
+LID Assignments: [5, 0, 6, 0, 0, 4, 10, 5, 1, 1]
+Active Experts in Batch: {0, 1, 4, 5, 6, 10}
+
+Train step of epoch 0:  56%|█████▌    | 3619/6434 [8:29:13<6:43:31,  8.60s/it, gpt_loss=0.281, loss_mean=0.316][A
+Train step of epoch 0:  56%|█████▋    | 3620/6434 [8:29:13<6:59:13,  8.94s/it, gpt_loss=0.281, loss_mean=0.316][A
+Train step of epoch 0:  56%|█████▋    | 3620/6434 [8:29:21<6:59:13,  8.94s/it, gpt_loss=0.307, loss_mean=0.315][A
+Train step of epoch 0:  56%|█████▋    | 3621/6434 [8:29:21<6:43:14,  8.60s/it, gpt_loss=0.307, loss_mean=0.315][A
+Train step of epoch 0:  56%|█████▋    | 3621/6434 [8:29:28<6:43:14,  8.60s/it, gpt_loss=0.332, loss_mean=0.316][A
+Train step of epoch 0:  56%|█████▋    | 3622/6434 [8:29:28<6:23:34,  8.18s/it, gpt_loss=0.332, loss_mean=0.316][A
+Train step of epoch 0:  56%|█████▋    | 3622/6434 [8:29:37<6:23:34,  8.18s/it, gpt_loss=0.322, loss_mean=0.317][A
+Train step of epoch 0:  56%|█████▋    | 3623/6434 [8:29:37<6:28:31,  8.29s/it, gpt_loss=0.322, loss_mean=0.317][A
+Train step of epoch 0:  56%|█████▋    | 3623/6434 [8:29:46<6:28:31,  8.29s/it, gpt_loss=0.244, loss_mean=0.31] [A
+Train step of epoch 0:  56%|█████▋    | 3624/6434 [8:29:46<6:34:25,  8.42s/it, gpt_loss=0.244, loss_mean=0.31][A
+Train step of epoch 0:  56%|█████▋    | 3624/6434 [8:29:54<6:34:25,  8.42s/it, gpt_loss=0.411, loss_mean=0.32][A
+Train step of epoch 0:  56%|█████▋    | 3625/6434 [8:29:54<6:25:56,  8.24s/it, gpt_loss=0.411, loss_mean=0.32][A
+Train step of epoch 0:  56%|█████▋    | 3625/6434 [8:30:02<6:25:56,  8.24s/it, gpt_loss=0.346, loss_mean=0.322][A
+Train step of epoch 0:  56%|█████▋    | 3626/6434 [8:30:02<6:30:26,  8.34s/it, gpt_loss=0.346, loss_mean=0.322][A
+Train step of epoch 0:  56%|█████▋    | 3626/6434 [8:30:11<6:30:26,  8.34s/it, gpt_loss=0.284, loss_mean=0.319][A
+Train step of epoch 0:  56%|█████▋    | 3627/6434 [8:30:11<6:35:19,  8.45s/it, gpt_loss=0.284, loss_mean=0.319][A
+Train step of epoch 0:  56%|█████▋    | 3627/6434 [8:30:21<6:35:19,  8.45s/it, gpt_loss=0.315, loss_mean=0.318][A
+Train step of epoch 0:  56%|█████▋    | 3628/6434 [8:30:21<6:58:05,  8.94s/it, gpt_loss=0.315, loss_mean=0.318][A
+Train step of epoch 0:  56%|█████▋    | 3628/6434 [8:30:30<6:58:05,  8.94s/it, gpt_loss=0.362, loss_mean=0.323][A
+Train step of epoch 0:  56%|█████▋    | 3629/6434 [8:30:30<6:55:21,  8.88s/it, gpt_loss=0.362, loss_mean=0.323][A
+[LID Router Debug] Step: 3630
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [3, 4, 3, 6, 5, 5, 2, 0, 5, 4]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6}
+
+Train step of epoch 0:  56%|█████▋    | 3629/6434 [8:30:38<6:55:21,  8.88s/it, gpt_loss=0.307, loss_mean=0.321][A
+Train step of epoch 0:  56%|█████▋    | 3630/6434 [8:30:38<6:41:24,  8.59s/it, gpt_loss=0.307, loss_mean=0.321][A
+Train step of epoch 0:  56%|█████▋    | 3630/6434 [8:30:46<6:41:24,  8.59s/it, gpt_loss=0.345, loss_mean=0.323][A
+Train step of epoch 0:  56%|█████▋    | 3631/6434 [8:30:46<6:36:22,  8.48s/it, gpt_loss=0.345, loss_mean=0.323][A
+Train step of epoch 0:  56%|█████▋    | 3631/6434 [8:30:54<6:36:22,  8.48s/it, gpt_loss=0.263, loss_mean=0.317][A
+Train step of epoch 0:  56%|█████▋    | 3632/6434 [8:30:54<6:32:13,  8.40s/it, gpt_loss=0.263, loss_mean=0.317][A
+Train step of epoch 0:  56%|█████▋    | 3632/6434 [8:31:02<6:32:13,  8.40s/it, gpt_loss=0.353, loss_mean=0.321][A
+Train step of epoch 0:  56%|█████▋    | 3633/6434 [8:31:02<6:23:11,  8.21s/it, gpt_loss=0.353, loss_mean=0.321][A
+Train step of epoch 0:  56%|█████▋    | 3633/6434 [8:31:10<6:23:11,  8.21s/it, gpt_loss=0.252, loss_mean=0.314][A
+Train step of epoch 0:  56%|█████▋    | 3634/6434 [8:31:10<6:26:06,  8.27s/it, gpt_loss=0.252, loss_mean=0.314][A
+Train step of epoch 0:  56%|█████▋    | 3634/6434 [8:31:18<6:26:06,  8.27s/it, gpt_loss=0.287, loss_mean=0.311][A
+Train step of epoch 0:  56%|█████▋    | 3635/6434 [8:31:18<6:23:04,  8.21s/it, gpt_loss=0.287, loss_mean=0.311][A
+Train step of epoch 0:  56%|█████▋    | 3635/6434 [8:31:26<6:23:04,  8.21s/it, gpt_loss=0.302, loss_mean=0.31] [A
+Train step of epoch 0:  57%|█████▋    | 3636/6434 [8:31:26<6:17:02,  8.09s/it, gpt_loss=0.302, loss_mean=0.31][A
+Train step of epoch 0:  57%|█████▋    | 3636/6434 [8:31:35<6:17:02,  8.09s/it, gpt_loss=0.277, loss_mean=0.307][A
+Train step of epoch 0:  57%|█████▋    | 3637/6434 [8:31:35<6:23:39,  8.23s/it, gpt_loss=0.277, loss_mean=0.307][A
+Train step of epoch 0:  57%|█████▋    | 3637/6434 [8:31:43<6:23:39,  8.23s/it, gpt_loss=0.304, loss_mean=0.307][A
+Train step of epoch 0:  57%|█████▋    | 3638/6434 [8:31:43<6:28:08,  8.33s/it, gpt_loss=0.304, loss_mean=0.307][A
+Train step of epoch 0:  57%|█████▋    | 3638/6434 [8:31:51<6:28:08,  8.33s/it, gpt_loss=0.359, loss_mean=0.312][A
+Train step of epoch 0:  57%|█████▋    | 3639/6434 [8:31:51<6:17:12,  8.10s/it, gpt_loss=0.359, loss_mean=0.312][A
+[LID Router Debug] Step: 3640
+Batch Size: 10
+Audio Batch Size: 146
+LID Assignments: [4, 2, 9, 6, 1, 4, 3, 1, 3, 3]
+Active Experts in Batch: {1, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:  57%|█████▋    | 3639/6434 [8:32:00<6:17:12,  8.10s/it, gpt_loss=0.335, loss_mean=0.314][A
+Train step of epoch 0:  57%|█████▋    | 3640/6434 [8:32:00<6:33:04,  8.44s/it, gpt_loss=0.335, loss_mean=0.314][A
+Train step of epoch 0:  57%|█████▋    | 3640/6434 [8:32:08<6:33:04,  8.44s/it, gpt_loss=0.285, loss_mean=0.311][A
+Train step of epoch 0:  57%|█████▋    | 3641/6434 [8:32:08<6:25:12,  8.28s/it, gpt_loss=0.285, loss_mean=0.311][A
+Train step of epoch 0:  57%|█████▋    | 3641/6434 [8:32:15<6:25:12,  8.28s/it, gpt_loss=0.254, loss_mean=0.306][A
+Train step of epoch 0:  57%|█████▋    | 3642/6434 [8:32:15<6:13:58,  8.04s/it, gpt_loss=0.254, loss_mean=0.306][A
+Train step of epoch 0:  57%|█████▋    | 3642/6434 [8:32:24<6:13:58,  8.04s/it, gpt_loss=0.231, loss_mean=0.298][A
+Train step of epoch 0:  57%|█████▋    | 3643/6434 [8:32:24<6:21:20,  8.20s/it, gpt_loss=0.231, loss_mean=0.298][A
+Train step of epoch 0:  57%|█████▋    | 3643/6434 [8:32:32<6:21:20,  8.20s/it, gpt_loss=0.344, loss_mean=0.303][A
+Train step of epoch 0:  57%|█████▋    | 3644/6434 [8:32:32<6:24:03,  8.26s/it, gpt_loss=0.344, loss_mean=0.303][A
+Train step of epoch 0:  57%|█████▋    | 3644/6434 [8:32:41<6:24:03,  8.26s/it, gpt_loss=0.268, loss_mean=0.299][A
+Train step of epoch 0:  57%|█████▋    | 3645/6434 [8:32:41<6:35:25,  8.51s/it, gpt_loss=0.268, loss_mean=0.299][A
+Train step of epoch 0:  57%|█████▋    | 3645/6434 [8:32:50<6:35:25,  8.51s/it, gpt_loss=0.332, loss_mean=0.303][A
+Train step of epoch 0:  57%|█████▋    | 3646/6434 [8:32:50<6:35:00,  8.50s/it, gpt_loss=0.332, loss_mean=0.303][A
+Train step of epoch 0:  57%|█████▋    | 3646/6434 [8:32:59<6:35:00,  8.50s/it, gpt_loss=0.299, loss_mean=0.302][A
+Train step of epoch 0:  57%|█████▋    | 3647/6434 [8:32:59<6:42:21,  8.66s/it, gpt_loss=0.299, loss_mean=0.302][A
+Train step of epoch 0:  57%|█████▋    | 3647/6434 [8:33:06<6:42:21,  8.66s/it, gpt_loss=0.373, loss_mean=0.309][A
+Train step of epoch 0:  57%|█████▋    | 3648/6434 [8:33:06<6:23:23,  8.26s/it, gpt_loss=0.373, loss_mean=0.309][A
+Train step of epoch 0:  57%|█████▋    | 3648/6434 [8:33:14<6:23:23,  8.26s/it, gpt_loss=0.264, loss_mean=0.305][A
+Train step of epoch 0:  57%|█████▋    | 3649/6434 [8:33:14<6:13:06,  8.04s/it, gpt_loss=0.264, loss_mean=0.305][A
+[LID Router Debug] Step: 3650
+Batch Size: 10
+Audio Batch Size: 90
+LID Assignments: [5, 3, 6, 6, 0, 0, 2, 4, 1, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+Train step of epoch 0:  57%|█████▋    | 3649/6434 [8:33:22<6:13:06,  8.04s/it, gpt_loss=0.359, loss_mean=0.31] [A
+Train step of epoch 0:  57%|█████▋    | 3650/6434 [8:33:22<6:17:19,  8.13s/it, gpt_loss=0.359, loss_mean=0.31][A
+Train step of epoch 0:  57%|█████▋    | 3650/6434 [8:33:31<6:17:19,  8.13s/it, gpt_loss=0.284, loss_mean=0.308][A
+Train step of epoch 0:  57%|█████▋    | 3651/6434 [8:33:31<6:20:58,  8.21s/it, gpt_loss=0.284, loss_mean=0.308][A
+Train step of epoch 0:  57%|█████▋    | 3651/6434 [8:33:39<6:20:58,  8.21s/it, gpt_loss=0.271, loss_mean=0.304][A
+Train step of epoch 0:  57%|█████▋    | 3652/6434 [8:33:39<6:19:33,  8.19s/it, gpt_loss=0.271, loss_mean=0.304][A
+Train step of epoch 0:  57%|█████▋    | 3652/6434 [8:33:47<6:19:33,  8.19s/it, gpt_loss=0.311, loss_mean=0.305][A
+Train step of epoch 0:  57%|█████▋    | 3653/6434 [8:33:47<6:18:26,  8.16s/it, gpt_loss=0.311, loss_mean=0.305][A
+Train step of epoch 0:  57%|█████▋    | 3653/6434 [8:33:56<6:18:26,  8.16s/it, gpt_loss=0.404, loss_mean=0.315][A
+Train step of epoch 0:  57%|█████▋    | 3654/6434 [8:33:56<6:36:01,  8.55s/it, gpt_loss=0.404, loss_mean=0.315][A
+Train step of epoch 0:  57%|█████▋    | 3654/6434 [8:34:04<6:36:01,  8.55s/it, gpt_loss=0.285, loss_mean=0.312][A
+Train step of epoch 0:  57%|█████▋    | 3655/6434 [8:34:04<6:18:34,  8.17s/it, gpt_loss=0.285, loss_mean=0.312][A
+Train step of epoch 0:  57%|█████▋    | 3655/6434 [8:34:12<6:18:34,  8.17s/it, gpt_loss=0.239, loss_mean=0.304][A
+Train step of epoch 0:  57%|█████▋    | 3656/6434 [8:34:12<6:24:41,  8.31s/it, gpt_loss=0.239, loss_mean=0.304][A
+Train step of epoch 0:  57%|█████▋    | 3656/6434 [8:34:20<6:24:41,  8.31s/it, gpt_loss=0.245, loss_mean=0.298][A
+Train step of epoch 0:  57%|█████▋    | 3657/6434 [8:34:20<6:24:51,  8.32s/it, gpt_loss=0.245, loss_mean=0.298][A
+Train step of epoch 0:  57%|█████▋    | 3657/6434 [8:34:29<6:24:51,  8.32s/it, gpt_loss=0.338, loss_mean=0.302][A
+Train step of epoch 0:  57%|█████▋    | 3658/6434 [8:34:29<6:23:35,  8.29s/it, gpt_loss=0.338, loss_mean=0.302][A
+Train step of epoch 0:  57%|█████▋    | 3658/6434 [8:34:37<6:23:35,  8.29s/it, gpt_loss=0.322, loss_mean=0.304][A
+Train step of epoch 0:  57%|█████▋    | 3659/6434 [8:34:37<6:19:01,  8.20s/it, gpt_loss=0.322, loss_mean=0.304][A
+[LID Router Debug] Step: 3660
+Batch Size: 10
+Audio Batch Size: 128
+LID Assignments: [9, 9, 4, 3, 3, 9, 9, 0, 4, 0]
+Active Experts in Batch: {0, 9, 3, 4}
+
+Train step of epoch 0:  57%|█████▋    | 3659/6434 [8:34:45<6:19:01,  8.20s/it, gpt_loss=0.272, loss_mean=0.301][A
+Train step of epoch 0:  57%|█████▋    | 3660/6434 [8:34:45<6:25:02,  8.33s/it, gpt_loss=0.272, loss_mean=0.301][A
+Train step of epoch 0:  57%|█████▋    | 3660/6434 [8:34:54<6:25:02,  8.33s/it, gpt_loss=0.337, loss_mean=0.305][A
+Train step of epoch 0:  57%|█████▋    | 3661/6434 [8:34:54<6:28:47,  8.41s/it, gpt_loss=0.337, loss_mean=0.305][A
+Train step of epoch 0:  57%|█████▋    | 3661/6434 [8:35:03<6:28:47,  8.41s/it, gpt_loss=0.365, loss_mean=0.311][A
+Train step of epoch 0:  57%|█████▋    | 3662/6434 [8:35:03<6:31:05,  8.47s/it, gpt_loss=0.365, loss_mean=0.311][A
+Train step of epoch 0:  57%|█████▋    | 3662/6434 [8:35:10<6:31:05,  8.47s/it, gpt_loss=0.275, loss_mean=0.307][A
+Train step of epoch 0:  57%|█████▋    | 3663/6434 [8:35:10<6:16:51,  8.16s/it, gpt_loss=0.275, loss_mean=0.307][A
+Train step of epoch 0:  57%|█████▋    | 3663/6434 [8:35:19<6:16:51,  8.16s/it, gpt_loss=0.263, loss_mean=0.303][A
+Train step of epoch 0:  57%|█████▋    | 3664/6434 [8:35:19<6:25:02,  8.34s/it, gpt_loss=0.263, loss_mean=0.303][A
+Train step of epoch 0:  57%|█████▋    | 3664/6434 [8:35:26<6:25:02,  8.34s/it, gpt_loss=0.342, loss_mean=0.307][A
+Train step of epoch 0:  57%|█████▋    | 3665/6434 [8:35:26<6:10:29,  8.03s/it, gpt_loss=0.342, loss_mean=0.307][A
+Train step of epoch 0:  57%|█████▋    | 3665/6434 [8:35:35<6:10:29,  8.03s/it, gpt_loss=0.299, loss_mean=0.306][A
+Train step of epoch 0:  57%|█████▋    | 3666/6434 [8:35:35<6:23:21,  8.31s/it, gpt_loss=0.299, loss_mean=0.306][A
+Train step of epoch 0:  57%|█████▋    | 3666/6434 [8:35:44<6:23:21,  8.31s/it, gpt_loss=0.349, loss_mean=0.31] [A
+Train step of epoch 0:  57%|█████▋    | 3667/6434 [8:35:44<6:27:12,  8.40s/it, gpt_loss=0.349, loss_mean=0.31][A
+Train step of epoch 0:  57%|█████▋    | 3667/6434 [8:35:52<6:27:12,  8.40s/it, gpt_loss=0.29, loss_mean=0.308][A
+Train step of epoch 0:  57%|█████▋    | 3668/6434 [8:35:52<6:31:14,  8.49s/it, gpt_loss=0.29, loss_mean=0.308][A
+Train step of epoch 0:  57%|█████▋    | 3668/6434 [8:36:01<6:31:14,  8.49s/it, gpt_loss=0.281, loss_mean=0.305][A
+Train step of epoch 0:  57%|█████▋    | 3669/6434 [8:36:01<6:29:54,  8.46s/it, gpt_loss=0.281, loss_mean=0.305][A
+[LID Router Debug] Step: 3670
+Batch Size: 10
+Audio Batch Size: 75
+LID Assignments: [5, 9, 1, 2, 6, 6, 2, 1, 9, 9]
+Active Experts in Batch: {1, 2, 5, 6, 9}
+
+Train step of epoch 0:  57%|█████▋    | 3669/6434 [8:36:09<6:29:54,  8.46s/it, gpt_loss=0.338, loss_mean=0.309][A
+Train step of epoch 0:  57%|█████▋    | 3670/6434 [8:36:09<6:28:23,  8.43s/it, gpt_loss=0.338, loss_mean=0.309][A
+Train step of epoch 0:  57%|█████▋    | 3670/6434 [8:36:17<6:28:23,  8.43s/it, gpt_loss=0.272, loss_mean=0.305][A
+Train step of epoch 0:  57%|█████▋    | 3671/6434 [8:36:17<6:22:17,  8.30s/it, gpt_loss=0.272, loss_mean=0.305][A
+Train step of epoch 0:  57%|█████▋    | 3671/6434 [8:36:24<6:22:17,  8.30s/it, gpt_loss=0.364, loss_mean=0.311][A
+Train step of epoch 0:  57%|█████▋    | 3672/6434 [8:36:24<6:07:48,  7.99s/it, gpt_loss=0.364, loss_mean=0.311][A
+Train step of epoch 0:  57%|█████▋    | 3672/6434 [8:36:33<6:07:48,  7.99s/it, gpt_loss=0.324, loss_mean=0.312][A
+Train step of epoch 0:  57%|█████▋    | 3673/6434 [8:36:33<6:14:03,  8.13s/it, gpt_loss=0.324, loss_mean=0.312][A
+Train step of epoch 0:  57%|█████▋    | 3673/6434 [8:36:42<6:14:03,  8.13s/it, gpt_loss=0.271, loss_mean=0.308][A
+Train step of epoch 0:  57%|█████▋    | 3674/6434 [8:36:42<6:25:00,  8.37s/it, gpt_loss=0.271, loss_mean=0.308][A
+Train step of epoch 0:  57%|█████▋    | 3674/6434 [8:36:50<6:25:00,  8.37s/it, gpt_loss=0.379, loss_mean=0.315][A
+Train step of epoch 0:  57%|█████▋    | 3675/6434 [8:36:50<6:25:22,  8.38s/it, gpt_loss=0.379, loss_mean=0.315][A
+Train step of epoch 0:  57%|█████▋    | 3675/6434 [8:36:58<6:25:22,  8.38s/it, gpt_loss=0.317, loss_mean=0.315][A
+Train step of epoch 0:  57%|█████▋    | 3676/6434 [8:36:58<6:21:14,  8.29s/it, gpt_loss=0.317, loss_mean=0.315][A
+Train step of epoch 0:  57%|█████▋    | 3676/6434 [8:37:06<6:21:14,  8.29s/it, gpt_loss=0.324, loss_mean=0.316][A
+Train step of epoch 0:  57%|█████▋    | 3677/6434 [8:37:06<6:18:43,  8.24s/it, gpt_loss=0.324, loss_mean=0.316][A
+Train step of epoch 0:  57%|█████▋    | 3677/6434 [8:37:15<6:18:43,  8.24s/it, gpt_loss=0.382, loss_mean=0.323][A
+Train step of epoch 0:  57%|█████▋    | 3678/6434 [8:37:15<6:23:09,  8.34s/it, gpt_loss=0.382, loss_mean=0.323][A
+Train step of epoch 0:  57%|█████▋    | 3678/6434 [8:37:23<6:23:09,  8.34s/it, gpt_loss=0.325, loss_mean=0.323][A
+Train step of epoch 0:  57%|█████▋    | 3679/6434 [8:37:23<6:23:43,  8.36s/it, gpt_loss=0.325, loss_mean=0.323][A
+[LID Router Debug] Step: 3680
+Batch Size: 10
+Audio Batch Size: 76
+LID Assignments: [6, 1, 5, 1, 4, 2, 4, 6, 2, 0]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6}
+
+Train step of epoch 0:  57%|█████▋    | 3679/6434 [8:37:30<6:23:43,  8.36s/it, gpt_loss=0.281, loss_mean=0.319][A
+Train step of epoch 0:  57%|█████▋    | 3680/6434 [8:37:30<6:03:36,  7.92s/it, gpt_loss=0.281, loss_mean=0.319][A
+Train step of epoch 0:  57%|█████▋    | 3680/6434 [8:37:39<6:03:36,  7.92s/it, gpt_loss=0.319, loss_mean=0.319][A
+Train step of epoch 0:  57%|█████▋    | 3681/6434 [8:37:39<6:17:22,  8.22s/it, gpt_loss=0.319, loss_mean=0.319][A
+Train step of epoch 0:  57%|█████▋    | 3681/6434 [8:37:49<6:17:22,  8.22s/it, gpt_loss=0.282, loss_mean=0.315][A
+Train step of epoch 0:  57%|█████▋    | 3682/6434 [8:37:49<6:36:19,  8.64s/it, gpt_loss=0.282, loss_mean=0.315][A
+Train step of epoch 0:  57%|█████▋    | 3682/6434 [8:37:57<6:36:19,  8.64s/it, gpt_loss=0.285, loss_mean=0.312][A
+Train step of epoch 0:  57%|█████▋    | 3683/6434 [8:37:57<6:26:17,  8.43s/it, gpt_loss=0.285, loss_mean=0.312][A
+Train step of epoch 0:  57%|█████▋    | 3683/6434 [8:38:07<6:26:17,  8.43s/it, gpt_loss=0.346, loss_mean=0.315][A
+Train step of epoch 0:  57%|█████▋    | 3684/6434 [8:38:07<6:48:24,  8.91s/it, gpt_loss=0.346, loss_mean=0.315][A
+Train step of epoch 0:  57%|█████▋    | 3684/6434 [8:38:15<6:48:24,  8.91s/it, gpt_loss=0.281, loss_mean=0.312][A
+Train step of epoch 0:  57%|█████▋    | 3685/6434 [8:38:15<6:45:56,  8.86s/it, gpt_loss=0.281, loss_mean=0.312][A
+Train step of epoch 0:  57%|█████▋    | 3685/6434 [8:38:23<6:45:56,  8.86s/it, gpt_loss=0.279, loss_mean=0.309][A
+Train step of epoch 0:  57%|█████▋    | 3686/6434 [8:38:23<6:25:11,  8.41s/it, gpt_loss=0.279, loss_mean=0.309][A
+Train step of epoch 0:  57%|█████▋    | 3686/6434 [8:38:32<6:25:11,  8.41s/it, gpt_loss=0.33, loss_mean=0.311] [A
+Train step of epoch 0:  57%|█████▋    | 3687/6434 [8:38:32<6:35:47,  8.64s/it, gpt_loss=0.33, loss_mean=0.311][A
+Train step of epoch 0:  57%|█████▋    | 3687/6434 [8:38:40<6:35:47,  8.64s/it, gpt_loss=0.419, loss_mean=0.322][A
+Train step of epoch 0:  57%|█████▋    | 3688/6434 [8:38:40<6:33:32,  8.60s/it, gpt_loss=0.419, loss_mean=0.322][A
+Train step of epoch 0:  57%|█████▋    | 3688/6434 [8:38:49<6:33:32,  8.60s/it, gpt_loss=0.35, loss_mean=0.325] [A
+Train step of epoch 0:  57%|█████▋    | 3689/6434 [8:38:49<6:33:08,  8.59s/it, gpt_loss=0.35, loss_mean=0.325][A
+[LID Router Debug] Step: 3690
+Batch Size: 10
+Audio Batch Size: 125
+LID Assignments: [1, 1, 2, 0, 8, 0, 9, 1, 3, 2]
+Active Experts in Batch: {0, 1, 2, 3, 8, 9}
+
+Train step of epoch 0:  57%|█████▋    | 3689/6434 [8:38:57<6:33:08,  8.59s/it, gpt_loss=0.295, loss_mean=0.322][A
+Train step of epoch 0:  57%|█████▋    | 3690/6434 [8:38:57<6:27:27,  8.47s/it, gpt_loss=0.295, loss_mean=0.322][A
+Train step of epoch 0:  57%|█████▋    | 3690/6434 [8:39:06<6:27:27,  8.47s/it, gpt_loss=0.384, loss_mean=0.328][A
+Train step of epoch 0:  57%|█████▋    | 3691/6434 [8:39:06<6:24:56,  8.42s/it, gpt_loss=0.384, loss_mean=0.328][A
+Train step of epoch 0:  57%|█████▋    | 3691/6434 [8:39:15<6:24:56,  8.42s/it, gpt_loss=0.285, loss_mean=0.324][A
+Train step of epoch 0:  57%|█████▋    | 3692/6434 [8:39:15<6:34:45,  8.64s/it, gpt_loss=0.285, loss_mean=0.324][A
+Train step of epoch 0:  57%|█████▋    | 3692/6434 [8:39:23<6:34:45,  8.64s/it, gpt_loss=0.248, loss_mean=0.316][A
+Train step of epoch 0:  57%|█████▋    | 3693/6434 [8:39:23<6:28:24,  8.50s/it, gpt_loss=0.248, loss_mean=0.316][A
+Train step of epoch 0:  57%|█████▋    | 3693/6434 [8:39:30<6:28:24,  8.50s/it, gpt_loss=0.275, loss_mean=0.312][A
+Train step of epoch 0:  57%|█████▋    | 3694/6434 [8:39:30<6:15:12,  8.22s/it, gpt_loss=0.275, loss_mean=0.312][A
+Train step of epoch 0:  57%|█████▋    | 3694/6434 [8:39:38<6:15:12,  8.22s/it, gpt_loss=0.376, loss_mean=0.318][A
+Train step of epoch 0:  57%|█████▋    | 3695/6434 [8:39:38<6:12:09,  8.15s/it, gpt_loss=0.376, loss_mean=0.318][A
+Train step of epoch 0:  57%|█████▋    | 3695/6434 [8:39:47<6:12:09,  8.15s/it, gpt_loss=0.254, loss_mean=0.312][A
+Train step of epoch 0:  57%|█████▋    | 3696/6434 [8:39:47<6:15:23,  8.23s/it, gpt_loss=0.254, loss_mean=0.312][A
+Train step of epoch 0:  57%|█████▋    | 3696/6434 [8:39:55<6:15:23,  8.23s/it, gpt_loss=0.241, loss_mean=0.305][A
+Train step of epoch 0:  57%|█████▋    | 3697/6434 [8:39:55<6:14:20,  8.21s/it, gpt_loss=0.241, loss_mean=0.305][A
+Train step of epoch 0:  57%|█████▋    | 3697/6434 [8:40:03<6:14:20,  8.21s/it, gpt_loss=0.3, loss_mean=0.304]  [A
+Train step of epoch 0:  57%|█████▋    | 3698/6434 [8:40:03<6:17:20,  8.27s/it, gpt_loss=0.3, loss_mean=0.304][A
+Train step of epoch 0:  57%|█████▋    | 3698/6434 [8:40:11<6:17:20,  8.27s/it, gpt_loss=0.271, loss_mean=0.301][A
+Train step of epoch 0:  57%|█████▋    | 3699/6434 [8:40:11<6:12:21,  8.17s/it, gpt_loss=0.271, loss_mean=0.301][A
+[LID Router Debug] Step: 3700
+Batch Size: 10
+Audio Batch Size: 112
+LID Assignments: [8, 1, 3, 0, 2, 2, 1, 4, 9, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 8, 9}
+
+Train step of epoch 0:  57%|█████▋    | 3699/6434 [8:40:20<6:12:21,  8.17s/it, gpt_loss=0.253, loss_mean=0.296][A
+Train step of epoch 0:  58%|█████▊    | 3700/6434 [8:40:20<6:12:25,  8.17s/it, gpt_loss=0.253, loss_mean=0.296][A
+Train step of epoch 0:  58%|█████▊    | 3700/6434 [8:40:29<6:12:25,  8.17s/it, gpt_loss=0.396, loss_mean=0.306][A
+Train step of epoch 0:  58%|█████▊    | 3701/6434 [8:40:29<6:34:46,  8.67s/it, gpt_loss=0.396, loss_mean=0.306][A
+Train step of epoch 0:  58%|█████▊    | 3701/6434 [8:40:38<6:34:46,  8.67s/it, gpt_loss=0.363, loss_mean=0.312][A
+Train step of epoch 0:  58%|█████▊    | 3702/6434 [8:40:38<6:38:32,  8.75s/it, gpt_loss=0.363, loss_mean=0.312][A
+Train step of epoch 0:  58%|█████▊    | 3702/6434 [8:40:47<6:38:32,  8.75s/it, gpt_loss=0.256, loss_mean=0.306][A
+Train step of epoch 0:  58%|█████▊    | 3703/6434 [8:40:47<6:39:08,  8.77s/it, gpt_loss=0.256, loss_mean=0.306][A
+Train step of epoch 0:  58%|█████▊    | 3703/6434 [8:40:56<6:39:08,  8.77s/it, gpt_loss=0.293, loss_mean=0.305][A
+Train step of epoch 0:  58%|█████▊    | 3704/6434 [8:40:56<6:38:54,  8.77s/it, gpt_loss=0.293, loss_mean=0.305][A
+Train step of epoch 0:  58%|█████▊    | 3704/6434 [8:41:04<6:38:54,  8.77s/it, gpt_loss=0.258, loss_mean=0.3]  [A
+Train step of epoch 0:  58%|█████▊    | 3705/6434 [8:41:04<6:34:31,  8.67s/it, gpt_loss=0.258, loss_mean=0.3][A
+Train step of epoch 0:  58%|█████▊    | 3705/6434 [8:41:14<6:34:31,  8.67s/it, gpt_loss=0.301, loss_mean=0.3][A
+Train step of epoch 0:  58%|█████▊    | 3706/6434 [8:41:14<6:52:27,  9.07s/it, gpt_loss=0.301, loss_mean=0.3][A
+Train step of epoch 0:  58%|█████▊    | 3706/6434 [8:41:23<6:52:27,  9.07s/it, gpt_loss=0.283, loss_mean=0.299][A
+Train step of epoch 0:  58%|█████▊    | 3707/6434 [8:41:23<6:40:12,  8.81s/it, gpt_loss=0.283, loss_mean=0.299][A
+Train step of epoch 0:  58%|█████▊    | 3707/6434 [8:41:31<6:40:12,  8.81s/it, gpt_loss=0.342, loss_mean=0.303][A
+Train step of epoch 0:  58%|█████▊    | 3708/6434 [8:41:31<6:37:27,  8.75s/it, gpt_loss=0.342, loss_mean=0.303][A
+Train step of epoch 0:  58%|█████▊    | 3708/6434 [8:41:40<6:37:27,  8.75s/it, gpt_loss=0.259, loss_mean=0.299][A
+Train step of epoch 0:  58%|█████▊    | 3709/6434 [8:41:40<6:32:22,  8.64s/it, gpt_loss=0.259, loss_mean=0.299][A
+[LID Router Debug] Step: 3710
+Batch Size: 10
+Audio Batch Size: 128
+LID Assignments: [3, 3, 0, 9, 2, 0, 0, 9, 5, 4]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  58%|█████▊    | 3709/6434 [8:41:48<6:32:22,  8.64s/it, gpt_loss=0.228, loss_mean=0.292][A
+Train step of epoch 0:  58%|█████▊    | 3710/6434 [8:41:48<6:32:03,  8.64s/it, gpt_loss=0.228, loss_mean=0.292][A
+Train step of epoch 0:  58%|█████▊    | 3710/6434 [8:41:56<6:32:03,  8.64s/it, gpt_loss=0.343, loss_mean=0.297][A
+Train step of epoch 0:  58%|█████▊    | 3711/6434 [8:41:56<6:25:35,  8.50s/it, gpt_loss=0.343, loss_mean=0.297][A
+Train step of epoch 0:  58%|█████▊    | 3711/6434 [8:42:05<6:25:35,  8.50s/it, gpt_loss=0.332, loss_mean=0.3]  [A
+Train step of epoch 0:  58%|█████▊    | 3712/6434 [8:42:05<6:28:22,  8.56s/it, gpt_loss=0.332, loss_mean=0.3][A
+Train step of epoch 0:  58%|█████▊    | 3712/6434 [8:42:14<6:28:22,  8.56s/it, gpt_loss=0.21, loss_mean=0.291][A
+Train step of epoch 0:  58%|█████▊    | 3713/6434 [8:42:14<6:29:18,  8.58s/it, gpt_loss=0.21, loss_mean=0.291][A
+Train step of epoch 0:  58%|█████▊    | 3713/6434 [8:42:23<6:29:18,  8.58s/it, gpt_loss=0.302, loss_mean=0.292][A
+Train step of epoch 0:  58%|█████▊    | 3714/6434 [8:42:23<6:39:14,  8.81s/it, gpt_loss=0.302, loss_mean=0.292][A
+Train step of epoch 0:  58%|█████▊    | 3714/6434 [8:42:32<6:39:14,  8.81s/it, gpt_loss=0.24, loss_mean=0.287] [A
+Train step of epoch 0:  58%|█████▊    | 3715/6434 [8:42:32<6:36:15,  8.74s/it, gpt_loss=0.24, loss_mean=0.287][A
+Train step of epoch 0:  58%|█████▊    | 3715/6434 [8:42:40<6:36:15,  8.74s/it, gpt_loss=0.327, loss_mean=0.291][A
+Train step of epoch 0:  58%|█████▊    | 3716/6434 [8:42:40<6:29:46,  8.60s/it, gpt_loss=0.327, loss_mean=0.291][A
+Train step of epoch 0:  58%|█████▊    | 3716/6434 [8:42:48<6:29:46,  8.60s/it, gpt_loss=0.279, loss_mean=0.29] [A
+Train step of epoch 0:  58%|█████▊    | 3717/6434 [8:42:48<6:24:43,  8.50s/it, gpt_loss=0.279, loss_mean=0.29][A
+Train step of epoch 0:  58%|█████▊    | 3717/6434 [8:42:57<6:24:43,  8.50s/it, gpt_loss=0.388, loss_mean=0.3] [A
+Train step of epoch 0:  58%|█████▊    | 3718/6434 [8:42:57<6:28:56,  8.59s/it, gpt_loss=0.388, loss_mean=0.3][A
+Train step of epoch 0:  58%|█████▊    | 3718/6434 [8:43:05<6:28:56,  8.59s/it, gpt_loss=0.343, loss_mean=0.304][A
+Train step of epoch 0:  58%|█████▊    | 3719/6434 [8:43:05<6:16:06,  8.31s/it, gpt_loss=0.343, loss_mean=0.304][A
+[LID Router Debug] Step: 3720
+Batch Size: 10
+Audio Batch Size: 78
+LID Assignments: [5, 4, 5, 5, 5, 0, 1, 9, 5, 6]
+Active Experts in Batch: {0, 1, 4, 5, 6, 9}
+
+Train step of epoch 0:  58%|█████▊    | 3719/6434 [8:43:12<6:16:06,  8.31s/it, gpt_loss=0.481, loss_mean=0.322][A
+Train step of epoch 0:  58%|█████▊    | 3720/6434 [8:43:12<6:06:05,  8.09s/it, gpt_loss=0.481, loss_mean=0.322][A
+Train step of epoch 0:  58%|█████▊    | 3720/6434 [8:43:22<6:06:05,  8.09s/it, gpt_loss=0.292, loss_mean=0.319][A
+Train step of epoch 0:  58%|█████▊    | 3721/6434 [8:43:22<6:22:53,  8.47s/it, gpt_loss=0.292, loss_mean=0.319][A
+Train step of epoch 0:  58%|█████▊    | 3721/6434 [8:43:30<6:22:53,  8.47s/it, gpt_loss=0.299, loss_mean=0.317][A
+Train step of epoch 0:  58%|█████▊    | 3722/6434 [8:43:30<6:16:25,  8.33s/it, gpt_loss=0.299, loss_mean=0.317][A
+Train step of epoch 0:  58%|█████▊    | 3722/6434 [8:43:39<6:16:25,  8.33s/it, gpt_loss=0.225, loss_mean=0.308][A
+Train step of epoch 0:  58%|█████▊    | 3723/6434 [8:43:39<6:28:33,  8.60s/it, gpt_loss=0.225, loss_mean=0.308][A
+Train step of epoch 0:  58%|█████▊    | 3723/6434 [8:43:46<6:28:33,  8.60s/it, gpt_loss=0.364, loss_mean=0.313][A
+Train step of epoch 0:  58%|█████▊    | 3724/6434 [8:43:46<6:12:41,  8.25s/it, gpt_loss=0.364, loss_mean=0.313][A
+Train step of epoch 0:  58%|█████▊    | 3724/6434 [8:43:55<6:12:41,  8.25s/it, gpt_loss=0.292, loss_mean=0.311][A
+Train step of epoch 0:  58%|█████▊    | 3725/6434 [8:43:55<6:17:40,  8.36s/it, gpt_loss=0.292, loss_mean=0.311][A
+Train step of epoch 0:  58%|█████▊    | 3725/6434 [8:44:03<6:17:40,  8.36s/it, gpt_loss=0.385, loss_mean=0.318][A
+Train step of epoch 0:  58%|█████▊    | 3726/6434 [8:44:03<6:20:52,  8.44s/it, gpt_loss=0.385, loss_mean=0.318][A
+Train step of epoch 0:  58%|█████▊    | 3726/6434 [8:44:11<6:20:52,  8.44s/it, gpt_loss=0.307, loss_mean=0.317][A
+Train step of epoch 0:  58%|█████▊    | 3727/6434 [8:44:11<6:08:36,  8.17s/it, gpt_loss=0.307, loss_mean=0.317][A
+Train step of epoch 0:  58%|█████▊    | 3727/6434 [8:44:19<6:08:36,  8.17s/it, gpt_loss=0.433, loss_mean=0.329][A
+Train step of epoch 0:  58%|█████▊    | 3728/6434 [8:44:19<6:00:49,  8.00s/it, gpt_loss=0.433, loss_mean=0.329][A
+Train step of epoch 0:  58%|█████▊    | 3728/6434 [8:44:28<6:00:49,  8.00s/it, gpt_loss=0.34, loss_mean=0.33]  [A
+Train step of epoch 0:  58%|█████▊    | 3729/6434 [8:44:28<6:16:17,  8.35s/it, gpt_loss=0.34, loss_mean=0.33][A
+[LID Router Debug] Step: 3730
+Batch Size: 10
+Audio Batch Size: 106
+LID Assignments: [9, 6, 4, 4, 9, 3, 5, 2, 4, 5]
+Active Experts in Batch: {2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  58%|█████▊    | 3729/6434 [8:44:36<6:16:17,  8.35s/it, gpt_loss=0.303, loss_mean=0.327][A
+Train step of epoch 0:  58%|█████▊    | 3730/6434 [8:44:36<6:16:59,  8.37s/it, gpt_loss=0.303, loss_mean=0.327][A
+Train step of epoch 0:  58%|█████▊    | 3730/6434 [8:44:44<6:16:59,  8.37s/it, gpt_loss=0.327, loss_mean=0.327][A
+Train step of epoch 0:  58%|█████▊    | 3731/6434 [8:44:44<6:09:42,  8.21s/it, gpt_loss=0.327, loss_mean=0.327][A
+Train step of epoch 0:  58%|█████▊    | 3731/6434 [8:44:51<6:09:42,  8.21s/it, gpt_loss=0.439, loss_mean=0.338][A
+Train step of epoch 0:  58%|█████▊    | 3732/6434 [8:44:51<5:59:41,  7.99s/it, gpt_loss=0.439, loss_mean=0.338][A
+Train step of epoch 0:  58%|█████▊    | 3732/6434 [8:44:59<5:59:41,  7.99s/it, gpt_loss=0.298, loss_mean=0.334][A
+Train step of epoch 0:  58%|█████▊    | 3733/6434 [8:44:59<5:53:34,  7.85s/it, gpt_loss=0.298, loss_mean=0.334][A
+Train step of epoch 0:  58%|█████▊    | 3733/6434 [8:45:07<5:53:34,  7.85s/it, gpt_loss=0.347, loss_mean=0.336][A
+Train step of epoch 0:  58%|█████▊    | 3734/6434 [8:45:07<5:53:59,  7.87s/it, gpt_loss=0.347, loss_mean=0.336][A
+Train step of epoch 0:  58%|█████▊    | 3734/6434 [8:45:15<5:53:59,  7.87s/it, gpt_loss=0.369, loss_mean=0.339][A
+Train step of epoch 0:  58%|█████▊    | 3735/6434 [8:45:15<5:55:01,  7.89s/it, gpt_loss=0.369, loss_mean=0.339][A
+Train step of epoch 0:  58%|█████▊    | 3735/6434 [8:45:22<5:55:01,  7.89s/it, gpt_loss=0.369, loss_mean=0.342][A
+Train step of epoch 0:  58%|█████▊    | 3736/6434 [8:45:22<5:50:00,  7.78s/it, gpt_loss=0.369, loss_mean=0.342][A
+Train step of epoch 0:  58%|█████▊    | 3736/6434 [8:45:30<5:50:00,  7.78s/it, gpt_loss=0.312, loss_mean=0.339][A
+Train step of epoch 0:  58%|█████▊    | 3737/6434 [8:45:30<5:52:30,  7.84s/it, gpt_loss=0.312, loss_mean=0.339][A
+Train step of epoch 0:  58%|█████▊    | 3737/6434 [8:45:38<5:52:30,  7.84s/it, gpt_loss=0.312, loss_mean=0.336][A
+Train step of epoch 0:  58%|█████▊    | 3738/6434 [8:45:38<5:49:11,  7.77s/it, gpt_loss=0.312, loss_mean=0.336][A
+Train step of epoch 0:  58%|█████▊    | 3738/6434 [8:45:46<5:49:11,  7.77s/it, gpt_loss=0.361, loss_mean=0.339][A
+Train step of epoch 0:  58%|█████▊    | 3739/6434 [8:45:46<5:56:12,  7.93s/it, gpt_loss=0.361, loss_mean=0.339][A
+[LID Router Debug] Step: 3740
+Batch Size: 10
+Audio Batch Size: 108
+LID Assignments: [1, 2, 1, 2, 0, 2, 1, 9, 3, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+Train step of epoch 0:  58%|█████▊    | 3739/6434 [8:45:54<5:56:12,  7.93s/it, gpt_loss=0.288, loss_mean=0.334][A
+Train step of epoch 0:  58%|█████▊    | 3740/6434 [8:45:54<5:50:56,  7.82s/it, gpt_loss=0.288, loss_mean=0.334][A
+Train step of epoch 0:  58%|█████▊    | 3740/6434 [8:46:02<5:50:56,  7.82s/it, gpt_loss=0.222, loss_mean=0.323][A
+Train step of epoch 0:  58%|█████▊    | 3741/6434 [8:46:02<5:59:14,  8.00s/it, gpt_loss=0.222, loss_mean=0.323][A
+Train step of epoch 0:  58%|█████▊    | 3741/6434 [8:46:11<5:59:14,  8.00s/it, gpt_loss=0.235, loss_mean=0.314][A
+Train step of epoch 0:  58%|█████▊    | 3742/6434 [8:46:11<6:05:11,  8.14s/it, gpt_loss=0.235, loss_mean=0.314][A
+Train step of epoch 0:  58%|█████▊    | 3742/6434 [8:46:19<6:05:11,  8.14s/it, gpt_loss=0.375, loss_mean=0.32] [A
+Train step of epoch 0:  58%|█████▊    | 3743/6434 [8:46:19<6:12:09,  8.30s/it, gpt_loss=0.375, loss_mean=0.32][A
+Train step of epoch 0:  58%|█████▊    | 3743/6434 [8:46:29<6:12:09,  8.30s/it, gpt_loss=0.342, loss_mean=0.322][A
+Train step of epoch 0:  58%|█████▊    | 3744/6434 [8:46:29<6:28:20,  8.66s/it, gpt_loss=0.342, loss_mean=0.322][A
+Train step of epoch 0:  58%|█████▊    | 3744/6434 [8:46:37<6:28:20,  8.66s/it, gpt_loss=0.354, loss_mean=0.325][A
+Train step of epoch 0:  58%|█████▊    | 3745/6434 [8:46:37<6:20:12,  8.48s/it, gpt_loss=0.354, loss_mean=0.325][A
+Train step of epoch 0:  58%|█████▊    | 3745/6434 [8:46:46<6:20:12,  8.48s/it, gpt_loss=0.403, loss_mean=0.333][A
+Train step of epoch 0:  58%|█████▊    | 3746/6434 [8:46:46<6:21:22,  8.51s/it, gpt_loss=0.403, loss_mean=0.333][A
+Train step of epoch 0:  58%|█████▊    | 3746/6434 [8:46:55<6:21:22,  8.51s/it, gpt_loss=0.293, loss_mean=0.329][A
+Train step of epoch 0:  58%|█████▊    | 3747/6434 [8:46:55<6:27:54,  8.66s/it, gpt_loss=0.293, loss_mean=0.329][A
+Train step of epoch 0:  58%|█████▊    | 3747/6434 [8:47:05<6:27:54,  8.66s/it, gpt_loss=0.287, loss_mean=0.325][A
+Train step of epoch 0:  58%|█████▊    | 3748/6434 [8:47:05<6:47:32,  9.10s/it, gpt_loss=0.287, loss_mean=0.325][A
+Train step of epoch 0:  58%|█████▊    | 3748/6434 [8:47:13<6:47:32,  9.10s/it, gpt_loss=0.22, loss_mean=0.314] [A
+Train step of epoch 0:  58%|█████▊    | 3749/6434 [8:47:13<6:36:58,  8.87s/it, gpt_loss=0.22, loss_mean=0.314][A
+[LID Router Debug] Step: 3750
+Batch Size: 10
+Audio Batch Size: 124
+LID Assignments: [2, 9, 3, 4, 4, 1, 4, 3, 5, 3]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  58%|█████▊    | 3749/6434 [8:47:21<6:36:58,  8.87s/it, gpt_loss=0.323, loss_mean=0.315][A
+Train step of epoch 0:  58%|█████▊    | 3750/6434 [8:47:21<6:25:20,  8.61s/it, gpt_loss=0.323, loss_mean=0.315][A
+Train step of epoch 0:  58%|█████▊    | 3750/6434 [8:47:29<6:25:20,  8.61s/it, gpt_loss=0.341, loss_mean=0.318][A
+Train step of epoch 0:  58%|█████▊    | 3751/6434 [8:47:29<6:16:51,  8.43s/it, gpt_loss=0.341, loss_mean=0.318][A
+Train step of epoch 0:  58%|█████▊    | 3751/6434 [8:47:37<6:16:51,  8.43s/it, gpt_loss=0.309, loss_mean=0.317][A
+Train step of epoch 0:  58%|█████▊    | 3752/6434 [8:47:37<6:16:35,  8.42s/it, gpt_loss=0.309, loss_mean=0.317][A
+Train step of epoch 0:  58%|█████▊    | 3752/6434 [8:47:46<6:16:35,  8.42s/it, gpt_loss=0.277, loss_mean=0.313][A
+Train step of epoch 0:  58%|█████▊    | 3753/6434 [8:47:46<6:16:21,  8.42s/it, gpt_loss=0.277, loss_mean=0.313][A
+Train step of epoch 0:  58%|█████▊    | 3753/6434 [8:47:54<6:16:21,  8.42s/it, gpt_loss=0.315, loss_mean=0.313][A
+Train step of epoch 0:  58%|█████▊    | 3754/6434 [8:47:54<6:10:23,  8.29s/it, gpt_loss=0.315, loss_mean=0.313][A
+Train step of epoch 0:  58%|█████▊    | 3754/6434 [8:48:02<6:10:23,  8.29s/it, gpt_loss=0.397, loss_mean=0.322][A
+Train step of epoch 0:  58%|█████▊    | 3755/6434 [8:48:02<6:13:54,  8.37s/it, gpt_loss=0.397, loss_mean=0.322][A
+Train step of epoch 0:  58%|█████▊    | 3755/6434 [8:48:11<6:13:54,  8.37s/it, gpt_loss=0.255, loss_mean=0.315][A
+Train step of epoch 0:  58%|█████▊    | 3756/6434 [8:48:11<6:21:23,  8.55s/it, gpt_loss=0.255, loss_mean=0.315][A
+Train step of epoch 0:  58%|█████▊    | 3756/6434 [8:48:19<6:21:23,  8.55s/it, gpt_loss=0.32, loss_mean=0.315] [A
+Train step of epoch 0:  58%|█████▊    | 3757/6434 [8:48:19<6:06:58,  8.23s/it, gpt_loss=0.32, loss_mean=0.315][A
+Train step of epoch 0:  58%|█████▊    | 3757/6434 [8:48:27<6:06:58,  8.23s/it, gpt_loss=0.261, loss_mean=0.31][A
+Train step of epoch 0:  58%|█████▊    | 3758/6434 [8:48:27<6:05:23,  8.19s/it, gpt_loss=0.261, loss_mean=0.31][A
+Train step of epoch 0:  58%|█████▊    | 3758/6434 [8:48:36<6:05:23,  8.19s/it, gpt_loss=0.362, loss_mean=0.315][A
+Train step of epoch 0:  58%|█████▊    | 3759/6434 [8:48:36<6:12:49,  8.36s/it, gpt_loss=0.362, loss_mean=0.315][A
+[LID Router Debug] Step: 3760
+Batch Size: 10
+Audio Batch Size: 112
+LID Assignments: [5, 0, 5, 4, 2, 5, 2, 9, 9, 3]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  58%|█████▊    | 3759/6434 [8:48:44<6:12:49,  8.36s/it, gpt_loss=0.343, loss_mean=0.318][A
+Train step of epoch 0:  58%|█████▊    | 3760/6434 [8:48:44<6:12:44,  8.36s/it, gpt_loss=0.343, loss_mean=0.318][A
+Train step of epoch 0:  58%|█████▊    | 3760/6434 [8:48:53<6:12:44,  8.36s/it, gpt_loss=0.365, loss_mean=0.323][A
+Train step of epoch 0:  58%|█████▊    | 3761/6434 [8:48:53<6:15:15,  8.42s/it, gpt_loss=0.365, loss_mean=0.323][A
+Train step of epoch 0:  58%|█████▊    | 3761/6434 [8:49:01<6:15:15,  8.42s/it, gpt_loss=0.292, loss_mean=0.32] [A
+Train step of epoch 0:  58%|█████▊    | 3762/6434 [8:49:01<6:19:23,  8.52s/it, gpt_loss=0.292, loss_mean=0.32][A
+Train step of epoch 0:  58%|█████▊    | 3762/6434 [8:49:10<6:19:23,  8.52s/it, gpt_loss=0.344, loss_mean=0.322][A
+Train step of epoch 0:  58%|█████▊    | 3763/6434 [8:49:10<6:23:23,  8.61s/it, gpt_loss=0.344, loss_mean=0.322][A
+Train step of epoch 0:  58%|█████▊    | 3763/6434 [8:49:19<6:23:23,  8.61s/it, gpt_loss=0.276, loss_mean=0.317][A
+Train step of epoch 0:  59%|█████▊    | 3764/6434 [8:49:19<6:26:49,  8.69s/it, gpt_loss=0.276, loss_mean=0.317][A
+Train step of epoch 0:  59%|█████▊    | 3764/6434 [8:49:28<6:26:49,  8.69s/it, gpt_loss=0.257, loss_mean=0.311][A
+Train step of epoch 0:  59%|█████▊    | 3765/6434 [8:49:28<6:35:51,  8.90s/it, gpt_loss=0.257, loss_mean=0.311][A
+Train step of epoch 0:  59%|█████▊    | 3765/6434 [8:49:38<6:35:51,  8.90s/it, gpt_loss=0.339, loss_mean=0.314][A
+Train step of epoch 0:  59%|█████▊    | 3766/6434 [8:49:38<6:45:08,  9.11s/it, gpt_loss=0.339, loss_mean=0.314][A
+Train step of epoch 0:  59%|█████▊    | 3766/6434 [8:49:47<6:45:08,  9.11s/it, gpt_loss=0.254, loss_mean=0.308][A
+Train step of epoch 0:  59%|█████▊    | 3767/6434 [8:49:47<6:38:34,  8.97s/it, gpt_loss=0.254, loss_mean=0.308][A
+Train step of epoch 0:  59%|█████▊    | 3767/6434 [8:49:55<6:38:34,  8.97s/it, gpt_loss=0.316, loss_mean=0.309][A
+Train step of epoch 0:  59%|█████▊    | 3768/6434 [8:49:55<6:32:34,  8.84s/it, gpt_loss=0.316, loss_mean=0.309][A
+Train step of epoch 0:  59%|█████▊    | 3768/6434 [8:50:04<6:32:34,  8.84s/it, gpt_loss=0.305, loss_mean=0.309][A
+Train step of epoch 0:  59%|█████▊    | 3769/6434 [8:50:04<6:37:56,  8.96s/it, gpt_loss=0.305, loss_mean=0.309][A
+[LID Router Debug] Step: 3770
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [4, 6, 9, 3, 5, 6, 8, 6, 1, 4]
+Active Experts in Batch: {1, 3, 4, 5, 6, 8, 9}
+
+Train step of epoch 0:  59%|█████▊    | 3769/6434 [8:50:13<6:37:56,  8.96s/it, gpt_loss=0.222, loss_mean=0.3]  [A
+Train step of epoch 0:  59%|█████▊    | 3770/6434 [8:50:13<6:38:30,  8.98s/it, gpt_loss=0.222, loss_mean=0.3][A
+Train step of epoch 0:  59%|█████▊    | 3770/6434 [8:50:22<6:38:30,  8.98s/it, gpt_loss=0.296, loss_mean=0.3][A
+Train step of epoch 0:  59%|█████▊    | 3771/6434 [8:50:22<6:33:28,  8.87s/it, gpt_loss=0.296, loss_mean=0.3][A
+Train step of epoch 0:  59%|█████▊    | 3771/6434 [8:50:31<6:33:28,  8.87s/it, gpt_loss=0.368, loss_mean=0.306][A
+Train step of epoch 0:  59%|█████▊    | 3772/6434 [8:50:31<6:39:25,  9.00s/it, gpt_loss=0.368, loss_mean=0.306][A
+Train step of epoch 0:  59%|█████▊    | 3772/6434 [8:50:39<6:39:25,  9.00s/it, gpt_loss=0.28, loss_mean=0.304] [A
+Train step of epoch 0:  59%|█████▊    | 3773/6434 [8:50:39<6:26:16,  8.71s/it, gpt_loss=0.28, loss_mean=0.304][A
+Train step of epoch 0:  59%|█████▊    | 3773/6434 [8:50:48<6:26:16,  8.71s/it, gpt_loss=0.271, loss_mean=0.301][A
+Train step of epoch 0:  59%|█████▊    | 3774/6434 [8:50:48<6:20:31,  8.58s/it, gpt_loss=0.271, loss_mean=0.301][A
+Train step of epoch 0:  59%|█████▊    | 3774/6434 [8:50:56<6:20:31,  8.58s/it, gpt_loss=0.283, loss_mean=0.299][A
+Train step of epoch 0:  59%|█████▊    | 3775/6434 [8:50:56<6:17:05,  8.51s/it, gpt_loss=0.283, loss_mean=0.299][A
+Train step of epoch 0:  59%|█████▊    | 3775/6434 [8:51:04<6:17:05,  8.51s/it, gpt_loss=0.398, loss_mean=0.309][A
+Train step of epoch 0:  59%|█████▊    | 3776/6434 [8:51:04<6:14:37,  8.46s/it, gpt_loss=0.398, loss_mean=0.309][A
+Train step of epoch 0:  59%|█████▊    | 3776/6434 [8:51:13<6:14:37,  8.46s/it, gpt_loss=0.286, loss_mean=0.306][A
+Train step of epoch 0:  59%|█████▊    | 3777/6434 [8:51:13<6:16:06,  8.49s/it, gpt_loss=0.286, loss_mean=0.306][A
+Train step of epoch 0:  59%|█████▊    | 3777/6434 [8:51:21<6:16:06,  8.49s/it, gpt_loss=0.312, loss_mean=0.307][A
+Train step of epoch 0:  59%|█████▊    | 3778/6434 [8:51:21<6:15:32,  8.48s/it, gpt_loss=0.312, loss_mean=0.307][A
+Train step of epoch 0:  59%|█████▊    | 3778/6434 [8:51:30<6:15:32,  8.48s/it, gpt_loss=0.246, loss_mean=0.301][A
+Train step of epoch 0:  59%|█████▊    | 3779/6434 [8:51:30<6:17:31,  8.53s/it, gpt_loss=0.246, loss_mean=0.301][A
+[LID Router Debug] Step: 3780
+Batch Size: 10
+Audio Batch Size: 89
+LID Assignments: [5, 0, 4, 2, 0, 2, 0, 0, 1, 6]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6}
+
+Train step of epoch 0:  59%|█████▊    | 3779/6434 [8:51:39<6:17:31,  8.53s/it, gpt_loss=0.243, loss_mean=0.295][A
+Train step of epoch 0:  59%|█████▉    | 3780/6434 [8:51:39<6:16:45,  8.52s/it, gpt_loss=0.243, loss_mean=0.295][A
+Train step of epoch 0:  59%|█████▉    | 3780/6434 [8:51:48<6:16:45,  8.52s/it, gpt_loss=0.312, loss_mean=0.297][A
+Train step of epoch 0:  59%|█████▉    | 3781/6434 [8:51:48<6:27:46,  8.77s/it, gpt_loss=0.312, loss_mean=0.297][A
+Train step of epoch 0:  59%|█████▉    | 3781/6434 [8:51:57<6:27:46,  8.77s/it, gpt_loss=0.306, loss_mean=0.298][A
+Train step of epoch 0:  59%|█████▉    | 3782/6434 [8:51:57<6:36:40,  8.97s/it, gpt_loss=0.306, loss_mean=0.298][A
+Train step of epoch 0:  59%|█████▉    | 3782/6434 [8:52:06<6:36:40,  8.97s/it, gpt_loss=0.332, loss_mean=0.301][A
+Train step of epoch 0:  59%|█████▉    | 3783/6434 [8:52:06<6:29:43,  8.82s/it, gpt_loss=0.332, loss_mean=0.301][A
+Train step of epoch 0:  59%|█████▉    | 3783/6434 [8:52:14<6:29:43,  8.82s/it, gpt_loss=0.418, loss_mean=0.313][A
+Train step of epoch 0:  59%|█████▉    | 3784/6434 [8:52:14<6:15:54,  8.51s/it, gpt_loss=0.418, loss_mean=0.313][A
+Train step of epoch 0:  59%|█████▉    | 3784/6434 [8:52:22<6:15:54,  8.51s/it, gpt_loss=0.275, loss_mean=0.309][A
+Train step of epoch 0:  59%|█████▉    | 3785/6434 [8:52:22<6:14:32,  8.48s/it, gpt_loss=0.275, loss_mean=0.309][A
+Train step of epoch 0:  59%|█████▉    | 3785/6434 [8:52:30<6:14:32,  8.48s/it, gpt_loss=0.382, loss_mean=0.316][A
+Train step of epoch 0:  59%|█████▉    | 3786/6434 [8:52:30<6:06:58,  8.32s/it, gpt_loss=0.382, loss_mean=0.316][A
+Train step of epoch 0:  59%|█████▉    | 3786/6434 [8:52:38<6:06:58,  8.32s/it, gpt_loss=0.348, loss_mean=0.319][A
+Train step of epoch 0:  59%|█████▉    | 3787/6434 [8:52:38<6:03:32,  8.24s/it, gpt_loss=0.348, loss_mean=0.319][A
+Train step of epoch 0:  59%|█████▉    | 3787/6434 [8:52:46<6:03:32,  8.24s/it, gpt_loss=0.332, loss_mean=0.321][A
+Train step of epoch 0:  59%|█████▉    | 3788/6434 [8:52:46<5:54:42,  8.04s/it, gpt_loss=0.332, loss_mean=0.321][A
+Train step of epoch 0:  59%|█████▉    | 3788/6434 [8:52:54<5:54:42,  8.04s/it, gpt_loss=0.298, loss_mean=0.318][A
+Train step of epoch 0:  59%|█████▉    | 3789/6434 [8:52:54<5:59:20,  8.15s/it, gpt_loss=0.298, loss_mean=0.318][A
+[LID Router Debug] Step: 3790
+Batch Size: 10
+Audio Batch Size: 129
+LID Assignments: [9, 3, 5, 9, 2, 1, 4, 2, 3, 3]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  59%|█████▉    | 3789/6434 [8:53:03<5:59:20,  8.15s/it, gpt_loss=0.282, loss_mean=0.315][A
+Train step of epoch 0:  59%|█████▉    | 3790/6434 [8:53:03<6:04:37,  8.27s/it, gpt_loss=0.282, loss_mean=0.315][A
+Train step of epoch 0:  59%|█████▉    | 3790/6434 [8:53:11<6:04:37,  8.27s/it, gpt_loss=0.375, loss_mean=0.321][A
+Train step of epoch 0:  59%|█████▉    | 3791/6434 [8:53:11<6:05:25,  8.30s/it, gpt_loss=0.375, loss_mean=0.321][A
+Train step of epoch 0:  59%|█████▉    | 3791/6434 [8:53:20<6:05:25,  8.30s/it, gpt_loss=0.24, loss_mean=0.313] [A
+Train step of epoch 0:  59%|█████▉    | 3792/6434 [8:53:20<6:16:58,  8.56s/it, gpt_loss=0.24, loss_mean=0.313][A
+Train step of epoch 0:  59%|█████▉    | 3792/6434 [8:53:29<6:16:58,  8.56s/it, gpt_loss=0.412, loss_mean=0.323][A
+Train step of epoch 0:  59%|█████▉    | 3793/6434 [8:53:29<6:25:26,  8.76s/it, gpt_loss=0.412, loss_mean=0.323][A
+Train step of epoch 0:  59%|█████▉    | 3793/6434 [8:53:39<6:25:26,  8.76s/it, gpt_loss=0.382, loss_mean=0.329][A
+Train step of epoch 0:  59%|█████▉    | 3794/6434 [8:53:39<6:38:42,  9.06s/it, gpt_loss=0.382, loss_mean=0.329][A
+Train step of epoch 0:  59%|█████▉    | 3794/6434 [8:53:47<6:38:42,  9.06s/it, gpt_loss=0.421, loss_mean=0.338][A
+Train step of epoch 0:  59%|█████▉    | 3795/6434 [8:53:47<6:20:03,  8.64s/it, gpt_loss=0.421, loss_mean=0.338][A
+Train step of epoch 0:  59%|█████▉    | 3795/6434 [8:53:55<6:20:03,  8.64s/it, gpt_loss=0.227, loss_mean=0.327][A
+Train step of epoch 0:  59%|█████▉    | 3796/6434 [8:53:55<6:17:28,  8.59s/it, gpt_loss=0.227, loss_mean=0.327][A
+Train step of epoch 0:  59%|█████▉    | 3796/6434 [8:54:04<6:17:28,  8.59s/it, gpt_loss=0.346, loss_mean=0.329][A
+Train step of epoch 0:  59%|█████▉    | 3797/6434 [8:54:04<6:15:22,  8.54s/it, gpt_loss=0.346, loss_mean=0.329][A
+Train step of epoch 0:  59%|█████▉    | 3797/6434 [8:54:12<6:15:22,  8.54s/it, gpt_loss=0.33, loss_mean=0.329] [A
+Train step of epoch 0:  59%|█████▉    | 3798/6434 [8:54:12<6:07:44,  8.37s/it, gpt_loss=0.33, loss_mean=0.329][A
+Train step of epoch 0:  59%|█████▉    | 3798/6434 [8:54:19<6:07:44,  8.37s/it, gpt_loss=0.27, loss_mean=0.323][A
+Train step of epoch 0:  59%|█████▉    | 3799/6434 [8:54:19<5:56:23,  8.12s/it, gpt_loss=0.27, loss_mean=0.323][A
+[LID Router Debug] Step: 3800
+Batch Size: 10
+Audio Batch Size: 131
+LID Assignments: [6, 2, 3, 5, 9, 0, 0, 3, 1, 3]
+Active Experts in Batch: {0, 1, 2, 3, 5, 6, 9}
+[2026-02-07 00:50:32,207] [INFO] [logging.py:96:log_dist] [Rank 0] step=1900, skipped=0, lr=[1.8293398260673117e-05, 1.8293398260673117e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 00:50:32,208] [INFO] [timer.py:260:stop] epoch=0/micro_step=3800/global_step=1900, RunningAvgSamplesPerSec=4.75044812075071, CurrSamplesPerSec=4.976133360436702, MemAllocated=12.84GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  59%|█████▉    | 3799/6434 [8:54:28<5:56:23,  8.12s/it, gpt_loss=0.313, loss_mean=0.322][A
+Train step of epoch 0:  59%|█████▉    | 3800/6434 [8:54:28<6:02:27,  8.26s/it, gpt_loss=0.313, loss_mean=0.322][A
+Train step of epoch 0:  59%|█████▉    | 3800/6434 [8:54:37<6:02:27,  8.26s/it, gpt_loss=0.223, loss_mean=0.312][A
+Train step of epoch 0:  59%|█████▉    | 3801/6434 [8:54:37<6:10:06,  8.43s/it, gpt_loss=0.223, loss_mean=0.312][A
+Train step of epoch 0:  59%|█████▉    | 3801/6434 [8:54:45<6:10:06,  8.43s/it, gpt_loss=0.253, loss_mean=0.306][A
+Train step of epoch 0:  59%|█████▉    | 3802/6434 [8:54:45<6:10:27,  8.44s/it, gpt_loss=0.253, loss_mean=0.306][A
+Train step of epoch 0:  59%|█████▉    | 3802/6434 [8:54:54<6:10:27,  8.44s/it, gpt_loss=0.307, loss_mean=0.306][A
+Train step of epoch 0:  59%|█████▉    | 3803/6434 [8:54:54<6:19:42,  8.66s/it, gpt_loss=0.307, loss_mean=0.306][A
+Train step of epoch 0:  59%|█████▉    | 3803/6434 [8:55:02<6:19:42,  8.66s/it, gpt_loss=0.27, loss_mean=0.303] [A
+Train step of epoch 0:  59%|█████▉    | 3804/6434 [8:55:02<6:11:32,  8.48s/it, gpt_loss=0.27, loss_mean=0.303][A
+Train step of epoch 0:  59%|█████▉    | 3804/6434 [8:55:11<6:11:32,  8.48s/it, gpt_loss=0.243, loss_mean=0.297][A
+Train step of epoch 0:  59%|█████▉    | 3805/6434 [8:55:11<6:10:26,  8.45s/it, gpt_loss=0.243, loss_mean=0.297][A
+Train step of epoch 0:  59%|█████▉    | 3805/6434 [8:55:20<6:10:26,  8.45s/it, gpt_loss=0.321, loss_mean=0.299][A
+Train step of epoch 0:  59%|█████▉    | 3806/6434 [8:55:20<6:20:24,  8.69s/it, gpt_loss=0.321, loss_mean=0.299][A
+Train step of epoch 0:  59%|█████▉    | 3806/6434 [8:55:29<6:20:24,  8.69s/it, gpt_loss=0.279, loss_mean=0.297][A
+Train step of epoch 0:  59%|█████▉    | 3807/6434 [8:55:29<6:29:46,  8.90s/it, gpt_loss=0.279, loss_mean=0.297][A
+Train step of epoch 0:  59%|█████▉    | 3807/6434 [8:55:38<6:29:46,  8.90s/it, gpt_loss=0.316, loss_mean=0.299][A
+Train step of epoch 0:  59%|█████▉    | 3808/6434 [8:55:38<6:30:00,  8.91s/it, gpt_loss=0.316, loss_mean=0.299][A
+Train step of epoch 0:  59%|█████▉    | 3808/6434 [8:55:47<6:30:00,  8.91s/it, gpt_loss=0.271, loss_mean=0.296][A
+Train step of epoch 0:  59%|█████▉    | 3809/6434 [8:55:47<6:23:56,  8.78s/it, gpt_loss=0.271, loss_mean=0.296][A
+[LID Router Debug] Step: 3810
+Batch Size: 10
+Audio Batch Size: 123
+LID Assignments: [9, 2, 1, 3, 0, 7, 0, 5, 3, 1]
+Active Experts in Batch: {0, 1, 2, 3, 5, 7, 9}
+
+Train step of epoch 0:  59%|█████▉    | 3809/6434 [8:55:55<6:23:56,  8.78s/it, gpt_loss=0.308, loss_mean=0.297][A
+Train step of epoch 0:  59%|█████▉    | 3810/6434 [8:55:55<6:17:09,  8.62s/it, gpt_loss=0.308, loss_mean=0.297][A
+Train step of epoch 0:  59%|█████▉    | 3810/6434 [8:56:03<6:17:09,  8.62s/it, gpt_loss=0.389, loss_mean=0.306][A
+Train step of epoch 0:  59%|█████▉    | 3811/6434 [8:56:03<6:04:58,  8.35s/it, gpt_loss=0.389, loss_mean=0.306][A
+Train step of epoch 0:  59%|█████▉    | 3811/6434 [8:56:11<6:04:58,  8.35s/it, gpt_loss=0.363, loss_mean=0.312][A
+Train step of epoch 0:  59%|█████▉    | 3812/6434 [8:56:11<5:59:07,  8.22s/it, gpt_loss=0.363, loss_mean=0.312][A
+Train step of epoch 0:  59%|█████▉    | 3812/6434 [8:56:20<5:59:07,  8.22s/it, gpt_loss=0.243, loss_mean=0.305][A
+Train step of epoch 0:  59%|█████▉    | 3813/6434 [8:56:20<6:15:22,  8.59s/it, gpt_loss=0.243, loss_mean=0.305][A
+Train step of epoch 0:  59%|█████▉    | 3813/6434 [8:56:28<6:15:22,  8.59s/it, gpt_loss=0.295, loss_mean=0.304][A
+Train step of epoch 0:  59%|█████▉    | 3814/6434 [8:56:28<6:09:10,  8.45s/it, gpt_loss=0.295, loss_mean=0.304][A
+Train step of epoch 0:  59%|█████▉    | 3814/6434 [8:56:36<6:09:10,  8.45s/it, gpt_loss=0.29, loss_mean=0.303] [A
+Train step of epoch 0:  59%|█████▉    | 3815/6434 [8:56:36<6:03:13,  8.32s/it, gpt_loss=0.29, loss_mean=0.303][A
+Train step of epoch 0:  59%|█████▉    | 3815/6434 [8:56:46<6:03:13,  8.32s/it, gpt_loss=0.337, loss_mean=0.306][A
+Train step of epoch 0:  59%|█████▉    | 3816/6434 [8:56:46<6:17:39,  8.66s/it, gpt_loss=0.337, loss_mean=0.306][A
+Train step of epoch 0:  59%|█████▉    | 3816/6434 [8:56:54<6:17:39,  8.66s/it, gpt_loss=0.294, loss_mean=0.305][A
+Train step of epoch 0:  59%|█████▉    | 3817/6434 [8:56:54<6:08:58,  8.46s/it, gpt_loss=0.294, loss_mean=0.305][A
+Train step of epoch 0:  59%|█████▉    | 3817/6434 [8:57:03<6:08:58,  8.46s/it, gpt_loss=0.36, loss_mean=0.31]  [A
+Train step of epoch 0:  59%|█████▉    | 3818/6434 [8:57:03<6:16:19,  8.63s/it, gpt_loss=0.36, loss_mean=0.31][A
+Train step of epoch 0:  59%|█████▉    | 3818/6434 [8:57:12<6:16:19,  8.63s/it, gpt_loss=0.28, loss_mean=0.307][A
+Train step of epoch 0:  59%|█████▉    | 3819/6434 [8:57:12<6:22:46,  8.78s/it, gpt_loss=0.28, loss_mean=0.307][A
+[LID Router Debug] Step: 3820
+Batch Size: 10
+Audio Batch Size: 92
+LID Assignments: [0, 2, 0, 3, 4, 5, 0, 6, 2, 0]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6}
+
+Train step of epoch 0:  59%|█████▉    | 3819/6434 [8:57:20<6:22:46,  8.78s/it, gpt_loss=0.273, loss_mean=0.304][A
+Train step of epoch 0:  59%|█████▉    | 3820/6434 [8:57:20<6:11:40,  8.53s/it, gpt_loss=0.273, loss_mean=0.304][A
+Train step of epoch 0:  59%|█████▉    | 3820/6434 [8:57:28<6:11:40,  8.53s/it, gpt_loss=0.267, loss_mean=0.3]  [A
+Train step of epoch 0:  59%|█████▉    | 3821/6434 [8:57:28<6:06:37,  8.42s/it, gpt_loss=0.267, loss_mean=0.3][A
+Train step of epoch 0:  59%|█████▉    | 3821/6434 [8:57:37<6:06:37,  8.42s/it, gpt_loss=0.256, loss_mean=0.296][A
+Train step of epoch 0:  59%|█████▉    | 3822/6434 [8:57:37<6:09:20,  8.48s/it, gpt_loss=0.256, loss_mean=0.296][A
+Train step of epoch 0:  59%|█████▉    | 3822/6434 [8:57:45<6:09:20,  8.48s/it, gpt_loss=0.296, loss_mean=0.296][A
+Train step of epoch 0:  59%|█████▉    | 3823/6434 [8:57:45<6:04:23,  8.37s/it, gpt_loss=0.296, loss_mean=0.296][A
+Train step of epoch 0:  59%|█████▉    | 3823/6434 [8:57:53<6:04:23,  8.37s/it, gpt_loss=0.23, loss_mean=0.289] [A
+Train step of epoch 0:  59%|█████▉    | 3824/6434 [8:57:53<6:10:12,  8.51s/it, gpt_loss=0.23, loss_mean=0.289][A
+Train step of epoch 0:  59%|█████▉    | 3824/6434 [8:58:02<6:10:12,  8.51s/it, gpt_loss=0.264, loss_mean=0.287][A
+Train step of epoch 0:  59%|█████▉    | 3825/6434 [8:58:02<6:13:20,  8.59s/it, gpt_loss=0.264, loss_mean=0.287][A
+Train step of epoch 0:  59%|█████▉    | 3825/6434 [8:58:11<6:13:20,  8.59s/it, gpt_loss=0.296, loss_mean=0.288][A
+Train step of epoch 0:  59%|█████▉    | 3826/6434 [8:58:11<6:14:29,  8.62s/it, gpt_loss=0.296, loss_mean=0.288][A
+Train step of epoch 0:  59%|█████▉    | 3826/6434 [8:58:20<6:14:29,  8.62s/it, gpt_loss=0.337, loss_mean=0.293][A
+Train step of epoch 0:  59%|█████▉    | 3827/6434 [8:58:20<6:17:05,  8.68s/it, gpt_loss=0.337, loss_mean=0.293][A
+Train step of epoch 0:  59%|█████▉    | 3827/6434 [8:58:28<6:17:05,  8.68s/it, gpt_loss=0.22, loss_mean=0.285] [A
+Train step of epoch 0:  59%|█████▉    | 3828/6434 [8:58:28<6:14:07,  8.61s/it, gpt_loss=0.22, loss_mean=0.285][A
+Train step of epoch 0:  59%|█████▉    | 3828/6434 [8:58:37<6:14:07,  8.61s/it, gpt_loss=0.329, loss_mean=0.29][A
+Train step of epoch 0:  60%|█████▉    | 3829/6434 [8:58:37<6:21:41,  8.79s/it, gpt_loss=0.329, loss_mean=0.29][A
+[LID Router Debug] Step: 3830
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [6, 2, 2, 3, 3, 1, 2, 4, 3, 6]
+Active Experts in Batch: {1, 2, 3, 4, 6}
+
+Train step of epoch 0:  60%|█████▉    | 3829/6434 [8:58:46<6:21:41,  8.79s/it, gpt_loss=0.295, loss_mean=0.29][A
+Train step of epoch 0:  60%|█████▉    | 3830/6434 [8:58:46<6:12:18,  8.58s/it, gpt_loss=0.295, loss_mean=0.29][A
+Train step of epoch 0:  60%|█████▉    | 3830/6434 [8:58:54<6:12:18,  8.58s/it, gpt_loss=0.278, loss_mean=0.289][A
+Train step of epoch 0:  60%|█████▉    | 3831/6434 [8:58:54<6:14:46,  8.64s/it, gpt_loss=0.278, loss_mean=0.289][A
+Train step of epoch 0:  60%|█████▉    | 3831/6434 [8:59:03<6:14:46,  8.64s/it, gpt_loss=0.246, loss_mean=0.285][A
+Train step of epoch 0:  60%|█████▉    | 3832/6434 [8:59:03<6:18:35,  8.73s/it, gpt_loss=0.246, loss_mean=0.285][A
+Train step of epoch 0:  60%|█████▉    | 3832/6434 [8:59:11<6:18:35,  8.73s/it, gpt_loss=0.311, loss_mean=0.287][A
+Train step of epoch 0:  60%|█████▉    | 3833/6434 [8:59:11<6:10:41,  8.55s/it, gpt_loss=0.311, loss_mean=0.287][A
+Train step of epoch 0:  60%|█████▉    | 3833/6434 [8:59:20<6:10:41,  8.55s/it, gpt_loss=0.259, loss_mean=0.285][A
+Train step of epoch 0:  60%|█████▉    | 3834/6434 [8:59:20<6:06:34,  8.46s/it, gpt_loss=0.259, loss_mean=0.285][A
+Train step of epoch 0:  60%|█████▉    | 3834/6434 [8:59:29<6:06:34,  8.46s/it, gpt_loss=0.373, loss_mean=0.293][A
+Train step of epoch 0:  60%|█████▉    | 3835/6434 [8:59:29<6:13:10,  8.62s/it, gpt_loss=0.373, loss_mean=0.293][A
+Train step of epoch 0:  60%|█████▉    | 3835/6434 [8:59:38<6:13:10,  8.62s/it, gpt_loss=0.336, loss_mean=0.298][A
+Train step of epoch 0:  60%|█████▉    | 3836/6434 [8:59:38<6:23:14,  8.85s/it, gpt_loss=0.336, loss_mean=0.298][A
+Train step of epoch 0:  60%|█████▉    | 3836/6434 [8:59:47<6:23:14,  8.85s/it, gpt_loss=0.256, loss_mean=0.293][A
+Train step of epoch 0:  60%|█████▉    | 3837/6434 [8:59:47<6:30:52,  9.03s/it, gpt_loss=0.256, loss_mean=0.293][A
+Train step of epoch 0:  60%|█████▉    | 3837/6434 [8:59:55<6:30:52,  9.03s/it, gpt_loss=0.254, loss_mean=0.289][A
+Train step of epoch 0:  60%|█████▉    | 3838/6434 [8:59:55<6:14:11,  8.65s/it, gpt_loss=0.254, loss_mean=0.289][A
+Train step of epoch 0:  60%|█████▉    | 3838/6434 [9:00:04<6:14:11,  8.65s/it, gpt_loss=0.277, loss_mean=0.288][A
+Train step of epoch 0:  60%|█████▉    | 3839/6434 [9:00:04<6:17:54,  8.74s/it, gpt_loss=0.277, loss_mean=0.288][A
+[LID Router Debug] Step: 3840
+Batch Size: 10
+Audio Batch Size: 114
+LID Assignments: [4, 11, 5, 4, 2, 9, 0, 0, 4, 3]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9, 11}
+
+Train step of epoch 0:  60%|█████▉    | 3839/6434 [9:00:12<6:17:54,  8.74s/it, gpt_loss=0.319, loss_mean=0.291][A
+Train step of epoch 0:  60%|█████▉    | 3840/6434 [9:00:12<6:07:05,  8.49s/it, gpt_loss=0.319, loss_mean=0.291][A
+Train step of epoch 0:  60%|█████▉    | 3840/6434 [9:00:20<6:07:05,  8.49s/it, gpt_loss=0.359, loss_mean=0.298][A
+Train step of epoch 0:  60%|█████▉    | 3841/6434 [9:00:20<6:00:30,  8.34s/it, gpt_loss=0.359, loss_mean=0.298][A
+Train step of epoch 0:  60%|█████▉    | 3841/6434 [9:00:28<6:00:30,  8.34s/it, gpt_loss=0.296, loss_mean=0.298][A
+Train step of epoch 0:  60%|█████▉    | 3842/6434 [9:00:28<6:00:11,  8.34s/it, gpt_loss=0.296, loss_mean=0.298][A
+Train step of epoch 0:  60%|█████▉    | 3842/6434 [9:00:38<6:00:11,  8.34s/it, gpt_loss=0.373, loss_mean=0.305][A
+Train step of epoch 0:  60%|█████▉    | 3843/6434 [9:00:38<6:17:11,  8.73s/it, gpt_loss=0.373, loss_mean=0.305][A
+Train step of epoch 0:  60%|█████▉    | 3843/6434 [9:00:47<6:17:11,  8.73s/it, gpt_loss=0.229, loss_mean=0.298][A
+Train step of epoch 0:  60%|█████▉    | 3844/6434 [9:00:47<6:18:53,  8.78s/it, gpt_loss=0.229, loss_mean=0.298][A
+Train step of epoch 0:  60%|█████▉    | 3844/6434 [9:00:55<6:18:53,  8.78s/it, gpt_loss=0.311, loss_mean=0.299][A
+Train step of epoch 0:  60%|█████▉    | 3845/6434 [9:00:55<6:08:28,  8.54s/it, gpt_loss=0.311, loss_mean=0.299][A
+Train step of epoch 0:  60%|█████▉    | 3845/6434 [9:01:03<6:08:28,  8.54s/it, gpt_loss=0.25, loss_mean=0.294] [A
+Train step of epoch 0:  60%|█████▉    | 3846/6434 [9:01:03<6:01:43,  8.39s/it, gpt_loss=0.25, loss_mean=0.294][A
+Train step of epoch 0:  60%|█████▉    | 3846/6434 [9:01:12<6:01:43,  8.39s/it, gpt_loss=0.38, loss_mean=0.303][A
+Train step of epoch 0:  60%|█████▉    | 3847/6434 [9:01:12<6:06:29,  8.50s/it, gpt_loss=0.38, loss_mean=0.303][A
+Train step of epoch 0:  60%|█████▉    | 3847/6434 [9:01:20<6:06:29,  8.50s/it, gpt_loss=0.297, loss_mean=0.302][A
+Train step of epoch 0:  60%|█████▉    | 3848/6434 [9:01:20<6:03:07,  8.43s/it, gpt_loss=0.297, loss_mean=0.302][A
+Train step of epoch 0:  60%|█████▉    | 3848/6434 [9:01:28<6:03:07,  8.43s/it, gpt_loss=0.281, loss_mean=0.3]  [A
+Train step of epoch 0:  60%|█████▉    | 3849/6434 [9:01:28<6:02:38,  8.42s/it, gpt_loss=0.281, loss_mean=0.3][A
+[LID Router Debug] Step: 3850
+Batch Size: 10
+Audio Batch Size: 139
+LID Assignments: [5, 2, 3, 9, 4, 6, 4, 9, 10, 4]
+Active Experts in Batch: {2, 3, 4, 5, 6, 9, 10}
+
+Train step of epoch 0:  60%|█████▉    | 3849/6434 [9:01:37<6:02:38,  8.42s/it, gpt_loss=0.366, loss_mean=0.307][A
+Train step of epoch 0:  60%|█████▉    | 3850/6434 [9:01:37<6:08:38,  8.56s/it, gpt_loss=0.366, loss_mean=0.307][A
+Train step of epoch 0:  60%|█████▉    | 3850/6434 [9:01:46<6:08:38,  8.56s/it, gpt_loss=0.298, loss_mean=0.306][A
+Train step of epoch 0:  60%|█████▉    | 3851/6434 [9:01:46<6:05:12,  8.48s/it, gpt_loss=0.298, loss_mean=0.306][A
+Train step of epoch 0:  60%|█████▉    | 3851/6434 [9:01:53<6:05:12,  8.48s/it, gpt_loss=0.277, loss_mean=0.303][A
+Train step of epoch 0:  60%|█████▉    | 3852/6434 [9:01:53<5:54:52,  8.25s/it, gpt_loss=0.277, loss_mean=0.303][A
+Train step of epoch 0:  60%|█████▉    | 3852/6434 [9:02:02<5:54:52,  8.25s/it, gpt_loss=0.356, loss_mean=0.308][A
+Train step of epoch 0:  60%|█████▉    | 3853/6434 [9:02:02<6:04:51,  8.48s/it, gpt_loss=0.356, loss_mean=0.308][A
+Train step of epoch 0:  60%|█████▉    | 3853/6434 [9:02:11<6:04:51,  8.48s/it, gpt_loss=0.324, loss_mean=0.31] [A
+Train step of epoch 0:  60%|█████▉    | 3854/6434 [9:02:11<6:11:14,  8.63s/it, gpt_loss=0.324, loss_mean=0.31][A
+Train step of epoch 0:  60%|█████▉    | 3854/6434 [9:02:19<6:11:14,  8.63s/it, gpt_loss=0.312, loss_mean=0.31][A
+Train step of epoch 0:  60%|█████▉    | 3855/6434 [9:02:19<5:57:28,  8.32s/it, gpt_loss=0.312, loss_mean=0.31][A
+Train step of epoch 0:  60%|█████▉    | 3855/6434 [9:02:27<5:57:28,  8.32s/it, gpt_loss=0.32, loss_mean=0.311][A
+Train step of epoch 0:  60%|█████▉    | 3856/6434 [9:02:27<5:50:09,  8.15s/it, gpt_loss=0.32, loss_mean=0.311][A
+Train step of epoch 0:  60%|█████▉    | 3856/6434 [9:02:35<5:50:09,  8.15s/it, gpt_loss=0.336, loss_mean=0.314][A
+Train step of epoch 0:  60%|█████▉    | 3857/6434 [9:02:35<5:49:38,  8.14s/it, gpt_loss=0.336, loss_mean=0.314][A
+Train step of epoch 0:  60%|█████▉    | 3857/6434 [9:02:44<5:49:38,  8.14s/it, gpt_loss=0.249, loss_mean=0.307][A
+Train step of epoch 0:  60%|█████▉    | 3858/6434 [9:02:44<6:08:36,  8.59s/it, gpt_loss=0.249, loss_mean=0.307][A
+Train step of epoch 0:  60%|█████▉    | 3858/6434 [9:02:52<6:08:36,  8.59s/it, gpt_loss=0.222, loss_mean=0.299][A
+Train step of epoch 0:  60%|█████▉    | 3859/6434 [9:02:52<5:56:22,  8.30s/it, gpt_loss=0.222, loss_mean=0.299][A
+[LID Router Debug] Step: 3860
+Batch Size: 10
+Audio Batch Size: 82
+LID Assignments: [4, 0, 9, 1, 3, 2, 2, 5, 4, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  60%|█████▉    | 3859/6434 [9:03:01<5:56:22,  8.30s/it, gpt_loss=0.308, loss_mean=0.3]  [A
+Train step of epoch 0:  60%|█████▉    | 3860/6434 [9:03:01<6:04:41,  8.50s/it, gpt_loss=0.308, loss_mean=0.3][A
+Train step of epoch 0:  60%|█████▉    | 3860/6434 [9:03:09<6:04:41,  8.50s/it, gpt_loss=0.243, loss_mean=0.294][A
+Train step of epoch 0:  60%|██████    | 3861/6434 [9:03:09<6:02:05,  8.44s/it, gpt_loss=0.243, loss_mean=0.294][A
+Train step of epoch 0:  60%|██████    | 3861/6434 [9:03:18<6:02:05,  8.44s/it, gpt_loss=0.297, loss_mean=0.294][A
+Train step of epoch 0:  60%|██████    | 3862/6434 [9:03:18<6:04:57,  8.51s/it, gpt_loss=0.297, loss_mean=0.294][A
+Train step of epoch 0:  60%|██████    | 3862/6434 [9:03:27<6:04:57,  8.51s/it, gpt_loss=0.294, loss_mean=0.294][A
+Train step of epoch 0:  60%|██████    | 3863/6434 [9:03:27<6:11:48,  8.68s/it, gpt_loss=0.294, loss_mean=0.294][A
+Train step of epoch 0:  60%|██████    | 3863/6434 [9:03:37<6:11:48,  8.68s/it, gpt_loss=0.272, loss_mean=0.292][A
+Train step of epoch 0:  60%|██████    | 3864/6434 [9:03:37<6:26:42,  9.03s/it, gpt_loss=0.272, loss_mean=0.292][A
+Train step of epoch 0:  60%|██████    | 3864/6434 [9:03:45<6:26:42,  9.03s/it, gpt_loss=0.316, loss_mean=0.294][A
+Train step of epoch 0:  60%|██████    | 3865/6434 [9:03:45<6:21:20,  8.91s/it, gpt_loss=0.316, loss_mean=0.294][A
+Train step of epoch 0:  60%|██████    | 3865/6434 [9:03:54<6:21:20,  8.91s/it, gpt_loss=0.25, loss_mean=0.29]  [A
+Train step of epoch 0:  60%|██████    | 3866/6434 [9:03:54<6:10:37,  8.66s/it, gpt_loss=0.25, loss_mean=0.29][A
+Train step of epoch 0:  60%|██████    | 3866/6434 [9:04:01<6:10:37,  8.66s/it, gpt_loss=0.276, loss_mean=0.289][A
+Train step of epoch 0:  60%|██████    | 3867/6434 [9:04:01<5:50:52,  8.20s/it, gpt_loss=0.276, loss_mean=0.289][A
+Train step of epoch 0:  60%|██████    | 3867/6434 [9:04:09<5:50:52,  8.20s/it, gpt_loss=0.245, loss_mean=0.284][A
+Train step of epoch 0:  60%|██████    | 3868/6434 [9:04:09<5:49:49,  8.18s/it, gpt_loss=0.245, loss_mean=0.284][A
+Train step of epoch 0:  60%|██████    | 3868/6434 [9:04:18<5:49:49,  8.18s/it, gpt_loss=0.299, loss_mean=0.286][A
+Train step of epoch 0:  60%|██████    | 3869/6434 [9:04:18<6:04:38,  8.53s/it, gpt_loss=0.299, loss_mean=0.286][A
+[LID Router Debug] Step: 3870
+Batch Size: 10
+Audio Batch Size: 111
+LID Assignments: [5, 9, 9, 9, 9, 0, 1, 9, 0, 6]
+Active Experts in Batch: {0, 1, 5, 6, 9}
+
+Train step of epoch 0:  60%|██████    | 3869/6434 [9:04:26<6:04:38,  8.53s/it, gpt_loss=0.284, loss_mean=0.286][A
+Train step of epoch 0:  60%|██████    | 3870/6434 [9:04:26<5:59:42,  8.42s/it, gpt_loss=0.284, loss_mean=0.286][A
+Train step of epoch 0:  60%|██████    | 3870/6434 [9:04:35<5:59:42,  8.42s/it, gpt_loss=0.343, loss_mean=0.291][A
+Train step of epoch 0:  60%|██████    | 3871/6434 [9:04:35<6:09:18,  8.65s/it, gpt_loss=0.343, loss_mean=0.291][A
+Train step of epoch 0:  60%|██████    | 3871/6434 [9:04:43<6:09:18,  8.65s/it, gpt_loss=0.233, loss_mean=0.286][A
+Train step of epoch 0:  60%|██████    | 3872/6434 [9:04:43<5:55:31,  8.33s/it, gpt_loss=0.233, loss_mean=0.286][A
+Train step of epoch 0:  60%|██████    | 3872/6434 [9:04:53<5:55:31,  8.33s/it, gpt_loss=0.237, loss_mean=0.281][A
+Train step of epoch 0:  60%|██████    | 3873/6434 [9:04:53<6:13:13,  8.74s/it, gpt_loss=0.237, loss_mean=0.281][A
+Train step of epoch 0:  60%|██████    | 3873/6434 [9:05:02<6:13:13,  8.74s/it, gpt_loss=0.29, loss_mean=0.282] [A
+Train step of epoch 0:  60%|██████    | 3874/6434 [9:05:02<6:13:50,  8.76s/it, gpt_loss=0.29, loss_mean=0.282][A
+Train step of epoch 0:  60%|██████    | 3874/6434 [9:05:10<6:13:50,  8.76s/it, gpt_loss=0.246, loss_mean=0.278][A
+Train step of epoch 0:  60%|██████    | 3875/6434 [9:05:10<6:13:21,  8.75s/it, gpt_loss=0.246, loss_mean=0.278][A
+Train step of epoch 0:  60%|██████    | 3875/6434 [9:05:19<6:13:21,  8.75s/it, gpt_loss=0.333, loss_mean=0.284][A
+Train step of epoch 0:  60%|██████    | 3876/6434 [9:05:19<6:17:47,  8.86s/it, gpt_loss=0.333, loss_mean=0.284][A
+Train step of epoch 0:  60%|██████    | 3876/6434 [9:05:27<6:17:47,  8.86s/it, gpt_loss=0.291, loss_mean=0.284][A
+Train step of epoch 0:  60%|██████    | 3877/6434 [9:05:27<6:06:17,  8.60s/it, gpt_loss=0.291, loss_mean=0.284][A
+Train step of epoch 0:  60%|██████    | 3877/6434 [9:05:37<6:06:17,  8.60s/it, gpt_loss=0.351, loss_mean=0.291][A
+Train step of epoch 0:  60%|██████    | 3878/6434 [9:05:37<6:13:24,  8.77s/it, gpt_loss=0.351, loss_mean=0.291][A
+Train step of epoch 0:  60%|██████    | 3878/6434 [9:05:46<6:13:24,  8.77s/it, gpt_loss=0.272, loss_mean=0.289][A
+Train step of epoch 0:  60%|██████    | 3879/6434 [9:05:46<6:16:54,  8.85s/it, gpt_loss=0.272, loss_mean=0.289][A
+[LID Router Debug] Step: 3880
+Batch Size: 10
+Audio Batch Size: 143
+LID Assignments: [4, 1, 3, 4, 3, 3, 1, 4, 2, 3]
+Active Experts in Batch: {1, 2, 3, 4}
+
+Train step of epoch 0:  60%|██████    | 3879/6434 [9:05:55<6:16:54,  8.85s/it, gpt_loss=0.336, loss_mean=0.294][A
+Train step of epoch 0:  60%|██████    | 3880/6434 [9:05:55<6:23:05,  9.00s/it, gpt_loss=0.336, loss_mean=0.294][A
+Train step of epoch 0:  60%|██████    | 3880/6434 [9:06:03<6:23:05,  9.00s/it, gpt_loss=0.331, loss_mean=0.297][A
+Train step of epoch 0:  60%|██████    | 3881/6434 [9:06:03<6:15:21,  8.82s/it, gpt_loss=0.331, loss_mean=0.297][A
+Train step of epoch 0:  60%|██████    | 3881/6434 [9:06:12<6:15:21,  8.82s/it, gpt_loss=0.191, loss_mean=0.287][A
+Train step of epoch 0:  60%|██████    | 3882/6434 [9:06:12<6:11:07,  8.73s/it, gpt_loss=0.191, loss_mean=0.287][A
+Train step of epoch 0:  60%|██████    | 3882/6434 [9:06:20<6:11:07,  8.73s/it, gpt_loss=0.341, loss_mean=0.292][A
+Train step of epoch 0:  60%|██████    | 3883/6434 [9:06:20<6:02:11,  8.52s/it, gpt_loss=0.341, loss_mean=0.292][A
+Train step of epoch 0:  60%|██████    | 3883/6434 [9:06:29<6:02:11,  8.52s/it, gpt_loss=0.261, loss_mean=0.289][A
+Train step of epoch 0:  60%|██████    | 3884/6434 [9:06:29<6:10:35,  8.72s/it, gpt_loss=0.261, loss_mean=0.289][A
+Train step of epoch 0:  60%|██████    | 3884/6434 [9:06:38<6:10:35,  8.72s/it, gpt_loss=0.292, loss_mean=0.289][A
+Train step of epoch 0:  60%|██████    | 3885/6434 [9:06:38<6:18:48,  8.92s/it, gpt_loss=0.292, loss_mean=0.289][A
+Train step of epoch 0:  60%|██████    | 3885/6434 [9:06:47<6:18:48,  8.92s/it, gpt_loss=0.261, loss_mean=0.287][A
+Train step of epoch 0:  60%|██████    | 3886/6434 [9:06:47<6:15:58,  8.85s/it, gpt_loss=0.261, loss_mean=0.287][A
+Train step of epoch 0:  60%|██████    | 3886/6434 [9:06:55<6:15:58,  8.85s/it, gpt_loss=0.277, loss_mean=0.286][A
+Train step of epoch 0:  60%|██████    | 3887/6434 [9:06:55<5:58:33,  8.45s/it, gpt_loss=0.277, loss_mean=0.286][A
+Train step of epoch 0:  60%|██████    | 3887/6434 [9:07:03<5:58:33,  8.45s/it, gpt_loss=0.305, loss_mean=0.288][A
+Train step of epoch 0:  60%|██████    | 3888/6434 [9:07:03<6:02:46,  8.55s/it, gpt_loss=0.305, loss_mean=0.288][A
+Train step of epoch 0:  60%|██████    | 3888/6434 [9:07:11<6:02:46,  8.55s/it, gpt_loss=0.204, loss_mean=0.279][A
+Train step of epoch 0:  60%|██████    | 3889/6434 [9:07:11<5:50:06,  8.25s/it, gpt_loss=0.204, loss_mean=0.279][A
+[LID Router Debug] Step: 3890
+Batch Size: 10
+Audio Batch Size: 83
+LID Assignments: [3, 9, 9, 1, 4, 9, 5, 0, 7, 9]
+Active Experts in Batch: {0, 1, 3, 4, 5, 7, 9}
+
+Train step of epoch 0:  60%|██████    | 3889/6434 [9:07:21<5:50:06,  8.25s/it, gpt_loss=0.463, loss_mean=0.298][A
+Train step of epoch 0:  60%|██████    | 3890/6434 [9:07:21<6:07:51,  8.68s/it, gpt_loss=0.463, loss_mean=0.298][A
+Train step of epoch 0:  60%|██████    | 3890/6434 [9:07:32<6:07:51,  8.68s/it, gpt_loss=0.31, loss_mean=0.299] [A
+Train step of epoch 0:  60%|██████    | 3891/6434 [9:07:32<6:43:08,  9.51s/it, gpt_loss=0.31, loss_mean=0.299][A
+Train step of epoch 0:  60%|██████    | 3891/6434 [9:07:41<6:43:08,  9.51s/it, gpt_loss=0.393, loss_mean=0.308][A
+Train step of epoch 0:  60%|██████    | 3892/6434 [9:07:41<6:32:16,  9.26s/it, gpt_loss=0.393, loss_mean=0.308][A
+Train step of epoch 0:  60%|██████    | 3892/6434 [9:07:50<6:32:16,  9.26s/it, gpt_loss=0.246, loss_mean=0.302][A
+Train step of epoch 0:  61%|██████    | 3893/6434 [9:07:50<6:32:48,  9.28s/it, gpt_loss=0.246, loss_mean=0.302][A
+Train step of epoch 0:  61%|██████    | 3893/6434 [9:07:58<6:32:48,  9.28s/it, gpt_loss=0.272, loss_mean=0.299][A
+Train step of epoch 0:  61%|██████    | 3894/6434 [9:07:58<6:21:03,  9.00s/it, gpt_loss=0.272, loss_mean=0.299][A
+Train step of epoch 0:  61%|██████    | 3894/6434 [9:08:07<6:21:03,  9.00s/it, gpt_loss=0.299, loss_mean=0.299][A
+Train step of epoch 0:  61%|██████    | 3895/6434 [9:08:07<6:14:18,  8.85s/it, gpt_loss=0.299, loss_mean=0.299][A
+Train step of epoch 0:  61%|██████    | 3895/6434 [9:08:15<6:14:18,  8.85s/it, gpt_loss=0.287, loss_mean=0.298][A
+Train step of epoch 0:  61%|██████    | 3896/6434 [9:08:15<6:04:56,  8.63s/it, gpt_loss=0.287, loss_mean=0.298][A
+Train step of epoch 0:  61%|██████    | 3896/6434 [9:08:22<6:04:56,  8.63s/it, gpt_loss=0.301, loss_mean=0.298][A
+Train step of epoch 0:  61%|██████    | 3897/6434 [9:08:22<5:47:48,  8.23s/it, gpt_loss=0.301, loss_mean=0.298][A
+Train step of epoch 0:  61%|██████    | 3897/6434 [9:08:32<5:47:48,  8.23s/it, gpt_loss=0.324, loss_mean=0.301][A
+Train step of epoch 0:  61%|██████    | 3898/6434 [9:08:32<5:59:51,  8.51s/it, gpt_loss=0.324, loss_mean=0.301][A
+Train step of epoch 0:  61%|██████    | 3898/6434 [9:08:39<5:59:51,  8.51s/it, gpt_loss=0.311, loss_mean=0.302][A
+Train step of epoch 0:  61%|██████    | 3899/6434 [9:08:39<5:51:09,  8.31s/it, gpt_loss=0.311, loss_mean=0.302][A
+[LID Router Debug] Step: 3900
+Batch Size: 10
+Audio Batch Size: 105
+LID Assignments: [2, 9, 0, 0, 2, 5, 3, 1, 0, 6]
+Active Experts in Batch: {0, 1, 2, 3, 5, 6, 9}
+
+Train step of epoch 0:  61%|██████    | 3899/6434 [9:08:48<5:51:09,  8.31s/it, gpt_loss=0.347, loss_mean=0.306][A
+Train step of epoch 0:  61%|██████    | 3900/6434 [9:08:48<5:49:25,  8.27s/it, gpt_loss=0.347, loss_mean=0.306][A
+Train step of epoch 0:  61%|██████    | 3900/6434 [9:08:56<5:49:25,  8.27s/it, gpt_loss=0.342, loss_mean=0.31] [A
+Train step of epoch 0:  61%|██████    | 3901/6434 [9:08:56<5:46:14,  8.20s/it, gpt_loss=0.342, loss_mean=0.31][A
+Train step of epoch 0:  61%|██████    | 3901/6434 [9:09:03<5:46:14,  8.20s/it, gpt_loss=0.306, loss_mean=0.309][A
+Train step of epoch 0:  61%|██████    | 3902/6434 [9:09:03<5:40:25,  8.07s/it, gpt_loss=0.306, loss_mean=0.309][A
+Train step of epoch 0:  61%|██████    | 3902/6434 [9:09:13<5:40:25,  8.07s/it, gpt_loss=0.26, loss_mean=0.304] [A
+Train step of epoch 0:  61%|██████    | 3903/6434 [9:09:13<5:54:49,  8.41s/it, gpt_loss=0.26, loss_mean=0.304][A
+Train step of epoch 0:  61%|██████    | 3903/6434 [9:09:21<5:54:49,  8.41s/it, gpt_loss=0.288, loss_mean=0.303][A
+Train step of epoch 0:  61%|██████    | 3904/6434 [9:09:21<5:48:58,  8.28s/it, gpt_loss=0.288, loss_mean=0.303][A
+Train step of epoch 0:  61%|██████    | 3904/6434 [9:09:28<5:48:58,  8.28s/it, gpt_loss=0.275, loss_mean=0.3]  [A
+Train step of epoch 0:  61%|██████    | 3905/6434 [9:09:28<5:42:51,  8.13s/it, gpt_loss=0.275, loss_mean=0.3][A
+Train step of epoch 0:  61%|██████    | 3905/6434 [9:09:36<5:42:51,  8.13s/it, gpt_loss=0.279, loss_mean=0.298][A
+Train step of epoch 0:  61%|██████    | 3906/6434 [9:09:36<5:39:00,  8.05s/it, gpt_loss=0.279, loss_mean=0.298][A
+Train step of epoch 0:  61%|██████    | 3906/6434 [9:09:45<5:39:00,  8.05s/it, gpt_loss=0.219, loss_mean=0.29] [A
+Train step of epoch 0:  61%|██████    | 3907/6434 [9:09:45<5:45:15,  8.20s/it, gpt_loss=0.219, loss_mean=0.29][A
+Train step of epoch 0:  61%|██████    | 3907/6434 [9:09:53<5:45:15,  8.20s/it, gpt_loss=0.29, loss_mean=0.29] [A
+Train step of epoch 0:  61%|██████    | 3908/6434 [9:09:53<5:48:11,  8.27s/it, gpt_loss=0.29, loss_mean=0.29][A
+Train step of epoch 0:  61%|██████    | 3908/6434 [9:10:02<5:48:11,  8.27s/it, gpt_loss=0.315, loss_mean=0.292][A
+Train step of epoch 0:  61%|██████    | 3909/6434 [9:10:02<5:51:15,  8.35s/it, gpt_loss=0.315, loss_mean=0.292][A
+[LID Router Debug] Step: 3910
+Batch Size: 10
+Audio Batch Size: 95
+LID Assignments: [4, 1, 3, 2, 2, 4, 1, 0, 1, 9]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+Train step of epoch 0:  61%|██████    | 3909/6434 [9:10:09<5:51:15,  8.35s/it, gpt_loss=0.262, loss_mean=0.289][A
+Train step of epoch 0:  61%|██████    | 3910/6434 [9:10:09<5:32:05,  7.89s/it, gpt_loss=0.262, loss_mean=0.289][A
+Train step of epoch 0:  61%|██████    | 3910/6434 [9:10:16<5:32:05,  7.89s/it, gpt_loss=0.268, loss_mean=0.287][A
+Train step of epoch 0:  61%|██████    | 3911/6434 [9:10:16<5:27:54,  7.80s/it, gpt_loss=0.268, loss_mean=0.287][A
+Train step of epoch 0:  61%|██████    | 3911/6434 [9:10:25<5:27:54,  7.80s/it, gpt_loss=0.349, loss_mean=0.293][A
+Train step of epoch 0:  61%|██████    | 3912/6434 [9:10:25<5:45:13,  8.21s/it, gpt_loss=0.349, loss_mean=0.293][A
+Train step of epoch 0:  61%|██████    | 3912/6434 [9:10:36<5:45:13,  8.21s/it, gpt_loss=0.288, loss_mean=0.293][A
+Train step of epoch 0:  61%|██████    | 3913/6434 [9:10:36<6:14:21,  8.91s/it, gpt_loss=0.288, loss_mean=0.293][A
+Train step of epoch 0:  61%|██████    | 3913/6434 [9:10:44<6:14:21,  8.91s/it, gpt_loss=0.312, loss_mean=0.295][A
+Train step of epoch 0:  61%|██████    | 3914/6434 [9:10:44<6:08:03,  8.76s/it, gpt_loss=0.312, loss_mean=0.295][A
+Train step of epoch 0:  61%|██████    | 3914/6434 [9:10:53<6:08:03,  8.76s/it, gpt_loss=0.331, loss_mean=0.298][A
+Train step of epoch 0:  61%|██████    | 3915/6434 [9:10:53<6:02:52,  8.64s/it, gpt_loss=0.331, loss_mean=0.298][A
+Train step of epoch 0:  61%|██████    | 3915/6434 [9:11:02<6:02:52,  8.64s/it, gpt_loss=0.258, loss_mean=0.294][A
+Train step of epoch 0:  61%|██████    | 3916/6434 [9:11:02<6:14:26,  8.92s/it, gpt_loss=0.258, loss_mean=0.294][A
+Train step of epoch 0:  61%|██████    | 3916/6434 [9:11:11<6:14:26,  8.92s/it, gpt_loss=0.361, loss_mean=0.301][A
+Train step of epoch 0:  61%|██████    | 3917/6434 [9:11:11<6:11:35,  8.86s/it, gpt_loss=0.361, loss_mean=0.301][A
+Train step of epoch 0:  61%|██████    | 3917/6434 [9:11:20<6:11:35,  8.86s/it, gpt_loss=0.284, loss_mean=0.299][A
+Train step of epoch 0:  61%|██████    | 3918/6434 [9:11:20<6:09:37,  8.81s/it, gpt_loss=0.284, loss_mean=0.299][A
+Train step of epoch 0:  61%|██████    | 3918/6434 [9:11:28<6:09:37,  8.81s/it, gpt_loss=0.291, loss_mean=0.299][A
+Train step of epoch 0:  61%|██████    | 3919/6434 [9:11:28<6:05:34,  8.72s/it, gpt_loss=0.291, loss_mean=0.299][A
+[LID Router Debug] Step: 3920
+Batch Size: 10
+Audio Batch Size: 115
+LID Assignments: [9, 1, 9, 2, 9, 5, 3, 0, 3, 2]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9}
+
+Train step of epoch 0:  61%|██████    | 3919/6434 [9:11:36<6:05:34,  8.72s/it, gpt_loss=0.332, loss_mean=0.302][A
+Train step of epoch 0:  61%|██████    | 3920/6434 [9:11:36<5:54:42,  8.47s/it, gpt_loss=0.332, loss_mean=0.302][A
+Train step of epoch 0:  61%|██████    | 3920/6434 [9:11:44<5:54:42,  8.47s/it, gpt_loss=0.332, loss_mean=0.305][A
+Train step of epoch 0:  61%|██████    | 3921/6434 [9:11:44<5:50:23,  8.37s/it, gpt_loss=0.332, loss_mean=0.305][A
+Train step of epoch 0:  61%|██████    | 3921/6434 [9:11:53<5:50:23,  8.37s/it, gpt_loss=0.312, loss_mean=0.306][A
+Train step of epoch 0:  61%|██████    | 3922/6434 [9:11:53<5:59:21,  8.58s/it, gpt_loss=0.312, loss_mean=0.306][A
+Train step of epoch 0:  61%|██████    | 3922/6434 [9:12:01<5:59:21,  8.58s/it, gpt_loss=0.26, loss_mean=0.301] [A
+Train step of epoch 0:  61%|██████    | 3923/6434 [9:12:01<5:49:11,  8.34s/it, gpt_loss=0.26, loss_mean=0.301][A
+Train step of epoch 0:  61%|██████    | 3923/6434 [9:12:09<5:49:11,  8.34s/it, gpt_loss=0.335, loss_mean=0.304][A
+Train step of epoch 0:  61%|██████    | 3924/6434 [9:12:09<5:39:20,  8.11s/it, gpt_loss=0.335, loss_mean=0.304][A
+Train step of epoch 0:  61%|██████    | 3924/6434 [9:12:17<5:39:20,  8.11s/it, gpt_loss=0.306, loss_mean=0.305][A
+Train step of epoch 0:  61%|██████    | 3925/6434 [9:12:17<5:47:58,  8.32s/it, gpt_loss=0.306, loss_mean=0.305][A
+Train step of epoch 0:  61%|██████    | 3925/6434 [9:12:25<5:47:58,  8.32s/it, gpt_loss=0.281, loss_mean=0.302][A
+Train step of epoch 0:  61%|██████    | 3926/6434 [9:12:25<5:39:39,  8.13s/it, gpt_loss=0.281, loss_mean=0.302][A
+Train step of epoch 0:  61%|██████    | 3926/6434 [9:12:33<5:39:39,  8.13s/it, gpt_loss=0.266, loss_mean=0.299][A
+Train step of epoch 0:  61%|██████    | 3927/6434 [9:12:33<5:41:35,  8.18s/it, gpt_loss=0.266, loss_mean=0.299][A
+Train step of epoch 0:  61%|██████    | 3927/6434 [9:12:42<5:41:35,  8.18s/it, gpt_loss=0.28, loss_mean=0.297] [A
+Train step of epoch 0:  61%|██████    | 3928/6434 [9:12:42<5:46:41,  8.30s/it, gpt_loss=0.28, loss_mean=0.297][A
+Train step of epoch 0:  61%|██████    | 3928/6434 [9:12:51<5:46:41,  8.30s/it, gpt_loss=0.394, loss_mean=0.306][A
+Train step of epoch 0:  61%|██████    | 3929/6434 [9:12:51<5:49:49,  8.38s/it, gpt_loss=0.394, loss_mean=0.306][A
+[LID Router Debug] Step: 3930
+Batch Size: 10
+Audio Batch Size: 111
+LID Assignments: [0, 2, 4, 0, 3, 0, 5, 2, 2, 2]
+Active Experts in Batch: {0, 2, 3, 4, 5}
+
+Train step of epoch 0:  61%|██████    | 3929/6434 [9:13:00<5:49:49,  8.38s/it, gpt_loss=0.324, loss_mean=0.308][A
+Train step of epoch 0:  61%|██████    | 3930/6434 [9:13:00<6:06:03,  8.77s/it, gpt_loss=0.324, loss_mean=0.308][A
+Train step of epoch 0:  61%|██████    | 3930/6434 [9:13:09<6:06:03,  8.77s/it, gpt_loss=0.311, loss_mean=0.308][A
+Train step of epoch 0:  61%|██████    | 3931/6434 [9:13:09<6:08:37,  8.84s/it, gpt_loss=0.311, loss_mean=0.308][A
+Train step of epoch 0:  61%|██████    | 3931/6434 [9:13:19<6:08:37,  8.84s/it, gpt_loss=0.27, loss_mean=0.305] [A
+Train step of epoch 0:  61%|██████    | 3932/6434 [9:13:19<6:14:39,  8.98s/it, gpt_loss=0.27, loss_mean=0.305][A
+Train step of epoch 0:  61%|██████    | 3932/6434 [9:13:26<6:14:39,  8.98s/it, gpt_loss=0.267, loss_mean=0.301][A
+Train step of epoch 0:  61%|██████    | 3933/6434 [9:13:26<5:58:43,  8.61s/it, gpt_loss=0.267, loss_mean=0.301][A
+Train step of epoch 0:  61%|██████    | 3933/6434 [9:13:35<5:58:43,  8.61s/it, gpt_loss=0.343, loss_mean=0.305][A
+Train step of epoch 0:  61%|██████    | 3934/6434 [9:13:35<5:56:33,  8.56s/it, gpt_loss=0.343, loss_mean=0.305][A
+Train step of epoch 0:  61%|██████    | 3934/6434 [9:13:43<5:56:33,  8.56s/it, gpt_loss=0.215, loss_mean=0.296][A
+Train step of epoch 0:  61%|██████    | 3935/6434 [9:13:43<5:56:14,  8.55s/it, gpt_loss=0.215, loss_mean=0.296][A
+Train step of epoch 0:  61%|██████    | 3935/6434 [9:13:51<5:56:14,  8.55s/it, gpt_loss=0.324, loss_mean=0.299][A
+Train step of epoch 0:  61%|██████    | 3936/6434 [9:13:51<5:50:52,  8.43s/it, gpt_loss=0.324, loss_mean=0.299][A
+Train step of epoch 0:  61%|██████    | 3936/6434 [9:13:59<5:50:52,  8.43s/it, gpt_loss=0.284, loss_mean=0.297][A
+Train step of epoch 0:  61%|██████    | 3937/6434 [9:13:59<5:41:46,  8.21s/it, gpt_loss=0.284, loss_mean=0.297][A
+Train step of epoch 0:  61%|██████    | 3937/6434 [9:14:08<5:41:46,  8.21s/it, gpt_loss=0.339, loss_mean=0.302][A
+Train step of epoch 0:  61%|██████    | 3938/6434 [9:14:08<5:47:45,  8.36s/it, gpt_loss=0.339, loss_mean=0.302][A
+Train step of epoch 0:  61%|██████    | 3938/6434 [9:14:16<5:47:45,  8.36s/it, gpt_loss=0.296, loss_mean=0.301][A
+Train step of epoch 0:  61%|██████    | 3939/6434 [9:14:16<5:42:33,  8.24s/it, gpt_loss=0.296, loss_mean=0.301][A
+[LID Router Debug] Step: 3940
+Batch Size: 10
+Audio Batch Size: 143
+LID Assignments: [2, 4, 3, 9, 0, 1, 4, 5, 3, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  61%|██████    | 3939/6434 [9:14:25<5:42:33,  8.24s/it, gpt_loss=0.327, loss_mean=0.304][A
+Train step of epoch 0:  61%|██████    | 3940/6434 [9:14:25<5:54:55,  8.54s/it, gpt_loss=0.327, loss_mean=0.304][A
+Train step of epoch 0:  61%|██████    | 3940/6434 [9:14:33<5:54:55,  8.54s/it, gpt_loss=0.329, loss_mean=0.306][A
+Train step of epoch 0:  61%|██████▏   | 3941/6434 [9:14:33<5:53:57,  8.52s/it, gpt_loss=0.329, loss_mean=0.306][A
+Train step of epoch 0:  61%|██████▏   | 3941/6434 [9:14:42<5:53:57,  8.52s/it, gpt_loss=0.222, loss_mean=0.298][A
+Train step of epoch 0:  61%|██████▏   | 3942/6434 [9:14:42<5:54:47,  8.54s/it, gpt_loss=0.222, loss_mean=0.298][A
+Train step of epoch 0:  61%|██████▏   | 3942/6434 [9:14:51<5:54:47,  8.54s/it, gpt_loss=0.281, loss_mean=0.296][A
+Train step of epoch 0:  61%|██████▏   | 3943/6434 [9:14:51<6:03:48,  8.76s/it, gpt_loss=0.281, loss_mean=0.296][A
+Train step of epoch 0:  61%|██████▏   | 3943/6434 [9:14:59<6:03:48,  8.76s/it, gpt_loss=0.29, loss_mean=0.295] [A
+Train step of epoch 0:  61%|██████▏   | 3944/6434 [9:14:59<5:46:22,  8.35s/it, gpt_loss=0.29, loss_mean=0.295][A
+Train step of epoch 0:  61%|██████▏   | 3944/6434 [9:15:08<5:46:22,  8.35s/it, gpt_loss=0.272, loss_mean=0.293][A
+Train step of epoch 0:  61%|██████▏   | 3945/6434 [9:15:08<5:54:27,  8.54s/it, gpt_loss=0.272, loss_mean=0.293][A
+Train step of epoch 0:  61%|██████▏   | 3945/6434 [9:15:15<5:54:27,  8.54s/it, gpt_loss=0.244, loss_mean=0.288][A
+Train step of epoch 0:  61%|██████▏   | 3946/6434 [9:15:15<5:41:45,  8.24s/it, gpt_loss=0.244, loss_mean=0.288][A
+Train step of epoch 0:  61%|██████▏   | 3946/6434 [9:15:24<5:41:45,  8.24s/it, gpt_loss=0.33, loss_mean=0.292] [A
+Train step of epoch 0:  61%|██████▏   | 3947/6434 [9:15:24<5:48:10,  8.40s/it, gpt_loss=0.33, loss_mean=0.292][A
+Train step of epoch 0:  61%|██████▏   | 3947/6434 [9:15:33<5:48:10,  8.40s/it, gpt_loss=0.327, loss_mean=0.296][A
+Train step of epoch 0:  61%|██████▏   | 3948/6434 [9:15:33<5:57:10,  8.62s/it, gpt_loss=0.327, loss_mean=0.296][A
+Train step of epoch 0:  61%|██████▏   | 3948/6434 [9:15:41<5:57:10,  8.62s/it, gpt_loss=0.302, loss_mean=0.296][A
+Train step of epoch 0:  61%|██████▏   | 3949/6434 [9:15:41<5:52:22,  8.51s/it, gpt_loss=0.302, loss_mean=0.296][A
+[LID Router Debug] Step: 3950
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [2, 0, 6, 3, 0, 2, 4, 1, 2, 2]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6}
+
+Train step of epoch 0:  61%|██████▏   | 3949/6434 [9:15:49<5:52:22,  8.51s/it, gpt_loss=0.271, loss_mean=0.294][A
+Train step of epoch 0:  61%|██████▏   | 3950/6434 [9:15:49<5:39:42,  8.21s/it, gpt_loss=0.271, loss_mean=0.294][A
+Train step of epoch 0:  61%|██████▏   | 3950/6434 [9:15:57<5:39:42,  8.21s/it, gpt_loss=0.305, loss_mean=0.295][A
+Train step of epoch 0:  61%|██████▏   | 3951/6434 [9:15:57<5:43:47,  8.31s/it, gpt_loss=0.305, loss_mean=0.295][A
+Train step of epoch 0:  61%|██████▏   | 3951/6434 [9:16:06<5:43:47,  8.31s/it, gpt_loss=0.336, loss_mean=0.299][A
+Train step of epoch 0:  61%|██████▏   | 3952/6434 [9:16:06<5:47:43,  8.41s/it, gpt_loss=0.336, loss_mean=0.299][A
+Train step of epoch 0:  61%|██████▏   | 3952/6434 [9:16:15<5:47:43,  8.41s/it, gpt_loss=0.351, loss_mean=0.304][A
+Train step of epoch 0:  61%|██████▏   | 3953/6434 [9:16:15<5:53:15,  8.54s/it, gpt_loss=0.351, loss_mean=0.304][A
+Train step of epoch 0:  61%|██████▏   | 3953/6434 [9:16:23<5:53:15,  8.54s/it, gpt_loss=0.34, loss_mean=0.308] [A
+Train step of epoch 0:  61%|██████▏   | 3954/6434 [9:16:23<5:50:07,  8.47s/it, gpt_loss=0.34, loss_mean=0.308][A
+Train step of epoch 0:  61%|██████▏   | 3954/6434 [9:16:31<5:50:07,  8.47s/it, gpt_loss=0.25, loss_mean=0.302][A
+Train step of epoch 0:  61%|██████▏   | 3955/6434 [9:16:31<5:46:33,  8.39s/it, gpt_loss=0.25, loss_mean=0.302][A
+Train step of epoch 0:  61%|██████▏   | 3955/6434 [9:16:42<5:46:33,  8.39s/it, gpt_loss=0.325, loss_mean=0.304][A
+Train step of epoch 0:  61%|██████▏   | 3956/6434 [9:16:42<6:07:33,  8.90s/it, gpt_loss=0.325, loss_mean=0.304][A
+Train step of epoch 0:  61%|██████▏   | 3956/6434 [9:16:50<6:07:33,  8.90s/it, gpt_loss=0.374, loss_mean=0.311][A
+Train step of epoch 0:  62%|██████▏   | 3957/6434 [9:16:50<5:56:27,  8.63s/it, gpt_loss=0.374, loss_mean=0.311][A
+Train step of epoch 0:  62%|██████▏   | 3957/6434 [9:16:58<5:56:27,  8.63s/it, gpt_loss=0.196, loss_mean=0.3]  [A
+Train step of epoch 0:  62%|██████▏   | 3958/6434 [9:16:58<5:53:36,  8.57s/it, gpt_loss=0.196, loss_mean=0.3][A
+Train step of epoch 0:  62%|██████▏   | 3958/6434 [9:17:05<5:53:36,  8.57s/it, gpt_loss=0.246, loss_mean=0.294][A
+Train step of epoch 0:  62%|██████▏   | 3959/6434 [9:17:05<5:35:11,  8.13s/it, gpt_loss=0.246, loss_mean=0.294][A
+[LID Router Debug] Step: 3960
+Batch Size: 10
+Audio Batch Size: 121
+LID Assignments: [3, 2, 9, 9, 4, 2, 4, 7, 4, 0]
+Active Experts in Batch: {0, 2, 3, 4, 7, 9}
+
+Train step of epoch 0:  62%|██████▏   | 3959/6434 [9:17:14<5:35:11,  8.13s/it, gpt_loss=0.324, loss_mean=0.297][A
+Train step of epoch 0:  62%|██████▏   | 3960/6434 [9:17:14<5:48:01,  8.44s/it, gpt_loss=0.324, loss_mean=0.297][A
+Train step of epoch 0:  62%|██████▏   | 3960/6434 [9:17:23<5:48:01,  8.44s/it, gpt_loss=0.343, loss_mean=0.302][A
+Train step of epoch 0:  62%|██████▏   | 3961/6434 [9:17:23<5:55:27,  8.62s/it, gpt_loss=0.343, loss_mean=0.302][A
+Train step of epoch 0:  62%|██████▏   | 3961/6434 [9:17:32<5:55:27,  8.62s/it, gpt_loss=0.258, loss_mean=0.297][A
+Train step of epoch 0:  62%|██████▏   | 3962/6434 [9:17:32<5:54:23,  8.60s/it, gpt_loss=0.258, loss_mean=0.297][A
+Train step of epoch 0:  62%|██████▏   | 3962/6434 [9:17:41<5:54:23,  8.60s/it, gpt_loss=0.254, loss_mean=0.293][A
+Train step of epoch 0:  62%|██████▏   | 3963/6434 [9:17:41<5:56:46,  8.66s/it, gpt_loss=0.254, loss_mean=0.293][A
+Train step of epoch 0:  62%|██████▏   | 3963/6434 [9:17:50<5:56:46,  8.66s/it, gpt_loss=0.295, loss_mean=0.293][A
+Train step of epoch 0:  62%|██████▏   | 3964/6434 [9:17:50<6:07:30,  8.93s/it, gpt_loss=0.295, loss_mean=0.293][A
+Train step of epoch 0:  62%|██████▏   | 3964/6434 [9:17:59<6:07:30,  8.93s/it, gpt_loss=0.26, loss_mean=0.29]  [A
+Train step of epoch 0:  62%|██████▏   | 3965/6434 [9:17:59<6:04:02,  8.85s/it, gpt_loss=0.26, loss_mean=0.29][A
+Train step of epoch 0:  62%|██████▏   | 3965/6434 [9:18:07<6:04:02,  8.85s/it, gpt_loss=0.318, loss_mean=0.293][A
+Train step of epoch 0:  62%|██████▏   | 3966/6434 [9:18:07<6:01:42,  8.79s/it, gpt_loss=0.318, loss_mean=0.293][A
+Train step of epoch 0:  62%|██████▏   | 3966/6434 [9:18:15<6:01:42,  8.79s/it, gpt_loss=0.315, loss_mean=0.295][A
+Train step of epoch 0:  62%|██████▏   | 3967/6434 [9:18:15<5:43:58,  8.37s/it, gpt_loss=0.315, loss_mean=0.295][A
+Train step of epoch 0:  62%|██████▏   | 3967/6434 [9:18:23<5:43:58,  8.37s/it, gpt_loss=0.303, loss_mean=0.296][A
+Train step of epoch 0:  62%|██████▏   | 3968/6434 [9:18:23<5:46:32,  8.43s/it, gpt_loss=0.303, loss_mean=0.296][A
+Train step of epoch 0:  62%|██████▏   | 3968/6434 [9:18:32<5:46:32,  8.43s/it, gpt_loss=0.264, loss_mean=0.293][A
+Train step of epoch 0:  62%|██████▏   | 3969/6434 [9:18:32<5:51:36,  8.56s/it, gpt_loss=0.264, loss_mean=0.293][A
+[LID Router Debug] Step: 3970
+Batch Size: 10
+Audio Batch Size: 109
+LID Assignments: [6, 1, 4, 6, 9, 9, 3, 0, 9, 1]
+Active Experts in Batch: {0, 1, 3, 4, 6, 9}
+
+Train step of epoch 0:  62%|██████▏   | 3969/6434 [9:18:40<5:51:36,  8.56s/it, gpt_loss=0.311, loss_mean=0.294][A
+Train step of epoch 0:  62%|██████▏   | 3970/6434 [9:18:40<5:40:08,  8.28s/it, gpt_loss=0.311, loss_mean=0.294][A
+Train step of epoch 0:  62%|██████▏   | 3970/6434 [9:18:48<5:40:08,  8.28s/it, gpt_loss=0.317, loss_mean=0.297][A
+Train step of epoch 0:  62%|██████▏   | 3971/6434 [9:18:48<5:41:44,  8.32s/it, gpt_loss=0.317, loss_mean=0.297][A
+Train step of epoch 0:  62%|██████▏   | 3971/6434 [9:18:56<5:41:44,  8.32s/it, gpt_loss=0.368, loss_mean=0.304][A
+Train step of epoch 0:  62%|██████▏   | 3972/6434 [9:18:56<5:37:12,  8.22s/it, gpt_loss=0.368, loss_mean=0.304][A
+Train step of epoch 0:  62%|██████▏   | 3972/6434 [9:19:05<5:37:12,  8.22s/it, gpt_loss=0.251, loss_mean=0.298][A
+Train step of epoch 0:  62%|██████▏   | 3973/6434 [9:19:05<5:46:11,  8.44s/it, gpt_loss=0.251, loss_mean=0.298][A
+Train step of epoch 0:  62%|██████▏   | 3973/6434 [9:19:14<5:46:11,  8.44s/it, gpt_loss=0.313, loss_mean=0.3]  [A
+Train step of epoch 0:  62%|██████▏   | 3974/6434 [9:19:14<5:54:54,  8.66s/it, gpt_loss=0.313, loss_mean=0.3][A
+Train step of epoch 0:  62%|██████▏   | 3974/6434 [9:19:23<5:54:54,  8.66s/it, gpt_loss=0.239, loss_mean=0.294][A
+Train step of epoch 0:  62%|██████▏   | 3975/6434 [9:19:23<5:56:24,  8.70s/it, gpt_loss=0.239, loss_mean=0.294][A
+Train step of epoch 0:  62%|██████▏   | 3975/6434 [9:19:31<5:56:24,  8.70s/it, gpt_loss=0.259, loss_mean=0.29] [A
+Train step of epoch 0:  62%|██████▏   | 3976/6434 [9:19:31<5:50:25,  8.55s/it, gpt_loss=0.259, loss_mean=0.29][A
+Train step of epoch 0:  62%|██████▏   | 3976/6434 [9:19:40<5:50:25,  8.55s/it, gpt_loss=0.224, loss_mean=0.284][A
+Train step of epoch 0:  62%|██████▏   | 3977/6434 [9:19:40<5:44:30,  8.41s/it, gpt_loss=0.224, loss_mean=0.284][A
+Train step of epoch 0:  62%|██████▏   | 3977/6434 [9:19:47<5:44:30,  8.41s/it, gpt_loss=0.348, loss_mean=0.29] [A
+Train step of epoch 0:  62%|██████▏   | 3978/6434 [9:19:47<5:36:12,  8.21s/it, gpt_loss=0.348, loss_mean=0.29][A
+Train step of epoch 0:  62%|██████▏   | 3978/6434 [9:19:55<5:36:12,  8.21s/it, gpt_loss=0.252, loss_mean=0.286][A
+Train step of epoch 0:  62%|██████▏   | 3979/6434 [9:19:55<5:31:14,  8.10s/it, gpt_loss=0.252, loss_mean=0.286][A
+[LID Router Debug] Step: 3980
+Batch Size: 10
+Audio Batch Size: 78
+LID Assignments: [0, 9, 9, 7, 4, 6, 1, 1, 0, 5]
+Active Experts in Batch: {0, 1, 4, 5, 6, 7, 9}
+
+Train step of epoch 0:  62%|██████▏   | 3979/6434 [9:20:04<5:31:14,  8.10s/it, gpt_loss=0.326, loss_mean=0.29] [A
+Train step of epoch 0:  62%|██████▏   | 3980/6434 [9:20:04<5:39:15,  8.29s/it, gpt_loss=0.326, loss_mean=0.29][A
+Train step of epoch 0:  62%|██████▏   | 3980/6434 [9:20:11<5:39:15,  8.29s/it, gpt_loss=0.392, loss_mean=0.3] [A
+Train step of epoch 0:  62%|██████▏   | 3981/6434 [9:20:11<5:26:42,  7.99s/it, gpt_loss=0.392, loss_mean=0.3][A
+Train step of epoch 0:  62%|██████▏   | 3981/6434 [9:20:21<5:26:42,  7.99s/it, gpt_loss=0.302, loss_mean=0.301][A
+Train step of epoch 0:  62%|██████▏   | 3982/6434 [9:20:21<5:44:11,  8.42s/it, gpt_loss=0.302, loss_mean=0.301][A
+Train step of epoch 0:  62%|██████▏   | 3982/6434 [9:20:29<5:44:11,  8.42s/it, gpt_loss=0.239, loss_mean=0.294][A
+Train step of epoch 0:  62%|██████▏   | 3983/6434 [9:20:29<5:42:03,  8.37s/it, gpt_loss=0.239, loss_mean=0.294][A
+Train step of epoch 0:  62%|██████▏   | 3983/6434 [9:20:37<5:42:03,  8.37s/it, gpt_loss=0.294, loss_mean=0.294][A
+Train step of epoch 0:  62%|██████▏   | 3984/6434 [9:20:37<5:44:38,  8.44s/it, gpt_loss=0.294, loss_mean=0.294][A
+Train step of epoch 0:  62%|██████▏   | 3984/6434 [9:20:45<5:44:38,  8.44s/it, gpt_loss=0.364, loss_mean=0.301][A
+Train step of epoch 0:  62%|██████▏   | 3985/6434 [9:20:45<5:36:11,  8.24s/it, gpt_loss=0.364, loss_mean=0.301][A
+Train step of epoch 0:  62%|██████▏   | 3985/6434 [9:20:54<5:36:11,  8.24s/it, gpt_loss=0.334, loss_mean=0.305][A
+Train step of epoch 0:  62%|██████▏   | 3986/6434 [9:20:54<5:37:58,  8.28s/it, gpt_loss=0.334, loss_mean=0.305][A
+Train step of epoch 0:  62%|██████▏   | 3986/6434 [9:21:02<5:37:58,  8.28s/it, gpt_loss=0.27, loss_mean=0.301] [A
+Train step of epoch 0:  62%|██████▏   | 3987/6434 [9:21:02<5:34:34,  8.20s/it, gpt_loss=0.27, loss_mean=0.301][A
+Train step of epoch 0:  62%|██████▏   | 3987/6434 [9:21:10<5:34:34,  8.20s/it, gpt_loss=0.373, loss_mean=0.308][A
+Train step of epoch 0:  62%|██████▏   | 3988/6434 [9:21:10<5:34:10,  8.20s/it, gpt_loss=0.373, loss_mean=0.308][A
+Train step of epoch 0:  62%|██████▏   | 3988/6434 [9:21:18<5:34:10,  8.20s/it, gpt_loss=0.302, loss_mean=0.308][A
+Train step of epoch 0:  62%|██████▏   | 3989/6434 [9:21:18<5:37:29,  8.28s/it, gpt_loss=0.302, loss_mean=0.308][A
+[LID Router Debug] Step: 3990
+Batch Size: 10
+Audio Batch Size: 104
+LID Assignments: [2, 9, 9, 2, 1, 9, 3, 5, 1, 3]
+Active Experts in Batch: {1, 2, 3, 5, 9}
+
+Train step of epoch 0:  62%|██████▏   | 3989/6434 [9:21:27<5:37:29,  8.28s/it, gpt_loss=0.264, loss_mean=0.303][A
+Train step of epoch 0:  62%|██████▏   | 3990/6434 [9:21:27<5:39:27,  8.33s/it, gpt_loss=0.264, loss_mean=0.303][A
+Train step of epoch 0:  62%|██████▏   | 3990/6434 [9:21:36<5:39:27,  8.33s/it, gpt_loss=0.26, loss_mean=0.299] [A
+Train step of epoch 0:  62%|██████▏   | 3991/6434 [9:21:36<5:55:08,  8.72s/it, gpt_loss=0.26, loss_mean=0.299][A
+Train step of epoch 0:  62%|██████▏   | 3991/6434 [9:21:45<5:55:08,  8.72s/it, gpt_loss=0.31, loss_mean=0.3]  [A
+Train step of epoch 0:  62%|██████▏   | 3992/6434 [9:21:45<5:53:33,  8.69s/it, gpt_loss=0.31, loss_mean=0.3][A
+Train step of epoch 0:  62%|██████▏   | 3992/6434 [9:21:52<5:53:33,  8.69s/it, gpt_loss=0.366, loss_mean=0.307][A
+Train step of epoch 0:  62%|██████▏   | 3993/6434 [9:21:52<5:39:19,  8.34s/it, gpt_loss=0.366, loss_mean=0.307][A
+Train step of epoch 0:  62%|██████▏   | 3993/6434 [9:22:00<5:39:19,  8.34s/it, gpt_loss=0.23, loss_mean=0.299] [A
+Train step of epoch 0:  62%|██████▏   | 3994/6434 [9:22:00<5:32:22,  8.17s/it, gpt_loss=0.23, loss_mean=0.299][A
+Train step of epoch 0:  62%|██████▏   | 3994/6434 [9:22:08<5:32:22,  8.17s/it, gpt_loss=0.353, loss_mean=0.304][A
+Train step of epoch 0:  62%|██████▏   | 3995/6434 [9:22:08<5:29:50,  8.11s/it, gpt_loss=0.353, loss_mean=0.304][A
+Train step of epoch 0:  62%|██████▏   | 3995/6434 [9:22:16<5:29:50,  8.11s/it, gpt_loss=0.258, loss_mean=0.3]  [A
+Train step of epoch 0:  62%|██████▏   | 3996/6434 [9:22:16<5:30:58,  8.15s/it, gpt_loss=0.258, loss_mean=0.3][A
+Train step of epoch 0:  62%|██████▏   | 3996/6434 [9:22:26<5:30:58,  8.15s/it, gpt_loss=0.355, loss_mean=0.305][A
+Train step of epoch 0:  62%|██████▏   | 3997/6434 [9:22:26<5:49:53,  8.61s/it, gpt_loss=0.355, loss_mean=0.305][A
+Train step of epoch 0:  62%|██████▏   | 3997/6434 [9:22:36<5:49:53,  8.61s/it, gpt_loss=0.246, loss_mean=0.299][A
+Train step of epoch 0:  62%|██████▏   | 3998/6434 [9:22:36<6:01:34,  8.91s/it, gpt_loss=0.246, loss_mean=0.299][A
+Train step of epoch 0:  62%|██████▏   | 3998/6434 [9:22:44<6:01:34,  8.91s/it, gpt_loss=0.348, loss_mean=0.304][A
+Train step of epoch 0:  62%|██████▏   | 3999/6434 [9:22:44<5:49:40,  8.62s/it, gpt_loss=0.348, loss_mean=0.304][A
+[LID Router Debug] Step: 4000
+Batch Size: 10
+Audio Batch Size: 120
+LID Assignments: [2, 9, 4, 2, 3, 9, 7, 6, 9, 4]
+Active Experts in Batch: {2, 3, 4, 6, 7, 9}
+[2026-02-07 01:18:56,560] [INFO] [logging.py:96:log_dist] [Rank 0] step=2000, skipped=0, lr=[1.810521903233307e-05, 1.810521903233307e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 01:18:56,560] [INFO] [timer.py:260:stop] epoch=0/micro_step=4000/global_step=2000, RunningAvgSamplesPerSec=4.7480129866802026, CurrSamplesPerSec=4.908622158692772, MemAllocated=12.86GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  62%|██████▏   | 3999/6434 [9:22:52<5:49:40,  8.62s/it, gpt_loss=0.28, loss_mean=0.302] [A
+Train step of epoch 0:  62%|██████▏   | 4000/6434 [9:22:52<5:46:45,  8.55s/it, gpt_loss=0.28, loss_mean=0.302][A
+Train step of epoch 0:  62%|██████▏   | 4000/6434 [9:23:02<5:46:45,  8.55s/it, gpt_loss=0.33, loss_mean=0.305][A
+Train step of epoch 0:  62%|██████▏   | 4001/6434 [9:23:02<6:04:04,  8.98s/it, gpt_loss=0.33, loss_mean=0.305][A
+Train step of epoch 0:  62%|██████▏   | 4001/6434 [9:23:10<6:04:04,  8.98s/it, gpt_loss=0.341, loss_mean=0.308][A
+Train step of epoch 0:  62%|██████▏   | 4002/6434 [9:23:10<5:49:11,  8.61s/it, gpt_loss=0.341, loss_mean=0.308][A
+Train step of epoch 0:  62%|██████▏   | 4002/6434 [9:23:17<5:49:11,  8.61s/it, gpt_loss=0.326, loss_mean=0.31] [A
+Train step of epoch 0:  62%|██████▏   | 4003/6434 [9:23:17<5:35:58,  8.29s/it, gpt_loss=0.326, loss_mean=0.31][A
+Train step of epoch 0:  62%|██████▏   | 4003/6434 [9:23:26<5:35:58,  8.29s/it, gpt_loss=0.315, loss_mean=0.311][A
+Train step of epoch 0:  62%|██████▏   | 4004/6434 [9:23:26<5:38:48,  8.37s/it, gpt_loss=0.315, loss_mean=0.311][A
+Train step of epoch 0:  62%|██████▏   | 4004/6434 [9:23:34<5:38:48,  8.37s/it, gpt_loss=0.258, loss_mean=0.305][A
+Train step of epoch 0:  62%|██████▏   | 4005/6434 [9:23:34<5:36:58,  8.32s/it, gpt_loss=0.258, loss_mean=0.305][A
+Train step of epoch 0:  62%|██████▏   | 4005/6434 [9:23:43<5:36:58,  8.32s/it, gpt_loss=0.35, loss_mean=0.31]  [A
+Train step of epoch 0:  62%|██████▏   | 4006/6434 [9:23:43<5:41:21,  8.44s/it, gpt_loss=0.35, loss_mean=0.31][A
+Train step of epoch 0:  62%|██████▏   | 4006/6434 [9:23:51<5:41:21,  8.44s/it, gpt_loss=0.335, loss_mean=0.312][A
+Train step of epoch 0:  62%|██████▏   | 4007/6434 [9:23:51<5:35:47,  8.30s/it, gpt_loss=0.335, loss_mean=0.312][A
+Train step of epoch 0:  62%|██████▏   | 4007/6434 [9:23:59<5:35:47,  8.30s/it, gpt_loss=0.264, loss_mean=0.308][A
+Train step of epoch 0:  62%|██████▏   | 4008/6434 [9:23:59<5:30:04,  8.16s/it, gpt_loss=0.264, loss_mean=0.308][A
+Train step of epoch 0:  62%|██████▏   | 4008/6434 [9:24:07<5:30:04,  8.16s/it, gpt_loss=0.298, loss_mean=0.307][A
+Train step of epoch 0:  62%|██████▏   | 4009/6434 [9:24:07<5:29:55,  8.16s/it, gpt_loss=0.298, loss_mean=0.307][A
+[LID Router Debug] Step: 4010
+Batch Size: 10
+Audio Batch Size: 128
+LID Assignments: [9, 1, 3, 9, 2, 0, 3, 3, 1, 1]
+Active Experts in Batch: {0, 1, 2, 3, 9}
+
+Train step of epoch 0:  62%|██████▏   | 4009/6434 [9:24:15<5:29:55,  8.16s/it, gpt_loss=0.236, loss_mean=0.299][A
+Train step of epoch 0:  62%|██████▏   | 4010/6434 [9:24:15<5:30:23,  8.18s/it, gpt_loss=0.236, loss_mean=0.299][A
+Train step of epoch 0:  62%|██████▏   | 4010/6434 [9:24:23<5:30:23,  8.18s/it, gpt_loss=0.373, loss_mean=0.307][A
+Train step of epoch 0:  62%|██████▏   | 4011/6434 [9:24:23<5:29:27,  8.16s/it, gpt_loss=0.373, loss_mean=0.307][A
+Train step of epoch 0:  62%|██████▏   | 4011/6434 [9:24:31<5:29:27,  8.16s/it, gpt_loss=0.334, loss_mean=0.31] [A
+Train step of epoch 0:  62%|██████▏   | 4012/6434 [9:24:31<5:29:22,  8.16s/it, gpt_loss=0.334, loss_mean=0.31][A
+Train step of epoch 0:  62%|██████▏   | 4012/6434 [9:24:40<5:29:22,  8.16s/it, gpt_loss=0.325, loss_mean=0.311][A
+Train step of epoch 0:  62%|██████▏   | 4013/6434 [9:24:40<5:37:24,  8.36s/it, gpt_loss=0.325, loss_mean=0.311][A
+Train step of epoch 0:  62%|██████▏   | 4013/6434 [9:24:48<5:37:24,  8.36s/it, gpt_loss=0.279, loss_mean=0.308][A
+Train step of epoch 0:  62%|██████▏   | 4014/6434 [9:24:48<5:29:34,  8.17s/it, gpt_loss=0.279, loss_mean=0.308][A
+Train step of epoch 0:  62%|██████▏   | 4014/6434 [9:24:57<5:29:34,  8.17s/it, gpt_loss=0.28, loss_mean=0.305] [A
+Train step of epoch 0:  62%|██████▏   | 4015/6434 [9:24:57<5:45:36,  8.57s/it, gpt_loss=0.28, loss_mean=0.305][A
+Train step of epoch 0:  62%|██████▏   | 4015/6434 [9:25:06<5:45:36,  8.57s/it, gpt_loss=0.346, loss_mean=0.309][A
+Train step of epoch 0:  62%|██████▏   | 4016/6434 [9:25:06<5:41:21,  8.47s/it, gpt_loss=0.346, loss_mean=0.309][A
+Train step of epoch 0:  62%|██████▏   | 4016/6434 [9:25:13<5:41:21,  8.47s/it, gpt_loss=0.251, loss_mean=0.303][A
+Train step of epoch 0:  62%|██████▏   | 4017/6434 [9:25:13<5:29:06,  8.17s/it, gpt_loss=0.251, loss_mean=0.303][A
+Train step of epoch 0:  62%|██████▏   | 4017/6434 [9:25:20<5:29:06,  8.17s/it, gpt_loss=0.211, loss_mean=0.294][A
+Train step of epoch 0:  62%|██████▏   | 4018/6434 [9:25:20<5:14:41,  7.82s/it, gpt_loss=0.211, loss_mean=0.294][A
+Train step of epoch 0:  62%|██████▏   | 4018/6434 [9:25:28<5:14:41,  7.82s/it, gpt_loss=0.257, loss_mean=0.29] [A
+Train step of epoch 0:  62%|██████▏   | 4019/6434 [9:25:28<5:19:45,  7.94s/it, gpt_loss=0.257, loss_mean=0.29][A
+[LID Router Debug] Step: 4020
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [0, 4, 4, 3, 1, 0, 4, 1, 1, 3]
+Active Experts in Batch: {0, 1, 3, 4}
+
+Train step of epoch 0:  62%|██████▏   | 4019/6434 [9:25:37<5:19:45,  7.94s/it, gpt_loss=0.262, loss_mean=0.288][A
+Train step of epoch 0:  62%|██████▏   | 4020/6434 [9:25:37<5:32:05,  8.25s/it, gpt_loss=0.262, loss_mean=0.288][A
+Train step of epoch 0:  62%|██████▏   | 4020/6434 [9:25:45<5:32:05,  8.25s/it, gpt_loss=0.418, loss_mean=0.301][A
+Train step of epoch 0:  62%|██████▏   | 4021/6434 [9:25:45<5:27:35,  8.15s/it, gpt_loss=0.418, loss_mean=0.301][A
+Train step of epoch 0:  62%|██████▏   | 4021/6434 [9:25:53<5:27:35,  8.15s/it, gpt_loss=0.414, loss_mean=0.312][A
+Train step of epoch 0:  63%|██████▎   | 4022/6434 [9:25:53<5:21:37,  8.00s/it, gpt_loss=0.414, loss_mean=0.312][A
+Train step of epoch 0:  63%|██████▎   | 4022/6434 [9:26:00<5:21:37,  8.00s/it, gpt_loss=0.349, loss_mean=0.316][A
+Train step of epoch 0:  63%|██████▎   | 4023/6434 [9:26:00<5:14:08,  7.82s/it, gpt_loss=0.349, loss_mean=0.316][A
+Train step of epoch 0:  63%|██████▎   | 4023/6434 [9:26:09<5:14:08,  7.82s/it, gpt_loss=0.271, loss_mean=0.311][A
+Train step of epoch 0:  63%|██████▎   | 4024/6434 [9:26:09<5:21:43,  8.01s/it, gpt_loss=0.271, loss_mean=0.311][A
+Train step of epoch 0:  63%|██████▎   | 4024/6434 [9:26:18<5:21:43,  8.01s/it, gpt_loss=0.287, loss_mean=0.309][A
+Train step of epoch 0:  63%|██████▎   | 4025/6434 [9:26:18<5:36:08,  8.37s/it, gpt_loss=0.287, loss_mean=0.309][A
+Train step of epoch 0:  63%|██████▎   | 4025/6434 [9:26:27<5:36:08,  8.37s/it, gpt_loss=0.242, loss_mean=0.302][A
+Train step of epoch 0:  63%|██████▎   | 4026/6434 [9:26:27<5:49:33,  8.71s/it, gpt_loss=0.242, loss_mean=0.302][A
+Train step of epoch 0:  63%|██████▎   | 4026/6434 [9:26:36<5:49:33,  8.71s/it, gpt_loss=0.29, loss_mean=0.301] [A
+Train step of epoch 0:  63%|██████▎   | 4027/6434 [9:26:36<5:49:05,  8.70s/it, gpt_loss=0.29, loss_mean=0.301][A
+Train step of epoch 0:  63%|██████▎   | 4027/6434 [9:26:44<5:49:05,  8.70s/it, gpt_loss=0.314, loss_mean=0.302][A
+Train step of epoch 0:  63%|██████▎   | 4028/6434 [9:26:44<5:43:33,  8.57s/it, gpt_loss=0.314, loss_mean=0.302][A
+Train step of epoch 0:  63%|██████▎   | 4028/6434 [9:26:52<5:43:33,  8.57s/it, gpt_loss=0.353, loss_mean=0.307][A
+Train step of epoch 0:  63%|██████▎   | 4029/6434 [9:26:52<5:35:31,  8.37s/it, gpt_loss=0.353, loss_mean=0.307][A
+[LID Router Debug] Step: 4030
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [4, 6, 4, 4, 1, 1, 0, 2, 3, 5]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+Train step of epoch 0:  63%|██████▎   | 4029/6434 [9:27:01<5:35:31,  8.37s/it, gpt_loss=0.338, loss_mean=0.31] [A
+Train step of epoch 0:  63%|██████▎   | 4030/6434 [9:27:01<5:42:30,  8.55s/it, gpt_loss=0.338, loss_mean=0.31][A
+Train step of epoch 0:  63%|██████▎   | 4030/6434 [9:27:09<5:42:30,  8.55s/it, gpt_loss=0.352, loss_mean=0.314][A
+Train step of epoch 0:  63%|██████▎   | 4031/6434 [9:27:09<5:29:11,  8.22s/it, gpt_loss=0.352, loss_mean=0.314][A
+Train step of epoch 0:  63%|██████▎   | 4031/6434 [9:27:17<5:29:11,  8.22s/it, gpt_loss=0.324, loss_mean=0.315][A
+Train step of epoch 0:  63%|██████▎   | 4032/6434 [9:27:17<5:34:02,  8.34s/it, gpt_loss=0.324, loss_mean=0.315][A
+Train step of epoch 0:  63%|██████▎   | 4032/6434 [9:27:28<5:34:02,  8.34s/it, gpt_loss=0.273, loss_mean=0.311][A
+Train step of epoch 0:  63%|██████▎   | 4033/6434 [9:27:28<5:56:07,  8.90s/it, gpt_loss=0.273, loss_mean=0.311][A
+Train step of epoch 0:  63%|██████▎   | 4033/6434 [9:27:36<5:56:07,  8.90s/it, gpt_loss=0.387, loss_mean=0.319][A
+Train step of epoch 0:  63%|██████▎   | 4034/6434 [9:27:36<5:46:20,  8.66s/it, gpt_loss=0.387, loss_mean=0.319][A
+Train step of epoch 0:  63%|██████▎   | 4034/6434 [9:27:44<5:46:20,  8.66s/it, gpt_loss=0.3, loss_mean=0.317]  [A
+Train step of epoch 0:  63%|██████▎   | 4035/6434 [9:27:44<5:40:33,  8.52s/it, gpt_loss=0.3, loss_mean=0.317][A
+Train step of epoch 0:  63%|██████▎   | 4035/6434 [9:27:53<5:40:33,  8.52s/it, gpt_loss=0.262, loss_mean=0.311][A
+Train step of epoch 0:  63%|██████▎   | 4036/6434 [9:27:53<5:50:08,  8.76s/it, gpt_loss=0.262, loss_mean=0.311][A
+Train step of epoch 0:  63%|██████▎   | 4036/6434 [9:28:01<5:50:08,  8.76s/it, gpt_loss=0.257, loss_mean=0.306][A
+Train step of epoch 0:  63%|██████▎   | 4037/6434 [9:28:01<5:41:17,  8.54s/it, gpt_loss=0.257, loss_mean=0.306][A
+Train step of epoch 0:  63%|██████▎   | 4037/6434 [9:28:10<5:41:17,  8.54s/it, gpt_loss=0.247, loss_mean=0.3]  [A
+Train step of epoch 0:  63%|██████▎   | 4038/6434 [9:28:10<5:45:00,  8.64s/it, gpt_loss=0.247, loss_mean=0.3][A
+Train step of epoch 0:  63%|██████▎   | 4038/6434 [9:28:19<5:45:00,  8.64s/it, gpt_loss=0.379, loss_mean=0.308][A
+Train step of epoch 0:  63%|██████▎   | 4039/6434 [9:28:19<5:48:36,  8.73s/it, gpt_loss=0.379, loss_mean=0.308][A
+[LID Router Debug] Step: 4040
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [9, 0, 1, 4, 2, 5, 3, 6, 1, 2]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  63%|██████▎   | 4039/6434 [9:28:26<5:48:36,  8.73s/it, gpt_loss=0.237, loss_mean=0.301][A
+Train step of epoch 0:  63%|██████▎   | 4040/6434 [9:28:26<5:31:37,  8.31s/it, gpt_loss=0.237, loss_mean=0.301][A
+Train step of epoch 0:  63%|██████▎   | 4040/6434 [9:28:35<5:31:37,  8.31s/it, gpt_loss=0.229, loss_mean=0.294][A
+Train step of epoch 0:  63%|██████▎   | 4041/6434 [9:28:35<5:34:45,  8.39s/it, gpt_loss=0.229, loss_mean=0.294][A
+Train step of epoch 0:  63%|██████▎   | 4041/6434 [9:28:44<5:34:45,  8.39s/it, gpt_loss=0.26, loss_mean=0.29]  [A
+Train step of epoch 0:  63%|██████▎   | 4042/6434 [9:28:44<5:39:55,  8.53s/it, gpt_loss=0.26, loss_mean=0.29][A
+Train step of epoch 0:  63%|██████▎   | 4042/6434 [9:28:53<5:39:55,  8.53s/it, gpt_loss=0.26, loss_mean=0.287][A
+Train step of epoch 0:  63%|██████▎   | 4043/6434 [9:28:53<5:43:15,  8.61s/it, gpt_loss=0.26, loss_mean=0.287][A
+Train step of epoch 0:  63%|██████▎   | 4043/6434 [9:29:00<5:43:15,  8.61s/it, gpt_loss=0.281, loss_mean=0.287][A
+Train step of epoch 0:  63%|██████▎   | 4044/6434 [9:29:00<5:27:00,  8.21s/it, gpt_loss=0.281, loss_mean=0.287][A
+Train step of epoch 0:  63%|██████▎   | 4044/6434 [9:29:08<5:27:00,  8.21s/it, gpt_loss=0.372, loss_mean=0.295][A
+Train step of epoch 0:  63%|██████▎   | 4045/6434 [9:29:08<5:29:27,  8.27s/it, gpt_loss=0.372, loss_mean=0.295][A
+Train step of epoch 0:  63%|██████▎   | 4045/6434 [9:29:17<5:29:27,  8.27s/it, gpt_loss=0.331, loss_mean=0.299][A
+Train step of epoch 0:  63%|██████▎   | 4046/6434 [9:29:17<5:29:37,  8.28s/it, gpt_loss=0.331, loss_mean=0.299][A
+Train step of epoch 0:  63%|██████▎   | 4046/6434 [9:29:25<5:29:37,  8.28s/it, gpt_loss=0.275, loss_mean=0.296][A
+Train step of epoch 0:  63%|██████▎   | 4047/6434 [9:29:25<5:33:01,  8.37s/it, gpt_loss=0.275, loss_mean=0.296][A
+Train step of epoch 0:  63%|██████▎   | 4047/6434 [9:29:35<5:33:01,  8.37s/it, gpt_loss=0.264, loss_mean=0.293][A
+Train step of epoch 0:  63%|██████▎   | 4048/6434 [9:29:35<5:49:05,  8.78s/it, gpt_loss=0.264, loss_mean=0.293][A
+Train step of epoch 0:  63%|██████▎   | 4048/6434 [9:29:44<5:49:05,  8.78s/it, gpt_loss=0.32, loss_mean=0.296] [A
+Train step of epoch 0:  63%|██████▎   | 4049/6434 [9:29:44<5:49:16,  8.79s/it, gpt_loss=0.32, loss_mean=0.296][A
+[LID Router Debug] Step: 4050
+Batch Size: 10
+Audio Batch Size: 120
+LID Assignments: [3, 1, 4, 3, 0, 2, 4, 4, 9, 10]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9, 10}
+
+Train step of epoch 0:  63%|██████▎   | 4049/6434 [9:29:52<5:49:16,  8.79s/it, gpt_loss=0.309, loss_mean=0.297][A
+Train step of epoch 0:  63%|██████▎   | 4050/6434 [9:29:52<5:40:35,  8.57s/it, gpt_loss=0.309, loss_mean=0.297][A
+Train step of epoch 0:  63%|██████▎   | 4050/6434 [9:29:59<5:40:35,  8.57s/it, gpt_loss=0.302, loss_mean=0.298][A
+Train step of epoch 0:  63%|██████▎   | 4051/6434 [9:29:59<5:26:49,  8.23s/it, gpt_loss=0.302, loss_mean=0.298][A
+Train step of epoch 0:  63%|██████▎   | 4051/6434 [9:30:08<5:26:49,  8.23s/it, gpt_loss=0.279, loss_mean=0.296][A
+Train step of epoch 0:  63%|██████▎   | 4052/6434 [9:30:08<5:39:58,  8.56s/it, gpt_loss=0.279, loss_mean=0.296][A
+Train step of epoch 0:  63%|██████▎   | 4052/6434 [9:30:17<5:39:58,  8.56s/it, gpt_loss=0.355, loss_mean=0.302][A
+Train step of epoch 0:  63%|██████▎   | 4053/6434 [9:30:17<5:34:03,  8.42s/it, gpt_loss=0.355, loss_mean=0.302][A
+Train step of epoch 0:  63%|██████▎   | 4053/6434 [9:30:25<5:34:03,  8.42s/it, gpt_loss=0.266, loss_mean=0.298][A
+Train step of epoch 0:  63%|██████▎   | 4054/6434 [9:30:25<5:38:29,  8.53s/it, gpt_loss=0.266, loss_mean=0.298][A
+Train step of epoch 0:  63%|██████▎   | 4054/6434 [9:30:34<5:38:29,  8.53s/it, gpt_loss=0.283, loss_mean=0.297][A
+Train step of epoch 0:  63%|██████▎   | 4055/6434 [9:30:34<5:44:46,  8.70s/it, gpt_loss=0.283, loss_mean=0.297][A
+Train step of epoch 0:  63%|██████▎   | 4055/6434 [9:30:44<5:44:46,  8.70s/it, gpt_loss=0.296, loss_mean=0.297][A
+Train step of epoch 0:  63%|██████▎   | 4056/6434 [9:30:44<5:49:27,  8.82s/it, gpt_loss=0.296, loss_mean=0.297][A
+Train step of epoch 0:  63%|██████▎   | 4056/6434 [9:30:53<5:49:27,  8.82s/it, gpt_loss=0.245, loss_mean=0.292][A
+Train step of epoch 0:  63%|██████▎   | 4057/6434 [9:30:53<5:51:42,  8.88s/it, gpt_loss=0.245, loss_mean=0.292][A
+Train step of epoch 0:  63%|██████▎   | 4057/6434 [9:31:00<5:51:42,  8.88s/it, gpt_loss=0.285, loss_mean=0.291][A
+Train step of epoch 0:  63%|██████▎   | 4058/6434 [9:31:00<5:39:54,  8.58s/it, gpt_loss=0.285, loss_mean=0.291][A
+Train step of epoch 0:  63%|██████▎   | 4058/6434 [9:31:09<5:39:54,  8.58s/it, gpt_loss=0.236, loss_mean=0.285][A
+Train step of epoch 0:  63%|██████▎   | 4059/6434 [9:31:09<5:37:00,  8.51s/it, gpt_loss=0.236, loss_mean=0.285][A
+[LID Router Debug] Step: 4060
+Batch Size: 10
+Audio Batch Size: 84
+LID Assignments: [2, 1, 9, 9, 4, 0, 5, 4, 2, 5]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+Train step of epoch 0:  63%|██████▎   | 4059/6434 [9:31:16<5:37:00,  8.51s/it, gpt_loss=0.361, loss_mean=0.293][A
+Train step of epoch 0:  63%|██████▎   | 4060/6434 [9:31:16<5:25:10,  8.22s/it, gpt_loss=0.361, loss_mean=0.293][A
+Train step of epoch 0:  63%|██████▎   | 4060/6434 [9:31:24<5:25:10,  8.22s/it, gpt_loss=0.263, loss_mean=0.29] [A
+Train step of epoch 0:  63%|██████▎   | 4061/6434 [9:31:24<5:15:24,  7.98s/it, gpt_loss=0.263, loss_mean=0.29][A
+Train step of epoch 0:  63%|██████▎   | 4061/6434 [9:31:33<5:15:24,  7.98s/it, gpt_loss=0.247, loss_mean=0.286][A
+Train step of epoch 0:  63%|██████▎   | 4062/6434 [9:31:33<5:30:39,  8.36s/it, gpt_loss=0.247, loss_mean=0.286][A
+Train step of epoch 0:  63%|██████▎   | 4062/6434 [9:31:42<5:30:39,  8.36s/it, gpt_loss=0.468, loss_mean=0.304][A
+Train step of epoch 0:  63%|██████▎   | 4063/6434 [9:31:42<5:36:29,  8.52s/it, gpt_loss=0.468, loss_mean=0.304][A
+Train step of epoch 0:  63%|██████▎   | 4063/6434 [9:31:51<5:36:29,  8.52s/it, gpt_loss=0.358, loss_mean=0.309][A
+Train step of epoch 0:  63%|██████▎   | 4064/6434 [9:31:51<5:39:19,  8.59s/it, gpt_loss=0.358, loss_mean=0.309][A
+Train step of epoch 0:  63%|██████▎   | 4064/6434 [9:31:58<5:39:19,  8.59s/it, gpt_loss=0.264, loss_mean=0.305][A
+Train step of epoch 0:  63%|██████▎   | 4065/6434 [9:31:58<5:28:27,  8.32s/it, gpt_loss=0.264, loss_mean=0.305][A
+Train step of epoch 0:  63%|██████▎   | 4065/6434 [9:32:06<5:28:27,  8.32s/it, gpt_loss=0.347, loss_mean=0.309][A
+Train step of epoch 0:  63%|██████▎   | 4066/6434 [9:32:06<5:22:04,  8.16s/it, gpt_loss=0.347, loss_mean=0.309][A
+Train step of epoch 0:  63%|██████▎   | 4066/6434 [9:32:15<5:22:04,  8.16s/it, gpt_loss=0.271, loss_mean=0.305][A
+Train step of epoch 0:  63%|██████▎   | 4067/6434 [9:32:15<5:24:54,  8.24s/it, gpt_loss=0.271, loss_mean=0.305][A
+Train step of epoch 0:  63%|██████▎   | 4067/6434 [9:32:23<5:24:54,  8.24s/it, gpt_loss=0.262, loss_mean=0.301][A
+Train step of epoch 0:  63%|██████▎   | 4068/6434 [9:32:23<5:25:46,  8.26s/it, gpt_loss=0.262, loss_mean=0.301][A
+Train step of epoch 0:  63%|██████▎   | 4068/6434 [9:32:33<5:25:46,  8.26s/it, gpt_loss=0.317, loss_mean=0.303][A
+Train step of epoch 0:  63%|██████▎   | 4069/6434 [9:32:33<5:46:16,  8.78s/it, gpt_loss=0.317, loss_mean=0.303][A
+[LID Router Debug] Step: 4070
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [1, 5, 2, 2, 4, 6, 4, 4, 9, 2]
+Active Experts in Batch: {1, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  63%|██████▎   | 4069/6434 [9:32:40<5:46:16,  8.78s/it, gpt_loss=0.358, loss_mean=0.308][A
+Train step of epoch 0:  63%|██████▎   | 4070/6434 [9:32:40<5:28:39,  8.34s/it, gpt_loss=0.358, loss_mean=0.308][A
+Train step of epoch 0:  63%|██████▎   | 4070/6434 [9:32:48<5:28:39,  8.34s/it, gpt_loss=0.301, loss_mean=0.307][A
+Train step of epoch 0:  63%|██████▎   | 4071/6434 [9:32:48<5:22:45,  8.20s/it, gpt_loss=0.301, loss_mean=0.307][A
+Train step of epoch 0:  63%|██████▎   | 4071/6434 [9:32:57<5:22:45,  8.20s/it, gpt_loss=0.242, loss_mean=0.301][A
+Train step of epoch 0:  63%|██████▎   | 4072/6434 [9:32:57<5:29:05,  8.36s/it, gpt_loss=0.242, loss_mean=0.301][A
+Train step of epoch 0:  63%|██████▎   | 4072/6434 [9:33:05<5:29:05,  8.36s/it, gpt_loss=0.266, loss_mean=0.297][A
+Train step of epoch 0:  63%|██████▎   | 4073/6434 [9:33:05<5:27:39,  8.33s/it, gpt_loss=0.266, loss_mean=0.297][A
+Train step of epoch 0:  63%|██████▎   | 4073/6434 [9:33:13<5:27:39,  8.33s/it, gpt_loss=0.295, loss_mean=0.297][A
+Train step of epoch 0:  63%|██████▎   | 4074/6434 [9:33:13<5:26:51,  8.31s/it, gpt_loss=0.295, loss_mean=0.297][A
+Train step of epoch 0:  63%|██████▎   | 4074/6434 [9:33:22<5:26:51,  8.31s/it, gpt_loss=0.3, loss_mean=0.297]  [A
+Train step of epoch 0:  63%|██████▎   | 4075/6434 [9:33:22<5:35:35,  8.54s/it, gpt_loss=0.3, loss_mean=0.297][A
+Train step of epoch 0:  63%|██████▎   | 4075/6434 [9:33:31<5:35:35,  8.54s/it, gpt_loss=0.244, loss_mean=0.292][A
+Train step of epoch 0:  63%|██████▎   | 4076/6434 [9:33:31<5:32:40,  8.47s/it, gpt_loss=0.244, loss_mean=0.292][A
+Train step of epoch 0:  63%|██████▎   | 4076/6434 [9:33:39<5:32:40,  8.47s/it, gpt_loss=0.313, loss_mean=0.294][A
+Train step of epoch 0:  63%|██████▎   | 4077/6434 [9:33:39<5:34:25,  8.51s/it, gpt_loss=0.313, loss_mean=0.294][A
+Train step of epoch 0:  63%|██████▎   | 4077/6434 [9:33:47<5:34:25,  8.51s/it, gpt_loss=0.357, loss_mean=0.3]  [A
+Train step of epoch 0:  63%|██████▎   | 4078/6434 [9:33:47<5:24:00,  8.25s/it, gpt_loss=0.357, loss_mean=0.3][A
+Train step of epoch 0:  63%|██████▎   | 4078/6434 [9:33:56<5:24:00,  8.25s/it, gpt_loss=0.249, loss_mean=0.295][A
+Train step of epoch 0:  63%|██████▎   | 4079/6434 [9:33:56<5:30:19,  8.42s/it, gpt_loss=0.249, loss_mean=0.295][A
+[LID Router Debug] Step: 4080
+Batch Size: 10
+Audio Batch Size: 115
+LID Assignments: [4, 9, 3, 2, 1, 1, 9, 5, 4, 2]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  63%|██████▎   | 4079/6434 [9:34:04<5:30:19,  8.42s/it, gpt_loss=0.255, loss_mean=0.291][A
+Train step of epoch 0:  63%|██████▎   | 4080/6434 [9:34:04<5:23:15,  8.24s/it, gpt_loss=0.255, loss_mean=0.291][A
+Train step of epoch 0:  63%|██████▎   | 4080/6434 [9:34:13<5:23:15,  8.24s/it, gpt_loss=0.323, loss_mean=0.294][A
+Train step of epoch 0:  63%|██████▎   | 4081/6434 [9:34:13<5:33:25,  8.50s/it, gpt_loss=0.323, loss_mean=0.294][A
+Train step of epoch 0:  63%|██████▎   | 4081/6434 [9:34:21<5:33:25,  8.50s/it, gpt_loss=0.262, loss_mean=0.291][A
+Train step of epoch 0:  63%|██████▎   | 4082/6434 [9:34:21<5:31:08,  8.45s/it, gpt_loss=0.262, loss_mean=0.291][A
+Train step of epoch 0:  63%|██████▎   | 4082/6434 [9:34:29<5:31:08,  8.45s/it, gpt_loss=0.279, loss_mean=0.29] [A
+Train step of epoch 0:  63%|██████▎   | 4083/6434 [9:34:29<5:26:34,  8.33s/it, gpt_loss=0.279, loss_mean=0.29][A
+Train step of epoch 0:  63%|██████▎   | 4083/6434 [9:34:36<5:26:34,  8.33s/it, gpt_loss=0.339, loss_mean=0.295][A
+Train step of epoch 0:  63%|██████▎   | 4084/6434 [9:34:36<5:15:15,  8.05s/it, gpt_loss=0.339, loss_mean=0.295][A
+Train step of epoch 0:  63%|██████▎   | 4084/6434 [9:34:45<5:15:15,  8.05s/it, gpt_loss=0.235, loss_mean=0.289][A
+Train step of epoch 0:  63%|██████▎   | 4085/6434 [9:34:45<5:18:25,  8.13s/it, gpt_loss=0.235, loss_mean=0.289][A
+Train step of epoch 0:  63%|██████▎   | 4085/6434 [9:34:53<5:18:25,  8.13s/it, gpt_loss=0.355, loss_mean=0.295][A
+Train step of epoch 0:  64%|██████▎   | 4086/6434 [9:34:53<5:21:05,  8.20s/it, gpt_loss=0.355, loss_mean=0.295][A
+Train step of epoch 0:  64%|██████▎   | 4086/6434 [9:35:02<5:21:05,  8.20s/it, gpt_loss=0.274, loss_mean=0.293][A
+Train step of epoch 0:  64%|██████▎   | 4087/6434 [9:35:02<5:26:42,  8.35s/it, gpt_loss=0.274, loss_mean=0.293][A
+Train step of epoch 0:  64%|██████▎   | 4087/6434 [9:35:10<5:26:42,  8.35s/it, gpt_loss=0.246, loss_mean=0.289][A
+Train step of epoch 0:  64%|██████▎   | 4088/6434 [9:35:10<5:22:17,  8.24s/it, gpt_loss=0.246, loss_mean=0.289][A
+Train step of epoch 0:  64%|██████▎   | 4088/6434 [9:35:18<5:22:17,  8.24s/it, gpt_loss=0.241, loss_mean=0.284][A
+Train step of epoch 0:  64%|██████▎   | 4089/6434 [9:35:18<5:20:47,  8.21s/it, gpt_loss=0.241, loss_mean=0.284][A
+[LID Router Debug] Step: 4090
+Batch Size: 10
+Audio Batch Size: 98
+LID Assignments: [1, 5, 2, 5, 2, 5, 3, 9, 0, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  64%|██████▎   | 4089/6434 [9:35:26<5:20:47,  8.21s/it, gpt_loss=0.33, loss_mean=0.288] [A
+Train step of epoch 0:  64%|██████▎   | 4090/6434 [9:35:26<5:22:34,  8.26s/it, gpt_loss=0.33, loss_mean=0.288][A
+Train step of epoch 0:  64%|██████▎   | 4090/6434 [9:35:35<5:22:34,  8.26s/it, gpt_loss=0.274, loss_mean=0.287][A
+Train step of epoch 0:  64%|██████▎   | 4091/6434 [9:35:35<5:26:58,  8.37s/it, gpt_loss=0.274, loss_mean=0.287][A
+Train step of epoch 0:  64%|██████▎   | 4091/6434 [9:35:44<5:26:58,  8.37s/it, gpt_loss=0.349, loss_mean=0.293][A
+Train step of epoch 0:  64%|██████▎   | 4092/6434 [9:35:44<5:37:03,  8.64s/it, gpt_loss=0.349, loss_mean=0.293][A
+Train step of epoch 0:  64%|██████▎   | 4092/6434 [9:35:52<5:37:03,  8.64s/it, gpt_loss=0.254, loss_mean=0.289][A
+Train step of epoch 0:  64%|██████▎   | 4093/6434 [9:35:52<5:29:19,  8.44s/it, gpt_loss=0.254, loss_mean=0.289][A
+Train step of epoch 0:  64%|██████▎   | 4093/6434 [9:36:01<5:29:19,  8.44s/it, gpt_loss=0.266, loss_mean=0.287][A
+Train step of epoch 0:  64%|██████▎   | 4094/6434 [9:36:01<5:27:38,  8.40s/it, gpt_loss=0.266, loss_mean=0.287][A
+Train step of epoch 0:  64%|██████▎   | 4094/6434 [9:36:10<5:27:38,  8.40s/it, gpt_loss=0.326, loss_mean=0.291][A
+Train step of epoch 0:  64%|██████▎   | 4095/6434 [9:36:10<5:37:12,  8.65s/it, gpt_loss=0.326, loss_mean=0.291][A
+Train step of epoch 0:  64%|██████▎   | 4095/6434 [9:36:19<5:37:12,  8.65s/it, gpt_loss=0.356, loss_mean=0.297][A
+Train step of epoch 0:  64%|██████▎   | 4096/6434 [9:36:19<5:42:36,  8.79s/it, gpt_loss=0.356, loss_mean=0.297][A
+Train step of epoch 0:  64%|██████▎   | 4096/6434 [9:36:27<5:42:36,  8.79s/it, gpt_loss=0.374, loss_mean=0.305][A
+Train step of epoch 0:  64%|██████▎   | 4097/6434 [9:36:27<5:33:26,  8.56s/it, gpt_loss=0.374, loss_mean=0.305][A
+Train step of epoch 0:  64%|██████▎   | 4097/6434 [9:36:35<5:33:26,  8.56s/it, gpt_loss=0.379, loss_mean=0.313][A
+Train step of epoch 0:  64%|██████▎   | 4098/6434 [9:36:35<5:23:50,  8.32s/it, gpt_loss=0.379, loss_mean=0.313][A
+Train step of epoch 0:  64%|██████▎   | 4098/6434 [9:36:43<5:23:50,  8.32s/it, gpt_loss=0.384, loss_mean=0.32] [A
+Train step of epoch 0:  64%|██████▎   | 4099/6434 [9:36:43<5:28:52,  8.45s/it, gpt_loss=0.384, loss_mean=0.32][A
+[LID Router Debug] Step: 4100
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [9, 2, 0, 1, 5, 1, 5, 1, 5, 0]
+Active Experts in Batch: {0, 1, 2, 5, 9}
+
+Train step of epoch 0:  64%|██████▎   | 4099/6434 [9:36:52<5:28:52,  8.45s/it, gpt_loss=0.27, loss_mean=0.315][A
+Train step of epoch 0:  64%|██████▎   | 4100/6434 [9:36:52<5:25:21,  8.36s/it, gpt_loss=0.27, loss_mean=0.315][A
+Train step of epoch 0:  64%|██████▎   | 4100/6434 [9:37:00<5:25:21,  8.36s/it, gpt_loss=0.264, loss_mean=0.31][A
+Train step of epoch 0:  64%|██████▎   | 4101/6434 [9:37:00<5:31:02,  8.51s/it, gpt_loss=0.264, loss_mean=0.31][A
+Train step of epoch 0:  64%|██████▎   | 4101/6434 [9:37:10<5:31:02,  8.51s/it, gpt_loss=0.301, loss_mean=0.309][A
+Train step of epoch 0:  64%|██████▍   | 4102/6434 [9:37:10<5:37:39,  8.69s/it, gpt_loss=0.301, loss_mean=0.309][A
+Train step of epoch 0:  64%|██████▍   | 4102/6434 [9:37:20<5:37:39,  8.69s/it, gpt_loss=0.327, loss_mean=0.311][A
+Train step of epoch 0:  64%|██████▍   | 4103/6434 [9:37:20<5:53:09,  9.09s/it, gpt_loss=0.327, loss_mean=0.311][A
+Train step of epoch 0:  64%|██████▍   | 4103/6434 [9:37:26<5:53:09,  9.09s/it, gpt_loss=0.291, loss_mean=0.309][A
+Train step of epoch 0:  64%|██████▍   | 4104/6434 [9:37:26<5:26:24,  8.41s/it, gpt_loss=0.291, loss_mean=0.309][A
+Train step of epoch 0:  64%|██████▍   | 4104/6434 [9:37:36<5:26:24,  8.41s/it, gpt_loss=0.331, loss_mean=0.311][A
+Train step of epoch 0:  64%|██████▍   | 4105/6434 [9:37:36<5:36:33,  8.67s/it, gpt_loss=0.331, loss_mean=0.311][A
+Train step of epoch 0:  64%|██████▍   | 4105/6434 [9:37:45<5:36:33,  8.67s/it, gpt_loss=0.288, loss_mean=0.308][A
+Train step of epoch 0:  64%|██████▍   | 4106/6434 [9:37:45<5:41:18,  8.80s/it, gpt_loss=0.288, loss_mean=0.308][A
+Train step of epoch 0:  64%|██████▍   | 4106/6434 [9:37:53<5:41:18,  8.80s/it, gpt_loss=0.309, loss_mean=0.309][A
+Train step of epoch 0:  64%|██████▍   | 4107/6434 [9:37:53<5:40:11,  8.77s/it, gpt_loss=0.309, loss_mean=0.309][A
+Train step of epoch 0:  64%|██████▍   | 4107/6434 [9:38:01<5:40:11,  8.77s/it, gpt_loss=0.335, loss_mean=0.311][A
+Train step of epoch 0:  64%|██████▍   | 4108/6434 [9:38:01<5:30:47,  8.53s/it, gpt_loss=0.335, loss_mean=0.311][A
+Train step of epoch 0:  64%|██████▍   | 4108/6434 [9:38:10<5:30:47,  8.53s/it, gpt_loss=0.324, loss_mean=0.313][A
+Train step of epoch 0:  64%|██████▍   | 4109/6434 [9:38:10<5:36:36,  8.69s/it, gpt_loss=0.324, loss_mean=0.313][A
+[LID Router Debug] Step: 4110
+Batch Size: 10
+Audio Batch Size: 134
+LID Assignments: [5, 5, 2, 3, 9, 9, 9, 0, 4, 4]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  64%|██████▍   | 4109/6434 [9:38:19<5:36:36,  8.69s/it, gpt_loss=0.361, loss_mean=0.317][A
+Train step of epoch 0:  64%|██████▍   | 4110/6434 [9:38:19<5:39:50,  8.77s/it, gpt_loss=0.361, loss_mean=0.317][A
+Train step of epoch 0:  64%|██████▍   | 4110/6434 [9:38:28<5:39:50,  8.77s/it, gpt_loss=0.298, loss_mean=0.315][A
+Train step of epoch 0:  64%|██████▍   | 4111/6434 [9:38:28<5:38:23,  8.74s/it, gpt_loss=0.298, loss_mean=0.315][A
+Train step of epoch 0:  64%|██████▍   | 4111/6434 [9:38:36<5:38:23,  8.74s/it, gpt_loss=0.299, loss_mean=0.314][A
+Train step of epoch 0:  64%|██████▍   | 4112/6434 [9:38:36<5:27:23,  8.46s/it, gpt_loss=0.299, loss_mean=0.314][A
+Train step of epoch 0:  64%|██████▍   | 4112/6434 [9:38:45<5:27:23,  8.46s/it, gpt_loss=0.283, loss_mean=0.311][A
+Train step of epoch 0:  64%|██████▍   | 4113/6434 [9:38:45<5:29:13,  8.51s/it, gpt_loss=0.283, loss_mean=0.311][A
+Train step of epoch 0:  64%|██████▍   | 4113/6434 [9:38:52<5:29:13,  8.51s/it, gpt_loss=0.354, loss_mean=0.315][A
+Train step of epoch 0:  64%|██████▍   | 4114/6434 [9:38:52<5:22:12,  8.33s/it, gpt_loss=0.354, loss_mean=0.315][A
+Train step of epoch 0:  64%|██████▍   | 4114/6434 [9:39:00<5:22:12,  8.33s/it, gpt_loss=0.303, loss_mean=0.314][A
+Train step of epoch 0:  64%|██████▍   | 4115/6434 [9:39:00<5:16:22,  8.19s/it, gpt_loss=0.303, loss_mean=0.314][A
+Train step of epoch 0:  64%|██████▍   | 4115/6434 [9:39:10<5:16:22,  8.19s/it, gpt_loss=0.251, loss_mean=0.308][A
+Train step of epoch 0:  64%|██████▍   | 4116/6434 [9:39:10<5:32:39,  8.61s/it, gpt_loss=0.251, loss_mean=0.308][A
+Train step of epoch 0:  64%|██████▍   | 4116/6434 [9:39:19<5:32:39,  8.61s/it, gpt_loss=0.376, loss_mean=0.315][A
+Train step of epoch 0:  64%|██████▍   | 4117/6434 [9:39:19<5:36:32,  8.72s/it, gpt_loss=0.376, loss_mean=0.315][A
+Train step of epoch 0:  64%|██████▍   | 4117/6434 [9:39:27<5:36:32,  8.72s/it, gpt_loss=0.326, loss_mean=0.316][A
+Train step of epoch 0:  64%|██████▍   | 4118/6434 [9:39:27<5:27:20,  8.48s/it, gpt_loss=0.326, loss_mean=0.316][A
+Train step of epoch 0:  64%|██████▍   | 4118/6434 [9:39:36<5:27:20,  8.48s/it, gpt_loss=0.328, loss_mean=0.317][A
+Train step of epoch 0:  64%|██████▍   | 4119/6434 [9:39:36<5:30:34,  8.57s/it, gpt_loss=0.328, loss_mean=0.317][A
+[LID Router Debug] Step: 4120
+Batch Size: 10
+Audio Batch Size: 108
+LID Assignments: [8, 9, 3, 1, 4, 5, 4, 5, 2, 4]
+Active Experts in Batch: {1, 2, 3, 4, 5, 8, 9}
+
+Train step of epoch 0:  64%|██████▍   | 4119/6434 [9:39:43<5:30:34,  8.57s/it, gpt_loss=0.351, loss_mean=0.32] [A
+Train step of epoch 0:  64%|██████▍   | 4120/6434 [9:39:43<5:20:57,  8.32s/it, gpt_loss=0.351, loss_mean=0.32][A
+Train step of epoch 0:  64%|██████▍   | 4120/6434 [9:39:53<5:20:57,  8.32s/it, gpt_loss=0.257, loss_mean=0.314][A
+Train step of epoch 0:  64%|██████▍   | 4121/6434 [9:39:53<5:31:38,  8.60s/it, gpt_loss=0.257, loss_mean=0.314][A
+Train step of epoch 0:  64%|██████▍   | 4121/6434 [9:40:01<5:31:38,  8.60s/it, gpt_loss=0.327, loss_mean=0.315][A
+Train step of epoch 0:  64%|██████▍   | 4122/6434 [9:40:01<5:28:28,  8.52s/it, gpt_loss=0.327, loss_mean=0.315][A
+Train step of epoch 0:  64%|██████▍   | 4122/6434 [9:40:10<5:28:28,  8.52s/it, gpt_loss=0.307, loss_mean=0.314][A
+Train step of epoch 0:  64%|██████▍   | 4123/6434 [9:40:10<5:29:18,  8.55s/it, gpt_loss=0.307, loss_mean=0.314][A
+Train step of epoch 0:  64%|██████▍   | 4123/6434 [9:40:18<5:29:18,  8.55s/it, gpt_loss=0.259, loss_mean=0.309][A
+Train step of epoch 0:  64%|██████▍   | 4124/6434 [9:40:18<5:28:46,  8.54s/it, gpt_loss=0.259, loss_mean=0.309][A
+Train step of epoch 0:  64%|██████▍   | 4124/6434 [9:40:27<5:28:46,  8.54s/it, gpt_loss=0.367, loss_mean=0.315][A
+Train step of epoch 0:  64%|██████▍   | 4125/6434 [9:40:27<5:28:16,  8.53s/it, gpt_loss=0.367, loss_mean=0.315][A
+Train step of epoch 0:  64%|██████▍   | 4125/6434 [9:40:35<5:28:16,  8.53s/it, gpt_loss=0.395, loss_mean=0.323][A
+Train step of epoch 0:  64%|██████▍   | 4126/6434 [9:40:35<5:32:36,  8.65s/it, gpt_loss=0.395, loss_mean=0.323][A
+Train step of epoch 0:  64%|██████▍   | 4126/6434 [9:40:44<5:32:36,  8.65s/it, gpt_loss=0.457, loss_mean=0.336][A
+Train step of epoch 0:  64%|██████▍   | 4127/6434 [9:40:44<5:28:11,  8.54s/it, gpt_loss=0.457, loss_mean=0.336][A
+Train step of epoch 0:  64%|██████▍   | 4127/6434 [9:40:52<5:28:11,  8.54s/it, gpt_loss=0.299, loss_mean=0.332][A
+Train step of epoch 0:  64%|██████▍   | 4128/6434 [9:40:52<5:24:22,  8.44s/it, gpt_loss=0.299, loss_mean=0.332][A
+Train step of epoch 0:  64%|██████▍   | 4128/6434 [9:41:01<5:24:22,  8.44s/it, gpt_loss=0.343, loss_mean=0.333][A
+Train step of epoch 0:  64%|██████▍   | 4129/6434 [9:41:01<5:29:07,  8.57s/it, gpt_loss=0.343, loss_mean=0.333][A
+[LID Router Debug] Step: 4130
+Batch Size: 10
+Audio Batch Size: 105
+LID Assignments: [4, 4, 1, 9, 2, 0, 5, 5, 3, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  64%|██████▍   | 4129/6434 [9:41:10<5:29:07,  8.57s/it, gpt_loss=0.36, loss_mean=0.336] [A
+Train step of epoch 0:  64%|██████▍   | 4130/6434 [9:41:10<5:32:12,  8.65s/it, gpt_loss=0.36, loss_mean=0.336][A
+Train step of epoch 0:  64%|██████▍   | 4130/6434 [9:41:20<5:32:12,  8.65s/it, gpt_loss=0.242, loss_mean=0.327][A
+Train step of epoch 0:  64%|██████▍   | 4131/6434 [9:41:20<5:50:33,  9.13s/it, gpt_loss=0.242, loss_mean=0.327][A
+Train step of epoch 0:  64%|██████▍   | 4131/6434 [9:41:28<5:50:33,  9.13s/it, gpt_loss=0.35, loss_mean=0.329] [A
+Train step of epoch 0:  64%|██████▍   | 4132/6434 [9:41:28<5:41:09,  8.89s/it, gpt_loss=0.35, loss_mean=0.329][A
+Train step of epoch 0:  64%|██████▍   | 4132/6434 [9:41:37<5:41:09,  8.89s/it, gpt_loss=0.34, loss_mean=0.33] [A
+Train step of epoch 0:  64%|██████▍   | 4133/6434 [9:41:37<5:44:35,  8.99s/it, gpt_loss=0.34, loss_mean=0.33][A
+Train step of epoch 0:  64%|██████▍   | 4133/6434 [9:41:46<5:44:35,  8.99s/it, gpt_loss=0.276, loss_mean=0.325][A
+Train step of epoch 0:  64%|██████▍   | 4134/6434 [9:41:46<5:41:06,  8.90s/it, gpt_loss=0.276, loss_mean=0.325][A
+Train step of epoch 0:  64%|██████▍   | 4134/6434 [9:41:55<5:41:06,  8.90s/it, gpt_loss=0.259, loss_mean=0.318][A
+Train step of epoch 0:  64%|██████▍   | 4135/6434 [9:41:55<5:38:06,  8.82s/it, gpt_loss=0.259, loss_mean=0.318][A
+Train step of epoch 0:  64%|██████▍   | 4135/6434 [9:42:05<5:38:06,  8.82s/it, gpt_loss=0.333, loss_mean=0.32] [A
+Train step of epoch 0:  64%|██████▍   | 4136/6434 [9:42:05<5:50:57,  9.16s/it, gpt_loss=0.333, loss_mean=0.32][A
+Train step of epoch 0:  64%|██████▍   | 4136/6434 [9:42:13<5:50:57,  9.16s/it, gpt_loss=0.285, loss_mean=0.316][A
+Train step of epoch 0:  64%|██████▍   | 4137/6434 [9:42:13<5:45:36,  9.03s/it, gpt_loss=0.285, loss_mean=0.316][A
+Train step of epoch 0:  64%|██████▍   | 4137/6434 [9:42:21<5:45:36,  9.03s/it, gpt_loss=0.324, loss_mean=0.317][A
+Train step of epoch 0:  64%|██████▍   | 4138/6434 [9:42:21<5:25:15,  8.50s/it, gpt_loss=0.324, loss_mean=0.317][A
+Train step of epoch 0:  64%|██████▍   | 4138/6434 [9:42:30<5:25:15,  8.50s/it, gpt_loss=0.317, loss_mean=0.317][A
+Train step of epoch 0:  64%|██████▍   | 4139/6434 [9:42:30<5:30:10,  8.63s/it, gpt_loss=0.317, loss_mean=0.317][A
+[LID Router Debug] Step: 4140
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [2, 9, 0, 1, 4, 4, 3, 9, 2, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+Train step of epoch 0:  64%|██████▍   | 4139/6434 [9:42:37<5:30:10,  8.63s/it, gpt_loss=0.25, loss_mean=0.31]  [A
+Train step of epoch 0:  64%|██████▍   | 4140/6434 [9:42:37<5:19:36,  8.36s/it, gpt_loss=0.25, loss_mean=0.31][A
+Train step of epoch 0:  64%|██████▍   | 4140/6434 [9:42:46<5:19:36,  8.36s/it, gpt_loss=0.244, loss_mean=0.304][A
+Train step of epoch 0:  64%|██████▍   | 4141/6434 [9:42:46<5:20:20,  8.38s/it, gpt_loss=0.244, loss_mean=0.304][A
+Train step of epoch 0:  64%|██████▍   | 4141/6434 [9:42:55<5:20:20,  8.38s/it, gpt_loss=0.369, loss_mean=0.31] [A
+Train step of epoch 0:  64%|██████▍   | 4142/6434 [9:42:55<5:26:08,  8.54s/it, gpt_loss=0.369, loss_mean=0.31][A
+Train step of epoch 0:  64%|██████▍   | 4142/6434 [9:43:03<5:26:08,  8.54s/it, gpt_loss=0.332, loss_mean=0.312][A
+Train step of epoch 0:  64%|██████▍   | 4143/6434 [9:43:03<5:17:21,  8.31s/it, gpt_loss=0.332, loss_mean=0.312][A
+Train step of epoch 0:  64%|██████▍   | 4143/6434 [9:43:11<5:17:21,  8.31s/it, gpt_loss=0.222, loss_mean=0.303][A
+Train step of epoch 0:  64%|██████▍   | 4144/6434 [9:43:11<5:16:59,  8.31s/it, gpt_loss=0.222, loss_mean=0.303][A
+Train step of epoch 0:  64%|██████▍   | 4144/6434 [9:43:19<5:16:59,  8.31s/it, gpt_loss=0.436, loss_mean=0.317][A
+Train step of epoch 0:  64%|██████▍   | 4145/6434 [9:43:19<5:14:29,  8.24s/it, gpt_loss=0.436, loss_mean=0.317][A
+Train step of epoch 0:  64%|██████▍   | 4145/6434 [9:43:28<5:14:29,  8.24s/it, gpt_loss=0.392, loss_mean=0.324][A
+Train step of epoch 0:  64%|██████▍   | 4146/6434 [9:43:28<5:24:43,  8.52s/it, gpt_loss=0.392, loss_mean=0.324][A
+Train step of epoch 0:  64%|██████▍   | 4146/6434 [9:43:37<5:24:43,  8.52s/it, gpt_loss=0.323, loss_mean=0.324][A
+Train step of epoch 0:  64%|██████▍   | 4147/6434 [9:43:37<5:30:47,  8.68s/it, gpt_loss=0.323, loss_mean=0.324][A
+Train step of epoch 0:  64%|██████▍   | 4147/6434 [9:43:45<5:30:47,  8.68s/it, gpt_loss=0.318, loss_mean=0.323][A
+Train step of epoch 0:  64%|██████▍   | 4148/6434 [9:43:45<5:16:51,  8.32s/it, gpt_loss=0.318, loss_mean=0.323][A
+Train step of epoch 0:  64%|██████▍   | 4148/6434 [9:43:52<5:16:51,  8.32s/it, gpt_loss=0.353, loss_mean=0.326][A
+Train step of epoch 0:  64%|██████▍   | 4149/6434 [9:43:52<5:05:44,  8.03s/it, gpt_loss=0.353, loss_mean=0.326][A
+[LID Router Debug] Step: 4150
+Batch Size: 10
+Audio Batch Size: 128
+LID Assignments: [3, 1, 6, 4, 3, 0, 3, 7, 0, 3]
+Active Experts in Batch: {0, 1, 3, 4, 6, 7}
+
+Train step of epoch 0:  64%|██████▍   | 4149/6434 [9:44:00<5:05:44,  8.03s/it, gpt_loss=0.246, loss_mean=0.318][A
+Train step of epoch 0:  65%|██████▍   | 4150/6434 [9:44:00<5:10:09,  8.15s/it, gpt_loss=0.246, loss_mean=0.318][A
+Train step of epoch 0:  65%|██████▍   | 4150/6434 [9:44:09<5:10:09,  8.15s/it, gpt_loss=0.23, loss_mean=0.309] [A
+Train step of epoch 0:  65%|██████▍   | 4151/6434 [9:44:09<5:11:21,  8.18s/it, gpt_loss=0.23, loss_mean=0.309][A
+Train step of epoch 0:  65%|██████▍   | 4151/6434 [9:44:18<5:11:21,  8.18s/it, gpt_loss=0.373, loss_mean=0.316][A
+Train step of epoch 0:  65%|██████▍   | 4152/6434 [9:44:18<5:22:36,  8.48s/it, gpt_loss=0.373, loss_mean=0.316][A
+Train step of epoch 0:  65%|██████▍   | 4152/6434 [9:44:26<5:22:36,  8.48s/it, gpt_loss=0.308, loss_mean=0.315][A
+Train step of epoch 0:  65%|██████▍   | 4153/6434 [9:44:26<5:18:27,  8.38s/it, gpt_loss=0.308, loss_mean=0.315][A
+Train step of epoch 0:  65%|██████▍   | 4153/6434 [9:44:35<5:18:27,  8.38s/it, gpt_loss=0.25, loss_mean=0.308] [A
+Train step of epoch 0:  65%|██████▍   | 4154/6434 [9:44:35<5:22:58,  8.50s/it, gpt_loss=0.25, loss_mean=0.308][A
+Train step of epoch 0:  65%|██████▍   | 4154/6434 [9:44:42<5:22:58,  8.50s/it, gpt_loss=0.369, loss_mean=0.315][A
+Train step of epoch 0:  65%|██████▍   | 4155/6434 [9:44:42<5:12:38,  8.23s/it, gpt_loss=0.369, loss_mean=0.315][A
+Train step of epoch 0:  65%|██████▍   | 4155/6434 [9:44:50<5:12:38,  8.23s/it, gpt_loss=0.326, loss_mean=0.316][A
+Train step of epoch 0:  65%|██████▍   | 4156/6434 [9:44:50<5:08:01,  8.11s/it, gpt_loss=0.326, loss_mean=0.316][A
+Train step of epoch 0:  65%|██████▍   | 4156/6434 [9:44:59<5:08:01,  8.11s/it, gpt_loss=0.3, loss_mean=0.314]  [A
+Train step of epoch 0:  65%|██████▍   | 4157/6434 [9:44:59<5:10:22,  8.18s/it, gpt_loss=0.3, loss_mean=0.314][A
+Train step of epoch 0:  65%|██████▍   | 4157/6434 [9:45:06<5:10:22,  8.18s/it, gpt_loss=0.278, loss_mean=0.311][A
+Train step of epoch 0:  65%|██████▍   | 4158/6434 [9:45:06<5:05:35,  8.06s/it, gpt_loss=0.278, loss_mean=0.311][A
+Train step of epoch 0:  65%|██████▍   | 4158/6434 [9:45:15<5:05:35,  8.06s/it, gpt_loss=0.275, loss_mean=0.307][A
+Train step of epoch 0:  65%|██████▍   | 4159/6434 [9:45:15<5:12:17,  8.24s/it, gpt_loss=0.275, loss_mean=0.307][A
+[LID Router Debug] Step: 4160
+Batch Size: 10
+Audio Batch Size: 83
+LID Assignments: [9, 1, 2, 4, 5, 4, 2, 5, 0, 4]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+Train step of epoch 0:  65%|██████▍   | 4159/6434 [9:45:22<5:12:17,  8.24s/it, gpt_loss=0.281, loss_mean=0.304][A
+Train step of epoch 0:  65%|██████▍   | 4160/6434 [9:45:22<5:03:10,  8.00s/it, gpt_loss=0.281, loss_mean=0.304][A
+Train step of epoch 0:  65%|██████▍   | 4160/6434 [9:45:31<5:03:10,  8.00s/it, gpt_loss=0.314, loss_mean=0.305][A
+Train step of epoch 0:  65%|██████▍   | 4161/6434 [9:45:31<5:04:21,  8.03s/it, gpt_loss=0.314, loss_mean=0.305][A
+Train step of epoch 0:  65%|██████▍   | 4161/6434 [9:45:39<5:04:21,  8.03s/it, gpt_loss=0.233, loss_mean=0.298][A
+Train step of epoch 0:  65%|██████▍   | 4162/6434 [9:45:39<5:09:48,  8.18s/it, gpt_loss=0.233, loss_mean=0.298][A
+Train step of epoch 0:  65%|██████▍   | 4162/6434 [9:45:48<5:09:48,  8.18s/it, gpt_loss=0.26, loss_mean=0.294] [A
+Train step of epoch 0:  65%|██████▍   | 4163/6434 [9:45:48<5:17:38,  8.39s/it, gpt_loss=0.26, loss_mean=0.294][A
+Train step of epoch 0:  65%|██████▍   | 4163/6434 [9:45:56<5:17:38,  8.39s/it, gpt_loss=0.34, loss_mean=0.299][A
+Train step of epoch 0:  65%|██████▍   | 4164/6434 [9:45:56<5:13:39,  8.29s/it, gpt_loss=0.34, loss_mean=0.299][A
+Train step of epoch 0:  65%|██████▍   | 4164/6434 [9:46:04<5:13:39,  8.29s/it, gpt_loss=0.247, loss_mean=0.294][A
+Train step of epoch 0:  65%|██████▍   | 4165/6434 [9:46:04<5:07:01,  8.12s/it, gpt_loss=0.247, loss_mean=0.294][A
+Train step of epoch 0:  65%|██████▍   | 4165/6434 [9:46:12<5:07:01,  8.12s/it, gpt_loss=0.28, loss_mean=0.292] [A
+Train step of epoch 0:  65%|██████▍   | 4166/6434 [9:46:12<5:10:17,  8.21s/it, gpt_loss=0.28, loss_mean=0.292][A
+Train step of epoch 0:  65%|██████▍   | 4166/6434 [9:46:21<5:10:17,  8.21s/it, gpt_loss=0.344, loss_mean=0.297][A
+Train step of epoch 0:  65%|██████▍   | 4167/6434 [9:46:21<5:21:00,  8.50s/it, gpt_loss=0.344, loss_mean=0.297][A
+Train step of epoch 0:  65%|██████▍   | 4167/6434 [9:46:29<5:21:00,  8.50s/it, gpt_loss=0.251, loss_mean=0.293][A
+Train step of epoch 0:  65%|██████▍   | 4168/6434 [9:46:29<5:11:07,  8.24s/it, gpt_loss=0.251, loss_mean=0.293][A
+Train step of epoch 0:  65%|██████▍   | 4168/6434 [9:46:38<5:11:07,  8.24s/it, gpt_loss=0.338, loss_mean=0.297][A
+Train step of epoch 0:  65%|██████▍   | 4169/6434 [9:46:38<5:16:00,  8.37s/it, gpt_loss=0.338, loss_mean=0.297][A
+[LID Router Debug] Step: 4170
+Batch Size: 10
+Audio Batch Size: 113
+LID Assignments: [1, 3, 9, 2, 1, 9, 2, 5, 1, 9]
+Active Experts in Batch: {1, 2, 3, 5, 9}
+
+Train step of epoch 0:  65%|██████▍   | 4169/6434 [9:46:45<5:16:00,  8.37s/it, gpt_loss=0.264, loss_mean=0.294][A
+Train step of epoch 0:  65%|██████▍   | 4170/6434 [9:46:45<5:05:59,  8.11s/it, gpt_loss=0.264, loss_mean=0.294][A
+Train step of epoch 0:  65%|██████▍   | 4170/6434 [9:46:54<5:05:59,  8.11s/it, gpt_loss=0.27, loss_mean=0.292] [A
+Train step of epoch 0:  65%|██████▍   | 4171/6434 [9:46:54<5:11:23,  8.26s/it, gpt_loss=0.27, loss_mean=0.292][A
+Train step of epoch 0:  65%|██████▍   | 4171/6434 [9:47:03<5:11:23,  8.26s/it, gpt_loss=0.304, loss_mean=0.293][A
+Train step of epoch 0:  65%|██████▍   | 4172/6434 [9:47:03<5:19:07,  8.46s/it, gpt_loss=0.304, loss_mean=0.293][A
+Train step of epoch 0:  65%|██████▍   | 4172/6434 [9:47:11<5:19:07,  8.46s/it, gpt_loss=0.267, loss_mean=0.29] [A
+Train step of epoch 0:  65%|██████▍   | 4173/6434 [9:47:11<5:20:30,  8.51s/it, gpt_loss=0.267, loss_mean=0.29][A
+Train step of epoch 0:  65%|██████▍   | 4173/6434 [9:47:19<5:20:30,  8.51s/it, gpt_loss=0.325, loss_mean=0.294][A
+Train step of epoch 0:  65%|██████▍   | 4174/6434 [9:47:19<5:06:18,  8.13s/it, gpt_loss=0.325, loss_mean=0.294][A
+Train step of epoch 0:  65%|██████▍   | 4174/6434 [9:47:27<5:06:18,  8.13s/it, gpt_loss=0.363, loss_mean=0.301][A
+Train step of epoch 0:  65%|██████▍   | 4175/6434 [9:47:27<5:10:52,  8.26s/it, gpt_loss=0.363, loss_mean=0.301][A
+Train step of epoch 0:  65%|██████▍   | 4175/6434 [9:47:35<5:10:52,  8.26s/it, gpt_loss=0.251, loss_mean=0.296][A
+Train step of epoch 0:  65%|██████▍   | 4176/6434 [9:47:35<5:08:27,  8.20s/it, gpt_loss=0.251, loss_mean=0.296][A
+Train step of epoch 0:  65%|██████▍   | 4176/6434 [9:47:43<5:08:27,  8.20s/it, gpt_loss=0.353, loss_mean=0.301][A
+Train step of epoch 0:  65%|██████▍   | 4177/6434 [9:47:43<5:03:56,  8.08s/it, gpt_loss=0.353, loss_mean=0.301][A
+Train step of epoch 0:  65%|██████▍   | 4177/6434 [9:47:52<5:03:56,  8.08s/it, gpt_loss=0.277, loss_mean=0.299][A
+Train step of epoch 0:  65%|██████▍   | 4178/6434 [9:47:52<5:13:35,  8.34s/it, gpt_loss=0.277, loss_mean=0.299][A
+Train step of epoch 0:  65%|██████▍   | 4178/6434 [9:48:00<5:13:35,  8.34s/it, gpt_loss=0.267, loss_mean=0.296][A
+Train step of epoch 0:  65%|██████▍   | 4179/6434 [9:48:00<5:14:14,  8.36s/it, gpt_loss=0.267, loss_mean=0.296][A
+[LID Router Debug] Step: 4180
+Batch Size: 10
+Audio Batch Size: 113
+LID Assignments: [5, 4, 2, 1, 1, 2, 2, 9, 4, 3]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  65%|██████▍   | 4179/6434 [9:48:09<5:14:14,  8.36s/it, gpt_loss=0.261, loss_mean=0.292][A
+Train step of epoch 0:  65%|██████▍   | 4180/6434 [9:48:09<5:15:22,  8.40s/it, gpt_loss=0.261, loss_mean=0.292][A
+Train step of epoch 0:  65%|██████▍   | 4180/6434 [9:48:17<5:15:22,  8.40s/it, gpt_loss=0.21, loss_mean=0.284] [A
+Train step of epoch 0:  65%|██████▍   | 4181/6434 [9:48:17<5:18:14,  8.48s/it, gpt_loss=0.21, loss_mean=0.284][A
+Train step of epoch 0:  65%|██████▍   | 4181/6434 [9:48:25<5:18:14,  8.48s/it, gpt_loss=0.287, loss_mean=0.284][A
+Train step of epoch 0:  65%|██████▍   | 4182/6434 [9:48:25<5:03:35,  8.09s/it, gpt_loss=0.287, loss_mean=0.284][A
+Train step of epoch 0:  65%|██████▍   | 4182/6434 [9:48:33<5:03:35,  8.09s/it, gpt_loss=0.271, loss_mean=0.283][A
+Train step of epoch 0:  65%|██████▌   | 4183/6434 [9:48:33<5:09:10,  8.24s/it, gpt_loss=0.271, loss_mean=0.283][A
+Train step of epoch 0:  65%|██████▌   | 4183/6434 [9:48:42<5:09:10,  8.24s/it, gpt_loss=0.335, loss_mean=0.288][A
+Train step of epoch 0:  65%|██████▌   | 4184/6434 [9:48:42<5:12:03,  8.32s/it, gpt_loss=0.335, loss_mean=0.288][A
+Train step of epoch 0:  65%|██████▌   | 4184/6434 [9:48:50<5:12:03,  8.32s/it, gpt_loss=0.318, loss_mean=0.291][A
+Train step of epoch 0:  65%|██████▌   | 4185/6434 [9:48:50<5:10:41,  8.29s/it, gpt_loss=0.318, loss_mean=0.291][A
+Train step of epoch 0:  65%|██████▌   | 4185/6434 [9:48:58<5:10:41,  8.29s/it, gpt_loss=0.304, loss_mean=0.293][A
+Train step of epoch 0:  65%|██████▌   | 4186/6434 [9:48:58<5:04:25,  8.13s/it, gpt_loss=0.304, loss_mean=0.293][A
+Train step of epoch 0:  65%|██████▌   | 4186/6434 [9:49:07<5:04:25,  8.13s/it, gpt_loss=0.24, loss_mean=0.287] [A
+Train step of epoch 0:  65%|██████▌   | 4187/6434 [9:49:07<5:14:55,  8.41s/it, gpt_loss=0.24, loss_mean=0.287][A
+Train step of epoch 0:  65%|██████▌   | 4187/6434 [9:49:16<5:14:55,  8.41s/it, gpt_loss=0.257, loss_mean=0.284][A
+Train step of epoch 0:  65%|██████▌   | 4188/6434 [9:49:16<5:26:42,  8.73s/it, gpt_loss=0.257, loss_mean=0.284][A
+Train step of epoch 0:  65%|██████▌   | 4188/6434 [9:49:25<5:26:42,  8.73s/it, gpt_loss=0.281, loss_mean=0.284][A
+Train step of epoch 0:  65%|██████▌   | 4189/6434 [9:49:25<5:32:01,  8.87s/it, gpt_loss=0.281, loss_mean=0.284][A
+[LID Router Debug] Step: 4190
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [5, 9, 6, 9, 3, 4, 4, 2, 2, 3]
+Active Experts in Batch: {2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  65%|██████▌   | 4189/6434 [9:49:34<5:32:01,  8.87s/it, gpt_loss=0.254, loss_mean=0.281][A
+Train step of epoch 0:  65%|██████▌   | 4190/6434 [9:49:34<5:23:17,  8.64s/it, gpt_loss=0.254, loss_mean=0.281][A
+Train step of epoch 0:  65%|██████▌   | 4190/6434 [9:49:41<5:23:17,  8.64s/it, gpt_loss=0.306, loss_mean=0.283][A
+Train step of epoch 0:  65%|██████▌   | 4191/6434 [9:49:41<5:10:44,  8.31s/it, gpt_loss=0.306, loss_mean=0.283][A
+Train step of epoch 0:  65%|██████▌   | 4191/6434 [9:49:50<5:10:44,  8.31s/it, gpt_loss=0.273, loss_mean=0.282][A
+Train step of epoch 0:  65%|██████▌   | 4192/6434 [9:49:50<5:12:07,  8.35s/it, gpt_loss=0.273, loss_mean=0.282][A
+Train step of epoch 0:  65%|██████▌   | 4192/6434 [9:49:58<5:12:07,  8.35s/it, gpt_loss=0.258, loss_mean=0.28] [A
+Train step of epoch 0:  65%|██████▌   | 4193/6434 [9:49:58<5:11:31,  8.34s/it, gpt_loss=0.258, loss_mean=0.28][A
+Train step of epoch 0:  65%|██████▌   | 4193/6434 [9:50:07<5:11:31,  8.34s/it, gpt_loss=0.249, loss_mean=0.277][A
+Train step of epoch 0:  65%|██████▌   | 4194/6434 [9:50:07<5:22:14,  8.63s/it, gpt_loss=0.249, loss_mean=0.277][A
+Train step of epoch 0:  65%|██████▌   | 4194/6434 [9:50:15<5:22:14,  8.63s/it, gpt_loss=0.328, loss_mean=0.282][A
+Train step of epoch 0:  65%|██████▌   | 4195/6434 [9:50:15<5:14:25,  8.43s/it, gpt_loss=0.328, loss_mean=0.282][A
+Train step of epoch 0:  65%|██████▌   | 4195/6434 [9:50:24<5:14:25,  8.43s/it, gpt_loss=0.294, loss_mean=0.283][A
+Train step of epoch 0:  65%|██████▌   | 4196/6434 [9:50:24<5:20:14,  8.59s/it, gpt_loss=0.294, loss_mean=0.283][A
+Train step of epoch 0:  65%|██████▌   | 4196/6434 [9:50:32<5:20:14,  8.59s/it, gpt_loss=0.308, loss_mean=0.286][A
+Train step of epoch 0:  65%|██████▌   | 4197/6434 [9:50:32<5:10:21,  8.32s/it, gpt_loss=0.308, loss_mean=0.286][A
+Train step of epoch 0:  65%|██████▌   | 4197/6434 [9:50:39<5:10:21,  8.32s/it, gpt_loss=0.253, loss_mean=0.282][A
+Train step of epoch 0:  65%|██████▌   | 4198/6434 [9:50:39<5:02:57,  8.13s/it, gpt_loss=0.253, loss_mean=0.282][A
+Train step of epoch 0:  65%|██████▌   | 4198/6434 [9:50:47<5:02:57,  8.13s/it, gpt_loss=0.28, loss_mean=0.282] [A
+Train step of epoch 0:  65%|██████▌   | 4199/6434 [9:50:47<4:56:18,  7.95s/it, gpt_loss=0.28, loss_mean=0.282][A
+[LID Router Debug] Step: 4200
+Batch Size: 10
+Audio Batch Size: 92
+LID Assignments: [0, 5, 6, 2, 3, 0, 2, 1, 5, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+[2026-02-07 01:46:59,834] [INFO] [logging.py:96:log_dist] [Rank 0] step=2100, skipped=0, lr=[1.790827676196364e-05, 1.790827676196364e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 01:46:59,835] [INFO] [timer.py:260:stop] epoch=0/micro_step=4200/global_step=2100, RunningAvgSamplesPerSec=4.748663264799324, CurrSamplesPerSec=5.029465100324876, MemAllocated=12.54GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  65%|██████▌   | 4199/6434 [9:50:55<4:56:18,  7.95s/it, gpt_loss=0.339, loss_mean=0.288][A
+Train step of epoch 0:  65%|██████▌   | 4200/6434 [9:50:55<5:01:03,  8.09s/it, gpt_loss=0.339, loss_mean=0.288][A
+Train step of epoch 0:  65%|██████▌   | 4200/6434 [9:51:05<5:01:03,  8.09s/it, gpt_loss=0.281, loss_mean=0.287][A
+Train step of epoch 0:  65%|██████▌   | 4201/6434 [9:51:05<5:20:11,  8.60s/it, gpt_loss=0.281, loss_mean=0.287][A
+Train step of epoch 0:  65%|██████▌   | 4201/6434 [9:51:13<5:20:11,  8.60s/it, gpt_loss=0.261, loss_mean=0.285][A
+Train step of epoch 0:  65%|██████▌   | 4202/6434 [9:51:13<5:08:39,  8.30s/it, gpt_loss=0.261, loss_mean=0.285][A
+Train step of epoch 0:  65%|██████▌   | 4202/6434 [9:51:21<5:08:39,  8.30s/it, gpt_loss=0.35, loss_mean=0.291] [A
+Train step of epoch 0:  65%|██████▌   | 4203/6434 [9:51:21<5:05:41,  8.22s/it, gpt_loss=0.35, loss_mean=0.291][A
+Train step of epoch 0:  65%|██████▌   | 4203/6434 [9:51:29<5:05:41,  8.22s/it, gpt_loss=0.232, loss_mean=0.285][A
+Train step of epoch 0:  65%|██████▌   | 4204/6434 [9:51:29<5:04:59,  8.21s/it, gpt_loss=0.232, loss_mean=0.285][A
+Train step of epoch 0:  65%|██████▌   | 4204/6434 [9:51:38<5:04:59,  8.21s/it, gpt_loss=0.454, loss_mean=0.302][A
+Train step of epoch 0:  65%|██████▌   | 4205/6434 [9:51:38<5:12:13,  8.40s/it, gpt_loss=0.454, loss_mean=0.302][A
+Train step of epoch 0:  65%|██████▌   | 4205/6434 [9:51:46<5:12:13,  8.40s/it, gpt_loss=0.336, loss_mean=0.305][A
+Train step of epoch 0:  65%|██████▌   | 4206/6434 [9:51:46<5:10:42,  8.37s/it, gpt_loss=0.336, loss_mean=0.305][A
+Train step of epoch 0:  65%|██████▌   | 4206/6434 [9:51:54<5:10:42,  8.37s/it, gpt_loss=0.222, loss_mean=0.297][A
+Train step of epoch 0:  65%|██████▌   | 4207/6434 [9:51:54<5:10:35,  8.37s/it, gpt_loss=0.222, loss_mean=0.297][A
+Train step of epoch 0:  65%|██████▌   | 4207/6434 [9:52:03<5:10:35,  8.37s/it, gpt_loss=0.291, loss_mean=0.296][A
+Train step of epoch 0:  65%|██████▌   | 4208/6434 [9:52:03<5:09:52,  8.35s/it, gpt_loss=0.291, loss_mean=0.296][A
+Train step of epoch 0:  65%|██████▌   | 4208/6434 [9:52:11<5:09:52,  8.35s/it, gpt_loss=0.344, loss_mean=0.301][A
+Train step of epoch 0:  65%|██████▌   | 4209/6434 [9:52:11<5:05:06,  8.23s/it, gpt_loss=0.344, loss_mean=0.301][A
+[LID Router Debug] Step: 4210
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [6, 0, 5, 6, 2, 0, 5, 8, 2, 3]
+Active Experts in Batch: {0, 2, 3, 5, 6, 8}
+
+Train step of epoch 0:  65%|██████▌   | 4209/6434 [9:52:19<5:05:06,  8.23s/it, gpt_loss=0.301, loss_mean=0.301][A
+Train step of epoch 0:  65%|██████▌   | 4210/6434 [9:52:19<5:04:31,  8.22s/it, gpt_loss=0.301, loss_mean=0.301][A
+Train step of epoch 0:  65%|██████▌   | 4210/6434 [9:52:28<5:04:31,  8.22s/it, gpt_loss=0.245, loss_mean=0.296][A
+Train step of epoch 0:  65%|██████▌   | 4211/6434 [9:52:28<5:13:57,  8.47s/it, gpt_loss=0.245, loss_mean=0.296][A
+Train step of epoch 0:  65%|██████▌   | 4211/6434 [9:52:37<5:13:57,  8.47s/it, gpt_loss=0.379, loss_mean=0.304][A
+Train step of epoch 0:  65%|██████▌   | 4212/6434 [9:52:37<5:15:54,  8.53s/it, gpt_loss=0.379, loss_mean=0.304][A
+Train step of epoch 0:  65%|██████▌   | 4212/6434 [9:52:44<5:15:54,  8.53s/it, gpt_loss=0.417, loss_mean=0.315][A
+Train step of epoch 0:  65%|██████▌   | 4213/6434 [9:52:44<5:07:03,  8.30s/it, gpt_loss=0.417, loss_mean=0.315][A
+Train step of epoch 0:  65%|██████▌   | 4213/6434 [9:52:54<5:07:03,  8.30s/it, gpt_loss=0.26, loss_mean=0.31]  [A
+Train step of epoch 0:  65%|██████▌   | 4214/6434 [9:52:54<5:22:24,  8.71s/it, gpt_loss=0.26, loss_mean=0.31][A
+Train step of epoch 0:  65%|██████▌   | 4214/6434 [9:53:03<5:22:24,  8.71s/it, gpt_loss=0.259, loss_mean=0.305][A
+Train step of epoch 0:  66%|██████▌   | 4215/6434 [9:53:03<5:18:52,  8.62s/it, gpt_loss=0.259, loss_mean=0.305][A
+Train step of epoch 0:  66%|██████▌   | 4215/6434 [9:53:11<5:18:52,  8.62s/it, gpt_loss=0.295, loss_mean=0.304][A
+Train step of epoch 0:  66%|██████▌   | 4216/6434 [9:53:11<5:17:25,  8.59s/it, gpt_loss=0.295, loss_mean=0.304][A
+Train step of epoch 0:  66%|██████▌   | 4216/6434 [9:53:20<5:17:25,  8.59s/it, gpt_loss=0.27, loss_mean=0.3]   [A
+Train step of epoch 0:  66%|██████▌   | 4217/6434 [9:53:20<5:18:43,  8.63s/it, gpt_loss=0.27, loss_mean=0.3][A
+Train step of epoch 0:  66%|██████▌   | 4217/6434 [9:53:27<5:18:43,  8.63s/it, gpt_loss=0.304, loss_mean=0.301][A
+Train step of epoch 0:  66%|██████▌   | 4218/6434 [9:53:27<5:03:18,  8.21s/it, gpt_loss=0.304, loss_mean=0.301][A
+Train step of epoch 0:  66%|██████▌   | 4218/6434 [9:53:36<5:03:18,  8.21s/it, gpt_loss=0.368, loss_mean=0.307][A
+Train step of epoch 0:  66%|██████▌   | 4219/6434 [9:53:36<5:13:32,  8.49s/it, gpt_loss=0.368, loss_mean=0.307][A
+[LID Router Debug] Step: 4220
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [0, 2, 4, 4, 5, 1, 5, 3, 5, 6]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+Train step of epoch 0:  66%|██████▌   | 4219/6434 [9:53:44<5:13:32,  8.49s/it, gpt_loss=0.283, loss_mean=0.305][A
+Train step of epoch 0:  66%|██████▌   | 4220/6434 [9:53:44<5:08:28,  8.36s/it, gpt_loss=0.283, loss_mean=0.305][A
+Train step of epoch 0:  66%|██████▌   | 4220/6434 [9:53:53<5:08:28,  8.36s/it, gpt_loss=0.315, loss_mean=0.306][A
+Train step of epoch 0:  66%|██████▌   | 4221/6434 [9:53:53<5:17:07,  8.60s/it, gpt_loss=0.315, loss_mean=0.306][A
+Train step of epoch 0:  66%|██████▌   | 4221/6434 [9:54:02<5:17:07,  8.60s/it, gpt_loss=0.216, loss_mean=0.297][A
+Train step of epoch 0:  66%|██████▌   | 4222/6434 [9:54:02<5:18:30,  8.64s/it, gpt_loss=0.216, loss_mean=0.297][A
+Train step of epoch 0:  66%|██████▌   | 4222/6434 [9:54:10<5:18:30,  8.64s/it, gpt_loss=0.278, loss_mean=0.295][A
+Train step of epoch 0:  66%|██████▌   | 4223/6434 [9:54:10<5:09:09,  8.39s/it, gpt_loss=0.278, loss_mean=0.295][A
+Train step of epoch 0:  66%|██████▌   | 4223/6434 [9:54:18<5:09:09,  8.39s/it, gpt_loss=0.317, loss_mean=0.297][A
+Train step of epoch 0:  66%|██████▌   | 4224/6434 [9:54:18<5:03:04,  8.23s/it, gpt_loss=0.317, loss_mean=0.297][A
+Train step of epoch 0:  66%|██████▌   | 4224/6434 [9:54:27<5:03:04,  8.23s/it, gpt_loss=0.273, loss_mean=0.295][A
+Train step of epoch 0:  66%|██████▌   | 4225/6434 [9:54:27<5:11:25,  8.46s/it, gpt_loss=0.273, loss_mean=0.295][A
+Train step of epoch 0:  66%|██████▌   | 4225/6434 [9:54:34<5:11:25,  8.46s/it, gpt_loss=0.21, loss_mean=0.286] [A
+Train step of epoch 0:  66%|██████▌   | 4226/6434 [9:54:34<4:58:07,  8.10s/it, gpt_loss=0.21, loss_mean=0.286][A
+Train step of epoch 0:  66%|██████▌   | 4226/6434 [9:54:42<4:58:07,  8.10s/it, gpt_loss=0.302, loss_mean=0.288][A
+Train step of epoch 0:  66%|██████▌   | 4227/6434 [9:54:42<4:58:04,  8.10s/it, gpt_loss=0.302, loss_mean=0.288][A
+Train step of epoch 0:  66%|██████▌   | 4227/6434 [9:54:50<4:58:04,  8.10s/it, gpt_loss=0.258, loss_mean=0.285][A
+Train step of epoch 0:  66%|██████▌   | 4228/6434 [9:54:50<4:55:50,  8.05s/it, gpt_loss=0.258, loss_mean=0.285][A
+Train step of epoch 0:  66%|██████▌   | 4228/6434 [9:55:00<4:55:50,  8.05s/it, gpt_loss=0.244, loss_mean=0.281][A
+Train step of epoch 0:  66%|██████▌   | 4229/6434 [9:55:00<5:11:51,  8.49s/it, gpt_loss=0.244, loss_mean=0.281][A
+[LID Router Debug] Step: 4230
+Batch Size: 10
+Audio Batch Size: 98
+LID Assignments: [4, 9, 2, 1, 5, 3, 4, 5, 6, 9]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  66%|██████▌   | 4229/6434 [9:55:10<5:11:51,  8.49s/it, gpt_loss=0.35, loss_mean=0.288] [A
+Train step of epoch 0:  66%|██████▌   | 4230/6434 [9:55:10<5:37:51,  9.20s/it, gpt_loss=0.35, loss_mean=0.288][A
+Train step of epoch 0:  66%|██████▌   | 4230/6434 [9:55:18<5:37:51,  9.20s/it, gpt_loss=0.261, loss_mean=0.285][A
+Train step of epoch 0:  66%|██████▌   | 4231/6434 [9:55:18<5:25:31,  8.87s/it, gpt_loss=0.261, loss_mean=0.285][A
+Train step of epoch 0:  66%|██████▌   | 4231/6434 [9:55:29<5:25:31,  8.87s/it, gpt_loss=0.328, loss_mean=0.289][A
+Train step of epoch 0:  66%|██████▌   | 4232/6434 [9:55:29<5:40:00,  9.26s/it, gpt_loss=0.328, loss_mean=0.289][A
+Train step of epoch 0:  66%|██████▌   | 4232/6434 [9:55:37<5:40:00,  9.26s/it, gpt_loss=0.354, loss_mean=0.296][A
+Train step of epoch 0:  66%|██████▌   | 4233/6434 [9:55:37<5:28:31,  8.96s/it, gpt_loss=0.354, loss_mean=0.296][A
+Train step of epoch 0:  66%|██████▌   | 4233/6434 [9:55:46<5:28:31,  8.96s/it, gpt_loss=0.222, loss_mean=0.288][A
+Train step of epoch 0:  66%|██████▌   | 4234/6434 [9:55:46<5:31:03,  9.03s/it, gpt_loss=0.222, loss_mean=0.288][A
+Train step of epoch 0:  66%|██████▌   | 4234/6434 [9:55:55<5:31:03,  9.03s/it, gpt_loss=0.252, loss_mean=0.285][A
+Train step of epoch 0:  66%|██████▌   | 4235/6434 [9:55:55<5:24:15,  8.85s/it, gpt_loss=0.252, loss_mean=0.285][A
+Train step of epoch 0:  66%|██████▌   | 4235/6434 [9:56:03<5:24:15,  8.85s/it, gpt_loss=0.387, loss_mean=0.295][A
+Train step of epoch 0:  66%|██████▌   | 4236/6434 [9:56:03<5:14:30,  8.59s/it, gpt_loss=0.387, loss_mean=0.295][A
+Train step of epoch 0:  66%|██████▌   | 4236/6434 [9:56:11<5:14:30,  8.59s/it, gpt_loss=0.247, loss_mean=0.29] [A
+Train step of epoch 0:  66%|██████▌   | 4237/6434 [9:56:11<5:15:10,  8.61s/it, gpt_loss=0.247, loss_mean=0.29][A
+Train step of epoch 0:  66%|██████▌   | 4237/6434 [9:56:20<5:15:10,  8.61s/it, gpt_loss=0.294, loss_mean=0.291][A
+Train step of epoch 0:  66%|██████▌   | 4238/6434 [9:56:20<5:13:35,  8.57s/it, gpt_loss=0.294, loss_mean=0.291][A
+Train step of epoch 0:  66%|██████▌   | 4238/6434 [9:56:28<5:13:35,  8.57s/it, gpt_loss=0.29, loss_mean=0.291] [A
+Train step of epoch 0:  66%|██████▌   | 4239/6434 [9:56:28<5:07:09,  8.40s/it, gpt_loss=0.29, loss_mean=0.291][A
+[LID Router Debug] Step: 4240
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [3, 0, 1, 3, 1, 9, 2, 4, 2, 9]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+Train step of epoch 0:  66%|██████▌   | 4239/6434 [9:56:36<5:07:09,  8.40s/it, gpt_loss=0.211, loss_mean=0.283][A
+Train step of epoch 0:  66%|██████▌   | 4240/6434 [9:56:36<5:09:43,  8.47s/it, gpt_loss=0.211, loss_mean=0.283][A
+Train step of epoch 0:  66%|██████▌   | 4240/6434 [9:56:45<5:09:43,  8.47s/it, gpt_loss=0.324, loss_mean=0.287][A
+Train step of epoch 0:  66%|██████▌   | 4241/6434 [9:56:45<5:07:18,  8.41s/it, gpt_loss=0.324, loss_mean=0.287][A
+Train step of epoch 0:  66%|██████▌   | 4241/6434 [9:56:53<5:07:18,  8.41s/it, gpt_loss=0.258, loss_mean=0.284][A
+Train step of epoch 0:  66%|██████▌   | 4242/6434 [9:56:53<5:11:46,  8.53s/it, gpt_loss=0.258, loss_mean=0.284][A
+Train step of epoch 0:  66%|██████▌   | 4242/6434 [9:57:02<5:11:46,  8.53s/it, gpt_loss=0.197, loss_mean=0.275][A
+Train step of epoch 0:  66%|██████▌   | 4243/6434 [9:57:02<5:17:55,  8.71s/it, gpt_loss=0.197, loss_mean=0.275][A
+Train step of epoch 0:  66%|██████▌   | 4243/6434 [9:57:11<5:17:55,  8.71s/it, gpt_loss=0.247, loss_mean=0.272][A
+Train step of epoch 0:  66%|██████▌   | 4244/6434 [9:57:11<5:13:29,  8.59s/it, gpt_loss=0.247, loss_mean=0.272][A
+Train step of epoch 0:  66%|██████▌   | 4244/6434 [9:57:19<5:13:29,  8.59s/it, gpt_loss=0.296, loss_mean=0.275][A
+Train step of epoch 0:  66%|██████▌   | 4245/6434 [9:57:19<5:11:31,  8.54s/it, gpt_loss=0.296, loss_mean=0.275][A
+Train step of epoch 0:  66%|██████▌   | 4245/6434 [9:57:29<5:11:31,  8.54s/it, gpt_loss=0.236, loss_mean=0.271][A
+Train step of epoch 0:  66%|██████▌   | 4246/6434 [9:57:29<5:28:49,  9.02s/it, gpt_loss=0.236, loss_mean=0.271][A
+Train step of epoch 0:  66%|██████▌   | 4246/6434 [9:57:38<5:28:49,  9.02s/it, gpt_loss=0.338, loss_mean=0.277][A
+Train step of epoch 0:  66%|██████▌   | 4247/6434 [9:57:38<5:19:31,  8.77s/it, gpt_loss=0.338, loss_mean=0.277][A
+Train step of epoch 0:  66%|██████▌   | 4247/6434 [9:57:46<5:19:31,  8.77s/it, gpt_loss=0.276, loss_mean=0.277][A
+Train step of epoch 0:  66%|██████▌   | 4248/6434 [9:57:46<5:11:49,  8.56s/it, gpt_loss=0.276, loss_mean=0.277][A
+Train step of epoch 0:  66%|██████▌   | 4248/6434 [9:57:55<5:11:49,  8.56s/it, gpt_loss=0.294, loss_mean=0.279][A
+Train step of epoch 0:  66%|██████▌   | 4249/6434 [9:57:55<5:20:01,  8.79s/it, gpt_loss=0.294, loss_mean=0.279][A
+[LID Router Debug] Step: 4250
+Batch Size: 10
+Audio Batch Size: 107
+LID Assignments: [1, 9, 2, 2, 10, 7, 0, 2, 4, 0]
+Active Experts in Batch: {0, 1, 2, 4, 7, 9, 10}
+
+Train step of epoch 0:  66%|██████▌   | 4249/6434 [9:58:04<5:20:01,  8.79s/it, gpt_loss=0.437, loss_mean=0.295][A
+Train step of epoch 0:  66%|██████▌   | 4250/6434 [9:58:04<5:19:46,  8.78s/it, gpt_loss=0.437, loss_mean=0.295][A
+Train step of epoch 0:  66%|██████▌   | 4250/6434 [9:58:11<5:19:46,  8.78s/it, gpt_loss=0.352, loss_mean=0.3]  [A
+Train step of epoch 0:  66%|██████▌   | 4251/6434 [9:58:11<5:05:40,  8.40s/it, gpt_loss=0.352, loss_mean=0.3][A
+Train step of epoch 0:  66%|██████▌   | 4251/6434 [9:58:19<5:05:40,  8.40s/it, gpt_loss=0.248, loss_mean=0.295][A
+Train step of epoch 0:  66%|██████▌   | 4252/6434 [9:58:19<4:58:00,  8.19s/it, gpt_loss=0.248, loss_mean=0.295][A
+Train step of epoch 0:  66%|██████▌   | 4252/6434 [9:58:27<4:58:00,  8.19s/it, gpt_loss=0.319, loss_mean=0.298][A
+Train step of epoch 0:  66%|██████▌   | 4253/6434 [9:58:27<4:55:32,  8.13s/it, gpt_loss=0.319, loss_mean=0.298][A
+Train step of epoch 0:  66%|██████▌   | 4253/6434 [9:58:35<4:55:32,  8.13s/it, gpt_loss=0.254, loss_mean=0.293][A
+Train step of epoch 0:  66%|██████▌   | 4254/6434 [9:58:35<4:58:52,  8.23s/it, gpt_loss=0.254, loss_mean=0.293][A
+Train step of epoch 0:  66%|██████▌   | 4254/6434 [9:58:45<4:58:52,  8.23s/it, gpt_loss=0.345, loss_mean=0.298][A
+Train step of epoch 0:  66%|██████▌   | 4255/6434 [9:58:45<5:11:53,  8.59s/it, gpt_loss=0.345, loss_mean=0.298][A
+Train step of epoch 0:  66%|██████▌   | 4255/6434 [9:58:54<5:11:53,  8.59s/it, gpt_loss=0.376, loss_mean=0.306][A
+Train step of epoch 0:  66%|██████▌   | 4256/6434 [9:58:54<5:23:01,  8.90s/it, gpt_loss=0.376, loss_mean=0.306][A
+Train step of epoch 0:  66%|██████▌   | 4256/6434 [9:59:03<5:23:01,  8.90s/it, gpt_loss=0.335, loss_mean=0.309][A
+Train step of epoch 0:  66%|██████▌   | 4257/6434 [9:59:03<5:21:46,  8.87s/it, gpt_loss=0.335, loss_mean=0.309][A
+Train step of epoch 0:  66%|██████▌   | 4257/6434 [9:59:12<5:21:46,  8.87s/it, gpt_loss=0.261, loss_mean=0.304][A
+Train step of epoch 0:  66%|██████▌   | 4258/6434 [9:59:12<5:23:40,  8.92s/it, gpt_loss=0.261, loss_mean=0.304][A
+Train step of epoch 0:  66%|██████▌   | 4258/6434 [9:59:21<5:23:40,  8.92s/it, gpt_loss=0.268, loss_mean=0.301][A
+Train step of epoch 0:  66%|██████▌   | 4259/6434 [9:59:21<5:16:06,  8.72s/it, gpt_loss=0.268, loss_mean=0.301][A
+[LID Router Debug] Step: 4260
+Batch Size: 10
+Audio Batch Size: 137
+LID Assignments: [1, 5, 8, 9, 4, 2, 3, 5, 2, 3]
+Active Experts in Batch: {1, 2, 3, 4, 5, 8, 9}
+
+Train step of epoch 0:  66%|██████▌   | 4259/6434 [9:59:29<5:16:06,  8.72s/it, gpt_loss=0.245, loss_mean=0.295][A
+Train step of epoch 0:  66%|██████▌   | 4260/6434 [9:59:29<5:15:36,  8.71s/it, gpt_loss=0.245, loss_mean=0.295][A
+Train step of epoch 0:  66%|██████▌   | 4260/6434 [9:59:38<5:15:36,  8.71s/it, gpt_loss=0.271, loss_mean=0.293][A
+Train step of epoch 0:  66%|██████▌   | 4261/6434 [9:59:38<5:17:33,  8.77s/it, gpt_loss=0.271, loss_mean=0.293][A
+Train step of epoch 0:  66%|██████▌   | 4261/6434 [9:59:48<5:17:33,  8.77s/it, gpt_loss=0.295, loss_mean=0.293][A
+Train step of epoch 0:  66%|██████▌   | 4262/6434 [9:59:48<5:24:56,  8.98s/it, gpt_loss=0.295, loss_mean=0.293][A
+Train step of epoch 0:  66%|██████▌   | 4262/6434 [9:59:55<5:24:56,  8.98s/it, gpt_loss=0.284, loss_mean=0.292][A
+Train step of epoch 0:  66%|██████▋   | 4263/6434 [9:59:55<5:13:02,  8.65s/it, gpt_loss=0.284, loss_mean=0.292][A
+Train step of epoch 0:  66%|██████▋   | 4263/6434 [10:00:03<5:13:02,  8.65s/it, gpt_loss=0.304, loss_mean=0.293][A
+Train step of epoch 0:  66%|██████▋   | 4264/6434 [10:00:03<5:03:09,  8.38s/it, gpt_loss=0.304, loss_mean=0.293][A
+Train step of epoch 0:  66%|██████▋   | 4264/6434 [10:00:10<5:03:09,  8.38s/it, gpt_loss=0.379, loss_mean=0.302][A
+Train step of epoch 0:  66%|██████▋   | 4265/6434 [10:00:10<4:48:02,  7.97s/it, gpt_loss=0.379, loss_mean=0.302][A
+Train step of epoch 0:  66%|██████▋   | 4265/6434 [10:00:19<4:48:02,  7.97s/it, gpt_loss=0.288, loss_mean=0.3]  [A
+Train step of epoch 0:  66%|██████▋   | 4266/6434 [10:00:19<4:59:26,  8.29s/it, gpt_loss=0.288, loss_mean=0.3][A
+Train step of epoch 0:  66%|██████▋   | 4266/6434 [10:00:27<4:59:26,  8.29s/it, gpt_loss=0.348, loss_mean=0.305][A
+Train step of epoch 0:  66%|██████▋   | 4267/6434 [10:00:27<4:56:32,  8.21s/it, gpt_loss=0.348, loss_mean=0.305][A
+Train step of epoch 0:  66%|██████▋   | 4267/6434 [10:00:36<4:56:32,  8.21s/it, gpt_loss=0.302, loss_mean=0.305][A
+Train step of epoch 0:  66%|██████▋   | 4268/6434 [10:00:36<5:03:33,  8.41s/it, gpt_loss=0.302, loss_mean=0.305][A
+Train step of epoch 0:  66%|██████▋   | 4268/6434 [10:00:45<5:03:33,  8.41s/it, gpt_loss=0.307, loss_mean=0.305][A
+Train step of epoch 0:  66%|██████▋   | 4269/6434 [10:00:45<5:12:49,  8.67s/it, gpt_loss=0.307, loss_mean=0.305][A
+[LID Router Debug] Step: 4270
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [9, 3, 5, 1, 8, 2, 9, 0, 6, 5]
+Active Experts in Batch: {0, 1, 2, 3, 5, 6, 8, 9}
+
+Train step of epoch 0:  66%|██████▋   | 4269/6434 [10:00:55<5:12:49,  8.67s/it, gpt_loss=0.202, loss_mean=0.295][A
+Train step of epoch 0:  66%|██████▋   | 4270/6434 [10:00:55<5:19:32,  8.86s/it, gpt_loss=0.202, loss_mean=0.295][A
+Train step of epoch 0:  66%|██████▋   | 4270/6434 [10:01:03<5:19:32,  8.86s/it, gpt_loss=0.323, loss_mean=0.298][A
+Train step of epoch 0:  66%|██████▋   | 4271/6434 [10:01:03<5:17:53,  8.82s/it, gpt_loss=0.323, loss_mean=0.298][A
+Train step of epoch 0:  66%|██████▋   | 4271/6434 [10:01:11<5:17:53,  8.82s/it, gpt_loss=0.231, loss_mean=0.291][A
+Train step of epoch 0:  66%|██████▋   | 4272/6434 [10:01:11<5:08:29,  8.56s/it, gpt_loss=0.231, loss_mean=0.291][A
+Train step of epoch 0:  66%|██████▋   | 4272/6434 [10:01:21<5:08:29,  8.56s/it, gpt_loss=0.359, loss_mean=0.298][A
+Train step of epoch 0:  66%|██████▋   | 4273/6434 [10:01:21<5:18:59,  8.86s/it, gpt_loss=0.359, loss_mean=0.298][A
+Train step of epoch 0:  66%|██████▋   | 4273/6434 [10:01:29<5:18:59,  8.86s/it, gpt_loss=0.314, loss_mean=0.299][A
+Train step of epoch 0:  66%|██████▋   | 4274/6434 [10:01:29<5:11:04,  8.64s/it, gpt_loss=0.314, loss_mean=0.299][A
+Train step of epoch 0:  66%|██████▋   | 4274/6434 [10:01:37<5:11:04,  8.64s/it, gpt_loss=0.384, loss_mean=0.308][A
+Train step of epoch 0:  66%|██████▋   | 4275/6434 [10:01:37<5:03:13,  8.43s/it, gpt_loss=0.384, loss_mean=0.308][A
+Train step of epoch 0:  66%|██████▋   | 4275/6434 [10:01:47<5:03:13,  8.43s/it, gpt_loss=0.288, loss_mean=0.306][A
+Train step of epoch 0:  66%|██████▋   | 4276/6434 [10:01:47<5:15:08,  8.76s/it, gpt_loss=0.288, loss_mean=0.306][A
+Train step of epoch 0:  66%|██████▋   | 4276/6434 [10:01:55<5:15:08,  8.76s/it, gpt_loss=0.303, loss_mean=0.306][A
+Train step of epoch 0:  66%|██████▋   | 4277/6434 [10:01:55<5:11:54,  8.68s/it, gpt_loss=0.303, loss_mean=0.306][A
+Train step of epoch 0:  66%|██████▋   | 4277/6434 [10:02:03<5:11:54,  8.68s/it, gpt_loss=0.297, loss_mean=0.305][A
+Train step of epoch 0:  66%|██████▋   | 4278/6434 [10:02:03<5:02:07,  8.41s/it, gpt_loss=0.297, loss_mean=0.305][A
+Train step of epoch 0:  66%|██████▋   | 4278/6434 [10:02:11<5:02:07,  8.41s/it, gpt_loss=0.277, loss_mean=0.302][A
+Train step of epoch 0:  67%|██████▋   | 4279/6434 [10:02:11<5:01:59,  8.41s/it, gpt_loss=0.277, loss_mean=0.302][A
+[LID Router Debug] Step: 4280
+Batch Size: 10
+Audio Batch Size: 132
+LID Assignments: [5, 9, 6, 3, 4, 5, 9, 9, 5, 3]
+Active Experts in Batch: {3, 4, 5, 6, 9}
+
+Train step of epoch 0:  67%|██████▋   | 4279/6434 [10:02:20<5:01:59,  8.41s/it, gpt_loss=0.339, loss_mean=0.306][A
+Train step of epoch 0:  67%|██████▋   | 4280/6434 [10:02:20<5:10:07,  8.64s/it, gpt_loss=0.339, loss_mean=0.306][A
+Train step of epoch 0:  67%|██████▋   | 4280/6434 [10:02:30<5:10:07,  8.64s/it, gpt_loss=0.323, loss_mean=0.307][A
+Train step of epoch 0:  67%|██████▋   | 4281/6434 [10:02:30<5:17:38,  8.85s/it, gpt_loss=0.323, loss_mean=0.307][A
+Train step of epoch 0:  67%|██████▋   | 4281/6434 [10:02:39<5:17:38,  8.85s/it, gpt_loss=0.307, loss_mean=0.307][A
+Train step of epoch 0:  67%|██████▋   | 4282/6434 [10:02:39<5:26:49,  9.11s/it, gpt_loss=0.307, loss_mean=0.307][A
+Train step of epoch 0:  67%|██████▋   | 4282/6434 [10:02:47<5:26:49,  9.11s/it, gpt_loss=0.398, loss_mean=0.317][A
+Train step of epoch 0:  67%|██████▋   | 4283/6434 [10:02:47<5:07:23,  8.57s/it, gpt_loss=0.398, loss_mean=0.317][A
+Train step of epoch 0:  67%|██████▋   | 4283/6434 [10:02:55<5:07:23,  8.57s/it, gpt_loss=0.292, loss_mean=0.314][A
+Train step of epoch 0:  67%|██████▋   | 4284/6434 [10:02:55<5:03:37,  8.47s/it, gpt_loss=0.292, loss_mean=0.314][A
+Train step of epoch 0:  67%|██████▋   | 4284/6434 [10:03:04<5:03:37,  8.47s/it, gpt_loss=0.291, loss_mean=0.312][A
+Train step of epoch 0:  67%|██████▋   | 4285/6434 [10:03:04<5:03:51,  8.48s/it, gpt_loss=0.291, loss_mean=0.312][A
+Train step of epoch 0:  67%|██████▋   | 4285/6434 [10:03:12<5:03:51,  8.48s/it, gpt_loss=0.278, loss_mean=0.308][A
+Train step of epoch 0:  67%|██████▋   | 4286/6434 [10:03:12<5:04:00,  8.49s/it, gpt_loss=0.278, loss_mean=0.308][A
+Train step of epoch 0:  67%|██████▋   | 4286/6434 [10:03:21<5:04:00,  8.49s/it, gpt_loss=0.285, loss_mean=0.306][A
+Train step of epoch 0:  67%|██████▋   | 4287/6434 [10:03:21<5:06:37,  8.57s/it, gpt_loss=0.285, loss_mean=0.306][A
+Train step of epoch 0:  67%|██████▋   | 4287/6434 [10:03:30<5:06:37,  8.57s/it, gpt_loss=0.293, loss_mean=0.305][A
+Train step of epoch 0:  67%|██████▋   | 4288/6434 [10:03:30<5:13:33,  8.77s/it, gpt_loss=0.293, loss_mean=0.305][A
+Train step of epoch 0:  67%|██████▋   | 4288/6434 [10:03:38<5:13:33,  8.77s/it, gpt_loss=0.316, loss_mean=0.306][A
+Train step of epoch 0:  67%|██████▋   | 4289/6434 [10:03:38<5:03:23,  8.49s/it, gpt_loss=0.316, loss_mean=0.306][A
+[LID Router Debug] Step: 4290
+Batch Size: 10
+Audio Batch Size: 131
+LID Assignments: [9, 2, 0, 2, 9, 3, 2, 0, 0, 3]
+Active Experts in Batch: {0, 9, 2, 3}
+
+Train step of epoch 0:  67%|██████▋   | 4289/6434 [10:03:46<5:03:23,  8.49s/it, gpt_loss=0.267, loss_mean=0.302][A
+Train step of epoch 0:  67%|██████▋   | 4290/6434 [10:03:46<5:04:50,  8.53s/it, gpt_loss=0.267, loss_mean=0.302][A
+Train step of epoch 0:  67%|██████▋   | 4290/6434 [10:03:56<5:04:50,  8.53s/it, gpt_loss=0.276, loss_mean=0.299][A
+Train step of epoch 0:  67%|██████▋   | 4291/6434 [10:03:56<5:11:01,  8.71s/it, gpt_loss=0.276, loss_mean=0.299][A
+Train step of epoch 0:  67%|██████▋   | 4291/6434 [10:04:05<5:11:01,  8.71s/it, gpt_loss=0.332, loss_mean=0.303][A
+Train step of epoch 0:  67%|██████▋   | 4292/6434 [10:04:05<5:17:37,  8.90s/it, gpt_loss=0.332, loss_mean=0.303][A
+Train step of epoch 0:  67%|██████▋   | 4292/6434 [10:04:14<5:17:37,  8.90s/it, gpt_loss=0.234, loss_mean=0.296][A
+Train step of epoch 0:  67%|██████▋   | 4293/6434 [10:04:14<5:15:56,  8.85s/it, gpt_loss=0.234, loss_mean=0.296][A
+Train step of epoch 0:  67%|██████▋   | 4293/6434 [10:04:24<5:15:56,  8.85s/it, gpt_loss=0.278, loss_mean=0.294][A
+Train step of epoch 0:  67%|██████▋   | 4294/6434 [10:04:24<5:29:46,  9.25s/it, gpt_loss=0.278, loss_mean=0.294][A
+Train step of epoch 0:  67%|██████▋   | 4294/6434 [10:04:31<5:29:46,  9.25s/it, gpt_loss=0.342, loss_mean=0.299][A
+Train step of epoch 0:  67%|██████▋   | 4295/6434 [10:04:31<5:11:23,  8.73s/it, gpt_loss=0.342, loss_mean=0.299][A
+Train step of epoch 0:  67%|██████▋   | 4295/6434 [10:04:40<5:11:23,  8.73s/it, gpt_loss=0.313, loss_mean=0.3]  [A
+Train step of epoch 0:  67%|██████▋   | 4296/6434 [10:04:40<5:12:22,  8.77s/it, gpt_loss=0.313, loss_mean=0.3][A
+Train step of epoch 0:  67%|██████▋   | 4296/6434 [10:04:48<5:12:22,  8.77s/it, gpt_loss=0.25, loss_mean=0.295][A
+Train step of epoch 0:  67%|██████▋   | 4297/6434 [10:04:48<5:05:58,  8.59s/it, gpt_loss=0.25, loss_mean=0.295][A
+Train step of epoch 0:  67%|██████▋   | 4297/6434 [10:04:58<5:05:58,  8.59s/it, gpt_loss=0.257, loss_mean=0.291][A
+Train step of epoch 0:  67%|██████▋   | 4298/6434 [10:04:58<5:19:42,  8.98s/it, gpt_loss=0.257, loss_mean=0.291][A
+Train step of epoch 0:  67%|██████▋   | 4298/6434 [10:05:07<5:19:42,  8.98s/it, gpt_loss=0.271, loss_mean=0.289][A
+Train step of epoch 0:  67%|██████▋   | 4299/6434 [10:05:07<5:16:18,  8.89s/it, gpt_loss=0.271, loss_mean=0.289][A
+[LID Router Debug] Step: 4300
+Batch Size: 10
+Audio Batch Size: 94
+LID Assignments: [6, 0, 2, 4, 1, 1, 5, 2, 2, 1]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6}
+
+Train step of epoch 0:  67%|██████▋   | 4299/6434 [10:05:16<5:16:18,  8.89s/it, gpt_loss=0.289, loss_mean=0.289][A
+Train step of epoch 0:  67%|██████▋   | 4300/6434 [10:05:16<5:18:05,  8.94s/it, gpt_loss=0.289, loss_mean=0.289][A
+Train step of epoch 0:  67%|██████▋   | 4300/6434 [10:05:25<5:18:05,  8.94s/it, gpt_loss=0.323, loss_mean=0.293][A
+Train step of epoch 0:  67%|██████▋   | 4301/6434 [10:05:25<5:13:58,  8.83s/it, gpt_loss=0.323, loss_mean=0.293][A
+Train step of epoch 0:  67%|██████▋   | 4301/6434 [10:05:33<5:13:58,  8.83s/it, gpt_loss=0.27, loss_mean=0.29]  [A
+Train step of epoch 0:  67%|██████▋   | 4302/6434 [10:05:33<5:07:25,  8.65s/it, gpt_loss=0.27, loss_mean=0.29][A
+Train step of epoch 0:  67%|██████▋   | 4302/6434 [10:05:41<5:07:25,  8.65s/it, gpt_loss=0.346, loss_mean=0.296][A
+Train step of epoch 0:  67%|██████▋   | 4303/6434 [10:05:41<5:06:45,  8.64s/it, gpt_loss=0.346, loss_mean=0.296][A
+Train step of epoch 0:  67%|██████▋   | 4303/6434 [10:05:50<5:06:45,  8.64s/it, gpt_loss=0.239, loss_mean=0.29] [A
+Train step of epoch 0:  67%|██████▋   | 4304/6434 [10:05:50<5:06:20,  8.63s/it, gpt_loss=0.239, loss_mean=0.29][A
+Train step of epoch 0:  67%|██████▋   | 4304/6434 [10:05:58<5:06:20,  8.63s/it, gpt_loss=0.276, loss_mean=0.289][A
+Train step of epoch 0:  67%|██████▋   | 4305/6434 [10:05:58<4:54:54,  8.31s/it, gpt_loss=0.276, loss_mean=0.289][A
+Train step of epoch 0:  67%|██████▋   | 4305/6434 [10:06:06<4:54:54,  8.31s/it, gpt_loss=0.397, loss_mean=0.3]  [A
+Train step of epoch 0:  67%|██████▋   | 4306/6434 [10:06:06<4:57:25,  8.39s/it, gpt_loss=0.397, loss_mean=0.3][A
+Train step of epoch 0:  67%|██████▋   | 4306/6434 [10:06:15<4:57:25,  8.39s/it, gpt_loss=0.347, loss_mean=0.304][A
+Train step of epoch 0:  67%|██████▋   | 4307/6434 [10:06:15<5:06:16,  8.64s/it, gpt_loss=0.347, loss_mean=0.304][A
+Train step of epoch 0:  67%|██████▋   | 4307/6434 [10:06:23<5:06:16,  8.64s/it, gpt_loss=0.3, loss_mean=0.304]  [A
+Train step of epoch 0:  67%|██████▋   | 4308/6434 [10:06:23<4:51:31,  8.23s/it, gpt_loss=0.3, loss_mean=0.304][A
+Train step of epoch 0:  67%|██████▋   | 4308/6434 [10:06:32<4:51:31,  8.23s/it, gpt_loss=0.257, loss_mean=0.299][A
+Train step of epoch 0:  67%|██████▋   | 4309/6434 [10:06:32<5:04:24,  8.59s/it, gpt_loss=0.257, loss_mean=0.299][A
+[LID Router Debug] Step: 4310
+Batch Size: 10
+Audio Batch Size: 96
+LID Assignments: [3, 1, 4, 0, 1, 0, 1, 1, 9, 0]
+Active Experts in Batch: {0, 1, 3, 4, 9}
+
+Train step of epoch 0:  67%|██████▋   | 4309/6434 [10:06:41<5:04:24,  8.59s/it, gpt_loss=0.272, loss_mean=0.297][A
+Train step of epoch 0:  67%|██████▋   | 4310/6434 [10:06:41<5:03:28,  8.57s/it, gpt_loss=0.272, loss_mean=0.297][A
+Train step of epoch 0:  67%|██████▋   | 4310/6434 [10:06:49<5:03:28,  8.57s/it, gpt_loss=0.306, loss_mean=0.297][A
+Train step of epoch 0:  67%|██████▋   | 4311/6434 [10:06:49<4:58:30,  8.44s/it, gpt_loss=0.306, loss_mean=0.297][A
+Train step of epoch 0:  67%|██████▋   | 4311/6434 [10:06:57<4:58:30,  8.44s/it, gpt_loss=0.304, loss_mean=0.298][A
+Train step of epoch 0:  67%|██████▋   | 4312/6434 [10:06:57<4:58:49,  8.45s/it, gpt_loss=0.304, loss_mean=0.298][A
+Train step of epoch 0:  67%|██████▋   | 4312/6434 [10:07:06<4:58:49,  8.45s/it, gpt_loss=0.26, loss_mean=0.294] [A
+Train step of epoch 0:  67%|██████▋   | 4313/6434 [10:07:06<5:01:11,  8.52s/it, gpt_loss=0.26, loss_mean=0.294][A
+Train step of epoch 0:  67%|██████▋   | 4313/6434 [10:07:14<5:01:11,  8.52s/it, gpt_loss=0.274, loss_mean=0.292][A
+Train step of epoch 0:  67%|██████▋   | 4314/6434 [10:07:14<4:59:29,  8.48s/it, gpt_loss=0.274, loss_mean=0.292][A
+Train step of epoch 0:  67%|██████▋   | 4314/6434 [10:07:23<4:59:29,  8.48s/it, gpt_loss=0.274, loss_mean=0.29] [A
+Train step of epoch 0:  67%|██████▋   | 4315/6434 [10:07:23<4:58:56,  8.46s/it, gpt_loss=0.274, loss_mean=0.29][A
+Train step of epoch 0:  67%|██████▋   | 4315/6434 [10:07:31<4:58:56,  8.46s/it, gpt_loss=0.288, loss_mean=0.29][A
+Train step of epoch 0:  67%|██████▋   | 4316/6434 [10:07:31<4:53:27,  8.31s/it, gpt_loss=0.288, loss_mean=0.29][A
+Train step of epoch 0:  67%|██████▋   | 4316/6434 [10:07:38<4:53:27,  8.31s/it, gpt_loss=0.287, loss_mean=0.29][A
+Train step of epoch 0:  67%|██████▋   | 4317/6434 [10:07:38<4:46:46,  8.13s/it, gpt_loss=0.287, loss_mean=0.29][A
+Train step of epoch 0:  67%|██████▋   | 4317/6434 [10:07:46<4:46:46,  8.13s/it, gpt_loss=0.356, loss_mean=0.296][A
+Train step of epoch 0:  67%|██████▋   | 4318/6434 [10:07:46<4:43:47,  8.05s/it, gpt_loss=0.356, loss_mean=0.296][A
+Train step of epoch 0:  67%|██████▋   | 4318/6434 [10:07:55<4:43:47,  8.05s/it, gpt_loss=0.341, loss_mean=0.301][A
+Train step of epoch 0:  67%|██████▋   | 4319/6434 [10:07:55<4:52:33,  8.30s/it, gpt_loss=0.341, loss_mean=0.301][A
+[LID Router Debug] Step: 4320
+Batch Size: 10
+Audio Batch Size: 100
+LID Assignments: [0, 5, 5, 4, 5, 3, 10, 5, 2, 9]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9, 10}
+
+Train step of epoch 0:  67%|██████▋   | 4319/6434 [10:08:05<4:52:33,  8.30s/it, gpt_loss=0.377, loss_mean=0.309][A
+Train step of epoch 0:  67%|██████▋   | 4320/6434 [10:08:05<5:03:22,  8.61s/it, gpt_loss=0.377, loss_mean=0.309][A
+Train step of epoch 0:  67%|██████▋   | 4320/6434 [10:08:14<5:03:22,  8.61s/it, gpt_loss=0.371, loss_mean=0.315][A
+Train step of epoch 0:  67%|██████▋   | 4321/6434 [10:08:14<5:07:38,  8.74s/it, gpt_loss=0.371, loss_mean=0.315][A
+Train step of epoch 0:  67%|██████▋   | 4321/6434 [10:08:23<5:07:38,  8.74s/it, gpt_loss=0.309, loss_mean=0.314][A
+Train step of epoch 0:  67%|██████▋   | 4322/6434 [10:08:23<5:10:08,  8.81s/it, gpt_loss=0.309, loss_mean=0.314][A
+Train step of epoch 0:  67%|██████▋   | 4322/6434 [10:08:31<5:10:08,  8.81s/it, gpt_loss=0.246, loss_mean=0.307][A
+Train step of epoch 0:  67%|██████▋   | 4323/6434 [10:08:31<5:11:20,  8.85s/it, gpt_loss=0.246, loss_mean=0.307][A
+Train step of epoch 0:  67%|██████▋   | 4323/6434 [10:08:40<5:11:20,  8.85s/it, gpt_loss=0.291, loss_mean=0.306][A
+Train step of epoch 0:  67%|██████▋   | 4324/6434 [10:08:40<5:05:25,  8.69s/it, gpt_loss=0.291, loss_mean=0.306][A
+Train step of epoch 0:  67%|██████▋   | 4324/6434 [10:08:47<5:05:25,  8.69s/it, gpt_loss=0.297, loss_mean=0.305][A
+Train step of epoch 0:  67%|██████▋   | 4325/6434 [10:08:47<4:51:17,  8.29s/it, gpt_loss=0.297, loss_mean=0.305][A
+Train step of epoch 0:  67%|██████▋   | 4325/6434 [10:08:56<4:51:17,  8.29s/it, gpt_loss=0.214, loss_mean=0.296][A
+Train step of epoch 0:  67%|██████▋   | 4326/6434 [10:08:56<4:56:44,  8.45s/it, gpt_loss=0.214, loss_mean=0.296][A
+Train step of epoch 0:  67%|██████▋   | 4326/6434 [10:09:04<4:56:44,  8.45s/it, gpt_loss=0.293, loss_mean=0.295][A
+Train step of epoch 0:  67%|██████▋   | 4327/6434 [10:09:04<4:52:10,  8.32s/it, gpt_loss=0.293, loss_mean=0.295][A
+Train step of epoch 0:  67%|██████▋   | 4327/6434 [10:09:12<4:52:10,  8.32s/it, gpt_loss=0.297, loss_mean=0.296][A
+Train step of epoch 0:  67%|██████▋   | 4328/6434 [10:09:12<4:51:40,  8.31s/it, gpt_loss=0.297, loss_mean=0.296][A
+Train step of epoch 0:  67%|██████▋   | 4328/6434 [10:09:21<4:51:40,  8.31s/it, gpt_loss=0.267, loss_mean=0.293][A
+Train step of epoch 0:  67%|██████▋   | 4329/6434 [10:09:21<4:52:33,  8.34s/it, gpt_loss=0.267, loss_mean=0.293][A
+[LID Router Debug] Step: 4330
+Batch Size: 10
+Audio Batch Size: 87
+LID Assignments: [0, 4, 9, 0, 0, 0, 0, 5, 6, 2]
+Active Experts in Batch: {0, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  67%|██████▋   | 4329/6434 [10:09:28<4:52:33,  8.34s/it, gpt_loss=0.383, loss_mean=0.302][A
+Train step of epoch 0:  67%|██████▋   | 4330/6434 [10:09:28<4:45:48,  8.15s/it, gpt_loss=0.383, loss_mean=0.302][A
+Train step of epoch 0:  67%|██████▋   | 4330/6434 [10:09:36<4:45:48,  8.15s/it, gpt_loss=0.347, loss_mean=0.306][A
+Train step of epoch 0:  67%|██████▋   | 4331/6434 [10:09:36<4:42:06,  8.05s/it, gpt_loss=0.347, loss_mean=0.306][A
+Train step of epoch 0:  67%|██████▋   | 4331/6434 [10:09:44<4:42:06,  8.05s/it, gpt_loss=0.345, loss_mean=0.31] [A
+Train step of epoch 0:  67%|██████▋   | 4332/6434 [10:09:44<4:41:22,  8.03s/it, gpt_loss=0.345, loss_mean=0.31][A
+Train step of epoch 0:  67%|██████▋   | 4332/6434 [10:09:52<4:41:22,  8.03s/it, gpt_loss=0.35, loss_mean=0.314][A
+Train step of epoch 0:  67%|██████▋   | 4333/6434 [10:09:52<4:38:03,  7.94s/it, gpt_loss=0.35, loss_mean=0.314][A
+Train step of epoch 0:  67%|██████▋   | 4333/6434 [10:10:00<4:38:03,  7.94s/it, gpt_loss=0.282, loss_mean=0.311][A
+Train step of epoch 0:  67%|██████▋   | 4334/6434 [10:10:00<4:38:31,  7.96s/it, gpt_loss=0.282, loss_mean=0.311][A
+Train step of epoch 0:  67%|██████▋   | 4334/6434 [10:10:09<4:38:31,  7.96s/it, gpt_loss=0.322, loss_mean=0.312][A
+Train step of epoch 0:  67%|██████▋   | 4335/6434 [10:10:09<4:45:43,  8.17s/it, gpt_loss=0.322, loss_mean=0.312][A
+Train step of epoch 0:  67%|██████▋   | 4335/6434 [10:10:18<4:45:43,  8.17s/it, gpt_loss=0.289, loss_mean=0.31] [A
+Train step of epoch 0:  67%|██████▋   | 4336/6434 [10:10:18<4:54:35,  8.42s/it, gpt_loss=0.289, loss_mean=0.31][A
+Train step of epoch 0:  67%|██████▋   | 4336/6434 [10:10:26<4:54:35,  8.42s/it, gpt_loss=0.333, loss_mean=0.312][A
+Train step of epoch 0:  67%|██████▋   | 4337/6434 [10:10:26<4:58:30,  8.54s/it, gpt_loss=0.333, loss_mean=0.312][A
+Train step of epoch 0:  67%|██████▋   | 4337/6434 [10:10:35<4:58:30,  8.54s/it, gpt_loss=0.441, loss_mean=0.325][A
+Train step of epoch 0:  67%|██████▋   | 4338/6434 [10:10:35<4:54:35,  8.43s/it, gpt_loss=0.441, loss_mean=0.325][A
+Train step of epoch 0:  67%|██████▋   | 4338/6434 [10:10:43<4:54:35,  8.43s/it, gpt_loss=0.267, loss_mean=0.319][A
+Train step of epoch 0:  67%|██████▋   | 4339/6434 [10:10:43<4:59:20,  8.57s/it, gpt_loss=0.267, loss_mean=0.319][A
+[LID Router Debug] Step: 4340
+Batch Size: 10
+Audio Batch Size: 120
+LID Assignments: [2, 9, 3, 5, 4, 2, 1, 3, 9, 1]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  67%|██████▋   | 4339/6434 [10:10:53<4:59:20,  8.57s/it, gpt_loss=0.302, loss_mean=0.317][A
+Train step of epoch 0:  67%|██████▋   | 4340/6434 [10:10:53<5:06:20,  8.78s/it, gpt_loss=0.302, loss_mean=0.317][A
+Train step of epoch 0:  67%|██████▋   | 4340/6434 [10:11:02<5:06:20,  8.78s/it, gpt_loss=0.29, loss_mean=0.315] [A
+Train step of epoch 0:  67%|██████▋   | 4341/6434 [10:11:02<5:08:49,  8.85s/it, gpt_loss=0.29, loss_mean=0.315][A
+Train step of epoch 0:  67%|██████▋   | 4341/6434 [10:11:10<5:08:49,  8.85s/it, gpt_loss=0.249, loss_mean=0.308][A
+Train step of epoch 0:  67%|██████▋   | 4342/6434 [10:11:10<5:02:40,  8.68s/it, gpt_loss=0.249, loss_mean=0.308][A
+Train step of epoch 0:  67%|██████▋   | 4342/6434 [10:11:19<5:02:40,  8.68s/it, gpt_loss=0.341, loss_mean=0.311][A
+Train step of epoch 0:  68%|██████▊   | 4343/6434 [10:11:19<5:08:10,  8.84s/it, gpt_loss=0.341, loss_mean=0.311][A
+Train step of epoch 0:  68%|██████▊   | 4343/6434 [10:11:29<5:08:10,  8.84s/it, gpt_loss=0.35, loss_mean=0.315] [A
+Train step of epoch 0:  68%|██████▊   | 4344/6434 [10:11:29<5:14:15,  9.02s/it, gpt_loss=0.35, loss_mean=0.315][A
+Train step of epoch 0:  68%|██████▊   | 4344/6434 [10:11:36<5:14:15,  9.02s/it, gpt_loss=0.353, loss_mean=0.319][A
+Train step of epoch 0:  68%|██████▊   | 4345/6434 [10:11:36<4:58:52,  8.58s/it, gpt_loss=0.353, loss_mean=0.319][A
+Train step of epoch 0:  68%|██████▊   | 4345/6434 [10:11:43<4:58:52,  8.58s/it, gpt_loss=0.358, loss_mean=0.323][A
+Train step of epoch 0:  68%|██████▊   | 4346/6434 [10:11:43<4:44:00,  8.16s/it, gpt_loss=0.358, loss_mean=0.323][A
+Train step of epoch 0:  68%|██████▊   | 4346/6434 [10:11:51<4:44:00,  8.16s/it, gpt_loss=0.256, loss_mean=0.316][A
+Train step of epoch 0:  68%|██████▊   | 4347/6434 [10:11:51<4:40:20,  8.06s/it, gpt_loss=0.256, loss_mean=0.316][A
+Train step of epoch 0:  68%|██████▊   | 4347/6434 [10:11:58<4:40:20,  8.06s/it, gpt_loss=0.391, loss_mean=0.324][A
+Train step of epoch 0:  68%|██████▊   | 4348/6434 [10:11:58<4:30:56,  7.79s/it, gpt_loss=0.391, loss_mean=0.324][A
+Train step of epoch 0:  68%|██████▊   | 4348/6434 [10:12:07<4:30:56,  7.79s/it, gpt_loss=0.298, loss_mean=0.321][A
+Train step of epoch 0:  68%|██████▊   | 4349/6434 [10:12:07<4:43:19,  8.15s/it, gpt_loss=0.298, loss_mean=0.321][A
+[LID Router Debug] Step: 4350
+Batch Size: 10
+Audio Batch Size: 82
+LID Assignments: [1, 2, 0, 1, 1, 1, 2, 1, 5, 0]
+Active Experts in Batch: {0, 1, 2, 5}
+
+Train step of epoch 0:  68%|██████▊   | 4349/6434 [10:12:16<4:43:19,  8.15s/it, gpt_loss=0.329, loss_mean=0.322][A
+Train step of epoch 0:  68%|██████▊   | 4350/6434 [10:12:16<4:47:09,  8.27s/it, gpt_loss=0.329, loss_mean=0.322][A
+Train step of epoch 0:  68%|██████▊   | 4350/6434 [10:12:23<4:47:09,  8.27s/it, gpt_loss=0.298, loss_mean=0.319][A
+Train step of epoch 0:  68%|██████▊   | 4351/6434 [10:12:23<4:38:26,  8.02s/it, gpt_loss=0.298, loss_mean=0.319][A
+Train step of epoch 0:  68%|██████▊   | 4351/6434 [10:12:31<4:38:26,  8.02s/it, gpt_loss=0.258, loss_mean=0.313][A
+Train step of epoch 0:  68%|██████▊   | 4352/6434 [10:12:31<4:32:50,  7.86s/it, gpt_loss=0.258, loss_mean=0.313][A
+Train step of epoch 0:  68%|██████▊   | 4352/6434 [10:12:40<4:32:50,  7.86s/it, gpt_loss=0.247, loss_mean=0.307][A
+Train step of epoch 0:  68%|██████▊   | 4353/6434 [10:12:40<4:46:37,  8.26s/it, gpt_loss=0.247, loss_mean=0.307][A
+Train step of epoch 0:  68%|██████▊   | 4353/6434 [10:12:48<4:46:37,  8.26s/it, gpt_loss=0.241, loss_mean=0.3]  [A
+Train step of epoch 0:  68%|██████▊   | 4354/6434 [10:12:48<4:45:41,  8.24s/it, gpt_loss=0.241, loss_mean=0.3][A
+Train step of epoch 0:  68%|██████▊   | 4354/6434 [10:12:56<4:45:41,  8.24s/it, gpt_loss=0.285, loss_mean=0.299][A
+Train step of epoch 0:  68%|██████▊   | 4355/6434 [10:12:56<4:44:24,  8.21s/it, gpt_loss=0.285, loss_mean=0.299][A
+Train step of epoch 0:  68%|██████▊   | 4355/6434 [10:13:06<4:44:24,  8.21s/it, gpt_loss=0.312, loss_mean=0.3]  [A
+Train step of epoch 0:  68%|██████▊   | 4356/6434 [10:13:06<4:58:22,  8.62s/it, gpt_loss=0.312, loss_mean=0.3][A
+Train step of epoch 0:  68%|██████▊   | 4356/6434 [10:13:14<4:58:22,  8.62s/it, gpt_loss=0.291, loss_mean=0.299][A
+Train step of epoch 0:  68%|██████▊   | 4357/6434 [10:13:14<4:46:55,  8.29s/it, gpt_loss=0.291, loss_mean=0.299][A
+Train step of epoch 0:  68%|██████▊   | 4357/6434 [10:13:22<4:46:55,  8.29s/it, gpt_loss=0.3, loss_mean=0.299]  [A
+Train step of epoch 0:  68%|██████▊   | 4358/6434 [10:13:22<4:46:14,  8.27s/it, gpt_loss=0.3, loss_mean=0.299][A
+Train step of epoch 0:  68%|██████▊   | 4358/6434 [10:13:30<4:46:14,  8.27s/it, gpt_loss=0.309, loss_mean=0.3][A
+Train step of epoch 0:  68%|██████▊   | 4359/6434 [10:13:30<4:49:40,  8.38s/it, gpt_loss=0.309, loss_mean=0.3][A
+[LID Router Debug] Step: 4360
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [3, 9, 5, 4, 9, 5, 4, 1, 6, 4]
+Active Experts in Batch: {1, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  68%|██████▊   | 4359/6434 [10:13:39<4:49:40,  8.38s/it, gpt_loss=0.286, loss_mean=0.299][A
+Train step of epoch 0:  68%|██████▊   | 4360/6434 [10:13:39<4:51:25,  8.43s/it, gpt_loss=0.286, loss_mean=0.299][A
+Train step of epoch 0:  68%|██████▊   | 4360/6434 [10:13:48<4:51:25,  8.43s/it, gpt_loss=0.253, loss_mean=0.294][A
+Train step of epoch 0:  68%|██████▊   | 4361/6434 [10:13:48<4:55:34,  8.56s/it, gpt_loss=0.253, loss_mean=0.294][A
+Train step of epoch 0:  68%|██████▊   | 4361/6434 [10:13:57<4:55:34,  8.56s/it, gpt_loss=0.236, loss_mean=0.288][A
+Train step of epoch 0:  68%|██████▊   | 4362/6434 [10:13:57<4:58:31,  8.64s/it, gpt_loss=0.236, loss_mean=0.288][A
+Train step of epoch 0:  68%|██████▊   | 4362/6434 [10:14:05<4:58:31,  8.64s/it, gpt_loss=0.298, loss_mean=0.289][A
+Train step of epoch 0:  68%|██████▊   | 4363/6434 [10:14:05<4:52:02,  8.46s/it, gpt_loss=0.298, loss_mean=0.289][A
+Train step of epoch 0:  68%|██████▊   | 4363/6434 [10:14:14<4:52:02,  8.46s/it, gpt_loss=0.242, loss_mean=0.285][A
+Train step of epoch 0:  68%|██████▊   | 4364/6434 [10:14:14<4:58:41,  8.66s/it, gpt_loss=0.242, loss_mean=0.285][A
+Train step of epoch 0:  68%|██████▊   | 4364/6434 [10:14:21<4:58:41,  8.66s/it, gpt_loss=0.278, loss_mean=0.284][A
+Train step of epoch 0:  68%|██████▊   | 4365/6434 [10:14:21<4:45:28,  8.28s/it, gpt_loss=0.278, loss_mean=0.284][A
+Train step of epoch 0:  68%|██████▊   | 4365/6434 [10:14:29<4:45:28,  8.28s/it, gpt_loss=0.281, loss_mean=0.284][A
+Train step of epoch 0:  68%|██████▊   | 4366/6434 [10:14:29<4:42:37,  8.20s/it, gpt_loss=0.281, loss_mean=0.284][A
+Train step of epoch 0:  68%|██████▊   | 4366/6434 [10:14:38<4:42:37,  8.20s/it, gpt_loss=0.214, loss_mean=0.277][A
+Train step of epoch 0:  68%|██████▊   | 4367/6434 [10:14:38<4:48:51,  8.38s/it, gpt_loss=0.214, loss_mean=0.277][A
+Train step of epoch 0:  68%|██████▊   | 4367/6434 [10:14:47<4:48:51,  8.38s/it, gpt_loss=0.244, loss_mean=0.273][A
+Train step of epoch 0:  68%|██████▊   | 4368/6434 [10:14:47<4:52:22,  8.49s/it, gpt_loss=0.244, loss_mean=0.273][A
+Train step of epoch 0:  68%|██████▊   | 4368/6434 [10:14:56<4:52:22,  8.49s/it, gpt_loss=0.333, loss_mean=0.279][A
+Train step of epoch 0:  68%|██████▊   | 4369/6434 [10:14:56<4:55:24,  8.58s/it, gpt_loss=0.333, loss_mean=0.279][A
+[LID Router Debug] Step: 4370
+Batch Size: 10
+Audio Batch Size: 113
+LID Assignments: [3, 3, 4, 3, 1, 0, 4, 2, 4, 6]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6}
+
+Train step of epoch 0:  68%|██████▊   | 4369/6434 [10:15:03<4:55:24,  8.58s/it, gpt_loss=0.222, loss_mean=0.274][A
+Train step of epoch 0:  68%|██████▊   | 4370/6434 [10:15:03<4:48:49,  8.40s/it, gpt_loss=0.222, loss_mean=0.274][A
+Train step of epoch 0:  68%|██████▊   | 4370/6434 [10:15:13<4:48:49,  8.40s/it, gpt_loss=0.312, loss_mean=0.277][A
+Train step of epoch 0:  68%|██████▊   | 4371/6434 [10:15:13<5:00:51,  8.75s/it, gpt_loss=0.312, loss_mean=0.277][A
+Train step of epoch 0:  68%|██████▊   | 4371/6434 [10:15:22<5:00:51,  8.75s/it, gpt_loss=0.288, loss_mean=0.278][A
+Train step of epoch 0:  68%|██████▊   | 4372/6434 [10:15:22<5:01:08,  8.76s/it, gpt_loss=0.288, loss_mean=0.278][A
+Train step of epoch 0:  68%|██████▊   | 4372/6434 [10:15:30<5:01:08,  8.76s/it, gpt_loss=0.354, loss_mean=0.286][A
+Train step of epoch 0:  68%|██████▊   | 4373/6434 [10:15:30<4:52:42,  8.52s/it, gpt_loss=0.354, loss_mean=0.286][A
+Train step of epoch 0:  68%|██████▊   | 4373/6434 [10:15:39<4:52:42,  8.52s/it, gpt_loss=0.292, loss_mean=0.287][A
+Train step of epoch 0:  68%|██████▊   | 4374/6434 [10:15:39<4:58:11,  8.69s/it, gpt_loss=0.292, loss_mean=0.287][A
+Train step of epoch 0:  68%|██████▊   | 4374/6434 [10:15:47<4:58:11,  8.69s/it, gpt_loss=0.243, loss_mean=0.282][A
+Train step of epoch 0:  68%|██████▊   | 4375/6434 [10:15:47<4:55:13,  8.60s/it, gpt_loss=0.243, loss_mean=0.282][A
+Train step of epoch 0:  68%|██████▊   | 4375/6434 [10:15:56<4:55:13,  8.60s/it, gpt_loss=0.278, loss_mean=0.282][A
+Train step of epoch 0:  68%|██████▊   | 4376/6434 [10:15:56<4:53:25,  8.55s/it, gpt_loss=0.278, loss_mean=0.282][A
+Train step of epoch 0:  68%|██████▊   | 4376/6434 [10:16:04<4:53:25,  8.55s/it, gpt_loss=0.289, loss_mean=0.282][A
+Train step of epoch 0:  68%|██████▊   | 4377/6434 [10:16:04<4:49:08,  8.43s/it, gpt_loss=0.289, loss_mean=0.282][A
+Train step of epoch 0:  68%|██████▊   | 4377/6434 [10:16:12<4:49:08,  8.43s/it, gpt_loss=0.331, loss_mean=0.287][A
+Train step of epoch 0:  68%|██████▊   | 4378/6434 [10:16:12<4:45:04,  8.32s/it, gpt_loss=0.331, loss_mean=0.287][A
+Train step of epoch 0:  68%|██████▊   | 4378/6434 [10:16:21<4:45:04,  8.32s/it, gpt_loss=0.331, loss_mean=0.292][A
+Train step of epoch 0:  68%|██████▊   | 4379/6434 [10:16:21<4:55:52,  8.64s/it, gpt_loss=0.331, loss_mean=0.292][A
+[LID Router Debug] Step: 4380
+Batch Size: 10
+Audio Batch Size: 122
+LID Assignments: [4, 3, 3, 0, 3, 5, 5, 9, 9, 1]
+Active Experts in Batch: {0, 1, 3, 4, 5, 9}
+
+Train step of epoch 0:  68%|██████▊   | 4379/6434 [10:16:29<4:55:52,  8.64s/it, gpt_loss=0.286, loss_mean=0.291][A
+Train step of epoch 0:  68%|██████▊   | 4380/6434 [10:16:29<4:50:53,  8.50s/it, gpt_loss=0.286, loss_mean=0.291][A
+Train step of epoch 0:  68%|██████▊   | 4380/6434 [10:16:38<4:50:53,  8.50s/it, gpt_loss=0.37, loss_mean=0.299] [A
+Train step of epoch 0:  68%|██████▊   | 4381/6434 [10:16:38<4:50:07,  8.48s/it, gpt_loss=0.37, loss_mean=0.299][A
+Train step of epoch 0:  68%|██████▊   | 4381/6434 [10:16:47<4:50:07,  8.48s/it, gpt_loss=0.258, loss_mean=0.295][A
+Train step of epoch 0:  68%|██████▊   | 4382/6434 [10:16:47<4:55:54,  8.65s/it, gpt_loss=0.258, loss_mean=0.295][A
+Train step of epoch 0:  68%|██████▊   | 4382/6434 [10:16:56<4:55:54,  8.65s/it, gpt_loss=0.326, loss_mean=0.298][A
+Train step of epoch 0:  68%|██████▊   | 4383/6434 [10:16:56<4:56:37,  8.68s/it, gpt_loss=0.326, loss_mean=0.298][A
+Train step of epoch 0:  68%|██████▊   | 4383/6434 [10:17:03<4:56:37,  8.68s/it, gpt_loss=0.399, loss_mean=0.308][A
+Train step of epoch 0:  68%|██████▊   | 4384/6434 [10:17:03<4:41:20,  8.23s/it, gpt_loss=0.399, loss_mean=0.308][A
+Train step of epoch 0:  68%|██████▊   | 4384/6434 [10:17:12<4:41:20,  8.23s/it, gpt_loss=0.388, loss_mean=0.316][A
+Train step of epoch 0:  68%|██████▊   | 4385/6434 [10:17:12<4:48:51,  8.46s/it, gpt_loss=0.388, loss_mean=0.316][A
+Train step of epoch 0:  68%|██████▊   | 4385/6434 [10:17:20<4:48:51,  8.46s/it, gpt_loss=0.407, loss_mean=0.325][A
+Train step of epoch 0:  68%|██████▊   | 4386/6434 [10:17:20<4:48:39,  8.46s/it, gpt_loss=0.407, loss_mean=0.325][A
+Train step of epoch 0:  68%|██████▊   | 4386/6434 [10:17:28<4:48:39,  8.46s/it, gpt_loss=0.316, loss_mean=0.324][A
+Train step of epoch 0:  68%|██████▊   | 4387/6434 [10:17:28<4:44:59,  8.35s/it, gpt_loss=0.316, loss_mean=0.324][A
+Train step of epoch 0:  68%|██████▊   | 4387/6434 [10:17:36<4:44:59,  8.35s/it, gpt_loss=0.312, loss_mean=0.323][A
+Train step of epoch 0:  68%|██████▊   | 4388/6434 [10:17:36<4:35:53,  8.09s/it, gpt_loss=0.312, loss_mean=0.323][A
+Train step of epoch 0:  68%|██████▊   | 4388/6434 [10:17:45<4:35:53,  8.09s/it, gpt_loss=0.392, loss_mean=0.33] [A
+Train step of epoch 0:  68%|██████▊   | 4389/6434 [10:17:45<4:50:35,  8.53s/it, gpt_loss=0.392, loss_mean=0.33][A
+[LID Router Debug] Step: 4390
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [0, 5, 5, 6, 4, 2, 1, 3, 6, 2]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+Train step of epoch 0:  68%|██████▊   | 4389/6434 [10:17:54<4:50:35,  8.53s/it, gpt_loss=0.26, loss_mean=0.323][A
+Train step of epoch 0:  68%|██████▊   | 4390/6434 [10:17:54<4:48:46,  8.48s/it, gpt_loss=0.26, loss_mean=0.323][A
+Train step of epoch 0:  68%|██████▊   | 4390/6434 [10:18:03<4:48:46,  8.48s/it, gpt_loss=0.345, loss_mean=0.325][A
+Train step of epoch 0:  68%|██████▊   | 4391/6434 [10:18:03<4:57:05,  8.73s/it, gpt_loss=0.345, loss_mean=0.325][A
+Train step of epoch 0:  68%|██████▊   | 4391/6434 [10:18:11<4:57:05,  8.73s/it, gpt_loss=0.247, loss_mean=0.317][A
+Train step of epoch 0:  68%|██████▊   | 4392/6434 [10:18:11<4:48:38,  8.48s/it, gpt_loss=0.247, loss_mean=0.317][A
+Train step of epoch 0:  68%|██████▊   | 4392/6434 [10:18:19<4:48:38,  8.48s/it, gpt_loss=0.323, loss_mean=0.318][A
+Train step of epoch 0:  68%|██████▊   | 4393/6434 [10:18:19<4:45:37,  8.40s/it, gpt_loss=0.323, loss_mean=0.318][A
+Train step of epoch 0:  68%|██████▊   | 4393/6434 [10:18:27<4:45:37,  8.40s/it, gpt_loss=0.22, loss_mean=0.308] [A
+Train step of epoch 0:  68%|██████▊   | 4394/6434 [10:18:27<4:43:27,  8.34s/it, gpt_loss=0.22, loss_mean=0.308][A
+Train step of epoch 0:  68%|██████▊   | 4394/6434 [10:18:35<4:43:27,  8.34s/it, gpt_loss=0.292, loss_mean=0.306][A
+Train step of epoch 0:  68%|██████▊   | 4395/6434 [10:18:35<4:37:34,  8.17s/it, gpt_loss=0.292, loss_mean=0.306][A
+Train step of epoch 0:  68%|██████▊   | 4395/6434 [10:18:44<4:37:34,  8.17s/it, gpt_loss=0.32, loss_mean=0.308] [A
+Train step of epoch 0:  68%|██████▊   | 4396/6434 [10:18:44<4:40:59,  8.27s/it, gpt_loss=0.32, loss_mean=0.308][A
+Train step of epoch 0:  68%|██████▊   | 4396/6434 [10:18:51<4:40:59,  8.27s/it, gpt_loss=0.28, loss_mean=0.305][A
+Train step of epoch 0:  68%|██████▊   | 4397/6434 [10:18:51<4:32:22,  8.02s/it, gpt_loss=0.28, loss_mean=0.305][A
+Train step of epoch 0:  68%|██████▊   | 4397/6434 [10:19:01<4:32:22,  8.02s/it, gpt_loss=0.38, loss_mean=0.312][A
+Train step of epoch 0:  68%|██████▊   | 4398/6434 [10:19:01<4:49:16,  8.53s/it, gpt_loss=0.38, loss_mean=0.312][A
+Train step of epoch 0:  68%|██████▊   | 4398/6434 [10:19:10<4:49:16,  8.53s/it, gpt_loss=0.255, loss_mean=0.307][A
+Train step of epoch 0:  68%|██████▊   | 4399/6434 [10:19:10<4:51:00,  8.58s/it, gpt_loss=0.255, loss_mean=0.307][A
+[LID Router Debug] Step: 4400
+Batch Size: 10
+Audio Batch Size: 129
+LID Assignments: [1, 9, 3, 3, 0, 2, 9, 4, 9, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+[2026-02-07 02:15:22,543] [INFO] [logging.py:96:log_dist] [Rank 0] step=2200, skipped=0, lr=[1.7702784375752962e-05, 1.7702784375752962e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 02:15:22,544] [INFO] [timer.py:260:stop] epoch=0/micro_step=4400/global_step=2200, RunningAvgSamplesPerSec=4.7467382922776125, CurrSamplesPerSec=4.664736105621367, MemAllocated=12.95GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  68%|██████▊   | 4399/6434 [10:19:18<4:51:00,  8.58s/it, gpt_loss=0.308, loss_mean=0.307][A
+Train step of epoch 0:  68%|██████▊   | 4400/6434 [10:19:18<4:49:46,  8.55s/it, gpt_loss=0.308, loss_mean=0.307][A
+Train step of epoch 0:  68%|██████▊   | 4400/6434 [10:19:28<4:49:46,  8.55s/it, gpt_loss=0.287, loss_mean=0.305][A
+Train step of epoch 0:  68%|██████▊   | 4401/6434 [10:19:28<5:00:43,  8.88s/it, gpt_loss=0.287, loss_mean=0.305][A
+Train step of epoch 0:  68%|██████▊   | 4401/6434 [10:19:38<5:00:43,  8.88s/it, gpt_loss=0.373, loss_mean=0.312][A
+Train step of epoch 0:  68%|██████▊   | 4402/6434 [10:19:38<5:12:18,  9.22s/it, gpt_loss=0.373, loss_mean=0.312][A
+Train step of epoch 0:  68%|██████▊   | 4402/6434 [10:19:46<5:12:18,  9.22s/it, gpt_loss=0.301, loss_mean=0.311][A
+Train step of epoch 0:  68%|██████▊   | 4403/6434 [10:19:46<5:02:22,  8.93s/it, gpt_loss=0.301, loss_mean=0.311][A
+Train step of epoch 0:  68%|██████▊   | 4403/6434 [10:19:55<5:02:22,  8.93s/it, gpt_loss=0.284, loss_mean=0.308][A
+Train step of epoch 0:  68%|██████▊   | 4404/6434 [10:19:55<5:02:17,  8.93s/it, gpt_loss=0.284, loss_mean=0.308][A
+Train step of epoch 0:  68%|██████▊   | 4404/6434 [10:20:03<5:02:17,  8.93s/it, gpt_loss=0.268, loss_mean=0.304][A
+Train step of epoch 0:  68%|██████▊   | 4405/6434 [10:20:03<4:52:27,  8.65s/it, gpt_loss=0.268, loss_mean=0.304][A
+Train step of epoch 0:  68%|██████▊   | 4405/6434 [10:20:11<4:52:27,  8.65s/it, gpt_loss=0.287, loss_mean=0.302][A
+Train step of epoch 0:  68%|██████▊   | 4406/6434 [10:20:11<4:44:20,  8.41s/it, gpt_loss=0.287, loss_mean=0.302][A
+Train step of epoch 0:  68%|██████▊   | 4406/6434 [10:20:20<4:44:20,  8.41s/it, gpt_loss=0.21, loss_mean=0.293] [A
+Train step of epoch 0:  68%|██████▊   | 4407/6434 [10:20:20<4:51:16,  8.62s/it, gpt_loss=0.21, loss_mean=0.293][A
+Train step of epoch 0:  68%|██████▊   | 4407/6434 [10:20:28<4:51:16,  8.62s/it, gpt_loss=0.266, loss_mean=0.29][A
+Train step of epoch 0:  69%|██████▊   | 4408/6434 [10:20:28<4:46:39,  8.49s/it, gpt_loss=0.266, loss_mean=0.29][A
+Train step of epoch 0:  69%|██████▊   | 4408/6434 [10:20:36<4:46:39,  8.49s/it, gpt_loss=0.272, loss_mean=0.288][A
+Train step of epoch 0:  69%|██████▊   | 4409/6434 [10:20:36<4:41:58,  8.35s/it, gpt_loss=0.272, loss_mean=0.288][A
+[LID Router Debug] Step: 4410
+Batch Size: 10
+Audio Batch Size: 123
+LID Assignments: [9, 9, 2, 9, 9, 3, 0, 2, 5, 3]
+Active Experts in Batch: {0, 2, 3, 5, 9}
+
+Train step of epoch 0:  69%|██████▊   | 4409/6434 [10:20:45<4:41:58,  8.35s/it, gpt_loss=0.352, loss_mean=0.295][A
+Train step of epoch 0:  69%|██████▊   | 4410/6434 [10:20:45<4:47:47,  8.53s/it, gpt_loss=0.352, loss_mean=0.295][A
+Train step of epoch 0:  69%|██████▊   | 4410/6434 [10:20:54<4:47:47,  8.53s/it, gpt_loss=0.262, loss_mean=0.292][A
+Train step of epoch 0:  69%|██████▊   | 4411/6434 [10:20:54<4:52:02,  8.66s/it, gpt_loss=0.262, loss_mean=0.292][A
+Train step of epoch 0:  69%|██████▊   | 4411/6434 [10:21:01<4:52:02,  8.66s/it, gpt_loss=0.302, loss_mean=0.293][A
+Train step of epoch 0:  69%|██████▊   | 4412/6434 [10:21:01<4:34:24,  8.14s/it, gpt_loss=0.302, loss_mean=0.293][A
+Train step of epoch 0:  69%|██████▊   | 4412/6434 [10:21:09<4:34:24,  8.14s/it, gpt_loss=0.277, loss_mean=0.291][A
+Train step of epoch 0:  69%|██████▊   | 4413/6434 [10:21:09<4:28:40,  7.98s/it, gpt_loss=0.277, loss_mean=0.291][A
+Train step of epoch 0:  69%|██████▊   | 4413/6434 [10:21:18<4:28:40,  7.98s/it, gpt_loss=0.269, loss_mean=0.289][A
+Train step of epoch 0:  69%|██████▊   | 4414/6434 [10:21:18<4:40:07,  8.32s/it, gpt_loss=0.269, loss_mean=0.289][A
+Train step of epoch 0:  69%|██████▊   | 4414/6434 [10:21:26<4:40:07,  8.32s/it, gpt_loss=0.314, loss_mean=0.291][A
+Train step of epoch 0:  69%|██████▊   | 4415/6434 [10:21:26<4:41:38,  8.37s/it, gpt_loss=0.314, loss_mean=0.291][A
+Train step of epoch 0:  69%|██████▊   | 4415/6434 [10:21:35<4:41:38,  8.37s/it, gpt_loss=0.315, loss_mean=0.294][A
+Train step of epoch 0:  69%|██████▊   | 4416/6434 [10:21:35<4:50:16,  8.63s/it, gpt_loss=0.315, loss_mean=0.294][A
+Train step of epoch 0:  69%|██████▊   | 4416/6434 [10:21:44<4:50:16,  8.63s/it, gpt_loss=0.296, loss_mean=0.294][A
+Train step of epoch 0:  69%|██████▊   | 4417/6434 [10:21:44<4:51:51,  8.68s/it, gpt_loss=0.296, loss_mean=0.294][A
+Train step of epoch 0:  69%|██████▊   | 4417/6434 [10:21:52<4:51:51,  8.68s/it, gpt_loss=0.295, loss_mean=0.294][A
+Train step of epoch 0:  69%|██████▊   | 4418/6434 [10:21:52<4:47:03,  8.54s/it, gpt_loss=0.295, loss_mean=0.294][A
+Train step of epoch 0:  69%|██████▊   | 4418/6434 [10:22:01<4:47:03,  8.54s/it, gpt_loss=0.285, loss_mean=0.293][A
+Train step of epoch 0:  69%|██████▊   | 4419/6434 [10:22:01<4:43:49,  8.45s/it, gpt_loss=0.285, loss_mean=0.293][A
+[LID Router Debug] Step: 4420
+Batch Size: 10
+Audio Batch Size: 114
+LID Assignments: [3, 9, 6, 5, 2, 3, 0, 5, 0, 4]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  69%|██████▊   | 4419/6434 [10:22:10<4:43:49,  8.45s/it, gpt_loss=0.311, loss_mean=0.295][A
+Train step of epoch 0:  69%|██████▊   | 4420/6434 [10:22:10<4:47:50,  8.58s/it, gpt_loss=0.311, loss_mean=0.295][A
+Train step of epoch 0:  69%|██████▊   | 4420/6434 [10:22:17<4:47:50,  8.58s/it, gpt_loss=0.324, loss_mean=0.298][A
+Train step of epoch 0:  69%|██████▊   | 4421/6434 [10:22:17<4:37:45,  8.28s/it, gpt_loss=0.324, loss_mean=0.298][A
+Train step of epoch 0:  69%|██████▊   | 4421/6434 [10:22:24<4:37:45,  8.28s/it, gpt_loss=0.252, loss_mean=0.293][A
+Train step of epoch 0:  69%|██████▊   | 4422/6434 [10:22:24<4:27:45,  7.98s/it, gpt_loss=0.252, loss_mean=0.293][A
+Train step of epoch 0:  69%|██████▊   | 4422/6434 [10:22:34<4:27:45,  7.98s/it, gpt_loss=0.252, loss_mean=0.289][A
+Train step of epoch 0:  69%|██████▊   | 4423/6434 [10:22:34<4:40:17,  8.36s/it, gpt_loss=0.252, loss_mean=0.289][A
+Train step of epoch 0:  69%|██████▊   | 4423/6434 [10:22:41<4:40:17,  8.36s/it, gpt_loss=0.272, loss_mean=0.288][A
+Train step of epoch 0:  69%|██████▉   | 4424/6434 [10:22:41<4:34:23,  8.19s/it, gpt_loss=0.272, loss_mean=0.288][A
+Train step of epoch 0:  69%|██████▉   | 4424/6434 [10:22:49<4:34:23,  8.19s/it, gpt_loss=0.292, loss_mean=0.288][A
+Train step of epoch 0:  69%|██████▉   | 4425/6434 [10:22:49<4:27:43,  8.00s/it, gpt_loss=0.292, loss_mean=0.288][A
+Train step of epoch 0:  69%|██████▉   | 4425/6434 [10:22:57<4:27:43,  8.00s/it, gpt_loss=0.253, loss_mean=0.285][A
+Train step of epoch 0:  69%|██████▉   | 4426/6434 [10:22:57<4:26:27,  7.96s/it, gpt_loss=0.253, loss_mean=0.285][A
+Train step of epoch 0:  69%|██████▉   | 4426/6434 [10:23:06<4:26:27,  7.96s/it, gpt_loss=0.346, loss_mean=0.291][A
+Train step of epoch 0:  69%|██████▉   | 4427/6434 [10:23:06<4:33:22,  8.17s/it, gpt_loss=0.346, loss_mean=0.291][A
+Train step of epoch 0:  69%|██████▉   | 4427/6434 [10:23:14<4:33:22,  8.17s/it, gpt_loss=0.327, loss_mean=0.294][A
+Train step of epoch 0:  69%|██████▉   | 4428/6434 [10:23:14<4:37:53,  8.31s/it, gpt_loss=0.327, loss_mean=0.294][A
+Train step of epoch 0:  69%|██████▉   | 4428/6434 [10:23:23<4:37:53,  8.31s/it, gpt_loss=0.339, loss_mean=0.299][A
+Train step of epoch 0:  69%|██████▉   | 4429/6434 [10:23:23<4:47:29,  8.60s/it, gpt_loss=0.339, loss_mean=0.299][A
+[LID Router Debug] Step: 4430
+Batch Size: 10
+Audio Batch Size: 129
+LID Assignments: [0, 6, 4, 5, 9, 3, 4, 5, 3, 3]
+Active Experts in Batch: {0, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  69%|██████▉   | 4429/6434 [10:23:32<4:47:29,  8.60s/it, gpt_loss=0.318, loss_mean=0.301][A
+Train step of epoch 0:  69%|██████▉   | 4430/6434 [10:23:32<4:46:51,  8.59s/it, gpt_loss=0.318, loss_mean=0.301][A
+Train step of epoch 0:  69%|██████▉   | 4430/6434 [10:23:40<4:46:51,  8.59s/it, gpt_loss=0.306, loss_mean=0.301][A
+Train step of epoch 0:  69%|██████▉   | 4431/6434 [10:23:40<4:43:45,  8.50s/it, gpt_loss=0.306, loss_mean=0.301][A
+Train step of epoch 0:  69%|██████▉   | 4431/6434 [10:23:48<4:43:45,  8.50s/it, gpt_loss=0.263, loss_mean=0.297][A
+Train step of epoch 0:  69%|██████▉   | 4432/6434 [10:23:48<4:33:17,  8.19s/it, gpt_loss=0.263, loss_mean=0.297][A
+Train step of epoch 0:  69%|██████▉   | 4432/6434 [10:23:57<4:33:17,  8.19s/it, gpt_loss=0.344, loss_mean=0.302][A
+Train step of epoch 0:  69%|██████▉   | 4433/6434 [10:23:57<4:46:21,  8.59s/it, gpt_loss=0.344, loss_mean=0.302][A
+Train step of epoch 0:  69%|██████▉   | 4433/6434 [10:24:05<4:46:21,  8.59s/it, gpt_loss=0.292, loss_mean=0.301][A
+Train step of epoch 0:  69%|██████▉   | 4434/6434 [10:24:05<4:35:07,  8.25s/it, gpt_loss=0.292, loss_mean=0.301][A
+Train step of epoch 0:  69%|██████▉   | 4434/6434 [10:24:13<4:35:07,  8.25s/it, gpt_loss=0.283, loss_mean=0.299][A
+Train step of epoch 0:  69%|██████▉   | 4435/6434 [10:24:13<4:34:52,  8.25s/it, gpt_loss=0.283, loss_mean=0.299][A
+Train step of epoch 0:  69%|██████▉   | 4435/6434 [10:24:22<4:34:52,  8.25s/it, gpt_loss=0.271, loss_mean=0.296][A
+Train step of epoch 0:  69%|██████▉   | 4436/6434 [10:24:22<4:39:27,  8.39s/it, gpt_loss=0.271, loss_mean=0.296][A
+Train step of epoch 0:  69%|██████▉   | 4436/6434 [10:24:31<4:39:27,  8.39s/it, gpt_loss=0.273, loss_mean=0.294][A
+Train step of epoch 0:  69%|██████▉   | 4437/6434 [10:24:31<4:52:29,  8.79s/it, gpt_loss=0.273, loss_mean=0.294][A
+Train step of epoch 0:  69%|██████▉   | 4437/6434 [10:24:40<4:52:29,  8.79s/it, gpt_loss=0.351, loss_mean=0.3]  [A
+Train step of epoch 0:  69%|██████▉   | 4438/6434 [10:24:40<4:47:02,  8.63s/it, gpt_loss=0.351, loss_mean=0.3][A
+Train step of epoch 0:  69%|██████▉   | 4438/6434 [10:24:48<4:47:02,  8.63s/it, gpt_loss=0.263, loss_mean=0.296][A
+Train step of epoch 0:  69%|██████▉   | 4439/6434 [10:24:48<4:40:14,  8.43s/it, gpt_loss=0.263, loss_mean=0.296][A
+[LID Router Debug] Step: 4440
+Batch Size: 10
+Audio Batch Size: 92
+LID Assignments: [0, 1, 4, 5, 2, 9, 6, 3, 2, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  69%|██████▉   | 4439/6434 [10:24:56<4:40:14,  8.43s/it, gpt_loss=0.284, loss_mean=0.295][A
+Train step of epoch 0:  69%|██████▉   | 4440/6434 [10:24:56<4:42:53,  8.51s/it, gpt_loss=0.284, loss_mean=0.295][A
+Train step of epoch 0:  69%|██████▉   | 4440/6434 [10:25:05<4:42:53,  8.51s/it, gpt_loss=0.248, loss_mean=0.29] [A
+Train step of epoch 0:  69%|██████▉   | 4441/6434 [10:25:05<4:47:07,  8.64s/it, gpt_loss=0.248, loss_mean=0.29][A
+Train step of epoch 0:  69%|██████▉   | 4441/6434 [10:25:13<4:47:07,  8.64s/it, gpt_loss=0.295, loss_mean=0.291][A
+Train step of epoch 0:  69%|██████▉   | 4442/6434 [10:25:13<4:39:04,  8.41s/it, gpt_loss=0.295, loss_mean=0.291][A
+Train step of epoch 0:  69%|██████▉   | 4442/6434 [10:25:21<4:39:04,  8.41s/it, gpt_loss=0.247, loss_mean=0.286][A
+Train step of epoch 0:  69%|██████▉   | 4443/6434 [10:25:21<4:33:46,  8.25s/it, gpt_loss=0.247, loss_mean=0.286][A
+Train step of epoch 0:  69%|██████▉   | 4443/6434 [10:25:30<4:33:46,  8.25s/it, gpt_loss=0.393, loss_mean=0.297][A
+Train step of epoch 0:  69%|██████▉   | 4444/6434 [10:25:30<4:39:06,  8.42s/it, gpt_loss=0.393, loss_mean=0.297][A
+Train step of epoch 0:  69%|██████▉   | 4444/6434 [10:25:38<4:39:06,  8.42s/it, gpt_loss=0.293, loss_mean=0.297][A
+Train step of epoch 0:  69%|██████▉   | 4445/6434 [10:25:38<4:36:35,  8.34s/it, gpt_loss=0.293, loss_mean=0.297][A
+Train step of epoch 0:  69%|██████▉   | 4445/6434 [10:25:47<4:36:35,  8.34s/it, gpt_loss=0.257, loss_mean=0.293][A
+Train step of epoch 0:  69%|██████▉   | 4446/6434 [10:25:47<4:38:19,  8.40s/it, gpt_loss=0.257, loss_mean=0.293][A
+Train step of epoch 0:  69%|██████▉   | 4446/6434 [10:25:56<4:38:19,  8.40s/it, gpt_loss=0.31, loss_mean=0.294] [A
+Train step of epoch 0:  69%|██████▉   | 4447/6434 [10:25:56<4:53:35,  8.87s/it, gpt_loss=0.31, loss_mean=0.294][A
+Train step of epoch 0:  69%|██████▉   | 4447/6434 [10:26:04<4:53:35,  8.87s/it, gpt_loss=0.315, loss_mean=0.296][A
+Train step of epoch 0:  69%|██████▉   | 4448/6434 [10:26:04<4:36:18,  8.35s/it, gpt_loss=0.315, loss_mean=0.296][A
+Train step of epoch 0:  69%|██████▉   | 4448/6434 [10:26:12<4:36:18,  8.35s/it, gpt_loss=0.384, loss_mean=0.305][A
+Train step of epoch 0:  69%|██████▉   | 4449/6434 [10:26:12<4:36:15,  8.35s/it, gpt_loss=0.384, loss_mean=0.305][A
+[LID Router Debug] Step: 4450
+Batch Size: 10
+Audio Batch Size: 119
+LID Assignments: [2, 3, 9, 5, 5, 3, 6, 3, 0, 0]
+Active Experts in Batch: {0, 2, 3, 5, 6, 9}
+
+Train step of epoch 0:  69%|██████▉   | 4449/6434 [10:26:20<4:36:15,  8.35s/it, gpt_loss=0.289, loss_mean=0.304][A
+Train step of epoch 0:  69%|██████▉   | 4450/6434 [10:26:20<4:29:28,  8.15s/it, gpt_loss=0.289, loss_mean=0.304][A
+Train step of epoch 0:  69%|██████▉   | 4450/6434 [10:26:30<4:29:28,  8.15s/it, gpt_loss=0.312, loss_mean=0.304][A
+Train step of epoch 0:  69%|██████▉   | 4451/6434 [10:26:30<4:46:08,  8.66s/it, gpt_loss=0.312, loss_mean=0.304][A
+Train step of epoch 0:  69%|██████▉   | 4451/6434 [10:26:38<4:46:08,  8.66s/it, gpt_loss=0.249, loss_mean=0.299][A
+Train step of epoch 0:  69%|██████▉   | 4452/6434 [10:26:38<4:45:51,  8.65s/it, gpt_loss=0.249, loss_mean=0.299][A
+Train step of epoch 0:  69%|██████▉   | 4452/6434 [10:26:47<4:45:51,  8.65s/it, gpt_loss=0.307, loss_mean=0.3]  [A
+Train step of epoch 0:  69%|██████▉   | 4453/6434 [10:26:47<4:47:30,  8.71s/it, gpt_loss=0.307, loss_mean=0.3][A
+Train step of epoch 0:  69%|██████▉   | 4453/6434 [10:26:55<4:47:30,  8.71s/it, gpt_loss=0.275, loss_mean=0.297][A
+Train step of epoch 0:  69%|██████▉   | 4454/6434 [10:26:55<4:44:53,  8.63s/it, gpt_loss=0.275, loss_mean=0.297][A
+Train step of epoch 0:  69%|██████▉   | 4454/6434 [10:27:05<4:44:53,  8.63s/it, gpt_loss=0.279, loss_mean=0.295][A
+Train step of epoch 0:  69%|██████▉   | 4455/6434 [10:27:05<4:58:22,  9.05s/it, gpt_loss=0.279, loss_mean=0.295][A
+Train step of epoch 0:  69%|██████▉   | 4455/6434 [10:27:16<4:58:22,  9.05s/it, gpt_loss=0.238, loss_mean=0.29] [A
+Train step of epoch 0:  69%|██████▉   | 4456/6434 [10:27:16<5:10:10,  9.41s/it, gpt_loss=0.238, loss_mean=0.29][A
+Train step of epoch 0:  69%|██████▉   | 4456/6434 [10:27:24<5:10:10,  9.41s/it, gpt_loss=0.377, loss_mean=0.298][A
+Train step of epoch 0:  69%|██████▉   | 4457/6434 [10:27:24<4:59:18,  9.08s/it, gpt_loss=0.377, loss_mean=0.298][A
+Train step of epoch 0:  69%|██████▉   | 4457/6434 [10:27:32<4:59:18,  9.08s/it, gpt_loss=0.319, loss_mean=0.3]  [A
+Train step of epoch 0:  69%|██████▉   | 4458/6434 [10:27:32<4:44:26,  8.64s/it, gpt_loss=0.319, loss_mean=0.3][A
+Train step of epoch 0:  69%|██████▉   | 4458/6434 [10:27:40<4:44:26,  8.64s/it, gpt_loss=0.3, loss_mean=0.3]  [A
+Train step of epoch 0:  69%|██████▉   | 4459/6434 [10:27:40<4:43:57,  8.63s/it, gpt_loss=0.3, loss_mean=0.3][A
+[LID Router Debug] Step: 4460
+Batch Size: 10
+Audio Batch Size: 110
+LID Assignments: [3, 2, 5, 1, 9, 1, 1, 9, 4, 2]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  69%|██████▉   | 4459/6434 [10:27:47<4:43:57,  8.63s/it, gpt_loss=0.319, loss_mean=0.302][A
+Train step of epoch 0:  69%|██████▉   | 4460/6434 [10:27:47<4:30:17,  8.22s/it, gpt_loss=0.319, loss_mean=0.302][A
+Train step of epoch 0:  69%|██████▉   | 4460/6434 [10:27:57<4:30:17,  8.22s/it, gpt_loss=0.248, loss_mean=0.297][A
+Train step of epoch 0:  69%|██████▉   | 4461/6434 [10:27:57<4:45:44,  8.69s/it, gpt_loss=0.248, loss_mean=0.297][A
+Train step of epoch 0:  69%|██████▉   | 4461/6434 [10:28:05<4:45:44,  8.69s/it, gpt_loss=0.311, loss_mean=0.298][A
+Train step of epoch 0:  69%|██████▉   | 4462/6434 [10:28:05<4:40:16,  8.53s/it, gpt_loss=0.311, loss_mean=0.298][A
+Train step of epoch 0:  69%|██████▉   | 4462/6434 [10:28:14<4:40:16,  8.53s/it, gpt_loss=0.322, loss_mean=0.301][A
+Train step of epoch 0:  69%|██████▉   | 4463/6434 [10:28:14<4:42:01,  8.59s/it, gpt_loss=0.322, loss_mean=0.301][A
+Train step of epoch 0:  69%|██████▉   | 4463/6434 [10:28:22<4:42:01,  8.59s/it, gpt_loss=0.287, loss_mean=0.299][A
+Train step of epoch 0:  69%|██████▉   | 4464/6434 [10:28:22<4:36:54,  8.43s/it, gpt_loss=0.287, loss_mean=0.299][A
+Train step of epoch 0:  69%|██████▉   | 4464/6434 [10:28:31<4:36:54,  8.43s/it, gpt_loss=0.354, loss_mean=0.305][A
+Train step of epoch 0:  69%|██████▉   | 4465/6434 [10:28:31<4:38:01,  8.47s/it, gpt_loss=0.354, loss_mean=0.305][A
+Train step of epoch 0:  69%|██████▉   | 4465/6434 [10:28:40<4:38:01,  8.47s/it, gpt_loss=0.333, loss_mean=0.308][A
+Train step of epoch 0:  69%|██████▉   | 4466/6434 [10:28:40<4:40:18,  8.55s/it, gpt_loss=0.333, loss_mean=0.308][A
+Train step of epoch 0:  69%|██████▉   | 4466/6434 [10:28:48<4:40:18,  8.55s/it, gpt_loss=0.286, loss_mean=0.305][A
+Train step of epoch 0:  69%|██████▉   | 4467/6434 [10:28:48<4:35:07,  8.39s/it, gpt_loss=0.286, loss_mean=0.305][A
+Train step of epoch 0:  69%|██████▉   | 4467/6434 [10:28:56<4:35:07,  8.39s/it, gpt_loss=0.235, loss_mean=0.298][A
+Train step of epoch 0:  69%|██████▉   | 4468/6434 [10:28:56<4:34:39,  8.38s/it, gpt_loss=0.235, loss_mean=0.298][A
+Train step of epoch 0:  69%|██████▉   | 4468/6434 [10:29:04<4:34:39,  8.38s/it, gpt_loss=0.295, loss_mean=0.298][A
+Train step of epoch 0:  69%|██████▉   | 4469/6434 [10:29:04<4:32:32,  8.32s/it, gpt_loss=0.295, loss_mean=0.298][A
+[LID Router Debug] Step: 4470
+Batch Size: 10
+Audio Batch Size: 100
+LID Assignments: [2, 9, 3, 9, 0, 9, 3, 4, 2, 6]
+Active Experts in Batch: {0, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:  69%|██████▉   | 4469/6434 [10:29:11<4:32:32,  8.32s/it, gpt_loss=0.326, loss_mean=0.301][A
+Train step of epoch 0:  69%|██████▉   | 4470/6434 [10:29:11<4:20:51,  7.97s/it, gpt_loss=0.326, loss_mean=0.301][A
+Train step of epoch 0:  69%|██████▉   | 4470/6434 [10:29:20<4:20:51,  7.97s/it, gpt_loss=0.253, loss_mean=0.296][A
+Train step of epoch 0:  69%|██████▉   | 4471/6434 [10:29:20<4:29:05,  8.22s/it, gpt_loss=0.253, loss_mean=0.296][A
+Train step of epoch 0:  69%|██████▉   | 4471/6434 [10:29:30<4:29:05,  8.22s/it, gpt_loss=0.343, loss_mean=0.301][A
+Train step of epoch 0:  70%|██████▉   | 4472/6434 [10:29:30<4:41:23,  8.61s/it, gpt_loss=0.343, loss_mean=0.301][A
+Train step of epoch 0:  70%|██████▉   | 4472/6434 [10:29:38<4:41:23,  8.61s/it, gpt_loss=0.277, loss_mean=0.298][A
+Train step of epoch 0:  70%|██████▉   | 4473/6434 [10:29:38<4:44:24,  8.70s/it, gpt_loss=0.277, loss_mean=0.298][A
+Train step of epoch 0:  70%|██████▉   | 4473/6434 [10:29:47<4:44:24,  8.70s/it, gpt_loss=0.258, loss_mean=0.294][A
+Train step of epoch 0:  70%|██████▉   | 4474/6434 [10:29:47<4:38:37,  8.53s/it, gpt_loss=0.258, loss_mean=0.294][A
+Train step of epoch 0:  70%|██████▉   | 4474/6434 [10:29:55<4:38:37,  8.53s/it, gpt_loss=0.258, loss_mean=0.291][A
+Train step of epoch 0:  70%|██████▉   | 4475/6434 [10:29:55<4:37:49,  8.51s/it, gpt_loss=0.258, loss_mean=0.291][A
+Train step of epoch 0:  70%|██████▉   | 4475/6434 [10:30:04<4:37:49,  8.51s/it, gpt_loss=0.246, loss_mean=0.286][A
+Train step of epoch 0:  70%|██████▉   | 4476/6434 [10:30:04<4:38:23,  8.53s/it, gpt_loss=0.246, loss_mean=0.286][A
+Train step of epoch 0:  70%|██████▉   | 4476/6434 [10:30:12<4:38:23,  8.53s/it, gpt_loss=0.32, loss_mean=0.29]  [A
+Train step of epoch 0:  70%|██████▉   | 4477/6434 [10:30:12<4:35:22,  8.44s/it, gpt_loss=0.32, loss_mean=0.29][A
+Train step of epoch 0:  70%|██████▉   | 4477/6434 [10:30:20<4:35:22,  8.44s/it, gpt_loss=0.247, loss_mean=0.285][A
+Train step of epoch 0:  70%|██████▉   | 4478/6434 [10:30:20<4:32:17,  8.35s/it, gpt_loss=0.247, loss_mean=0.285][A
+Train step of epoch 0:  70%|██████▉   | 4478/6434 [10:30:28<4:32:17,  8.35s/it, gpt_loss=0.339, loss_mean=0.291][A
+Train step of epoch 0:  70%|██████▉   | 4479/6434 [10:30:28<4:27:30,  8.21s/it, gpt_loss=0.339, loss_mean=0.291][A
+[LID Router Debug] Step: 4480
+Batch Size: 10
+Audio Batch Size: 135
+LID Assignments: [9, 6, 9, 5, 0, 4, 9, 3, 5, 1]
+Active Experts in Batch: {0, 1, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  70%|██████▉   | 4479/6434 [10:30:37<4:27:30,  8.21s/it, gpt_loss=0.308, loss_mean=0.292][A
+Train step of epoch 0:  70%|██████▉   | 4480/6434 [10:30:37<4:31:10,  8.33s/it, gpt_loss=0.308, loss_mean=0.292][A
+Train step of epoch 0:  70%|██████▉   | 4480/6434 [10:30:45<4:31:10,  8.33s/it, gpt_loss=0.262, loss_mean=0.289][A
+Train step of epoch 0:  70%|██████▉   | 4481/6434 [10:30:45<4:30:52,  8.32s/it, gpt_loss=0.262, loss_mean=0.289][A
+Train step of epoch 0:  70%|██████▉   | 4481/6434 [10:30:53<4:30:52,  8.32s/it, gpt_loss=0.406, loss_mean=0.301][A
+Train step of epoch 0:  70%|██████▉   | 4482/6434 [10:30:53<4:27:56,  8.24s/it, gpt_loss=0.406, loss_mean=0.301][A
+Train step of epoch 0:  70%|██████▉   | 4482/6434 [10:31:01<4:27:56,  8.24s/it, gpt_loss=0.24, loss_mean=0.295] [A
+Train step of epoch 0:  70%|██████▉   | 4483/6434 [10:31:01<4:27:55,  8.24s/it, gpt_loss=0.24, loss_mean=0.295][A
+Train step of epoch 0:  70%|██████▉   | 4483/6434 [10:31:09<4:27:55,  8.24s/it, gpt_loss=0.233, loss_mean=0.289][A
+Train step of epoch 0:  70%|██████▉   | 4484/6434 [10:31:09<4:26:16,  8.19s/it, gpt_loss=0.233, loss_mean=0.289][A
+Train step of epoch 0:  70%|██████▉   | 4484/6434 [10:31:18<4:26:16,  8.19s/it, gpt_loss=0.28, loss_mean=0.288] [A
+Train step of epoch 0:  70%|██████▉   | 4485/6434 [10:31:18<4:33:57,  8.43s/it, gpt_loss=0.28, loss_mean=0.288][A
+Train step of epoch 0:  70%|██████▉   | 4485/6434 [10:31:28<4:33:57,  8.43s/it, gpt_loss=0.351, loss_mean=0.294][A
+Train step of epoch 0:  70%|██████▉   | 4486/6434 [10:31:28<4:44:29,  8.76s/it, gpt_loss=0.351, loss_mean=0.294][A
+Train step of epoch 0:  70%|██████▉   | 4486/6434 [10:31:36<4:44:29,  8.76s/it, gpt_loss=0.309, loss_mean=0.296][A
+Train step of epoch 0:  70%|██████▉   | 4487/6434 [10:31:36<4:39:14,  8.61s/it, gpt_loss=0.309, loss_mean=0.296][A
+Train step of epoch 0:  70%|██████▉   | 4487/6434 [10:31:45<4:39:14,  8.61s/it, gpt_loss=0.246, loss_mean=0.291][A
+Train step of epoch 0:  70%|██████▉   | 4488/6434 [10:31:45<4:47:51,  8.88s/it, gpt_loss=0.246, loss_mean=0.291][A
+Train step of epoch 0:  70%|██████▉   | 4488/6434 [10:31:53<4:47:51,  8.88s/it, gpt_loss=0.316, loss_mean=0.293][A
+Train step of epoch 0:  70%|██████▉   | 4489/6434 [10:31:53<4:32:20,  8.40s/it, gpt_loss=0.316, loss_mean=0.293][A
+[LID Router Debug] Step: 4490
+Batch Size: 10
+Audio Batch Size: 84
+LID Assignments: [1, 5, 2, 0, 2, 1, 2, 4, 9, 9]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+Train step of epoch 0:  70%|██████▉   | 4489/6434 [10:32:02<4:32:20,  8.40s/it, gpt_loss=0.288, loss_mean=0.293][A
+Train step of epoch 0:  70%|██████▉   | 4490/6434 [10:32:02<4:37:33,  8.57s/it, gpt_loss=0.288, loss_mean=0.293][A
+Train step of epoch 0:  70%|██████▉   | 4490/6434 [10:32:11<4:37:33,  8.57s/it, gpt_loss=0.332, loss_mean=0.297][A
+Train step of epoch 0:  70%|██████▉   | 4491/6434 [10:32:11<4:41:38,  8.70s/it, gpt_loss=0.332, loss_mean=0.297][A
+Train step of epoch 0:  70%|██████▉   | 4491/6434 [10:32:19<4:41:38,  8.70s/it, gpt_loss=0.401, loss_mean=0.307][A
+Train step of epoch 0:  70%|██████▉   | 4492/6434 [10:32:19<4:34:00,  8.47s/it, gpt_loss=0.401, loss_mean=0.307][A
+Train step of epoch 0:  70%|██████▉   | 4492/6434 [10:32:27<4:34:00,  8.47s/it, gpt_loss=0.241, loss_mean=0.3]  [A
+Train step of epoch 0:  70%|██████▉   | 4493/6434 [10:32:27<4:33:33,  8.46s/it, gpt_loss=0.241, loss_mean=0.3][A
+Train step of epoch 0:  70%|██████▉   | 4493/6434 [10:32:35<4:33:33,  8.46s/it, gpt_loss=0.317, loss_mean=0.302][A
+Train step of epoch 0:  70%|██████▉   | 4494/6434 [10:32:35<4:25:56,  8.23s/it, gpt_loss=0.317, loss_mean=0.302][A
+Train step of epoch 0:  70%|██████▉   | 4494/6434 [10:32:43<4:25:56,  8.23s/it, gpt_loss=0.31, loss_mean=0.303] [A
+Train step of epoch 0:  70%|██████▉   | 4495/6434 [10:32:43<4:28:52,  8.32s/it, gpt_loss=0.31, loss_mean=0.303][A
+Train step of epoch 0:  70%|██████▉   | 4495/6434 [10:32:52<4:28:52,  8.32s/it, gpt_loss=0.272, loss_mean=0.3] [A
+Train step of epoch 0:  70%|██████▉   | 4496/6434 [10:32:52<4:36:58,  8.57s/it, gpt_loss=0.272, loss_mean=0.3][A
+Train step of epoch 0:  70%|██████▉   | 4496/6434 [10:33:01<4:36:58,  8.57s/it, gpt_loss=0.248, loss_mean=0.295][A
+Train step of epoch 0:  70%|██████▉   | 4497/6434 [10:33:01<4:34:37,  8.51s/it, gpt_loss=0.248, loss_mean=0.295][A
+Train step of epoch 0:  70%|██████▉   | 4497/6434 [10:33:09<4:34:37,  8.51s/it, gpt_loss=0.355, loss_mean=0.301][A
+Train step of epoch 0:  70%|██████▉   | 4498/6434 [10:33:09<4:32:11,  8.44s/it, gpt_loss=0.355, loss_mean=0.301][A
+Train step of epoch 0:  70%|██████▉   | 4498/6434 [10:33:18<4:32:11,  8.44s/it, gpt_loss=0.316, loss_mean=0.302][A
+Train step of epoch 0:  70%|██████▉   | 4499/6434 [10:33:18<4:32:15,  8.44s/it, gpt_loss=0.316, loss_mean=0.302][A
+[LID Router Debug] Step: 4500
+Batch Size: 10
+Audio Batch Size: 108
+LID Assignments: [9, 6, 6, 5, 2, 3, 1, 3, 1, 0]
+Active Experts in Batch: {0, 1, 2, 3, 5, 6, 9}
+
+Train step of epoch 0:  70%|██████▉   | 4499/6434 [10:33:26<4:32:15,  8.44s/it, gpt_loss=0.231, loss_mean=0.295][A
+Train step of epoch 0:  70%|██████▉   | 4500/6434 [10:33:26<4:36:01,  8.56s/it, gpt_loss=0.231, loss_mean=0.295][A
+Train step of epoch 0:  70%|██████▉   | 4500/6434 [10:33:34<4:36:01,  8.56s/it, gpt_loss=0.275, loss_mean=0.293][A
+Train step of epoch 0:  70%|██████▉   | 4501/6434 [10:33:34<4:25:16,  8.23s/it, gpt_loss=0.275, loss_mean=0.293][A
+Train step of epoch 0:  70%|██████▉   | 4501/6434 [10:33:42<4:25:16,  8.23s/it, gpt_loss=0.275, loss_mean=0.291][A
+Train step of epoch 0:  70%|██████▉   | 4502/6434 [10:33:42<4:24:32,  8.22s/it, gpt_loss=0.275, loss_mean=0.291][A
+Train step of epoch 0:  70%|██████▉   | 4502/6434 [10:33:50<4:24:32,  8.22s/it, gpt_loss=0.282, loss_mean=0.29] [A
+Train step of epoch 0:  70%|██████▉   | 4503/6434 [10:33:50<4:24:29,  8.22s/it, gpt_loss=0.282, loss_mean=0.29][A
+Train step of epoch 0:  70%|██████▉   | 4503/6434 [10:33:59<4:24:29,  8.22s/it, gpt_loss=0.292, loss_mean=0.291][A
+Train step of epoch 0:  70%|███████   | 4504/6434 [10:33:59<4:28:40,  8.35s/it, gpt_loss=0.292, loss_mean=0.291][A
+Train step of epoch 0:  70%|███████   | 4504/6434 [10:34:07<4:28:40,  8.35s/it, gpt_loss=0.368, loss_mean=0.298][A
+Train step of epoch 0:  70%|███████   | 4505/6434 [10:34:07<4:26:18,  8.28s/it, gpt_loss=0.368, loss_mean=0.298][A
+Train step of epoch 0:  70%|███████   | 4505/6434 [10:34:15<4:26:18,  8.28s/it, gpt_loss=0.259, loss_mean=0.294][A
+Train step of epoch 0:  70%|███████   | 4506/6434 [10:34:15<4:25:54,  8.28s/it, gpt_loss=0.259, loss_mean=0.294][A
+Train step of epoch 0:  70%|███████   | 4506/6434 [10:34:25<4:25:54,  8.28s/it, gpt_loss=0.287, loss_mean=0.294][A
+Train step of epoch 0:  70%|███████   | 4507/6434 [10:34:25<4:35:52,  8.59s/it, gpt_loss=0.287, loss_mean=0.294][A
+Train step of epoch 0:  70%|███████   | 4507/6434 [10:34:32<4:35:52,  8.59s/it, gpt_loss=0.282, loss_mean=0.293][A
+Train step of epoch 0:  70%|███████   | 4508/6434 [10:34:32<4:26:12,  8.29s/it, gpt_loss=0.282, loss_mean=0.293][A
+Train step of epoch 0:  70%|███████   | 4508/6434 [10:34:41<4:26:12,  8.29s/it, gpt_loss=0.278, loss_mean=0.291][A
+Train step of epoch 0:  70%|███████   | 4509/6434 [10:34:41<4:30:26,  8.43s/it, gpt_loss=0.278, loss_mean=0.291][A
+[LID Router Debug] Step: 4510
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [4, 6, 5, 2, 6, 2, 4, 3, 6, 6]
+Active Experts in Batch: {2, 3, 4, 5, 6}
+
+Train step of epoch 0:  70%|███████   | 4509/6434 [10:34:52<4:30:26,  8.43s/it, gpt_loss=0.256, loss_mean=0.288][A
+Train step of epoch 0:  70%|███████   | 4510/6434 [10:34:52<4:50:40,  9.06s/it, gpt_loss=0.256, loss_mean=0.288][A
+Train step of epoch 0:  70%|███████   | 4510/6434 [10:35:00<4:50:40,  9.06s/it, gpt_loss=0.252, loss_mean=0.284][A
+Train step of epoch 0:  70%|███████   | 4511/6434 [10:35:00<4:41:54,  8.80s/it, gpt_loss=0.252, loss_mean=0.284][A
+Train step of epoch 0:  70%|███████   | 4511/6434 [10:35:09<4:41:54,  8.80s/it, gpt_loss=0.349, loss_mean=0.29] [A
+Train step of epoch 0:  70%|███████   | 4512/6434 [10:35:09<4:42:22,  8.81s/it, gpt_loss=0.349, loss_mean=0.29][A
+Train step of epoch 0:  70%|███████   | 4512/6434 [10:35:17<4:42:22,  8.81s/it, gpt_loss=0.31, loss_mean=0.292][A
+Train step of epoch 0:  70%|███████   | 4513/6434 [10:35:17<4:42:52,  8.84s/it, gpt_loss=0.31, loss_mean=0.292][A
+Train step of epoch 0:  70%|███████   | 4513/6434 [10:35:26<4:42:52,  8.84s/it, gpt_loss=0.222, loss_mean=0.285][A
+Train step of epoch 0:  70%|███████   | 4514/6434 [10:35:26<4:38:44,  8.71s/it, gpt_loss=0.222, loss_mean=0.285][A
+Train step of epoch 0:  70%|███████   | 4514/6434 [10:35:34<4:38:44,  8.71s/it, gpt_loss=0.26, loss_mean=0.283] [A
+Train step of epoch 0:  70%|███████   | 4515/6434 [10:35:34<4:33:01,  8.54s/it, gpt_loss=0.26, loss_mean=0.283][A
+Train step of epoch 0:  70%|███████   | 4515/6434 [10:35:43<4:33:01,  8.54s/it, gpt_loss=0.253, loss_mean=0.28][A
+Train step of epoch 0:  70%|███████   | 4516/6434 [10:35:43<4:36:59,  8.67s/it, gpt_loss=0.253, loss_mean=0.28][A
+Train step of epoch 0:  70%|███████   | 4516/6434 [10:35:51<4:36:59,  8.67s/it, gpt_loss=0.258, loss_mean=0.278][A
+Train step of epoch 0:  70%|███████   | 4517/6434 [10:35:51<4:34:34,  8.59s/it, gpt_loss=0.258, loss_mean=0.278][A
+Train step of epoch 0:  70%|███████   | 4517/6434 [10:35:59<4:34:34,  8.59s/it, gpt_loss=0.267, loss_mean=0.277][A
+Train step of epoch 0:  70%|███████   | 4518/6434 [10:35:59<4:27:35,  8.38s/it, gpt_loss=0.267, loss_mean=0.277][A
+Train step of epoch 0:  70%|███████   | 4518/6434 [10:36:07<4:27:35,  8.38s/it, gpt_loss=0.324, loss_mean=0.281][A
+Train step of epoch 0:  70%|███████   | 4519/6434 [10:36:07<4:25:34,  8.32s/it, gpt_loss=0.324, loss_mean=0.281][A
+[LID Router Debug] Step: 4520
+Batch Size: 10
+Audio Batch Size: 151
+LID Assignments: [5, 3, 3, 4, 9, 2, 9, 2, 3, 9]
+Active Experts in Batch: {2, 3, 4, 5, 9}
+
+Train step of epoch 0:  70%|███████   | 4519/6434 [10:36:17<4:25:34,  8.32s/it, gpt_loss=0.428, loss_mean=0.296][A
+Train step of epoch 0:  70%|███████   | 4520/6434 [10:36:17<4:37:26,  8.70s/it, gpt_loss=0.428, loss_mean=0.296][A
+Train step of epoch 0:  70%|███████   | 4520/6434 [10:36:25<4:37:26,  8.70s/it, gpt_loss=0.367, loss_mean=0.303][A
+Train step of epoch 0:  70%|███████   | 4521/6434 [10:36:25<4:30:08,  8.47s/it, gpt_loss=0.367, loss_mean=0.303][A
+Train step of epoch 0:  70%|███████   | 4521/6434 [10:36:34<4:30:08,  8.47s/it, gpt_loss=0.291, loss_mean=0.302][A
+Train step of epoch 0:  70%|███████   | 4522/6434 [10:36:34<4:34:50,  8.62s/it, gpt_loss=0.291, loss_mean=0.302][A
+Train step of epoch 0:  70%|███████   | 4522/6434 [10:36:42<4:34:50,  8.62s/it, gpt_loss=0.358, loss_mean=0.307][A
+Train step of epoch 0:  70%|███████   | 4523/6434 [10:36:42<4:32:15,  8.55s/it, gpt_loss=0.358, loss_mean=0.307][A
+Train step of epoch 0:  70%|███████   | 4523/6434 [10:36:50<4:32:15,  8.55s/it, gpt_loss=0.262, loss_mean=0.303][A
+Train step of epoch 0:  70%|███████   | 4524/6434 [10:36:50<4:21:30,  8.21s/it, gpt_loss=0.262, loss_mean=0.303][A
+Train step of epoch 0:  70%|███████   | 4524/6434 [10:36:58<4:21:30,  8.21s/it, gpt_loss=0.28, loss_mean=0.301] [A
+Train step of epoch 0:  70%|███████   | 4525/6434 [10:36:58<4:24:38,  8.32s/it, gpt_loss=0.28, loss_mean=0.301][A
+Train step of epoch 0:  70%|███████   | 4525/6434 [10:37:07<4:24:38,  8.32s/it, gpt_loss=0.373, loss_mean=0.308][A
+Train step of epoch 0:  70%|███████   | 4526/6434 [10:37:07<4:27:22,  8.41s/it, gpt_loss=0.373, loss_mean=0.308][A
+Train step of epoch 0:  70%|███████   | 4526/6434 [10:37:16<4:27:22,  8.41s/it, gpt_loss=0.315, loss_mean=0.308][A
+Train step of epoch 0:  70%|███████   | 4527/6434 [10:37:16<4:29:23,  8.48s/it, gpt_loss=0.315, loss_mean=0.308][A
+Train step of epoch 0:  70%|███████   | 4527/6434 [10:37:25<4:29:23,  8.48s/it, gpt_loss=0.254, loss_mean=0.303][A
+Train step of epoch 0:  70%|███████   | 4528/6434 [10:37:25<4:38:47,  8.78s/it, gpt_loss=0.254, loss_mean=0.303][A
+Train step of epoch 0:  70%|███████   | 4528/6434 [10:37:33<4:38:47,  8.78s/it, gpt_loss=0.303, loss_mean=0.303][A
+Train step of epoch 0:  70%|███████   | 4529/6434 [10:37:33<4:34:39,  8.65s/it, gpt_loss=0.303, loss_mean=0.303][A
+[LID Router Debug] Step: 4530
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [2, 9, 2, 5, 2, 4, 2, 1, 9, 0]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+Train step of epoch 0:  70%|███████   | 4529/6434 [10:37:41<4:34:39,  8.65s/it, gpt_loss=0.25, loss_mean=0.298] [A
+Train step of epoch 0:  70%|███████   | 4530/6434 [10:37:41<4:27:41,  8.44s/it, gpt_loss=0.25, loss_mean=0.298][A
+Train step of epoch 0:  70%|███████   | 4530/6434 [10:37:49<4:27:41,  8.44s/it, gpt_loss=0.28, loss_mean=0.296][A
+Train step of epoch 0:  70%|███████   | 4531/6434 [10:37:49<4:20:47,  8.22s/it, gpt_loss=0.28, loss_mean=0.296][A
+Train step of epoch 0:  70%|███████   | 4531/6434 [10:37:57<4:20:47,  8.22s/it, gpt_loss=0.345, loss_mean=0.301][A
+Train step of epoch 0:  70%|███████   | 4532/6434 [10:37:57<4:16:52,  8.10s/it, gpt_loss=0.345, loss_mean=0.301][A
+Train step of epoch 0:  70%|███████   | 4532/6434 [10:38:05<4:16:52,  8.10s/it, gpt_loss=0.285, loss_mean=0.299][A
+Train step of epoch 0:  70%|███████   | 4533/6434 [10:38:05<4:16:30,  8.10s/it, gpt_loss=0.285, loss_mean=0.299][A
+Train step of epoch 0:  70%|███████   | 4533/6434 [10:38:13<4:16:30,  8.10s/it, gpt_loss=0.223, loss_mean=0.292][A
+Train step of epoch 0:  70%|███████   | 4534/6434 [10:38:13<4:17:26,  8.13s/it, gpt_loss=0.223, loss_mean=0.292][A
+Train step of epoch 0:  70%|███████   | 4534/6434 [10:38:21<4:17:26,  8.13s/it, gpt_loss=0.26, loss_mean=0.288] [A
+Train step of epoch 0:  70%|███████   | 4535/6434 [10:38:21<4:13:57,  8.02s/it, gpt_loss=0.26, loss_mean=0.288][A
+Train step of epoch 0:  70%|███████   | 4535/6434 [10:38:29<4:13:57,  8.02s/it, gpt_loss=0.409, loss_mean=0.3] [A
+Train step of epoch 0:  71%|███████   | 4536/6434 [10:38:29<4:16:58,  8.12s/it, gpt_loss=0.409, loss_mean=0.3][A
+Train step of epoch 0:  71%|███████   | 4536/6434 [10:38:39<4:16:58,  8.12s/it, gpt_loss=0.256, loss_mean=0.296][A
+Train step of epoch 0:  71%|███████   | 4537/6434 [10:38:39<4:36:09,  8.73s/it, gpt_loss=0.256, loss_mean=0.296][A
+Train step of epoch 0:  71%|███████   | 4537/6434 [10:38:48<4:36:09,  8.73s/it, gpt_loss=0.24, loss_mean=0.29]  [A
+Train step of epoch 0:  71%|███████   | 4538/6434 [10:38:48<4:32:39,  8.63s/it, gpt_loss=0.24, loss_mean=0.29][A
+Train step of epoch 0:  71%|███████   | 4538/6434 [10:38:56<4:32:39,  8.63s/it, gpt_loss=0.31, loss_mean=0.292][A
+Train step of epoch 0:  71%|███████   | 4539/6434 [10:38:56<4:31:27,  8.59s/it, gpt_loss=0.31, loss_mean=0.292][A
+[LID Router Debug] Step: 4540
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [5, 4, 5, 9, 1, 2, 1, 9, 0, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  71%|███████   | 4539/6434 [10:39:05<4:31:27,  8.59s/it, gpt_loss=0.344, loss_mean=0.298][A
+Train step of epoch 0:  71%|███████   | 4540/6434 [10:39:05<4:27:50,  8.49s/it, gpt_loss=0.344, loss_mean=0.298][A
+Train step of epoch 0:  71%|███████   | 4540/6434 [10:39:13<4:27:50,  8.49s/it, gpt_loss=0.228, loss_mean=0.291][A
+Train step of epoch 0:  71%|███████   | 4541/6434 [10:39:13<4:23:56,  8.37s/it, gpt_loss=0.228, loss_mean=0.291][A
+Train step of epoch 0:  71%|███████   | 4541/6434 [10:39:20<4:23:56,  8.37s/it, gpt_loss=0.269, loss_mean=0.288][A
+Train step of epoch 0:  71%|███████   | 4542/6434 [10:39:20<4:16:26,  8.13s/it, gpt_loss=0.269, loss_mean=0.288][A
+Train step of epoch 0:  71%|███████   | 4542/6434 [10:39:28<4:16:26,  8.13s/it, gpt_loss=0.225, loss_mean=0.282][A
+Train step of epoch 0:  71%|███████   | 4543/6434 [10:39:28<4:10:34,  7.95s/it, gpt_loss=0.225, loss_mean=0.282][A
+Train step of epoch 0:  71%|███████   | 4543/6434 [10:39:35<4:10:34,  7.95s/it, gpt_loss=0.27, loss_mean=0.281] [A
+Train step of epoch 0:  71%|███████   | 4544/6434 [10:39:35<4:06:43,  7.83s/it, gpt_loss=0.27, loss_mean=0.281][A
+Train step of epoch 0:  71%|███████   | 4544/6434 [10:39:45<4:06:43,  7.83s/it, gpt_loss=0.354, loss_mean=0.288][A
+Train step of epoch 0:  71%|███████   | 4545/6434 [10:39:45<4:20:36,  8.28s/it, gpt_loss=0.354, loss_mean=0.288][A
+Train step of epoch 0:  71%|███████   | 4545/6434 [10:39:53<4:20:36,  8.28s/it, gpt_loss=0.307, loss_mean=0.29] [A
+Train step of epoch 0:  71%|███████   | 4546/6434 [10:39:53<4:24:53,  8.42s/it, gpt_loss=0.307, loss_mean=0.29][A
+Train step of epoch 0:  71%|███████   | 4546/6434 [10:40:02<4:24:53,  8.42s/it, gpt_loss=0.31, loss_mean=0.292][A
+Train step of epoch 0:  71%|███████   | 4547/6434 [10:40:02<4:26:45,  8.48s/it, gpt_loss=0.31, loss_mean=0.292][A
+Train step of epoch 0:  71%|███████   | 4547/6434 [10:40:11<4:26:45,  8.48s/it, gpt_loss=0.348, loss_mean=0.298][A
+Train step of epoch 0:  71%|███████   | 4548/6434 [10:40:11<4:30:35,  8.61s/it, gpt_loss=0.348, loss_mean=0.298][A
+Train step of epoch 0:  71%|███████   | 4548/6434 [10:40:18<4:30:35,  8.61s/it, gpt_loss=0.329, loss_mean=0.301][A
+Train step of epoch 0:  71%|███████   | 4549/6434 [10:40:18<4:19:00,  8.24s/it, gpt_loss=0.329, loss_mean=0.301][A
+[LID Router Debug] Step: 4550
+Batch Size: 10
+Audio Batch Size: 121
+LID Assignments: [1, 4, 4, 1, 3, 3, 9, 3, 9, 4]
+Active Experts in Batch: {1, 3, 4, 9}
+
+Train step of epoch 0:  71%|███████   | 4549/6434 [10:40:28<4:19:00,  8.24s/it, gpt_loss=0.334, loss_mean=0.304][A
+Train step of epoch 0:  71%|███████   | 4550/6434 [10:40:28<4:29:44,  8.59s/it, gpt_loss=0.334, loss_mean=0.304][A
+Train step of epoch 0:  71%|███████   | 4550/6434 [10:40:36<4:29:44,  8.59s/it, gpt_loss=0.266, loss_mean=0.3]  [A
+Train step of epoch 0:  71%|███████   | 4551/6434 [10:40:36<4:25:23,  8.46s/it, gpt_loss=0.266, loss_mean=0.3][A
+Train step of epoch 0:  71%|███████   | 4551/6434 [10:40:46<4:25:23,  8.46s/it, gpt_loss=0.236, loss_mean=0.294][A
+Train step of epoch 0:  71%|███████   | 4552/6434 [10:40:46<4:39:27,  8.91s/it, gpt_loss=0.236, loss_mean=0.294][A
+Train step of epoch 0:  71%|███████   | 4552/6434 [10:40:55<4:39:27,  8.91s/it, gpt_loss=0.276, loss_mean=0.292][A
+Train step of epoch 0:  71%|███████   | 4553/6434 [10:40:55<4:44:58,  9.09s/it, gpt_loss=0.276, loss_mean=0.292][A
+Train step of epoch 0:  71%|███████   | 4553/6434 [10:41:04<4:44:58,  9.09s/it, gpt_loss=0.253, loss_mean=0.288][A
+Train step of epoch 0:  71%|███████   | 4554/6434 [10:41:04<4:43:28,  9.05s/it, gpt_loss=0.253, loss_mean=0.288][A
+Train step of epoch 0:  71%|███████   | 4554/6434 [10:41:13<4:43:28,  9.05s/it, gpt_loss=0.242, loss_mean=0.284][A
+Train step of epoch 0:  71%|███████   | 4555/6434 [10:41:13<4:43:16,  9.05s/it, gpt_loss=0.242, loss_mean=0.284][A
+Train step of epoch 0:  71%|███████   | 4555/6434 [10:41:22<4:43:16,  9.05s/it, gpt_loss=0.317, loss_mean=0.287][A
+Train step of epoch 0:  71%|███████   | 4556/6434 [10:41:22<4:38:14,  8.89s/it, gpt_loss=0.317, loss_mean=0.287][A
+Train step of epoch 0:  71%|███████   | 4556/6434 [10:41:30<4:38:14,  8.89s/it, gpt_loss=0.376, loss_mean=0.296][A
+Train step of epoch 0:  71%|███████   | 4557/6434 [10:41:30<4:33:28,  8.74s/it, gpt_loss=0.376, loss_mean=0.296][A
+Train step of epoch 0:  71%|███████   | 4557/6434 [10:41:38<4:33:28,  8.74s/it, gpt_loss=0.32, loss_mean=0.298] [A
+Train step of epoch 0:  71%|███████   | 4558/6434 [10:41:38<4:23:28,  8.43s/it, gpt_loss=0.32, loss_mean=0.298][A
+Train step of epoch 0:  71%|███████   | 4558/6434 [10:41:46<4:23:28,  8.43s/it, gpt_loss=0.271, loss_mean=0.296][A
+Train step of epoch 0:  71%|███████   | 4559/6434 [10:41:46<4:16:35,  8.21s/it, gpt_loss=0.271, loss_mean=0.296][A
+[LID Router Debug] Step: 4560
+Batch Size: 10
+Audio Batch Size: 114
+LID Assignments: [1, 1, 4, 9, 0, 9, 1, 3, 2, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+Train step of epoch 0:  71%|███████   | 4559/6434 [10:41:53<4:16:35,  8.21s/it, gpt_loss=0.262, loss_mean=0.292][A
+Train step of epoch 0:  71%|███████   | 4560/6434 [10:41:53<4:10:11,  8.01s/it, gpt_loss=0.262, loss_mean=0.292][A
+Train step of epoch 0:  71%|███████   | 4560/6434 [10:42:01<4:10:11,  8.01s/it, gpt_loss=0.375, loss_mean=0.3]  [A
+Train step of epoch 0:  71%|███████   | 4561/6434 [10:42:01<4:08:51,  7.97s/it, gpt_loss=0.375, loss_mean=0.3][A
+Train step of epoch 0:  71%|███████   | 4561/6434 [10:42:10<4:08:51,  7.97s/it, gpt_loss=0.359, loss_mean=0.306][A
+Train step of epoch 0:  71%|███████   | 4562/6434 [10:42:10<4:14:50,  8.17s/it, gpt_loss=0.359, loss_mean=0.306][A
+Train step of epoch 0:  71%|███████   | 4562/6434 [10:42:18<4:14:50,  8.17s/it, gpt_loss=0.296, loss_mean=0.305][A
+Train step of epoch 0:  71%|███████   | 4563/6434 [10:42:18<4:12:25,  8.09s/it, gpt_loss=0.296, loss_mean=0.305][A
+Train step of epoch 0:  71%|███████   | 4563/6434 [10:42:25<4:12:25,  8.09s/it, gpt_loss=0.391, loss_mean=0.314][A
+Train step of epoch 0:  71%|███████   | 4564/6434 [10:42:25<4:07:47,  7.95s/it, gpt_loss=0.391, loss_mean=0.314][A
+Train step of epoch 0:  71%|███████   | 4564/6434 [10:42:34<4:07:47,  7.95s/it, gpt_loss=0.315, loss_mean=0.314][A
+Train step of epoch 0:  71%|███████   | 4565/6434 [10:42:34<4:11:39,  8.08s/it, gpt_loss=0.315, loss_mean=0.314][A
+Train step of epoch 0:  71%|███████   | 4565/6434 [10:42:41<4:11:39,  8.08s/it, gpt_loss=0.282, loss_mean=0.311][A
+Train step of epoch 0:  71%|███████   | 4566/6434 [10:42:41<4:09:02,  8.00s/it, gpt_loss=0.282, loss_mean=0.311][A
+Train step of epoch 0:  71%|███████   | 4566/6434 [10:42:50<4:09:02,  8.00s/it, gpt_loss=0.352, loss_mean=0.315][A
+Train step of epoch 0:  71%|███████   | 4567/6434 [10:42:50<4:18:39,  8.31s/it, gpt_loss=0.352, loss_mean=0.315][A
+Train step of epoch 0:  71%|███████   | 4567/6434 [10:42:59<4:18:39,  8.31s/it, gpt_loss=0.342, loss_mean=0.318][A
+Train step of epoch 0:  71%|███████   | 4568/6434 [10:42:59<4:22:51,  8.45s/it, gpt_loss=0.342, loss_mean=0.318][A
+Train step of epoch 0:  71%|███████   | 4568/6434 [10:43:07<4:22:51,  8.45s/it, gpt_loss=0.236, loss_mean=0.309][A
+Train step of epoch 0:  71%|███████   | 4569/6434 [10:43:07<4:15:33,  8.22s/it, gpt_loss=0.236, loss_mean=0.309][A
+[LID Router Debug] Step: 4570
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [2, 5, 5, 2, 0, 2, 9, 2, 5, 1]
+Active Experts in Batch: {0, 1, 2, 5, 9}
+
+Train step of epoch 0:  71%|███████   | 4569/6434 [10:43:15<4:15:33,  8.22s/it, gpt_loss=0.228, loss_mean=0.301][A
+Train step of epoch 0:  71%|███████   | 4570/6434 [10:43:15<4:13:43,  8.17s/it, gpt_loss=0.228, loss_mean=0.301][A
+Train step of epoch 0:  71%|███████   | 4570/6434 [10:43:24<4:13:43,  8.17s/it, gpt_loss=0.38, loss_mean=0.309] [A
+Train step of epoch 0:  71%|███████   | 4571/6434 [10:43:24<4:21:59,  8.44s/it, gpt_loss=0.38, loss_mean=0.309][A
+Train step of epoch 0:  71%|███████   | 4571/6434 [10:43:32<4:21:59,  8.44s/it, gpt_loss=0.368, loss_mean=0.315][A
+Train step of epoch 0:  71%|███████   | 4572/6434 [10:43:32<4:16:21,  8.26s/it, gpt_loss=0.368, loss_mean=0.315][A
+Train step of epoch 0:  71%|███████   | 4572/6434 [10:43:41<4:16:21,  8.26s/it, gpt_loss=0.252, loss_mean=0.309][A
+Train step of epoch 0:  71%|███████   | 4573/6434 [10:43:41<4:20:06,  8.39s/it, gpt_loss=0.252, loss_mean=0.309][A
+Train step of epoch 0:  71%|███████   | 4573/6434 [10:43:50<4:20:06,  8.39s/it, gpt_loss=0.353, loss_mean=0.313][A
+Train step of epoch 0:  71%|███████   | 4574/6434 [10:43:50<4:25:15,  8.56s/it, gpt_loss=0.353, loss_mean=0.313][A
+Train step of epoch 0:  71%|███████   | 4574/6434 [10:43:59<4:25:15,  8.56s/it, gpt_loss=0.265, loss_mean=0.308][A
+Train step of epoch 0:  71%|███████   | 4575/6434 [10:43:59<4:34:50,  8.87s/it, gpt_loss=0.265, loss_mean=0.308][A
+Train step of epoch 0:  71%|███████   | 4575/6434 [10:44:07<4:34:50,  8.87s/it, gpt_loss=0.277, loss_mean=0.305][A
+Train step of epoch 0:  71%|███████   | 4576/6434 [10:44:07<4:25:46,  8.58s/it, gpt_loss=0.277, loss_mean=0.305][A
+Train step of epoch 0:  71%|███████   | 4576/6434 [10:44:15<4:25:46,  8.58s/it, gpt_loss=0.34, loss_mean=0.309] [A
+Train step of epoch 0:  71%|███████   | 4577/6434 [10:44:15<4:15:30,  8.26s/it, gpt_loss=0.34, loss_mean=0.309][A
+Train step of epoch 0:  71%|███████   | 4577/6434 [10:44:24<4:15:30,  8.26s/it, gpt_loss=0.35, loss_mean=0.313][A
+Train step of epoch 0:  71%|███████   | 4578/6434 [10:44:24<4:21:56,  8.47s/it, gpt_loss=0.35, loss_mean=0.313][A
+Train step of epoch 0:  71%|███████   | 4578/6434 [10:44:32<4:21:56,  8.47s/it, gpt_loss=0.323, loss_mean=0.314][A
+Train step of epoch 0:  71%|███████   | 4579/6434 [10:44:32<4:22:40,  8.50s/it, gpt_loss=0.323, loss_mean=0.314][A
+[LID Router Debug] Step: 4580
+Batch Size: 10
+Audio Batch Size: 84
+LID Assignments: [2, 2, 1, 5, 9, 4, 2, 2, 4, 5]
+Active Experts in Batch: {1, 2, 4, 5, 9}
+
+Train step of epoch 0:  71%|███████   | 4579/6434 [10:44:41<4:22:40,  8.50s/it, gpt_loss=0.314, loss_mean=0.314][A
+Train step of epoch 0:  71%|███████   | 4580/6434 [10:44:41<4:24:55,  8.57s/it, gpt_loss=0.314, loss_mean=0.314][A
+Train step of epoch 0:  71%|███████   | 4580/6434 [10:44:50<4:24:55,  8.57s/it, gpt_loss=0.348, loss_mean=0.317][A
+Train step of epoch 0:  71%|███████   | 4581/6434 [10:44:50<4:33:27,  8.85s/it, gpt_loss=0.348, loss_mean=0.317][A
+Train step of epoch 0:  71%|███████   | 4581/6434 [10:44:59<4:33:27,  8.85s/it, gpt_loss=0.286, loss_mean=0.314][A
+Train step of epoch 0:  71%|███████   | 4582/6434 [10:44:59<4:29:14,  8.72s/it, gpt_loss=0.286, loss_mean=0.314][A
+Train step of epoch 0:  71%|███████   | 4582/6434 [10:45:08<4:29:14,  8.72s/it, gpt_loss=0.282, loss_mean=0.311][A
+Train step of epoch 0:  71%|███████   | 4583/6434 [10:45:08<4:31:11,  8.79s/it, gpt_loss=0.282, loss_mean=0.311][A
+Train step of epoch 0:  71%|███████   | 4583/6434 [10:45:17<4:31:11,  8.79s/it, gpt_loss=0.212, loss_mean=0.301][A
+Train step of epoch 0:  71%|███████   | 4584/6434 [10:45:17<4:36:36,  8.97s/it, gpt_loss=0.212, loss_mean=0.301][A
+Train step of epoch 0:  71%|███████   | 4584/6434 [10:45:25<4:36:36,  8.97s/it, gpt_loss=0.37, loss_mean=0.308] [A
+Train step of epoch 0:  71%|███████▏  | 4585/6434 [10:45:25<4:26:13,  8.64s/it, gpt_loss=0.37, loss_mean=0.308][A
+Train step of epoch 0:  71%|███████▏  | 4585/6434 [10:45:33<4:26:13,  8.64s/it, gpt_loss=0.305, loss_mean=0.308][A
+Train step of epoch 0:  71%|███████▏  | 4586/6434 [10:45:33<4:23:22,  8.55s/it, gpt_loss=0.305, loss_mean=0.308][A
+Train step of epoch 0:  71%|███████▏  | 4586/6434 [10:45:41<4:23:22,  8.55s/it, gpt_loss=0.29, loss_mean=0.306] [A
+Train step of epoch 0:  71%|███████▏  | 4587/6434 [10:45:41<4:16:33,  8.33s/it, gpt_loss=0.29, loss_mean=0.306][A
+Train step of epoch 0:  71%|███████▏  | 4587/6434 [10:45:49<4:16:33,  8.33s/it, gpt_loss=0.356, loss_mean=0.311][A
+Train step of epoch 0:  71%|███████▏  | 4588/6434 [10:45:49<4:16:34,  8.34s/it, gpt_loss=0.356, loss_mean=0.311][A
+Train step of epoch 0:  71%|███████▏  | 4588/6434 [10:45:57<4:16:34,  8.34s/it, gpt_loss=0.294, loss_mean=0.309][A
+Train step of epoch 0:  71%|███████▏  | 4589/6434 [10:45:57<4:11:26,  8.18s/it, gpt_loss=0.294, loss_mean=0.309][A
+[LID Router Debug] Step: 4590
+Batch Size: 10
+Audio Batch Size: 131
+LID Assignments: [1, 3, 4, 1, 3, 4, 2, 3, 9, 9]
+Active Experts in Batch: {1, 2, 3, 4, 9}
+
+Train step of epoch 0:  71%|███████▏  | 4589/6434 [10:46:06<4:11:26,  8.18s/it, gpt_loss=0.295, loss_mean=0.308][A
+Train step of epoch 0:  71%|███████▏  | 4590/6434 [10:46:06<4:14:30,  8.28s/it, gpt_loss=0.295, loss_mean=0.308][A
+Train step of epoch 0:  71%|███████▏  | 4590/6434 [10:46:15<4:14:30,  8.28s/it, gpt_loss=0.248, loss_mean=0.302][A
+Train step of epoch 0:  71%|███████▏  | 4591/6434 [10:46:15<4:20:32,  8.48s/it, gpt_loss=0.248, loss_mean=0.302][A
+Train step of epoch 0:  71%|███████▏  | 4591/6434 [10:46:24<4:20:32,  8.48s/it, gpt_loss=0.239, loss_mean=0.295][A
+Train step of epoch 0:  71%|███████▏  | 4592/6434 [10:46:24<4:24:54,  8.63s/it, gpt_loss=0.239, loss_mean=0.295][A
+Train step of epoch 0:  71%|███████▏  | 4592/6434 [10:46:31<4:24:54,  8.63s/it, gpt_loss=0.353, loss_mean=0.301][A
+Train step of epoch 0:  71%|███████▏  | 4593/6434 [10:46:31<4:10:22,  8.16s/it, gpt_loss=0.353, loss_mean=0.301][A
+Train step of epoch 0:  71%|███████▏  | 4593/6434 [10:46:40<4:10:22,  8.16s/it, gpt_loss=0.253, loss_mean=0.296][A
+Train step of epoch 0:  71%|███████▏  | 4594/6434 [10:46:40<4:18:56,  8.44s/it, gpt_loss=0.253, loss_mean=0.296][A
+Train step of epoch 0:  71%|███████▏  | 4594/6434 [10:46:48<4:18:56,  8.44s/it, gpt_loss=0.28, loss_mean=0.295] [A
+Train step of epoch 0:  71%|███████▏  | 4595/6434 [10:46:48<4:15:54,  8.35s/it, gpt_loss=0.28, loss_mean=0.295][A
+Train step of epoch 0:  71%|███████▏  | 4595/6434 [10:46:57<4:15:54,  8.35s/it, gpt_loss=0.302, loss_mean=0.296][A
+Train step of epoch 0:  71%|███████▏  | 4596/6434 [10:46:57<4:20:48,  8.51s/it, gpt_loss=0.302, loss_mean=0.296][A
+Train step of epoch 0:  71%|███████▏  | 4596/6434 [10:47:06<4:20:48,  8.51s/it, gpt_loss=0.251, loss_mean=0.291][A
+Train step of epoch 0:  71%|███████▏  | 4597/6434 [10:47:06<4:21:58,  8.56s/it, gpt_loss=0.251, loss_mean=0.291][A
+Train step of epoch 0:  71%|███████▏  | 4597/6434 [10:47:15<4:21:58,  8.56s/it, gpt_loss=0.332, loss_mean=0.295][A
+Train step of epoch 0:  71%|███████▏  | 4598/6434 [10:47:15<4:30:28,  8.84s/it, gpt_loss=0.332, loss_mean=0.295][A
+Train step of epoch 0:  71%|███████▏  | 4598/6434 [10:47:22<4:30:28,  8.84s/it, gpt_loss=0.295, loss_mean=0.295][A
+Train step of epoch 0:  71%|███████▏  | 4599/6434 [10:47:22<4:16:13,  8.38s/it, gpt_loss=0.295, loss_mean=0.295][A
+[LID Router Debug] Step: 4600
+Batch Size: 10
+Audio Batch Size: 108
+LID Assignments: [10, 5, 3, 1, 5, 2, 9, 1, 9, 0]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9, 10}
+[2026-02-07 02:43:34,800] [INFO] [logging.py:96:log_dist] [Rank 0] step=2300, skipped=0, lr=[1.7488964043936042e-05, 1.7488964043936042e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 02:43:34,801] [INFO] [timer.py:260:stop] epoch=0/micro_step=4600/global_step=2300, RunningAvgSamplesPerSec=4.746288722577603, CurrSamplesPerSec=5.253629582830168, MemAllocated=12.84GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  71%|███████▏  | 4599/6434 [10:47:30<4:16:13,  8.38s/it, gpt_loss=0.392, loss_mean=0.305][A
+Train step of epoch 0:  71%|███████▏  | 4600/6434 [10:47:30<4:12:12,  8.25s/it, gpt_loss=0.392, loss_mean=0.305][A
+Train step of epoch 0:  71%|███████▏  | 4600/6434 [10:47:39<4:12:12,  8.25s/it, gpt_loss=0.323, loss_mean=0.307][A
+Train step of epoch 0:  72%|███████▏  | 4601/6434 [10:47:39<4:14:07,  8.32s/it, gpt_loss=0.323, loss_mean=0.307][A
+Train step of epoch 0:  72%|███████▏  | 4601/6434 [10:47:47<4:14:07,  8.32s/it, gpt_loss=0.257, loss_mean=0.302][A
+Train step of epoch 0:  72%|███████▏  | 4602/6434 [10:47:47<4:08:42,  8.15s/it, gpt_loss=0.257, loss_mean=0.302][A
+Train step of epoch 0:  72%|███████▏  | 4602/6434 [10:47:56<4:08:42,  8.15s/it, gpt_loss=0.317, loss_mean=0.303][A
+Train step of epoch 0:  72%|███████▏  | 4603/6434 [10:47:56<4:19:48,  8.51s/it, gpt_loss=0.317, loss_mean=0.303][A
+Train step of epoch 0:  72%|███████▏  | 4603/6434 [10:48:05<4:19:48,  8.51s/it, gpt_loss=0.212, loss_mean=0.294][A
+Train step of epoch 0:  72%|███████▏  | 4604/6434 [10:48:05<4:23:08,  8.63s/it, gpt_loss=0.212, loss_mean=0.294][A
+Train step of epoch 0:  72%|███████▏  | 4604/6434 [10:48:14<4:23:08,  8.63s/it, gpt_loss=0.313, loss_mean=0.296][A
+Train step of epoch 0:  72%|███████▏  | 4605/6434 [10:48:14<4:26:44,  8.75s/it, gpt_loss=0.313, loss_mean=0.296][A
+Train step of epoch 0:  72%|███████▏  | 4605/6434 [10:48:23<4:26:44,  8.75s/it, gpt_loss=0.306, loss_mean=0.297][A
+Train step of epoch 0:  72%|███████▏  | 4606/6434 [10:48:23<4:26:36,  8.75s/it, gpt_loss=0.306, loss_mean=0.297][A
+Train step of epoch 0:  72%|███████▏  | 4606/6434 [10:48:30<4:26:36,  8.75s/it, gpt_loss=0.289, loss_mean=0.296][A
+Train step of epoch 0:  72%|███████▏  | 4607/6434 [10:48:30<4:16:28,  8.42s/it, gpt_loss=0.289, loss_mean=0.296][A
+Train step of epoch 0:  72%|███████▏  | 4607/6434 [10:48:39<4:16:28,  8.42s/it, gpt_loss=0.265, loss_mean=0.293][A
+Train step of epoch 0:  72%|███████▏  | 4608/6434 [10:48:39<4:15:44,  8.40s/it, gpt_loss=0.265, loss_mean=0.293][A
+Train step of epoch 0:  72%|███████▏  | 4608/6434 [10:48:47<4:15:44,  8.40s/it, gpt_loss=0.308, loss_mean=0.295][A
+Train step of epoch 0:  72%|███████▏  | 4609/6434 [10:48:47<4:16:52,  8.45s/it, gpt_loss=0.308, loss_mean=0.295][A
+[LID Router Debug] Step: 4610
+Batch Size: 10
+Audio Batch Size: 119
+LID Assignments: [3, 2, 3, 9, 9, 0, 9, 9, 9, 6]
+Active Experts in Batch: {0, 2, 3, 6, 9}
+
+Train step of epoch 0:  72%|███████▏  | 4609/6434 [10:48:56<4:16:52,  8.45s/it, gpt_loss=0.33, loss_mean=0.298] [A
+Train step of epoch 0:  72%|███████▏  | 4610/6434 [10:48:56<4:21:21,  8.60s/it, gpt_loss=0.33, loss_mean=0.298][A
+Train step of epoch 0:  72%|███████▏  | 4610/6434 [10:49:04<4:21:21,  8.60s/it, gpt_loss=0.349, loss_mean=0.303][A
+Train step of epoch 0:  72%|███████▏  | 4611/6434 [10:49:04<4:15:17,  8.40s/it, gpt_loss=0.349, loss_mean=0.303][A
+Train step of epoch 0:  72%|███████▏  | 4611/6434 [10:49:13<4:15:17,  8.40s/it, gpt_loss=0.264, loss_mean=0.299][A
+Train step of epoch 0:  72%|███████▏  | 4612/6434 [10:49:13<4:17:58,  8.50s/it, gpt_loss=0.264, loss_mean=0.299][A
+Train step of epoch 0:  72%|███████▏  | 4612/6434 [10:49:21<4:17:58,  8.50s/it, gpt_loss=0.323, loss_mean=0.302][A
+Train step of epoch 0:  72%|███████▏  | 4613/6434 [10:49:21<4:12:09,  8.31s/it, gpt_loss=0.323, loss_mean=0.302][A
+Train step of epoch 0:  72%|███████▏  | 4613/6434 [10:49:29<4:12:09,  8.31s/it, gpt_loss=0.335, loss_mean=0.305][A
+Train step of epoch 0:  72%|███████▏  | 4614/6434 [10:49:29<4:08:44,  8.20s/it, gpt_loss=0.335, loss_mean=0.305][A
+Train step of epoch 0:  72%|███████▏  | 4614/6434 [10:49:38<4:08:44,  8.20s/it, gpt_loss=0.258, loss_mean=0.3]  [A
+Train step of epoch 0:  72%|███████▏  | 4615/6434 [10:49:38<4:15:46,  8.44s/it, gpt_loss=0.258, loss_mean=0.3][A
+Train step of epoch 0:  72%|███████▏  | 4615/6434 [10:49:46<4:15:46,  8.44s/it, gpt_loss=0.345, loss_mean=0.305][A
+Train step of epoch 0:  72%|███████▏  | 4616/6434 [10:49:46<4:15:18,  8.43s/it, gpt_loss=0.345, loss_mean=0.305][A
+Train step of epoch 0:  72%|███████▏  | 4616/6434 [10:49:55<4:15:18,  8.43s/it, gpt_loss=0.39, loss_mean=0.313] [A
+Train step of epoch 0:  72%|███████▏  | 4617/6434 [10:49:55<4:22:48,  8.68s/it, gpt_loss=0.39, loss_mean=0.313][A
+Train step of epoch 0:  72%|███████▏  | 4617/6434 [10:50:05<4:22:48,  8.68s/it, gpt_loss=0.341, loss_mean=0.316][A
+Train step of epoch 0:  72%|███████▏  | 4618/6434 [10:50:05<4:27:58,  8.85s/it, gpt_loss=0.341, loss_mean=0.316][A
+Train step of epoch 0:  72%|███████▏  | 4618/6434 [10:50:13<4:27:58,  8.85s/it, gpt_loss=0.294, loss_mean=0.314][A
+Train step of epoch 0:  72%|███████▏  | 4619/6434 [10:50:13<4:21:51,  8.66s/it, gpt_loss=0.294, loss_mean=0.314][A
+[LID Router Debug] Step: 4620
+Batch Size: 10
+Audio Batch Size: 105
+LID Assignments: [4, 5, 0, 4, 3, 3, 0, 4, 4, 1]
+Active Experts in Batch: {0, 1, 3, 4, 5}
+
+Train step of epoch 0:  72%|███████▏  | 4619/6434 [10:50:22<4:21:51,  8.66s/it, gpt_loss=0.251, loss_mean=0.308][A
+Train step of epoch 0:  72%|███████▏  | 4620/6434 [10:50:22<4:29:20,  8.91s/it, gpt_loss=0.251, loss_mean=0.308][A
+Train step of epoch 0:  72%|███████▏  | 4620/6434 [10:50:31<4:29:20,  8.91s/it, gpt_loss=0.384, loss_mean=0.315][A
+Train step of epoch 0:  72%|███████▏  | 4621/6434 [10:50:31<4:29:10,  8.91s/it, gpt_loss=0.384, loss_mean=0.315][A
+Train step of epoch 0:  72%|███████▏  | 4621/6434 [10:50:40<4:29:10,  8.91s/it, gpt_loss=0.287, loss_mean=0.312][A
+Train step of epoch 0:  72%|███████▏  | 4622/6434 [10:50:40<4:30:39,  8.96s/it, gpt_loss=0.287, loss_mean=0.312][A
+Train step of epoch 0:  72%|███████▏  | 4622/6434 [10:50:49<4:30:39,  8.96s/it, gpt_loss=0.384, loss_mean=0.32] [A
+Train step of epoch 0:  72%|███████▏  | 4623/6434 [10:50:49<4:24:57,  8.78s/it, gpt_loss=0.384, loss_mean=0.32][A
+Train step of epoch 0:  72%|███████▏  | 4623/6434 [10:50:58<4:24:57,  8.78s/it, gpt_loss=0.251, loss_mean=0.313][A
+Train step of epoch 0:  72%|███████▏  | 4624/6434 [10:50:58<4:26:51,  8.85s/it, gpt_loss=0.251, loss_mean=0.313][A
+Train step of epoch 0:  72%|███████▏  | 4624/6434 [10:51:05<4:26:51,  8.85s/it, gpt_loss=0.346, loss_mean=0.316][A
+Train step of epoch 0:  72%|███████▏  | 4625/6434 [10:51:05<4:18:26,  8.57s/it, gpt_loss=0.346, loss_mean=0.316][A
+Train step of epoch 0:  72%|███████▏  | 4625/6434 [10:51:13<4:18:26,  8.57s/it, gpt_loss=0.337, loss_mean=0.318][A
+Train step of epoch 0:  72%|███████▏  | 4626/6434 [10:51:13<4:08:49,  8.26s/it, gpt_loss=0.337, loss_mean=0.318][A
+Train step of epoch 0:  72%|███████▏  | 4626/6434 [10:51:22<4:08:49,  8.26s/it, gpt_loss=0.293, loss_mean=0.316][A
+Train step of epoch 0:  72%|███████▏  | 4627/6434 [10:51:22<4:10:47,  8.33s/it, gpt_loss=0.293, loss_mean=0.316][A
+Train step of epoch 0:  72%|███████▏  | 4627/6434 [10:51:29<4:10:47,  8.33s/it, gpt_loss=0.323, loss_mean=0.316][A
+Train step of epoch 0:  72%|███████▏  | 4628/6434 [10:51:29<3:59:14,  7.95s/it, gpt_loss=0.323, loss_mean=0.316][A
+Train step of epoch 0:  72%|███████▏  | 4628/6434 [10:51:37<3:59:14,  7.95s/it, gpt_loss=0.223, loss_mean=0.307][A
+Train step of epoch 0:  72%|███████▏  | 4629/6434 [10:51:37<4:01:48,  8.04s/it, gpt_loss=0.223, loss_mean=0.307][A
+[LID Router Debug] Step: 4630
+Batch Size: 10
+Audio Batch Size: 145
+LID Assignments: [3, 5, 0, 5, 2, 5, 3, 3, 2, 5]
+Active Experts in Batch: {0, 2, 3, 5}
+
+Train step of epoch 0:  72%|███████▏  | 4629/6434 [10:51:46<4:01:48,  8.04s/it, gpt_loss=0.266, loss_mean=0.303][A
+Train step of epoch 0:  72%|███████▏  | 4630/6434 [10:51:46<4:09:33,  8.30s/it, gpt_loss=0.266, loss_mean=0.303][A
+Train step of epoch 0:  72%|███████▏  | 4630/6434 [10:51:54<4:09:33,  8.30s/it, gpt_loss=0.402, loss_mean=0.313][A
+Train step of epoch 0:  72%|███████▏  | 4631/6434 [10:51:54<4:05:17,  8.16s/it, gpt_loss=0.402, loss_mean=0.313][A
+Train step of epoch 0:  72%|███████▏  | 4631/6434 [10:52:02<4:05:17,  8.16s/it, gpt_loss=0.294, loss_mean=0.311][A
+Train step of epoch 0:  72%|███████▏  | 4632/6434 [10:52:02<4:06:15,  8.20s/it, gpt_loss=0.294, loss_mean=0.311][A
+Train step of epoch 0:  72%|███████▏  | 4632/6434 [10:52:10<4:06:15,  8.20s/it, gpt_loss=0.296, loss_mean=0.309][A
+Train step of epoch 0:  72%|███████▏  | 4633/6434 [10:52:10<4:02:14,  8.07s/it, gpt_loss=0.296, loss_mean=0.309][A
+Train step of epoch 0:  72%|███████▏  | 4633/6434 [10:52:19<4:02:14,  8.07s/it, gpt_loss=0.243, loss_mean=0.303][A
+Train step of epoch 0:  72%|███████▏  | 4634/6434 [10:52:19<4:11:42,  8.39s/it, gpt_loss=0.243, loss_mean=0.303][A
+Train step of epoch 0:  72%|███████▏  | 4634/6434 [10:52:26<4:11:42,  8.39s/it, gpt_loss=0.326, loss_mean=0.305][A
+Train step of epoch 0:  72%|███████▏  | 4635/6434 [10:52:26<4:02:38,  8.09s/it, gpt_loss=0.326, loss_mean=0.305][A
+Train step of epoch 0:  72%|███████▏  | 4635/6434 [10:52:35<4:02:38,  8.09s/it, gpt_loss=0.364, loss_mean=0.311][A
+Train step of epoch 0:  72%|███████▏  | 4636/6434 [10:52:35<4:04:58,  8.17s/it, gpt_loss=0.364, loss_mean=0.311][A
+Train step of epoch 0:  72%|███████▏  | 4636/6434 [10:52:44<4:04:58,  8.17s/it, gpt_loss=0.343, loss_mean=0.314][A
+Train step of epoch 0:  72%|███████▏  | 4637/6434 [10:52:44<4:18:57,  8.65s/it, gpt_loss=0.343, loss_mean=0.314][A
+Train step of epoch 0:  72%|███████▏  | 4637/6434 [10:52:53<4:18:57,  8.65s/it, gpt_loss=0.419, loss_mean=0.325][A
+Train step of epoch 0:  72%|███████▏  | 4638/6434 [10:52:53<4:22:32,  8.77s/it, gpt_loss=0.419, loss_mean=0.325][A
+Train step of epoch 0:  72%|███████▏  | 4638/6434 [10:53:01<4:22:32,  8.77s/it, gpt_loss=0.292, loss_mean=0.321][A
+Train step of epoch 0:  72%|███████▏  | 4639/6434 [10:53:01<4:15:43,  8.55s/it, gpt_loss=0.292, loss_mean=0.321][A
+[LID Router Debug] Step: 4640
+Batch Size: 10
+Audio Batch Size: 95
+LID Assignments: [9, 2, 9, 9, 2, 2, 0, 1, 6, 3]
+Active Experts in Batch: {0, 1, 2, 3, 6, 9}
+
+Train step of epoch 0:  72%|███████▏  | 4639/6434 [10:53:09<4:15:43,  8.55s/it, gpt_loss=0.304, loss_mean=0.32] [A
+Train step of epoch 0:  72%|███████▏  | 4640/6434 [10:53:09<4:07:09,  8.27s/it, gpt_loss=0.304, loss_mean=0.32][A
+Train step of epoch 0:  72%|███████▏  | 4640/6434 [10:53:17<4:07:09,  8.27s/it, gpt_loss=0.294, loss_mean=0.317][A
+Train step of epoch 0:  72%|███████▏  | 4641/6434 [10:53:17<4:09:14,  8.34s/it, gpt_loss=0.294, loss_mean=0.317][A
+Train step of epoch 0:  72%|███████▏  | 4641/6434 [10:53:26<4:09:14,  8.34s/it, gpt_loss=0.259, loss_mean=0.311][A
+Train step of epoch 0:  72%|███████▏  | 4642/6434 [10:53:26<4:11:41,  8.43s/it, gpt_loss=0.259, loss_mean=0.311][A
+Train step of epoch 0:  72%|███████▏  | 4642/6434 [10:53:35<4:11:41,  8.43s/it, gpt_loss=0.307, loss_mean=0.311][A
+Train step of epoch 0:  72%|███████▏  | 4643/6434 [10:53:35<4:15:20,  8.55s/it, gpt_loss=0.307, loss_mean=0.311][A
+Train step of epoch 0:  72%|███████▏  | 4643/6434 [10:53:43<4:15:20,  8.55s/it, gpt_loss=0.336, loss_mean=0.313][A
+Train step of epoch 0:  72%|███████▏  | 4644/6434 [10:53:43<4:09:43,  8.37s/it, gpt_loss=0.336, loss_mean=0.313][A
+Train step of epoch 0:  72%|███████▏  | 4644/6434 [10:53:53<4:09:43,  8.37s/it, gpt_loss=0.338, loss_mean=0.316][A
+Train step of epoch 0:  72%|███████▏  | 4645/6434 [10:53:53<4:20:38,  8.74s/it, gpt_loss=0.338, loss_mean=0.316][A
+Train step of epoch 0:  72%|███████▏  | 4645/6434 [10:54:01<4:20:38,  8.74s/it, gpt_loss=0.225, loss_mean=0.307][A
+Train step of epoch 0:  72%|███████▏  | 4646/6434 [10:54:01<4:17:15,  8.63s/it, gpt_loss=0.225, loss_mean=0.307][A
+Train step of epoch 0:  72%|███████▏  | 4646/6434 [10:54:10<4:17:15,  8.63s/it, gpt_loss=0.308, loss_mean=0.307][A
+Train step of epoch 0:  72%|███████▏  | 4647/6434 [10:54:10<4:21:04,  8.77s/it, gpt_loss=0.308, loss_mean=0.307][A
+Train step of epoch 0:  72%|███████▏  | 4647/6434 [10:54:19<4:21:04,  8.77s/it, gpt_loss=0.29, loss_mean=0.305] [A
+Train step of epoch 0:  72%|███████▏  | 4648/6434 [10:54:19<4:20:09,  8.74s/it, gpt_loss=0.29, loss_mean=0.305][A
+Train step of epoch 0:  72%|███████▏  | 4648/6434 [10:54:26<4:20:09,  8.74s/it, gpt_loss=0.423, loss_mean=0.317][A
+Train step of epoch 0:  72%|███████▏  | 4649/6434 [10:54:26<4:10:40,  8.43s/it, gpt_loss=0.423, loss_mean=0.317][A
+[LID Router Debug] Step: 4650
+Batch Size: 10
+Audio Batch Size: 96
+LID Assignments: [1, 6, 0, 1, 1, 0, 0, 5, 3, 6]
+Active Experts in Batch: {0, 1, 3, 5, 6}
+
+Train step of epoch 0:  72%|███████▏  | 4649/6434 [10:54:33<4:10:40,  8.43s/it, gpt_loss=0.247, loss_mean=0.31] [A
+Train step of epoch 0:  72%|███████▏  | 4650/6434 [10:54:33<3:56:57,  7.97s/it, gpt_loss=0.247, loss_mean=0.31][A
+Train step of epoch 0:  72%|███████▏  | 4650/6434 [10:54:41<3:56:57,  7.97s/it, gpt_loss=0.359, loss_mean=0.315][A
+Train step of epoch 0:  72%|███████▏  | 4651/6434 [10:54:41<3:58:14,  8.02s/it, gpt_loss=0.359, loss_mean=0.315][A
+Train step of epoch 0:  72%|███████▏  | 4651/6434 [10:54:50<3:58:14,  8.02s/it, gpt_loss=0.356, loss_mean=0.319][A
+Train step of epoch 0:  72%|███████▏  | 4652/6434 [10:54:50<4:01:33,  8.13s/it, gpt_loss=0.356, loss_mean=0.319][A
+Train step of epoch 0:  72%|███████▏  | 4652/6434 [10:54:58<4:01:33,  8.13s/it, gpt_loss=0.269, loss_mean=0.314][A
+Train step of epoch 0:  72%|███████▏  | 4653/6434 [10:54:58<4:00:16,  8.09s/it, gpt_loss=0.269, loss_mean=0.314][A
+Train step of epoch 0:  72%|███████▏  | 4653/6434 [10:55:08<4:00:16,  8.09s/it, gpt_loss=0.24, loss_mean=0.306] [A
+Train step of epoch 0:  72%|███████▏  | 4654/6434 [10:55:08<4:19:46,  8.76s/it, gpt_loss=0.24, loss_mean=0.306][A
+Train step of epoch 0:  72%|███████▏  | 4654/6434 [10:55:15<4:19:46,  8.76s/it, gpt_loss=0.254, loss_mean=0.301][A
+Train step of epoch 0:  72%|███████▏  | 4655/6434 [10:55:15<4:05:00,  8.26s/it, gpt_loss=0.254, loss_mean=0.301][A
+Train step of epoch 0:  72%|███████▏  | 4655/6434 [10:55:23<4:05:00,  8.26s/it, gpt_loss=0.362, loss_mean=0.307][A
+Train step of epoch 0:  72%|███████▏  | 4656/6434 [10:55:23<4:04:25,  8.25s/it, gpt_loss=0.362, loss_mean=0.307][A
+Train step of epoch 0:  72%|███████▏  | 4656/6434 [10:55:33<4:04:25,  8.25s/it, gpt_loss=0.306, loss_mean=0.307][A
+Train step of epoch 0:  72%|███████▏  | 4657/6434 [10:55:33<4:15:12,  8.62s/it, gpt_loss=0.306, loss_mean=0.307][A
+Train step of epoch 0:  72%|███████▏  | 4657/6434 [10:55:41<4:15:12,  8.62s/it, gpt_loss=0.24, loss_mean=0.3]   [A
+Train step of epoch 0:  72%|███████▏  | 4658/6434 [10:55:41<4:09:36,  8.43s/it, gpt_loss=0.24, loss_mean=0.3][A
+Train step of epoch 0:  72%|███████▏  | 4658/6434 [10:55:49<4:09:36,  8.43s/it, gpt_loss=0.262, loss_mean=0.297][A
+Train step of epoch 0:  72%|███████▏  | 4659/6434 [10:55:49<4:05:30,  8.30s/it, gpt_loss=0.262, loss_mean=0.297][A
+[LID Router Debug] Step: 4660
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [1, 4, 2, 9, 3, 0, 7, 6, 4, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6, 7, 9}
+
+Train step of epoch 0:  72%|███████▏  | 4659/6434 [10:55:57<4:05:30,  8.30s/it, gpt_loss=0.346, loss_mean=0.302][A
+Train step of epoch 0:  72%|███████▏  | 4660/6434 [10:55:57<4:02:47,  8.21s/it, gpt_loss=0.346, loss_mean=0.302][A
+Train step of epoch 0:  72%|███████▏  | 4660/6434 [10:56:05<4:02:47,  8.21s/it, gpt_loss=0.326, loss_mean=0.304][A
+Train step of epoch 0:  72%|███████▏  | 4661/6434 [10:56:05<4:03:25,  8.24s/it, gpt_loss=0.326, loss_mean=0.304][A
+Train step of epoch 0:  72%|███████▏  | 4661/6434 [10:56:14<4:03:25,  8.24s/it, gpt_loss=0.24, loss_mean=0.298] [A
+Train step of epoch 0:  72%|███████▏  | 4662/6434 [10:56:14<4:07:35,  8.38s/it, gpt_loss=0.24, loss_mean=0.298][A
+Train step of epoch 0:  72%|███████▏  | 4662/6434 [10:56:23<4:07:35,  8.38s/it, gpt_loss=0.296, loss_mean=0.297][A
+Train step of epoch 0:  72%|███████▏  | 4663/6434 [10:56:23<4:10:33,  8.49s/it, gpt_loss=0.296, loss_mean=0.297][A
+Train step of epoch 0:  72%|███████▏  | 4663/6434 [10:56:30<4:10:33,  8.49s/it, gpt_loss=0.251, loss_mean=0.293][A
+Train step of epoch 0:  72%|███████▏  | 4664/6434 [10:56:30<4:02:25,  8.22s/it, gpt_loss=0.251, loss_mean=0.293][A
+Train step of epoch 0:  72%|███████▏  | 4664/6434 [10:56:39<4:02:25,  8.22s/it, gpt_loss=0.277, loss_mean=0.291][A
+Train step of epoch 0:  73%|███████▎  | 4665/6434 [10:56:39<4:09:39,  8.47s/it, gpt_loss=0.277, loss_mean=0.291][A
+Train step of epoch 0:  73%|███████▎  | 4665/6434 [10:56:48<4:09:39,  8.47s/it, gpt_loss=0.292, loss_mean=0.291][A
+Train step of epoch 0:  73%|███████▎  | 4666/6434 [10:56:48<4:11:32,  8.54s/it, gpt_loss=0.292, loss_mean=0.291][A
+Train step of epoch 0:  73%|███████▎  | 4666/6434 [10:56:56<4:11:32,  8.54s/it, gpt_loss=0.332, loss_mean=0.295][A
+Train step of epoch 0:  73%|███████▎  | 4667/6434 [10:56:56<4:07:20,  8.40s/it, gpt_loss=0.332, loss_mean=0.295][A
+Train step of epoch 0:  73%|███████▎  | 4667/6434 [10:57:04<4:07:20,  8.40s/it, gpt_loss=0.412, loss_mean=0.307][A
+Train step of epoch 0:  73%|███████▎  | 4668/6434 [10:57:04<4:07:23,  8.41s/it, gpt_loss=0.412, loss_mean=0.307][A
+Train step of epoch 0:  73%|███████▎  | 4668/6434 [10:57:13<4:07:23,  8.41s/it, gpt_loss=0.287, loss_mean=0.305][A
+Train step of epoch 0:  73%|███████▎  | 4669/6434 [10:57:13<4:05:58,  8.36s/it, gpt_loss=0.287, loss_mean=0.305][A
+[LID Router Debug] Step: 4670
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [9, 1, 6, 1, 3, 3, 1, 0, 0, 0]
+Active Experts in Batch: {0, 1, 3, 6, 9}
+
+Train step of epoch 0:  73%|███████▎  | 4669/6434 [10:57:21<4:05:58,  8.36s/it, gpt_loss=0.317, loss_mean=0.306][A
+Train step of epoch 0:  73%|███████▎  | 4670/6434 [10:57:21<4:00:51,  8.19s/it, gpt_loss=0.317, loss_mean=0.306][A
+Train step of epoch 0:  73%|███████▎  | 4670/6434 [10:57:29<4:00:51,  8.19s/it, gpt_loss=0.3, loss_mean=0.306]  [A
+Train step of epoch 0:  73%|███████▎  | 4671/6434 [10:57:29<4:02:56,  8.27s/it, gpt_loss=0.3, loss_mean=0.306][A
+Train step of epoch 0:  73%|███████▎  | 4671/6434 [10:57:37<4:02:56,  8.27s/it, gpt_loss=0.28, loss_mean=0.303][A
+Train step of epoch 0:  73%|███████▎  | 4672/6434 [10:57:37<4:03:46,  8.30s/it, gpt_loss=0.28, loss_mean=0.303][A
+Train step of epoch 0:  73%|███████▎  | 4672/6434 [10:57:47<4:03:46,  8.30s/it, gpt_loss=0.406, loss_mean=0.313][A
+Train step of epoch 0:  73%|███████▎  | 4673/6434 [10:57:47<4:17:12,  8.76s/it, gpt_loss=0.406, loss_mean=0.313][A
+Train step of epoch 0:  73%|███████▎  | 4673/6434 [10:57:57<4:17:12,  8.76s/it, gpt_loss=0.281, loss_mean=0.31] [A
+Train step of epoch 0:  73%|███████▎  | 4674/6434 [10:57:57<4:23:39,  8.99s/it, gpt_loss=0.281, loss_mean=0.31][A
+Train step of epoch 0:  73%|███████▎  | 4674/6434 [10:58:05<4:23:39,  8.99s/it, gpt_loss=0.344, loss_mean=0.313][A
+Train step of epoch 0:  73%|███████▎  | 4675/6434 [10:58:05<4:13:49,  8.66s/it, gpt_loss=0.344, loss_mean=0.313][A
+Train step of epoch 0:  73%|███████▎  | 4675/6434 [10:58:13<4:13:49,  8.66s/it, gpt_loss=0.24, loss_mean=0.306] [A
+Train step of epoch 0:  73%|███████▎  | 4676/6434 [10:58:13<4:13:43,  8.66s/it, gpt_loss=0.24, loss_mean=0.306][A
+Train step of epoch 0:  73%|███████▎  | 4676/6434 [10:58:21<4:13:43,  8.66s/it, gpt_loss=0.266, loss_mean=0.302][A
+Train step of epoch 0:  73%|███████▎  | 4677/6434 [10:58:21<4:04:21,  8.34s/it, gpt_loss=0.266, loss_mean=0.302][A
+Train step of epoch 0:  73%|███████▎  | 4677/6434 [10:58:29<4:04:21,  8.34s/it, gpt_loss=0.302, loss_mean=0.302][A
+Train step of epoch 0:  73%|███████▎  | 4678/6434 [10:58:29<4:06:13,  8.41s/it, gpt_loss=0.302, loss_mean=0.302][A
+Train step of epoch 0:  73%|███████▎  | 4678/6434 [10:58:37<4:06:13,  8.41s/it, gpt_loss=0.29, loss_mean=0.301] [A
+Train step of epoch 0:  73%|███████▎  | 4679/6434 [10:58:37<4:02:07,  8.28s/it, gpt_loss=0.29, loss_mean=0.301][A
+[LID Router Debug] Step: 4680
+Batch Size: 10
+Audio Batch Size: 104
+LID Assignments: [3, 9, 1, 2, 2, 3, 4, 1, 1, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+Train step of epoch 0:  73%|███████▎  | 4679/6434 [10:58:46<4:02:07,  8.28s/it, gpt_loss=0.315, loss_mean=0.302][A
+Train step of epoch 0:  73%|███████▎  | 4680/6434 [10:58:46<4:00:44,  8.24s/it, gpt_loss=0.315, loss_mean=0.302][A
+Train step of epoch 0:  73%|███████▎  | 4680/6434 [10:58:53<4:00:44,  8.24s/it, gpt_loss=0.374, loss_mean=0.309][A
+Train step of epoch 0:  73%|███████▎  | 4681/6434 [10:58:53<3:58:09,  8.15s/it, gpt_loss=0.374, loss_mean=0.309][A
+Train step of epoch 0:  73%|███████▎  | 4681/6434 [10:59:02<3:58:09,  8.15s/it, gpt_loss=0.296, loss_mean=0.308][A
+Train step of epoch 0:  73%|███████▎  | 4682/6434 [10:59:02<4:01:34,  8.27s/it, gpt_loss=0.296, loss_mean=0.308][A
+Train step of epoch 0:  73%|███████▎  | 4682/6434 [10:59:10<4:01:34,  8.27s/it, gpt_loss=0.301, loss_mean=0.307][A
+Train step of epoch 0:  73%|███████▎  | 4683/6434 [10:59:10<3:58:37,  8.18s/it, gpt_loss=0.301, loss_mean=0.307][A
+Train step of epoch 0:  73%|███████▎  | 4683/6434 [10:59:19<3:58:37,  8.18s/it, gpt_loss=0.33, loss_mean=0.309] [A
+Train step of epoch 0:  73%|███████▎  | 4684/6434 [10:59:19<4:06:32,  8.45s/it, gpt_loss=0.33, loss_mean=0.309][A
+Train step of epoch 0:  73%|███████▎  | 4684/6434 [10:59:27<4:06:32,  8.45s/it, gpt_loss=0.251, loss_mean=0.304][A
+Train step of epoch 0:  73%|███████▎  | 4685/6434 [10:59:27<4:05:52,  8.44s/it, gpt_loss=0.251, loss_mean=0.304][A
+Train step of epoch 0:  73%|███████▎  | 4685/6434 [10:59:36<4:05:52,  8.44s/it, gpt_loss=0.425, loss_mean=0.316][A
+Train step of epoch 0:  73%|███████▎  | 4686/6434 [10:59:36<4:10:17,  8.59s/it, gpt_loss=0.425, loss_mean=0.316][A
+Train step of epoch 0:  73%|███████▎  | 4686/6434 [10:59:45<4:10:17,  8.59s/it, gpt_loss=0.347, loss_mean=0.319][A
+Train step of epoch 0:  73%|███████▎  | 4687/6434 [10:59:45<4:07:04,  8.49s/it, gpt_loss=0.347, loss_mean=0.319][A
+Train step of epoch 0:  73%|███████▎  | 4687/6434 [10:59:53<4:07:04,  8.49s/it, gpt_loss=0.323, loss_mean=0.319][A
+Train step of epoch 0:  73%|███████▎  | 4688/6434 [10:59:53<4:08:46,  8.55s/it, gpt_loss=0.323, loss_mean=0.319][A
+Train step of epoch 0:  73%|███████▎  | 4688/6434 [11:00:02<4:08:46,  8.55s/it, gpt_loss=0.312, loss_mean=0.319][A
+Train step of epoch 0:  73%|███████▎  | 4689/6434 [11:00:02<4:07:08,  8.50s/it, gpt_loss=0.312, loss_mean=0.319][A
+[LID Router Debug] Step: 4690
+Batch Size: 10
+Audio Batch Size: 95
+LID Assignments: [1, 3, 3, 4, 5, 5, 5, 6, 1, 5]
+Active Experts in Batch: {1, 3, 4, 5, 6}
+
+Train step of epoch 0:  73%|███████▎  | 4689/6434 [11:00:10<4:07:08,  8.50s/it, gpt_loss=0.23, loss_mean=0.31]  [A
+Train step of epoch 0:  73%|███████▎  | 4690/6434 [11:00:10<4:04:58,  8.43s/it, gpt_loss=0.23, loss_mean=0.31][A
+Train step of epoch 0:  73%|███████▎  | 4690/6434 [11:00:19<4:04:58,  8.43s/it, gpt_loss=0.298, loss_mean=0.309][A
+Train step of epoch 0:  73%|███████▎  | 4691/6434 [11:00:19<4:09:13,  8.58s/it, gpt_loss=0.298, loss_mean=0.309][A
+Train step of epoch 0:  73%|███████▎  | 4691/6434 [11:00:27<4:09:13,  8.58s/it, gpt_loss=0.273, loss_mean=0.305][A
+Train step of epoch 0:  73%|███████▎  | 4692/6434 [11:00:27<4:07:26,  8.52s/it, gpt_loss=0.273, loss_mean=0.305][A
+Train step of epoch 0:  73%|███████▎  | 4692/6434 [11:00:37<4:07:26,  8.52s/it, gpt_loss=0.303, loss_mean=0.305][A
+Train step of epoch 0:  73%|███████▎  | 4693/6434 [11:00:37<4:17:21,  8.87s/it, gpt_loss=0.303, loss_mean=0.305][A
+Train step of epoch 0:  73%|███████▎  | 4693/6434 [11:00:46<4:17:21,  8.87s/it, gpt_loss=0.27, loss_mean=0.301] [A
+Train step of epoch 0:  73%|███████▎  | 4694/6434 [11:00:46<4:15:44,  8.82s/it, gpt_loss=0.27, loss_mean=0.301][A
+Train step of epoch 0:  73%|███████▎  | 4694/6434 [11:00:53<4:15:44,  8.82s/it, gpt_loss=0.259, loss_mean=0.297][A
+Train step of epoch 0:  73%|███████▎  | 4695/6434 [11:00:53<4:06:22,  8.50s/it, gpt_loss=0.259, loss_mean=0.297][A
+Train step of epoch 0:  73%|███████▎  | 4695/6434 [11:01:02<4:06:22,  8.50s/it, gpt_loss=0.288, loss_mean=0.296][A
+Train step of epoch 0:  73%|███████▎  | 4696/6434 [11:01:02<4:10:12,  8.64s/it, gpt_loss=0.288, loss_mean=0.296][A
+Train step of epoch 0:  73%|███████▎  | 4696/6434 [11:01:10<4:10:12,  8.64s/it, gpt_loss=0.354, loss_mean=0.302][A
+Train step of epoch 0:  73%|███████▎  | 4697/6434 [11:01:10<4:01:51,  8.35s/it, gpt_loss=0.354, loss_mean=0.302][A
+Train step of epoch 0:  73%|███████▎  | 4697/6434 [11:01:18<4:01:51,  8.35s/it, gpt_loss=0.228, loss_mean=0.295][A
+Train step of epoch 0:  73%|███████▎  | 4698/6434 [11:01:18<3:58:33,  8.25s/it, gpt_loss=0.228, loss_mean=0.295][A
+Train step of epoch 0:  73%|███████▎  | 4698/6434 [11:01:27<3:58:33,  8.25s/it, gpt_loss=0.231, loss_mean=0.288][A
+Train step of epoch 0:  73%|███████▎  | 4699/6434 [11:01:27<4:05:46,  8.50s/it, gpt_loss=0.231, loss_mean=0.288][A
+[LID Router Debug] Step: 4700
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [9, 0, 9, 8, 1, 2, 0, 4, 9, 6]
+Active Experts in Batch: {0, 1, 2, 4, 6, 8, 9}
+
+Train step of epoch 0:  73%|███████▎  | 4699/6434 [11:01:36<4:05:46,  8.50s/it, gpt_loss=0.331, loss_mean=0.292][A
+Train step of epoch 0:  73%|███████▎  | 4700/6434 [11:01:36<4:04:02,  8.44s/it, gpt_loss=0.331, loss_mean=0.292][A
+Train step of epoch 0:  73%|███████▎  | 4700/6434 [11:01:45<4:04:02,  8.44s/it, gpt_loss=0.319, loss_mean=0.295][A
+Train step of epoch 0:  73%|███████▎  | 4701/6434 [11:01:45<4:11:50,  8.72s/it, gpt_loss=0.319, loss_mean=0.295][A
+Train step of epoch 0:  73%|███████▎  | 4701/6434 [11:01:53<4:11:50,  8.72s/it, gpt_loss=0.345, loss_mean=0.3]  [A
+Train step of epoch 0:  73%|███████▎  | 4702/6434 [11:01:53<4:02:35,  8.40s/it, gpt_loss=0.345, loss_mean=0.3][A
+Train step of epoch 0:  73%|███████▎  | 4702/6434 [11:02:01<4:02:35,  8.40s/it, gpt_loss=0.248, loss_mean=0.295][A
+Train step of epoch 0:  73%|███████▎  | 4703/6434 [11:02:01<4:00:50,  8.35s/it, gpt_loss=0.248, loss_mean=0.295][A
+Train step of epoch 0:  73%|███████▎  | 4703/6434 [11:02:09<4:00:50,  8.35s/it, gpt_loss=0.26, loss_mean=0.291] [A
+Train step of epoch 0:  73%|███████▎  | 4704/6434 [11:02:09<4:03:40,  8.45s/it, gpt_loss=0.26, loss_mean=0.291][A
+Train step of epoch 0:  73%|███████▎  | 4704/6434 [11:02:18<4:03:40,  8.45s/it, gpt_loss=0.26, loss_mean=0.288][A
+Train step of epoch 0:  73%|███████▎  | 4705/6434 [11:02:18<4:06:53,  8.57s/it, gpt_loss=0.26, loss_mean=0.288][A
+Train step of epoch 0:  73%|███████▎  | 4705/6434 [11:02:27<4:06:53,  8.57s/it, gpt_loss=0.316, loss_mean=0.291][A
+Train step of epoch 0:  73%|███████▎  | 4706/6434 [11:02:27<4:03:42,  8.46s/it, gpt_loss=0.316, loss_mean=0.291][A
+Train step of epoch 0:  73%|███████▎  | 4706/6434 [11:02:34<4:03:42,  8.46s/it, gpt_loss=0.27, loss_mean=0.289] [A
+Train step of epoch 0:  73%|███████▎  | 4707/6434 [11:02:34<3:55:12,  8.17s/it, gpt_loss=0.27, loss_mean=0.289][A
+Train step of epoch 0:  73%|███████▎  | 4707/6434 [11:02:42<3:55:12,  8.17s/it, gpt_loss=0.402, loss_mean=0.3] [A
+Train step of epoch 0:  73%|███████▎  | 4708/6434 [11:02:42<3:50:13,  8.00s/it, gpt_loss=0.402, loss_mean=0.3][A
+Train step of epoch 0:  73%|███████▎  | 4708/6434 [11:02:51<3:50:13,  8.00s/it, gpt_loss=0.228, loss_mean=0.293][A
+Train step of epoch 0:  73%|███████▎  | 4709/6434 [11:02:51<4:02:06,  8.42s/it, gpt_loss=0.228, loss_mean=0.293][A
+[LID Router Debug] Step: 4710
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [2, 1, 3, 4, 9, 9, 4, 6, 0, 9]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:  73%|███████▎  | 4709/6434 [11:02:59<4:02:06,  8.42s/it, gpt_loss=0.313, loss_mean=0.295][A
+Train step of epoch 0:  73%|███████▎  | 4710/6434 [11:02:59<4:01:00,  8.39s/it, gpt_loss=0.313, loss_mean=0.295][A
+Train step of epoch 0:  73%|███████▎  | 4710/6434 [11:03:07<4:01:00,  8.39s/it, gpt_loss=0.32, loss_mean=0.297] [A
+Train step of epoch 0:  73%|███████▎  | 4711/6434 [11:03:07<3:57:02,  8.25s/it, gpt_loss=0.32, loss_mean=0.297][A
+Train step of epoch 0:  73%|███████▎  | 4711/6434 [11:03:17<3:57:02,  8.25s/it, gpt_loss=0.307, loss_mean=0.298][A
+Train step of epoch 0:  73%|███████▎  | 4712/6434 [11:03:17<4:06:53,  8.60s/it, gpt_loss=0.307, loss_mean=0.298][A
+Train step of epoch 0:  73%|███████▎  | 4712/6434 [11:03:25<4:06:53,  8.60s/it, gpt_loss=0.239, loss_mean=0.293][A
+Train step of epoch 0:  73%|███████▎  | 4713/6434 [11:03:25<4:03:16,  8.48s/it, gpt_loss=0.239, loss_mean=0.293][A
+Train step of epoch 0:  73%|███████▎  | 4713/6434 [11:03:35<4:03:16,  8.48s/it, gpt_loss=0.284, loss_mean=0.292][A
+Train step of epoch 0:  73%|███████▎  | 4714/6434 [11:03:35<4:16:09,  8.94s/it, gpt_loss=0.284, loss_mean=0.292][A
+Train step of epoch 0:  73%|███████▎  | 4714/6434 [11:03:43<4:16:09,  8.94s/it, gpt_loss=0.253, loss_mean=0.288][A
+Train step of epoch 0:  73%|███████▎  | 4715/6434 [11:03:43<4:11:20,  8.77s/it, gpt_loss=0.253, loss_mean=0.288][A
+Train step of epoch 0:  73%|███████▎  | 4715/6434 [11:03:51<4:11:20,  8.77s/it, gpt_loss=0.27, loss_mean=0.286] [A
+Train step of epoch 0:  73%|███████▎  | 4716/6434 [11:03:51<4:02:53,  8.48s/it, gpt_loss=0.27, loss_mean=0.286][A
+Train step of epoch 0:  73%|███████▎  | 4716/6434 [11:04:00<4:02:53,  8.48s/it, gpt_loss=0.281, loss_mean=0.286][A
+Train step of epoch 0:  73%|███████▎  | 4717/6434 [11:04:00<4:09:58,  8.74s/it, gpt_loss=0.281, loss_mean=0.286][A
+Train step of epoch 0:  73%|███████▎  | 4717/6434 [11:04:10<4:09:58,  8.74s/it, gpt_loss=0.334, loss_mean=0.29] [A
+Train step of epoch 0:  73%|███████▎  | 4718/6434 [11:04:10<4:15:50,  8.95s/it, gpt_loss=0.334, loss_mean=0.29][A
+Train step of epoch 0:  73%|███████▎  | 4718/6434 [11:04:18<4:15:50,  8.95s/it, gpt_loss=0.393, loss_mean=0.301][A
+Train step of epoch 0:  73%|███████▎  | 4719/6434 [11:04:18<4:07:39,  8.66s/it, gpt_loss=0.393, loss_mean=0.301][A
+[LID Router Debug] Step: 4720
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [3, 1, 5, 1, 1, 3, 1, 0, 1, 2]
+Active Experts in Batch: {0, 1, 2, 3, 5}
+
+Train step of epoch 0:  73%|███████▎  | 4719/6434 [11:04:26<4:07:39,  8.66s/it, gpt_loss=0.294, loss_mean=0.3]  [A
+Train step of epoch 0:  73%|███████▎  | 4720/6434 [11:04:26<4:05:33,  8.60s/it, gpt_loss=0.294, loss_mean=0.3][A
+Train step of epoch 0:  73%|███████▎  | 4720/6434 [11:04:34<4:05:33,  8.60s/it, gpt_loss=0.237, loss_mean=0.294][A
+Train step of epoch 0:  73%|███████▎  | 4721/6434 [11:04:34<3:54:34,  8.22s/it, gpt_loss=0.237, loss_mean=0.294][A
+Train step of epoch 0:  73%|███████▎  | 4721/6434 [11:04:41<3:54:34,  8.22s/it, gpt_loss=0.317, loss_mean=0.296][A
+Train step of epoch 0:  73%|███████▎  | 4722/6434 [11:04:41<3:48:48,  8.02s/it, gpt_loss=0.317, loss_mean=0.296][A
+Train step of epoch 0:  73%|███████▎  | 4722/6434 [11:04:50<3:48:48,  8.02s/it, gpt_loss=0.287, loss_mean=0.295][A
+Train step of epoch 0:  73%|███████▎  | 4723/6434 [11:04:50<3:57:32,  8.33s/it, gpt_loss=0.287, loss_mean=0.295][A
+Train step of epoch 0:  73%|███████▎  | 4723/6434 [11:04:58<3:57:32,  8.33s/it, gpt_loss=0.251, loss_mean=0.291][A
+Train step of epoch 0:  73%|███████▎  | 4724/6434 [11:04:58<3:53:06,  8.18s/it, gpt_loss=0.251, loss_mean=0.291][A
+Train step of epoch 0:  73%|███████▎  | 4724/6434 [11:05:07<3:53:06,  8.18s/it, gpt_loss=0.297, loss_mean=0.291][A
+Train step of epoch 0:  73%|███████▎  | 4725/6434 [11:05:07<3:57:26,  8.34s/it, gpt_loss=0.297, loss_mean=0.291][A
+Train step of epoch 0:  73%|███████▎  | 4725/6434 [11:05:16<3:57:26,  8.34s/it, gpt_loss=0.318, loss_mean=0.294][A
+Train step of epoch 0:  73%|███████▎  | 4726/6434 [11:05:16<4:06:48,  8.67s/it, gpt_loss=0.318, loss_mean=0.294][A
+Train step of epoch 0:  73%|███████▎  | 4726/6434 [11:05:23<4:06:48,  8.67s/it, gpt_loss=0.302, loss_mean=0.295][A
+Train step of epoch 0:  73%|███████▎  | 4727/6434 [11:05:23<3:54:14,  8.23s/it, gpt_loss=0.302, loss_mean=0.295][A
+Train step of epoch 0:  73%|███████▎  | 4727/6434 [11:05:32<3:54:14,  8.23s/it, gpt_loss=0.272, loss_mean=0.292][A
+Train step of epoch 0:  73%|███████▎  | 4728/6434 [11:05:32<3:53:37,  8.22s/it, gpt_loss=0.272, loss_mean=0.292][A
+Train step of epoch 0:  73%|███████▎  | 4728/6434 [11:05:40<3:53:37,  8.22s/it, gpt_loss=0.291, loss_mean=0.292][A
+Train step of epoch 0:  74%|███████▎  | 4729/6434 [11:05:40<3:53:56,  8.23s/it, gpt_loss=0.291, loss_mean=0.292][A
+[LID Router Debug] Step: 4730
+Batch Size: 10
+Audio Batch Size: 135
+LID Assignments: [4, 2, 2, 6, 2, 4, 3, 2, 2, 2]
+Active Experts in Batch: {2, 3, 4, 6}
+
+Train step of epoch 0:  74%|███████▎  | 4729/6434 [11:05:49<3:53:56,  8.23s/it, gpt_loss=0.317, loss_mean=0.295][A
+Train step of epoch 0:  74%|███████▎  | 4730/6434 [11:05:49<3:59:24,  8.43s/it, gpt_loss=0.317, loss_mean=0.295][A
+Train step of epoch 0:  74%|███████▎  | 4730/6434 [11:05:57<3:59:24,  8.43s/it, gpt_loss=0.328, loss_mean=0.298][A
+Train step of epoch 0:  74%|███████▎  | 4731/6434 [11:05:57<3:54:55,  8.28s/it, gpt_loss=0.328, loss_mean=0.298][A
+Train step of epoch 0:  74%|███████▎  | 4731/6434 [11:06:05<3:54:55,  8.28s/it, gpt_loss=0.313, loss_mean=0.3]  [A
+Train step of epoch 0:  74%|███████▎  | 4732/6434 [11:06:05<3:51:08,  8.15s/it, gpt_loss=0.313, loss_mean=0.3][A
+Train step of epoch 0:  74%|███████▎  | 4732/6434 [11:06:13<3:51:08,  8.15s/it, gpt_loss=0.325, loss_mean=0.302][A
+Train step of epoch 0:  74%|███████▎  | 4733/6434 [11:06:13<3:52:55,  8.22s/it, gpt_loss=0.325, loss_mean=0.302][A
+Train step of epoch 0:  74%|███████▎  | 4733/6434 [11:06:22<3:52:55,  8.22s/it, gpt_loss=0.38, loss_mean=0.31]  [A
+Train step of epoch 0:  74%|███████▎  | 4734/6434 [11:06:22<3:56:43,  8.36s/it, gpt_loss=0.38, loss_mean=0.31][A
+Train step of epoch 0:  74%|███████▎  | 4734/6434 [11:06:30<3:56:43,  8.36s/it, gpt_loss=0.287, loss_mean=0.308][A
+Train step of epoch 0:  74%|███████▎  | 4735/6434 [11:06:30<3:56:16,  8.34s/it, gpt_loss=0.287, loss_mean=0.308][A
+Train step of epoch 0:  74%|███████▎  | 4735/6434 [11:06:38<3:56:16,  8.34s/it, gpt_loss=0.278, loss_mean=0.305][A
+Train step of epoch 0:  74%|███████▎  | 4736/6434 [11:06:38<3:55:24,  8.32s/it, gpt_loss=0.278, loss_mean=0.305][A
+Train step of epoch 0:  74%|███████▎  | 4736/6434 [11:06:46<3:55:24,  8.32s/it, gpt_loss=0.28, loss_mean=0.302] [A
+Train step of epoch 0:  74%|███████▎  | 4737/6434 [11:06:46<3:52:53,  8.23s/it, gpt_loss=0.28, loss_mean=0.302][A
+Train step of epoch 0:  74%|███████▎  | 4737/6434 [11:06:54<3:52:53,  8.23s/it, gpt_loss=0.317, loss_mean=0.304][A
+Train step of epoch 0:  74%|███████▎  | 4738/6434 [11:06:54<3:53:07,  8.25s/it, gpt_loss=0.317, loss_mean=0.304][A
+Train step of epoch 0:  74%|███████▎  | 4738/6434 [11:07:03<3:53:07,  8.25s/it, gpt_loss=0.225, loss_mean=0.296][A
+Train step of epoch 0:  74%|███████▎  | 4739/6434 [11:07:03<3:57:12,  8.40s/it, gpt_loss=0.225, loss_mean=0.296][A
+[LID Router Debug] Step: 4740
+Batch Size: 10
+Audio Batch Size: 98
+LID Assignments: [4, 2, 9, 2, 9, 9, 9, 9, 2, 1]
+Active Experts in Batch: {9, 2, 4, 1}
+
+Train step of epoch 0:  74%|███████▎  | 4739/6434 [11:07:11<3:57:12,  8.40s/it, gpt_loss=0.25, loss_mean=0.291] [A
+Train step of epoch 0:  74%|███████▎  | 4740/6434 [11:07:11<3:55:39,  8.35s/it, gpt_loss=0.25, loss_mean=0.291][A
+Train step of epoch 0:  74%|███████▎  | 4740/6434 [11:07:20<3:55:39,  8.35s/it, gpt_loss=0.307, loss_mean=0.293][A
+Train step of epoch 0:  74%|███████▎  | 4741/6434 [11:07:20<3:56:34,  8.38s/it, gpt_loss=0.307, loss_mean=0.293][A
+Train step of epoch 0:  74%|███████▎  | 4741/6434 [11:07:29<3:56:34,  8.38s/it, gpt_loss=0.329, loss_mean=0.296][A
+Train step of epoch 0:  74%|███████▎  | 4742/6434 [11:07:29<4:00:51,  8.54s/it, gpt_loss=0.329, loss_mean=0.296][A
+Train step of epoch 0:  74%|███████▎  | 4742/6434 [11:07:37<4:00:51,  8.54s/it, gpt_loss=0.294, loss_mean=0.296][A
+Train step of epoch 0:  74%|███████▎  | 4743/6434 [11:07:37<3:59:44,  8.51s/it, gpt_loss=0.294, loss_mean=0.296][A
+Train step of epoch 0:  74%|███████▎  | 4743/6434 [11:07:45<3:59:44,  8.51s/it, gpt_loss=0.34, loss_mean=0.301] [A
+Train step of epoch 0:  74%|███████▎  | 4744/6434 [11:07:45<3:53:03,  8.27s/it, gpt_loss=0.34, loss_mean=0.301][A
+Train step of epoch 0:  74%|███████▎  | 4744/6434 [11:07:54<3:53:03,  8.27s/it, gpt_loss=0.313, loss_mean=0.302][A
+Train step of epoch 0:  74%|███████▎  | 4745/6434 [11:07:54<4:02:16,  8.61s/it, gpt_loss=0.313, loss_mean=0.302][A
+Train step of epoch 0:  74%|███████▎  | 4745/6434 [11:08:02<4:02:16,  8.61s/it, gpt_loss=0.325, loss_mean=0.304][A
+Train step of epoch 0:  74%|███████▍  | 4746/6434 [11:08:02<3:55:35,  8.37s/it, gpt_loss=0.325, loss_mean=0.304][A
+Train step of epoch 0:  74%|███████▍  | 4746/6434 [11:08:11<3:55:35,  8.37s/it, gpt_loss=0.278, loss_mean=0.302][A
+Train step of epoch 0:  74%|███████▍  | 4747/6434 [11:08:11<3:55:38,  8.38s/it, gpt_loss=0.278, loss_mean=0.302][A
+Train step of epoch 0:  74%|███████▍  | 4747/6434 [11:08:19<3:55:38,  8.38s/it, gpt_loss=0.295, loss_mean=0.301][A
+Train step of epoch 0:  74%|███████▍  | 4748/6434 [11:08:19<3:59:21,  8.52s/it, gpt_loss=0.295, loss_mean=0.301][A
+Train step of epoch 0:  74%|███████▍  | 4748/6434 [11:08:27<3:59:21,  8.52s/it, gpt_loss=0.25, loss_mean=0.296] [A
+Train step of epoch 0:  74%|███████▍  | 4749/6434 [11:08:27<3:51:59,  8.26s/it, gpt_loss=0.25, loss_mean=0.296][A
+[LID Router Debug] Step: 4750
+Batch Size: 10
+Audio Batch Size: 126
+LID Assignments: [4, 6, 9, 9, 3, 2, 3, 0, 4, 6]
+Active Experts in Batch: {0, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:  74%|███████▍  | 4749/6434 [11:08:36<3:51:59,  8.26s/it, gpt_loss=0.307, loss_mean=0.297][A
+Train step of epoch 0:  74%|███████▍  | 4750/6434 [11:08:36<3:54:49,  8.37s/it, gpt_loss=0.307, loss_mean=0.297][A
+Train step of epoch 0:  74%|███████▍  | 4750/6434 [11:08:44<3:54:49,  8.37s/it, gpt_loss=0.226, loss_mean=0.29] [A
+Train step of epoch 0:  74%|███████▍  | 4751/6434 [11:08:44<3:54:01,  8.34s/it, gpt_loss=0.226, loss_mean=0.29][A
+Train step of epoch 0:  74%|███████▍  | 4751/6434 [11:08:53<3:54:01,  8.34s/it, gpt_loss=0.244, loss_mean=0.285][A
+Train step of epoch 0:  74%|███████▍  | 4752/6434 [11:08:53<3:57:51,  8.48s/it, gpt_loss=0.244, loss_mean=0.285][A
+Train step of epoch 0:  74%|███████▍  | 4752/6434 [11:09:01<3:57:51,  8.48s/it, gpt_loss=0.211, loss_mean=0.278][A
+Train step of epoch 0:  74%|███████▍  | 4753/6434 [11:09:01<3:54:27,  8.37s/it, gpt_loss=0.211, loss_mean=0.278][A
+Train step of epoch 0:  74%|███████▍  | 4753/6434 [11:09:10<3:54:27,  8.37s/it, gpt_loss=0.274, loss_mean=0.277][A
+Train step of epoch 0:  74%|███████▍  | 4754/6434 [11:09:10<3:56:30,  8.45s/it, gpt_loss=0.274, loss_mean=0.277][A
+Train step of epoch 0:  74%|███████▍  | 4754/6434 [11:09:18<3:56:30,  8.45s/it, gpt_loss=0.303, loss_mean=0.28] [A
+Train step of epoch 0:  74%|███████▍  | 4755/6434 [11:09:18<3:52:18,  8.30s/it, gpt_loss=0.303, loss_mean=0.28][A
+Train step of epoch 0:  74%|███████▍  | 4755/6434 [11:09:26<3:52:18,  8.30s/it, gpt_loss=0.275, loss_mean=0.279][A
+Train step of epoch 0:  74%|███████▍  | 4756/6434 [11:09:26<3:50:52,  8.26s/it, gpt_loss=0.275, loss_mean=0.279][A
+Train step of epoch 0:  74%|███████▍  | 4756/6434 [11:09:35<3:50:52,  8.26s/it, gpt_loss=0.279, loss_mean=0.279][A
+Train step of epoch 0:  74%|███████▍  | 4757/6434 [11:09:35<4:00:35,  8.61s/it, gpt_loss=0.279, loss_mean=0.279][A
+Train step of epoch 0:  74%|███████▍  | 4757/6434 [11:09:44<4:00:35,  8.61s/it, gpt_loss=0.247, loss_mean=0.276][A
+Train step of epoch 0:  74%|███████▍  | 4758/6434 [11:09:44<4:01:04,  8.63s/it, gpt_loss=0.247, loss_mean=0.276][A
+Train step of epoch 0:  74%|███████▍  | 4758/6434 [11:09:52<4:01:04,  8.63s/it, gpt_loss=0.247, loss_mean=0.273][A
+Train step of epoch 0:  74%|███████▍  | 4759/6434 [11:09:52<4:01:14,  8.64s/it, gpt_loss=0.247, loss_mean=0.273][A
+[LID Router Debug] Step: 4760
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [4, 3, 9, 1, 9, 9, 4, 4, 5, 4]
+Active Experts in Batch: {1, 3, 4, 5, 9}
+
+Train step of epoch 0:  74%|███████▍  | 4759/6434 [11:10:01<4:01:14,  8.64s/it, gpt_loss=0.304, loss_mean=0.276][A
+Train step of epoch 0:  74%|███████▍  | 4760/6434 [11:10:01<4:00:19,  8.61s/it, gpt_loss=0.304, loss_mean=0.276][A
+Train step of epoch 0:  74%|███████▍  | 4760/6434 [11:10:09<4:00:19,  8.61s/it, gpt_loss=0.218, loss_mean=0.27] [A
+Train step of epoch 0:  74%|███████▍  | 4761/6434 [11:10:09<3:56:21,  8.48s/it, gpt_loss=0.218, loss_mean=0.27][A
+Train step of epoch 0:  74%|███████▍  | 4761/6434 [11:10:17<3:56:21,  8.48s/it, gpt_loss=0.359, loss_mean=0.279][A
+Train step of epoch 0:  74%|███████▍  | 4762/6434 [11:10:17<3:46:53,  8.14s/it, gpt_loss=0.359, loss_mean=0.279][A
+Train step of epoch 0:  74%|███████▍  | 4762/6434 [11:10:25<3:46:53,  8.14s/it, gpt_loss=0.261, loss_mean=0.278][A
+Train step of epoch 0:  74%|███████▍  | 4763/6434 [11:10:25<3:48:38,  8.21s/it, gpt_loss=0.261, loss_mean=0.278][A
+Train step of epoch 0:  74%|███████▍  | 4763/6434 [11:10:33<3:48:38,  8.21s/it, gpt_loss=0.36, loss_mean=0.286] [A
+Train step of epoch 0:  74%|███████▍  | 4764/6434 [11:10:33<3:47:42,  8.18s/it, gpt_loss=0.36, loss_mean=0.286][A
+Train step of epoch 0:  74%|███████▍  | 4764/6434 [11:10:41<3:47:42,  8.18s/it, gpt_loss=0.318, loss_mean=0.289][A
+Train step of epoch 0:  74%|███████▍  | 4765/6434 [11:10:41<3:44:02,  8.05s/it, gpt_loss=0.318, loss_mean=0.289][A
+Train step of epoch 0:  74%|███████▍  | 4765/6434 [11:10:49<3:44:02,  8.05s/it, gpt_loss=0.316, loss_mean=0.292][A
+Train step of epoch 0:  74%|███████▍  | 4766/6434 [11:10:49<3:43:24,  8.04s/it, gpt_loss=0.316, loss_mean=0.292][A
+Train step of epoch 0:  74%|███████▍  | 4766/6434 [11:10:58<3:43:24,  8.04s/it, gpt_loss=0.332, loss_mean=0.296][A
+Train step of epoch 0:  74%|███████▍  | 4767/6434 [11:10:58<3:49:20,  8.25s/it, gpt_loss=0.332, loss_mean=0.296][A
+Train step of epoch 0:  74%|███████▍  | 4767/6434 [11:11:07<3:49:20,  8.25s/it, gpt_loss=0.201, loss_mean=0.286][A
+Train step of epoch 0:  74%|███████▍  | 4768/6434 [11:11:07<3:56:02,  8.50s/it, gpt_loss=0.201, loss_mean=0.286][A
+Train step of epoch 0:  74%|███████▍  | 4768/6434 [11:11:15<3:56:02,  8.50s/it, gpt_loss=0.296, loss_mean=0.287][A
+Train step of epoch 0:  74%|███████▍  | 4769/6434 [11:11:15<3:55:57,  8.50s/it, gpt_loss=0.296, loss_mean=0.287][A
+[LID Router Debug] Step: 4770
+Batch Size: 10
+Audio Batch Size: 112
+LID Assignments: [9, 6, 0, 0, 5, 3, 1, 5, 9, 2]
+Active Experts in Batch: {0, 1, 2, 3, 5, 6, 9}
+
+Train step of epoch 0:  74%|███████▍  | 4769/6434 [11:11:24<3:55:57,  8.50s/it, gpt_loss=0.304, loss_mean=0.289][A
+Train step of epoch 0:  74%|███████▍  | 4770/6434 [11:11:24<3:58:57,  8.62s/it, gpt_loss=0.304, loss_mean=0.289][A
+Train step of epoch 0:  74%|███████▍  | 4770/6434 [11:11:33<3:58:57,  8.62s/it, gpt_loss=0.302, loss_mean=0.29] [A
+Train step of epoch 0:  74%|███████▍  | 4771/6434 [11:11:33<3:58:07,  8.59s/it, gpt_loss=0.302, loss_mean=0.29][A
+Train step of epoch 0:  74%|███████▍  | 4771/6434 [11:11:40<3:58:07,  8.59s/it, gpt_loss=0.271, loss_mean=0.288][A
+Train step of epoch 0:  74%|███████▍  | 4772/6434 [11:11:40<3:49:59,  8.30s/it, gpt_loss=0.271, loss_mean=0.288][A
+Train step of epoch 0:  74%|███████▍  | 4772/6434 [11:11:48<3:49:59,  8.30s/it, gpt_loss=0.351, loss_mean=0.295][A
+Train step of epoch 0:  74%|███████▍  | 4773/6434 [11:11:48<3:44:08,  8.10s/it, gpt_loss=0.351, loss_mean=0.295][A
+Train step of epoch 0:  74%|███████▍  | 4773/6434 [11:11:56<3:44:08,  8.10s/it, gpt_loss=0.362, loss_mean=0.301][A
+Train step of epoch 0:  74%|███████▍  | 4774/6434 [11:11:56<3:41:37,  8.01s/it, gpt_loss=0.362, loss_mean=0.301][A
+Train step of epoch 0:  74%|███████▍  | 4774/6434 [11:12:05<3:41:37,  8.01s/it, gpt_loss=0.281, loss_mean=0.299][A
+Train step of epoch 0:  74%|███████▍  | 4775/6434 [11:12:05<3:50:18,  8.33s/it, gpt_loss=0.281, loss_mean=0.299][A
+Train step of epoch 0:  74%|███████▍  | 4775/6434 [11:12:15<3:50:18,  8.33s/it, gpt_loss=0.3, loss_mean=0.299]  [A
+Train step of epoch 0:  74%|███████▍  | 4776/6434 [11:12:15<4:03:59,  8.83s/it, gpt_loss=0.3, loss_mean=0.299][A
+Train step of epoch 0:  74%|███████▍  | 4776/6434 [11:12:22<4:03:59,  8.83s/it, gpt_loss=0.244, loss_mean=0.294][A
+Train step of epoch 0:  74%|███████▍  | 4777/6434 [11:12:22<3:54:23,  8.49s/it, gpt_loss=0.244, loss_mean=0.294][A
+Train step of epoch 0:  74%|███████▍  | 4777/6434 [11:12:30<3:54:23,  8.49s/it, gpt_loss=0.271, loss_mean=0.292][A
+Train step of epoch 0:  74%|███████▍  | 4778/6434 [11:12:30<3:44:29,  8.13s/it, gpt_loss=0.271, loss_mean=0.292][A
+Train step of epoch 0:  74%|███████▍  | 4778/6434 [11:12:38<3:44:29,  8.13s/it, gpt_loss=0.275, loss_mean=0.29] [A
+Train step of epoch 0:  74%|███████▍  | 4779/6434 [11:12:38<3:48:10,  8.27s/it, gpt_loss=0.275, loss_mean=0.29][A
+[LID Router Debug] Step: 4780
+Batch Size: 10
+Audio Batch Size: 100
+LID Assignments: [9, 1, 3, 3, 5, 4, 4, 4, 9, 5]
+Active Experts in Batch: {1, 3, 4, 5, 9}
+
+Train step of epoch 0:  74%|███████▍  | 4779/6434 [11:12:45<3:48:10,  8.27s/it, gpt_loss=0.341, loss_mean=0.295][A
+Train step of epoch 0:  74%|███████▍  | 4780/6434 [11:12:45<3:37:46,  7.90s/it, gpt_loss=0.341, loss_mean=0.295][A
+Train step of epoch 0:  74%|███████▍  | 4780/6434 [11:12:55<3:37:46,  7.90s/it, gpt_loss=0.208, loss_mean=0.286][A
+Train step of epoch 0:  74%|███████▍  | 4781/6434 [11:12:55<3:48:58,  8.31s/it, gpt_loss=0.208, loss_mean=0.286][A
+Train step of epoch 0:  74%|███████▍  | 4781/6434 [11:13:04<3:48:58,  8.31s/it, gpt_loss=0.241, loss_mean=0.282][A
+Train step of epoch 0:  74%|███████▍  | 4782/6434 [11:13:04<3:55:34,  8.56s/it, gpt_loss=0.241, loss_mean=0.282][A
+Train step of epoch 0:  74%|███████▍  | 4782/6434 [11:13:11<3:55:34,  8.56s/it, gpt_loss=0.303, loss_mean=0.284][A
+Train step of epoch 0:  74%|███████▍  | 4783/6434 [11:13:11<3:48:35,  8.31s/it, gpt_loss=0.303, loss_mean=0.284][A
+Train step of epoch 0:  74%|███████▍  | 4783/6434 [11:13:19<3:48:35,  8.31s/it, gpt_loss=0.404, loss_mean=0.296][A
+Train step of epoch 0:  74%|███████▍  | 4784/6434 [11:13:19<3:45:16,  8.19s/it, gpt_loss=0.404, loss_mean=0.296][A
+Train step of epoch 0:  74%|███████▍  | 4784/6434 [11:13:28<3:45:16,  8.19s/it, gpt_loss=0.218, loss_mean=0.288][A
+Train step of epoch 0:  74%|███████▍  | 4785/6434 [11:13:28<3:53:12,  8.49s/it, gpt_loss=0.218, loss_mean=0.288][A
+Train step of epoch 0:  74%|███████▍  | 4785/6434 [11:13:36<3:53:12,  8.49s/it, gpt_loss=0.302, loss_mean=0.29] [A
+Train step of epoch 0:  74%|███████▍  | 4786/6434 [11:13:36<3:49:11,  8.34s/it, gpt_loss=0.302, loss_mean=0.29][A
+Train step of epoch 0:  74%|███████▍  | 4786/6434 [11:13:44<3:49:11,  8.34s/it, gpt_loss=0.305, loss_mean=0.291][A
+Train step of epoch 0:  74%|███████▍  | 4787/6434 [11:13:44<3:42:21,  8.10s/it, gpt_loss=0.305, loss_mean=0.291][A
+Train step of epoch 0:  74%|███████▍  | 4787/6434 [11:13:53<3:42:21,  8.10s/it, gpt_loss=0.329, loss_mean=0.295][A
+Train step of epoch 0:  74%|███████▍  | 4788/6434 [11:13:53<3:46:48,  8.27s/it, gpt_loss=0.329, loss_mean=0.295][A
+Train step of epoch 0:  74%|███████▍  | 4788/6434 [11:14:01<3:46:48,  8.27s/it, gpt_loss=0.282, loss_mean=0.294][A
+Train step of epoch 0:  74%|███████▍  | 4789/6434 [11:14:01<3:43:29,  8.15s/it, gpt_loss=0.282, loss_mean=0.294][A
+[LID Router Debug] Step: 4790
+Batch Size: 10
+Audio Batch Size: 136
+LID Assignments: [2, 5, 9, 9, 5, 1, 3, 2, 1, 8]
+Active Experts in Batch: {1, 2, 3, 5, 8, 9}
+
+Train step of epoch 0:  74%|███████▍  | 4789/6434 [11:14:09<3:43:29,  8.15s/it, gpt_loss=0.301, loss_mean=0.294][A
+Train step of epoch 0:  74%|███████▍  | 4790/6434 [11:14:09<3:47:19,  8.30s/it, gpt_loss=0.301, loss_mean=0.294][A
+Train step of epoch 0:  74%|███████▍  | 4790/6434 [11:14:17<3:47:19,  8.30s/it, gpt_loss=0.379, loss_mean=0.303][A
+Train step of epoch 0:  74%|███████▍  | 4791/6434 [11:14:17<3:46:00,  8.25s/it, gpt_loss=0.379, loss_mean=0.303][A
+Train step of epoch 0:  74%|███████▍  | 4791/6434 [11:14:25<3:46:00,  8.25s/it, gpt_loss=0.254, loss_mean=0.298][A
+Train step of epoch 0:  74%|███████▍  | 4792/6434 [11:14:25<3:42:51,  8.14s/it, gpt_loss=0.254, loss_mean=0.298][A
+Train step of epoch 0:  74%|███████▍  | 4792/6434 [11:14:33<3:42:51,  8.14s/it, gpt_loss=0.279, loss_mean=0.296][A
+Train step of epoch 0:  74%|███████▍  | 4793/6434 [11:14:33<3:41:57,  8.12s/it, gpt_loss=0.279, loss_mean=0.296][A
+Train step of epoch 0:  74%|███████▍  | 4793/6434 [11:14:42<3:41:57,  8.12s/it, gpt_loss=0.325, loss_mean=0.299][A
+Train step of epoch 0:  75%|███████▍  | 4794/6434 [11:14:42<3:44:09,  8.20s/it, gpt_loss=0.325, loss_mean=0.299][A
+Train step of epoch 0:  75%|███████▍  | 4794/6434 [11:14:50<3:44:09,  8.20s/it, gpt_loss=0.324, loss_mean=0.302][A
+Train step of epoch 0:  75%|███████▍  | 4795/6434 [11:14:50<3:45:19,  8.25s/it, gpt_loss=0.324, loss_mean=0.302][A
+Train step of epoch 0:  75%|███████▍  | 4795/6434 [11:15:00<3:45:19,  8.25s/it, gpt_loss=0.24, loss_mean=0.295] [A
+Train step of epoch 0:  75%|███████▍  | 4796/6434 [11:15:00<3:56:41,  8.67s/it, gpt_loss=0.24, loss_mean=0.295][A
+Train step of epoch 0:  75%|███████▍  | 4796/6434 [11:15:09<3:56:41,  8.67s/it, gpt_loss=0.311, loss_mean=0.297][A
+Train step of epoch 0:  75%|███████▍  | 4797/6434 [11:15:09<4:02:34,  8.89s/it, gpt_loss=0.311, loss_mean=0.297][A
+Train step of epoch 0:  75%|███████▍  | 4797/6434 [11:15:17<4:02:34,  8.89s/it, gpt_loss=0.341, loss_mean=0.301][A
+Train step of epoch 0:  75%|███████▍  | 4798/6434 [11:15:17<3:55:57,  8.65s/it, gpt_loss=0.341, loss_mean=0.301][A
+Train step of epoch 0:  75%|███████▍  | 4798/6434 [11:15:25<3:55:57,  8.65s/it, gpt_loss=0.393, loss_mean=0.31] [A
+Train step of epoch 0:  75%|███████▍  | 4799/6434 [11:15:25<3:47:22,  8.34s/it, gpt_loss=0.393, loss_mean=0.31][A
+[LID Router Debug] Step: 4800
+Batch Size: 10
+Audio Batch Size: 125
+LID Assignments: [5, 2, 9, 3, 3, 9, 3, 9, 5, 6]
+Active Experts in Batch: {2, 3, 5, 6, 9}
+[2026-02-07 03:11:37,652] [INFO] [logging.py:96:log_dist] [Rank 0] step=2400, skipped=0, lr=[1.7267046940593047e-05, 1.7267046940593047e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 03:11:37,652] [INFO] [timer.py:260:stop] epoch=0/micro_step=4800/global_step=2400, RunningAvgSamplesPerSec=4.746958809850261, CurrSamplesPerSec=5.016687289928737, MemAllocated=12.74GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  75%|███████▍  | 4799/6434 [11:15:33<3:47:22,  8.34s/it, gpt_loss=0.293, loss_mean=0.309][A
+Train step of epoch 0:  75%|███████▍  | 4800/6434 [11:15:33<3:47:19,  8.35s/it, gpt_loss=0.293, loss_mean=0.309][A
+Train step of epoch 0:  75%|███████▍  | 4800/6434 [11:15:41<3:47:19,  8.35s/it, gpt_loss=0.271, loss_mean=0.305][A
+Train step of epoch 0:  75%|███████▍  | 4801/6434 [11:15:41<3:44:11,  8.24s/it, gpt_loss=0.271, loss_mean=0.305][A
+Train step of epoch 0:  75%|███████▍  | 4801/6434 [11:15:49<3:44:11,  8.24s/it, gpt_loss=0.256, loss_mean=0.3]  [A
+Train step of epoch 0:  75%|███████▍  | 4802/6434 [11:15:49<3:41:21,  8.14s/it, gpt_loss=0.256, loss_mean=0.3][A
+Train step of epoch 0:  75%|███████▍  | 4802/6434 [11:15:57<3:41:21,  8.14s/it, gpt_loss=0.262, loss_mean=0.296][A
+Train step of epoch 0:  75%|███████▍  | 4803/6434 [11:15:57<3:42:44,  8.19s/it, gpt_loss=0.262, loss_mean=0.296][A
+Train step of epoch 0:  75%|███████▍  | 4803/6434 [11:16:06<3:42:44,  8.19s/it, gpt_loss=0.257, loss_mean=0.292][A
+Train step of epoch 0:  75%|███████▍  | 4804/6434 [11:16:06<3:47:09,  8.36s/it, gpt_loss=0.257, loss_mean=0.292][A
+Train step of epoch 0:  75%|███████▍  | 4804/6434 [11:16:15<3:47:09,  8.36s/it, gpt_loss=0.325, loss_mean=0.296][A
+Train step of epoch 0:  75%|███████▍  | 4805/6434 [11:16:15<3:47:53,  8.39s/it, gpt_loss=0.325, loss_mean=0.296][A
+Train step of epoch 0:  75%|███████▍  | 4805/6434 [11:16:23<3:47:53,  8.39s/it, gpt_loss=0.358, loss_mean=0.302][A
+Train step of epoch 0:  75%|███████▍  | 4806/6434 [11:16:23<3:50:53,  8.51s/it, gpt_loss=0.358, loss_mean=0.302][A
+Train step of epoch 0:  75%|███████▍  | 4806/6434 [11:16:32<3:50:53,  8.51s/it, gpt_loss=0.258, loss_mean=0.297][A
+Train step of epoch 0:  75%|███████▍  | 4807/6434 [11:16:32<3:54:37,  8.65s/it, gpt_loss=0.258, loss_mean=0.297][A
+Train step of epoch 0:  75%|███████▍  | 4807/6434 [11:16:41<3:54:37,  8.65s/it, gpt_loss=0.289, loss_mean=0.297][A
+Train step of epoch 0:  75%|███████▍  | 4808/6434 [11:16:41<3:51:19,  8.54s/it, gpt_loss=0.289, loss_mean=0.297][A
+Train step of epoch 0:  75%|███████▍  | 4808/6434 [11:16:49<3:51:19,  8.54s/it, gpt_loss=0.244, loss_mean=0.291][A
+Train step of epoch 0:  75%|███████▍  | 4809/6434 [11:16:49<3:45:41,  8.33s/it, gpt_loss=0.244, loss_mean=0.291][A
+[LID Router Debug] Step: 4810
+Batch Size: 10
+Audio Batch Size: 105
+LID Assignments: [1, 2, 5, 1, 2, 6, 1, 3, 9, 9]
+Active Experts in Batch: {1, 2, 3, 5, 6, 9}
+
+Train step of epoch 0:  75%|███████▍  | 4809/6434 [11:16:57<3:45:41,  8.33s/it, gpt_loss=0.307, loss_mean=0.293][A
+Train step of epoch 0:  75%|███████▍  | 4810/6434 [11:16:57<3:46:37,  8.37s/it, gpt_loss=0.307, loss_mean=0.293][A
+Train step of epoch 0:  75%|███████▍  | 4810/6434 [11:17:06<3:46:37,  8.37s/it, gpt_loss=0.326, loss_mean=0.296][A
+Train step of epoch 0:  75%|███████▍  | 4811/6434 [11:17:06<3:49:53,  8.50s/it, gpt_loss=0.326, loss_mean=0.296][A
+Train step of epoch 0:  75%|███████▍  | 4811/6434 [11:17:13<3:49:53,  8.50s/it, gpt_loss=0.304, loss_mean=0.297][A
+Train step of epoch 0:  75%|███████▍  | 4812/6434 [11:17:13<3:42:56,  8.25s/it, gpt_loss=0.304, loss_mean=0.297][A
+Train step of epoch 0:  75%|███████▍  | 4812/6434 [11:17:23<3:42:56,  8.25s/it, gpt_loss=0.338, loss_mean=0.301][A
+Train step of epoch 0:  75%|███████▍  | 4813/6434 [11:17:23<3:49:59,  8.51s/it, gpt_loss=0.338, loss_mean=0.301][A
+Train step of epoch 0:  75%|███████▍  | 4813/6434 [11:17:30<3:49:59,  8.51s/it, gpt_loss=0.293, loss_mean=0.3]  [A
+Train step of epoch 0:  75%|███████▍  | 4814/6434 [11:17:30<3:44:20,  8.31s/it, gpt_loss=0.293, loss_mean=0.3][A
+Train step of epoch 0:  75%|███████▍  | 4814/6434 [11:17:40<3:44:20,  8.31s/it, gpt_loss=0.279, loss_mean=0.298][A
+Train step of epoch 0:  75%|███████▍  | 4815/6434 [11:17:40<3:52:54,  8.63s/it, gpt_loss=0.279, loss_mean=0.298][A
+Train step of epoch 0:  75%|███████▍  | 4815/6434 [11:17:49<3:52:54,  8.63s/it, gpt_loss=0.285, loss_mean=0.297][A
+Train step of epoch 0:  75%|███████▍  | 4816/6434 [11:17:49<3:58:54,  8.86s/it, gpt_loss=0.285, loss_mean=0.297][A
+Train step of epoch 0:  75%|███████▍  | 4816/6434 [11:17:58<3:58:54,  8.86s/it, gpt_loss=0.296, loss_mean=0.297][A
+Train step of epoch 0:  75%|███████▍  | 4817/6434 [11:17:58<3:57:28,  8.81s/it, gpt_loss=0.296, loss_mean=0.297][A
+Train step of epoch 0:  75%|███████▍  | 4817/6434 [11:18:07<3:57:28,  8.81s/it, gpt_loss=0.222, loss_mean=0.289][A
+Train step of epoch 0:  75%|███████▍  | 4818/6434 [11:18:07<3:56:00,  8.76s/it, gpt_loss=0.222, loss_mean=0.289][A
+Train step of epoch 0:  75%|███████▍  | 4818/6434 [11:18:15<3:56:00,  8.76s/it, gpt_loss=0.261, loss_mean=0.286][A
+Train step of epoch 0:  75%|███████▍  | 4819/6434 [11:18:15<3:54:23,  8.71s/it, gpt_loss=0.261, loss_mean=0.286][A
+[LID Router Debug] Step: 4820
+Batch Size: 10
+Audio Batch Size: 105
+LID Assignments: [5, 1, 1, 1, 1, 3, 9, 9, 1, 3]
+Active Experts in Batch: {1, 3, 5, 9}
+
+Train step of epoch 0:  75%|███████▍  | 4819/6434 [11:18:23<3:54:23,  8.71s/it, gpt_loss=0.288, loss_mean=0.287][A
+Train step of epoch 0:  75%|███████▍  | 4820/6434 [11:18:23<3:50:23,  8.56s/it, gpt_loss=0.288, loss_mean=0.287][A
+Train step of epoch 0:  75%|███████▍  | 4820/6434 [11:18:32<3:50:23,  8.56s/it, gpt_loss=0.28, loss_mean=0.286] [A
+Train step of epoch 0:  75%|███████▍  | 4821/6434 [11:18:32<3:47:42,  8.47s/it, gpt_loss=0.28, loss_mean=0.286][A
+Train step of epoch 0:  75%|███████▍  | 4821/6434 [11:18:40<3:47:42,  8.47s/it, gpt_loss=0.249, loss_mean=0.282][A
+Train step of epoch 0:  75%|███████▍  | 4822/6434 [11:18:40<3:45:38,  8.40s/it, gpt_loss=0.249, loss_mean=0.282][A
+Train step of epoch 0:  75%|███████▍  | 4822/6434 [11:18:49<3:45:38,  8.40s/it, gpt_loss=0.279, loss_mean=0.282][A
+Train step of epoch 0:  75%|███████▍  | 4823/6434 [11:18:49<3:51:38,  8.63s/it, gpt_loss=0.279, loss_mean=0.282][A
+Train step of epoch 0:  75%|███████▍  | 4823/6434 [11:18:57<3:51:38,  8.63s/it, gpt_loss=0.33, loss_mean=0.287] [A
+Train step of epoch 0:  75%|███████▍  | 4824/6434 [11:18:57<3:44:55,  8.38s/it, gpt_loss=0.33, loss_mean=0.287][A
+Train step of epoch 0:  75%|███████▍  | 4824/6434 [11:19:04<3:44:55,  8.38s/it, gpt_loss=0.255, loss_mean=0.284][A
+Train step of epoch 0:  75%|███████▍  | 4825/6434 [11:19:04<3:38:47,  8.16s/it, gpt_loss=0.255, loss_mean=0.284][A
+Train step of epoch 0:  75%|███████▍  | 4825/6434 [11:19:13<3:38:47,  8.16s/it, gpt_loss=0.295, loss_mean=0.285][A
+Train step of epoch 0:  75%|███████▌  | 4826/6434 [11:19:13<3:40:29,  8.23s/it, gpt_loss=0.295, loss_mean=0.285][A
+Train step of epoch 0:  75%|███████▌  | 4826/6434 [11:19:21<3:40:29,  8.23s/it, gpt_loss=0.313, loss_mean=0.288][A
+Train step of epoch 0:  75%|███████▌  | 4827/6434 [11:19:21<3:41:47,  8.28s/it, gpt_loss=0.313, loss_mean=0.288][A
+Train step of epoch 0:  75%|███████▌  | 4827/6434 [11:19:30<3:41:47,  8.28s/it, gpt_loss=0.237, loss_mean=0.283][A
+Train step of epoch 0:  75%|███████▌  | 4828/6434 [11:19:30<3:47:51,  8.51s/it, gpt_loss=0.237, loss_mean=0.283][A
+Train step of epoch 0:  75%|███████▌  | 4828/6434 [11:19:38<3:47:51,  8.51s/it, gpt_loss=0.346, loss_mean=0.289][A
+Train step of epoch 0:  75%|███████▌  | 4829/6434 [11:19:38<3:44:05,  8.38s/it, gpt_loss=0.346, loss_mean=0.289][A
+[LID Router Debug] Step: 4830
+Batch Size: 10
+Audio Batch Size: 113
+LID Assignments: [2, 5, 2, 3, 3, 5, 6, 0, 5, 4]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6}
+
+Train step of epoch 0:  75%|███████▌  | 4829/6434 [11:19:46<3:44:05,  8.38s/it, gpt_loss=0.302, loss_mean=0.29] [A
+Train step of epoch 0:  75%|███████▌  | 4830/6434 [11:19:46<3:38:52,  8.19s/it, gpt_loss=0.302, loss_mean=0.29][A
+Train step of epoch 0:  75%|███████▌  | 4830/6434 [11:19:57<3:38:52,  8.19s/it, gpt_loss=0.322, loss_mean=0.293][A
+Train step of epoch 0:  75%|███████▌  | 4831/6434 [11:19:57<3:57:16,  8.88s/it, gpt_loss=0.322, loss_mean=0.293][A
+Train step of epoch 0:  75%|███████▌  | 4831/6434 [11:20:06<3:57:16,  8.88s/it, gpt_loss=0.253, loss_mean=0.289][A
+Train step of epoch 0:  75%|███████▌  | 4832/6434 [11:20:06<3:58:36,  8.94s/it, gpt_loss=0.253, loss_mean=0.289][A
+Train step of epoch 0:  75%|███████▌  | 4832/6434 [11:20:14<3:58:36,  8.94s/it, gpt_loss=0.238, loss_mean=0.284][A
+Train step of epoch 0:  75%|███████▌  | 4833/6434 [11:20:14<3:54:42,  8.80s/it, gpt_loss=0.238, loss_mean=0.284][A
+Train step of epoch 0:  75%|███████▌  | 4833/6434 [11:20:22<3:54:42,  8.80s/it, gpt_loss=0.218, loss_mean=0.278][A
+Train step of epoch 0:  75%|███████▌  | 4834/6434 [11:20:22<3:49:48,  8.62s/it, gpt_loss=0.218, loss_mean=0.278][A
+Train step of epoch 0:  75%|███████▌  | 4834/6434 [11:20:31<3:49:48,  8.62s/it, gpt_loss=0.25, loss_mean=0.275] [A
+Train step of epoch 0:  75%|███████▌  | 4835/6434 [11:20:31<3:53:59,  8.78s/it, gpt_loss=0.25, loss_mean=0.275][A
+Train step of epoch 0:  75%|███████▌  | 4835/6434 [11:20:41<3:53:59,  8.78s/it, gpt_loss=0.291, loss_mean=0.277][A
+Train step of epoch 0:  75%|███████▌  | 4836/6434 [11:20:41<3:57:22,  8.91s/it, gpt_loss=0.291, loss_mean=0.277][A
+Train step of epoch 0:  75%|███████▌  | 4836/6434 [11:20:48<3:57:22,  8.91s/it, gpt_loss=0.308, loss_mean=0.28] [A
+Train step of epoch 0:  75%|███████▌  | 4837/6434 [11:20:48<3:43:48,  8.41s/it, gpt_loss=0.308, loss_mean=0.28][A
+Train step of epoch 0:  75%|███████▌  | 4837/6434 [11:20:56<3:43:48,  8.41s/it, gpt_loss=0.342, loss_mean=0.286][A
+Train step of epoch 0:  75%|███████▌  | 4838/6434 [11:20:56<3:41:50,  8.34s/it, gpt_loss=0.342, loss_mean=0.286][A
+Train step of epoch 0:  75%|███████▌  | 4838/6434 [11:21:04<3:41:50,  8.34s/it, gpt_loss=0.292, loss_mean=0.287][A
+Train step of epoch 0:  75%|███████▌  | 4839/6434 [11:21:04<3:40:09,  8.28s/it, gpt_loss=0.292, loss_mean=0.287][A
+[LID Router Debug] Step: 4840
+Batch Size: 10
+Audio Batch Size: 128
+LID Assignments: [2, 3, 2, 9, 0, 2, 5, 0, 9, 1]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9}
+
+Train step of epoch 0:  75%|███████▌  | 4839/6434 [11:21:12<3:40:09,  8.28s/it, gpt_loss=0.274, loss_mean=0.285][A
+Train step of epoch 0:  75%|███████▌  | 4840/6434 [11:21:12<3:39:15,  8.25s/it, gpt_loss=0.274, loss_mean=0.285][A
+Train step of epoch 0:  75%|███████▌  | 4840/6434 [11:21:21<3:39:15,  8.25s/it, gpt_loss=0.292, loss_mean=0.286][A
+Train step of epoch 0:  75%|███████▌  | 4841/6434 [11:21:21<3:41:52,  8.36s/it, gpt_loss=0.292, loss_mean=0.286][A
+Train step of epoch 0:  75%|███████▌  | 4841/6434 [11:21:29<3:41:52,  8.36s/it, gpt_loss=0.303, loss_mean=0.288][A
+Train step of epoch 0:  75%|███████▌  | 4842/6434 [11:21:29<3:39:16,  8.26s/it, gpt_loss=0.303, loss_mean=0.288][A
+Train step of epoch 0:  75%|███████▌  | 4842/6434 [11:21:39<3:39:16,  8.26s/it, gpt_loss=0.273, loss_mean=0.286][A
+Train step of epoch 0:  75%|███████▌  | 4843/6434 [11:21:39<3:48:23,  8.61s/it, gpt_loss=0.273, loss_mean=0.286][A
+Train step of epoch 0:  75%|███████▌  | 4843/6434 [11:21:47<3:48:23,  8.61s/it, gpt_loss=0.295, loss_mean=0.287][A
+Train step of epoch 0:  75%|███████▌  | 4844/6434 [11:21:47<3:48:42,  8.63s/it, gpt_loss=0.295, loss_mean=0.287][A
+Train step of epoch 0:  75%|███████▌  | 4844/6434 [11:21:55<3:48:42,  8.63s/it, gpt_loss=0.286, loss_mean=0.287][A
+Train step of epoch 0:  75%|███████▌  | 4845/6434 [11:21:55<3:43:55,  8.46s/it, gpt_loss=0.286, loss_mean=0.287][A
+Train step of epoch 0:  75%|███████▌  | 4845/6434 [11:22:03<3:43:55,  8.46s/it, gpt_loss=0.205, loss_mean=0.279][A
+Train step of epoch 0:  75%|███████▌  | 4846/6434 [11:22:03<3:41:57,  8.39s/it, gpt_loss=0.205, loss_mean=0.279][A
+Train step of epoch 0:  75%|███████▌  | 4846/6434 [11:22:12<3:41:57,  8.39s/it, gpt_loss=0.263, loss_mean=0.277][A
+Train step of epoch 0:  75%|███████▌  | 4847/6434 [11:22:12<3:42:16,  8.40s/it, gpt_loss=0.263, loss_mean=0.277][A
+Train step of epoch 0:  75%|███████▌  | 4847/6434 [11:22:21<3:42:16,  8.40s/it, gpt_loss=0.264, loss_mean=0.276][A
+Train step of epoch 0:  75%|███████▌  | 4848/6434 [11:22:21<3:46:22,  8.56s/it, gpt_loss=0.264, loss_mean=0.276][A
+Train step of epoch 0:  75%|███████▌  | 4848/6434 [11:22:30<3:46:22,  8.56s/it, gpt_loss=0.364, loss_mean=0.285][A
+Train step of epoch 0:  75%|███████▌  | 4849/6434 [11:22:30<3:49:12,  8.68s/it, gpt_loss=0.364, loss_mean=0.285][A
+[LID Router Debug] Step: 4850
+Batch Size: 10
+Audio Batch Size: 112
+LID Assignments: [9, 3, 2, 5, 1, 6, 1, 4, 0, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  75%|███████▌  | 4849/6434 [11:22:38<3:49:12,  8.68s/it, gpt_loss=0.305, loss_mean=0.287][A
+Train step of epoch 0:  75%|███████▌  | 4850/6434 [11:22:38<3:44:26,  8.50s/it, gpt_loss=0.305, loss_mean=0.287][A
+Train step of epoch 0:  75%|███████▌  | 4850/6434 [11:22:46<3:44:26,  8.50s/it, gpt_loss=0.316, loss_mean=0.29] [A
+Train step of epoch 0:  75%|███████▌  | 4851/6434 [11:22:46<3:39:03,  8.30s/it, gpt_loss=0.316, loss_mean=0.29][A
+Train step of epoch 0:  75%|███████▌  | 4851/6434 [11:22:52<3:39:03,  8.30s/it, gpt_loss=0.25, loss_mean=0.286][A
+Train step of epoch 0:  75%|███████▌  | 4852/6434 [11:22:52<3:25:48,  7.81s/it, gpt_loss=0.25, loss_mean=0.286][A
+Train step of epoch 0:  75%|███████▌  | 4852/6434 [11:23:01<3:25:48,  7.81s/it, gpt_loss=0.328, loss_mean=0.29][A
+Train step of epoch 0:  75%|███████▌  | 4853/6434 [11:23:01<3:29:49,  7.96s/it, gpt_loss=0.328, loss_mean=0.29][A
+Train step of epoch 0:  75%|███████▌  | 4853/6434 [11:23:09<3:29:49,  7.96s/it, gpt_loss=0.296, loss_mean=0.291][A
+Train step of epoch 0:  75%|███████▌  | 4854/6434 [11:23:09<3:34:55,  8.16s/it, gpt_loss=0.296, loss_mean=0.291][A
+Train step of epoch 0:  75%|███████▌  | 4854/6434 [11:23:18<3:34:55,  8.16s/it, gpt_loss=0.278, loss_mean=0.289][A
+Train step of epoch 0:  75%|███████▌  | 4855/6434 [11:23:18<3:35:52,  8.20s/it, gpt_loss=0.278, loss_mean=0.289][A
+Train step of epoch 0:  75%|███████▌  | 4855/6434 [11:23:26<3:35:52,  8.20s/it, gpt_loss=0.312, loss_mean=0.291][A
+Train step of epoch 0:  75%|███████▌  | 4856/6434 [11:23:26<3:34:17,  8.15s/it, gpt_loss=0.312, loss_mean=0.291][A
+Train step of epoch 0:  75%|███████▌  | 4856/6434 [11:23:33<3:34:17,  8.15s/it, gpt_loss=0.302, loss_mean=0.292][A
+Train step of epoch 0:  75%|███████▌  | 4857/6434 [11:23:33<3:30:05,  7.99s/it, gpt_loss=0.302, loss_mean=0.292][A
+Train step of epoch 0:  75%|███████▌  | 4857/6434 [11:23:42<3:30:05,  7.99s/it, gpt_loss=0.387, loss_mean=0.302][A
+Train step of epoch 0:  76%|███████▌  | 4858/6434 [11:23:42<3:34:37,  8.17s/it, gpt_loss=0.387, loss_mean=0.302][A
+Train step of epoch 0:  76%|███████▌  | 4858/6434 [11:23:50<3:34:37,  8.17s/it, gpt_loss=0.231, loss_mean=0.295][A
+Train step of epoch 0:  76%|███████▌  | 4859/6434 [11:23:50<3:38:06,  8.31s/it, gpt_loss=0.231, loss_mean=0.295][A
+[LID Router Debug] Step: 4860
+Batch Size: 10
+Audio Batch Size: 94
+LID Assignments: [5, 1, 1, 6, 6, 9, 5, 4, 2, 9]
+Active Experts in Batch: {1, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  76%|███████▌  | 4859/6434 [11:23:59<3:38:06,  8.31s/it, gpt_loss=0.269, loss_mean=0.292][A
+Train step of epoch 0:  76%|███████▌  | 4860/6434 [11:23:59<3:39:19,  8.36s/it, gpt_loss=0.269, loss_mean=0.292][A
+Train step of epoch 0:  76%|███████▌  | 4860/6434 [11:24:07<3:39:19,  8.36s/it, gpt_loss=0.254, loss_mean=0.288][A
+Train step of epoch 0:  76%|███████▌  | 4861/6434 [11:24:07<3:39:37,  8.38s/it, gpt_loss=0.254, loss_mean=0.288][A
+Train step of epoch 0:  76%|███████▌  | 4861/6434 [11:24:16<3:39:37,  8.38s/it, gpt_loss=0.332, loss_mean=0.293][A
+Train step of epoch 0:  76%|███████▌  | 4862/6434 [11:24:16<3:41:11,  8.44s/it, gpt_loss=0.332, loss_mean=0.293][A
+Train step of epoch 0:  76%|███████▌  | 4862/6434 [11:24:24<3:41:11,  8.44s/it, gpt_loss=0.29, loss_mean=0.292] [A
+Train step of epoch 0:  76%|███████▌  | 4863/6434 [11:24:24<3:33:59,  8.17s/it, gpt_loss=0.29, loss_mean=0.292][A
+Train step of epoch 0:  76%|███████▌  | 4863/6434 [11:24:32<3:33:59,  8.17s/it, gpt_loss=0.354, loss_mean=0.299][A
+Train step of epoch 0:  76%|███████▌  | 4864/6434 [11:24:32<3:37:22,  8.31s/it, gpt_loss=0.354, loss_mean=0.299][A
+Train step of epoch 0:  76%|███████▌  | 4864/6434 [11:24:41<3:37:22,  8.31s/it, gpt_loss=0.271, loss_mean=0.296][A
+Train step of epoch 0:  76%|███████▌  | 4865/6434 [11:24:41<3:39:57,  8.41s/it, gpt_loss=0.271, loss_mean=0.296][A
+Train step of epoch 0:  76%|███████▌  | 4865/6434 [11:24:48<3:39:57,  8.41s/it, gpt_loss=0.234, loss_mean=0.29] [A
+Train step of epoch 0:  76%|███████▌  | 4866/6434 [11:24:48<3:28:57,  8.00s/it, gpt_loss=0.234, loss_mean=0.29][A
+Train step of epoch 0:  76%|███████▌  | 4866/6434 [11:24:56<3:28:57,  8.00s/it, gpt_loss=0.287, loss_mean=0.289][A
+Train step of epoch 0:  76%|███████▌  | 4867/6434 [11:24:56<3:30:56,  8.08s/it, gpt_loss=0.287, loss_mean=0.289][A
+Train step of epoch 0:  76%|███████▌  | 4867/6434 [11:25:05<3:30:56,  8.08s/it, gpt_loss=0.295, loss_mean=0.29] [A
+Train step of epoch 0:  76%|███████▌  | 4868/6434 [11:25:05<3:36:32,  8.30s/it, gpt_loss=0.295, loss_mean=0.29][A
+Train step of epoch 0:  76%|███████▌  | 4868/6434 [11:25:12<3:36:32,  8.30s/it, gpt_loss=0.252, loss_mean=0.286][A
+Train step of epoch 0:  76%|███████▌  | 4869/6434 [11:25:12<3:27:43,  7.96s/it, gpt_loss=0.252, loss_mean=0.286][A
+[LID Router Debug] Step: 4870
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [2, 5, 1, 4, 1, 5, 0, 0, 0, 2]
+Active Experts in Batch: {0, 1, 2, 4, 5}
+
+Train step of epoch 0:  76%|███████▌  | 4869/6434 [11:25:20<3:27:43,  7.96s/it, gpt_loss=0.223, loss_mean=0.28] [A
+Train step of epoch 0:  76%|███████▌  | 4870/6434 [11:25:20<3:30:32,  8.08s/it, gpt_loss=0.223, loss_mean=0.28][A
+Train step of epoch 0:  76%|███████▌  | 4870/6434 [11:25:30<3:30:32,  8.08s/it, gpt_loss=0.315, loss_mean=0.283][A
+Train step of epoch 0:  76%|███████▌  | 4871/6434 [11:25:30<3:40:16,  8.46s/it, gpt_loss=0.315, loss_mean=0.283][A
+Train step of epoch 0:  76%|███████▌  | 4871/6434 [11:25:38<3:40:16,  8.46s/it, gpt_loss=0.347, loss_mean=0.29] [A
+Train step of epoch 0:  76%|███████▌  | 4872/6434 [11:25:38<3:40:12,  8.46s/it, gpt_loss=0.347, loss_mean=0.29][A
+Train step of epoch 0:  76%|███████▌  | 4872/6434 [11:25:47<3:40:12,  8.46s/it, gpt_loss=0.275, loss_mean=0.288][A
+Train step of epoch 0:  76%|███████▌  | 4873/6434 [11:25:47<3:43:20,  8.58s/it, gpt_loss=0.275, loss_mean=0.288][A
+Train step of epoch 0:  76%|███████▌  | 4873/6434 [11:25:54<3:43:20,  8.58s/it, gpt_loss=0.255, loss_mean=0.285][A
+Train step of epoch 0:  76%|███████▌  | 4874/6434 [11:25:54<3:33:08,  8.20s/it, gpt_loss=0.255, loss_mean=0.285][A
+Train step of epoch 0:  76%|███████▌  | 4874/6434 [11:26:03<3:33:08,  8.20s/it, gpt_loss=0.276, loss_mean=0.284][A
+Train step of epoch 0:  76%|███████▌  | 4875/6434 [11:26:03<3:33:25,  8.21s/it, gpt_loss=0.276, loss_mean=0.284][A
+Train step of epoch 0:  76%|███████▌  | 4875/6434 [11:26:10<3:33:25,  8.21s/it, gpt_loss=0.314, loss_mean=0.287][A
+Train step of epoch 0:  76%|███████▌  | 4876/6434 [11:26:10<3:29:08,  8.05s/it, gpt_loss=0.314, loss_mean=0.287][A
+Train step of epoch 0:  76%|███████▌  | 4876/6434 [11:26:18<3:29:08,  8.05s/it, gpt_loss=0.327, loss_mean=0.291][A
+Train step of epoch 0:  76%|███████▌  | 4877/6434 [11:26:18<3:22:28,  7.80s/it, gpt_loss=0.327, loss_mean=0.291][A
+Train step of epoch 0:  76%|███████▌  | 4877/6434 [11:26:24<3:22:28,  7.80s/it, gpt_loss=0.456, loss_mean=0.307][A
+Train step of epoch 0:  76%|███████▌  | 4878/6434 [11:26:24<3:14:59,  7.52s/it, gpt_loss=0.456, loss_mean=0.307][A
+Train step of epoch 0:  76%|███████▌  | 4878/6434 [11:26:32<3:14:59,  7.52s/it, gpt_loss=0.451, loss_mean=0.322][A
+Train step of epoch 0:  76%|███████▌  | 4879/6434 [11:26:32<3:16:05,  7.57s/it, gpt_loss=0.451, loss_mean=0.322][A
+[LID Router Debug] Step: 4880
+Batch Size: 10
+Audio Batch Size: 117
+LID Assignments: [9, 9, 0, 5, 9, 4, 5, 3, 9, 9]
+Active Experts in Batch: {0, 3, 4, 5, 9}
+
+Train step of epoch 0:  76%|███████▌  | 4879/6434 [11:26:41<3:16:05,  7.57s/it, gpt_loss=0.306, loss_mean=0.32] [A
+Train step of epoch 0:  76%|███████▌  | 4880/6434 [11:26:41<3:28:23,  8.05s/it, gpt_loss=0.306, loss_mean=0.32][A
+Train step of epoch 0:  76%|███████▌  | 4880/6434 [11:26:50<3:28:23,  8.05s/it, gpt_loss=0.346, loss_mean=0.323][A
+Train step of epoch 0:  76%|███████▌  | 4881/6434 [11:26:50<3:36:45,  8.37s/it, gpt_loss=0.346, loss_mean=0.323][A
+Train step of epoch 0:  76%|███████▌  | 4881/6434 [11:26:58<3:36:45,  8.37s/it, gpt_loss=0.243, loss_mean=0.315][A
+Train step of epoch 0:  76%|███████▌  | 4882/6434 [11:26:58<3:31:54,  8.19s/it, gpt_loss=0.243, loss_mean=0.315][A
+Train step of epoch 0:  76%|███████▌  | 4882/6434 [11:27:08<3:31:54,  8.19s/it, gpt_loss=0.32, loss_mean=0.315] [A
+Train step of epoch 0:  76%|███████▌  | 4883/6434 [11:27:08<3:40:54,  8.55s/it, gpt_loss=0.32, loss_mean=0.315][A
+Train step of epoch 0:  76%|███████▌  | 4883/6434 [11:27:15<3:40:54,  8.55s/it, gpt_loss=0.267, loss_mean=0.311][A
+Train step of epoch 0:  76%|███████▌  | 4884/6434 [11:27:15<3:32:37,  8.23s/it, gpt_loss=0.267, loss_mean=0.311][A
+Train step of epoch 0:  76%|███████▌  | 4884/6434 [11:27:23<3:32:37,  8.23s/it, gpt_loss=0.396, loss_mean=0.319][A
+Train step of epoch 0:  76%|███████▌  | 4885/6434 [11:27:23<3:26:55,  8.01s/it, gpt_loss=0.396, loss_mean=0.319][A
+Train step of epoch 0:  76%|███████▌  | 4885/6434 [11:27:30<3:26:55,  8.01s/it, gpt_loss=0.314, loss_mean=0.319][A
+Train step of epoch 0:  76%|███████▌  | 4886/6434 [11:27:30<3:21:46,  7.82s/it, gpt_loss=0.314, loss_mean=0.319][A
+Train step of epoch 0:  76%|███████▌  | 4886/6434 [11:27:38<3:21:46,  7.82s/it, gpt_loss=0.334, loss_mean=0.32] [A
+Train step of epoch 0:  76%|███████▌  | 4887/6434 [11:27:38<3:25:43,  7.98s/it, gpt_loss=0.334, loss_mean=0.32][A
+Train step of epoch 0:  76%|███████▌  | 4887/6434 [11:27:46<3:25:43,  7.98s/it, gpt_loss=0.387, loss_mean=0.327][A
+Train step of epoch 0:  76%|███████▌  | 4888/6434 [11:27:46<3:26:35,  8.02s/it, gpt_loss=0.387, loss_mean=0.327][A
+Train step of epoch 0:  76%|███████▌  | 4888/6434 [11:27:55<3:26:35,  8.02s/it, gpt_loss=0.371, loss_mean=0.331][A
+Train step of epoch 0:  76%|███████▌  | 4889/6434 [11:27:55<3:31:08,  8.20s/it, gpt_loss=0.371, loss_mean=0.331][A
+[LID Router Debug] Step: 4890
+Batch Size: 10
+Audio Batch Size: 95
+LID Assignments: [0, 3, 7, 2, 0, 0, 5, 4, 0, 3]
+Active Experts in Batch: {0, 2, 3, 4, 5, 7}
+
+Train step of epoch 0:  76%|███████▌  | 4889/6434 [11:28:03<3:31:08,  8.20s/it, gpt_loss=0.289, loss_mean=0.327][A
+Train step of epoch 0:  76%|███████▌  | 4890/6434 [11:28:03<3:28:45,  8.11s/it, gpt_loss=0.289, loss_mean=0.327][A
+Train step of epoch 0:  76%|███████▌  | 4890/6434 [11:28:13<3:28:45,  8.11s/it, gpt_loss=0.286, loss_mean=0.323][A
+Train step of epoch 0:  76%|███████▌  | 4891/6434 [11:28:13<3:43:24,  8.69s/it, gpt_loss=0.286, loss_mean=0.323][A
+Train step of epoch 0:  76%|███████▌  | 4891/6434 [11:28:20<3:43:24,  8.69s/it, gpt_loss=0.374, loss_mean=0.328][A
+Train step of epoch 0:  76%|███████▌  | 4892/6434 [11:28:20<3:33:36,  8.31s/it, gpt_loss=0.374, loss_mean=0.328][A
+Train step of epoch 0:  76%|███████▌  | 4892/6434 [11:28:30<3:33:36,  8.31s/it, gpt_loss=0.346, loss_mean=0.33] [A
+Train step of epoch 0:  76%|███████▌  | 4893/6434 [11:28:30<3:40:42,  8.59s/it, gpt_loss=0.346, loss_mean=0.33][A
+Train step of epoch 0:  76%|███████▌  | 4893/6434 [11:28:38<3:40:42,  8.59s/it, gpt_loss=0.439, loss_mean=0.341][A
+Train step of epoch 0:  76%|███████▌  | 4894/6434 [11:28:38<3:37:15,  8.46s/it, gpt_loss=0.439, loss_mean=0.341][A
+Train step of epoch 0:  76%|███████▌  | 4894/6434 [11:28:45<3:37:15,  8.46s/it, gpt_loss=0.287, loss_mean=0.335][A
+Train step of epoch 0:  76%|███████▌  | 4895/6434 [11:28:45<3:29:48,  8.18s/it, gpt_loss=0.287, loss_mean=0.335][A
+Train step of epoch 0:  76%|███████▌  | 4895/6434 [11:28:55<3:29:48,  8.18s/it, gpt_loss=0.266, loss_mean=0.328][A
+Train step of epoch 0:  76%|███████▌  | 4896/6434 [11:28:55<3:44:47,  8.77s/it, gpt_loss=0.266, loss_mean=0.328][A
+Train step of epoch 0:  76%|███████▌  | 4896/6434 [11:29:03<3:44:47,  8.77s/it, gpt_loss=0.275, loss_mean=0.323][A
+Train step of epoch 0:  76%|███████▌  | 4897/6434 [11:29:03<3:37:29,  8.49s/it, gpt_loss=0.275, loss_mean=0.323][A
+Train step of epoch 0:  76%|███████▌  | 4897/6434 [11:29:11<3:37:29,  8.49s/it, gpt_loss=0.298, loss_mean=0.321][A
+Train step of epoch 0:  76%|███████▌  | 4898/6434 [11:29:11<3:31:10,  8.25s/it, gpt_loss=0.298, loss_mean=0.321][A
+Train step of epoch 0:  76%|███████▌  | 4898/6434 [11:29:20<3:31:10,  8.25s/it, gpt_loss=0.285, loss_mean=0.317][A
+Train step of epoch 0:  76%|███████▌  | 4899/6434 [11:29:20<3:39:36,  8.58s/it, gpt_loss=0.285, loss_mean=0.317][A
+[LID Router Debug] Step: 4900
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [2, 1, 2, 9, 2, 4, 9, 4, 9, 2]
+Active Experts in Batch: {1, 2, 4, 9}
+
+Train step of epoch 0:  76%|███████▌  | 4899/6434 [11:29:29<3:39:36,  8.58s/it, gpt_loss=0.29, loss_mean=0.314] [A
+Train step of epoch 0:  76%|███████▌  | 4900/6434 [11:29:29<3:43:38,  8.75s/it, gpt_loss=0.29, loss_mean=0.314][A
+Train step of epoch 0:  76%|███████▌  | 4900/6434 [11:29:38<3:43:38,  8.75s/it, gpt_loss=0.326, loss_mean=0.316][A
+Train step of epoch 0:  76%|███████▌  | 4901/6434 [11:29:38<3:39:35,  8.59s/it, gpt_loss=0.326, loss_mean=0.316][A
+Train step of epoch 0:  76%|███████▌  | 4901/6434 [11:29:47<3:39:35,  8.59s/it, gpt_loss=0.328, loss_mean=0.317][A
+Train step of epoch 0:  76%|███████▌  | 4902/6434 [11:29:47<3:42:19,  8.71s/it, gpt_loss=0.328, loss_mean=0.317][A
+Train step of epoch 0:  76%|███████▌  | 4902/6434 [11:29:54<3:42:19,  8.71s/it, gpt_loss=0.273, loss_mean=0.312][A
+Train step of epoch 0:  76%|███████▌  | 4903/6434 [11:29:54<3:34:58,  8.43s/it, gpt_loss=0.273, loss_mean=0.312][A
+Train step of epoch 0:  76%|███████▌  | 4903/6434 [11:30:03<3:34:58,  8.43s/it, gpt_loss=0.328, loss_mean=0.314][A
+Train step of epoch 0:  76%|███████▌  | 4904/6434 [11:30:03<3:32:33,  8.34s/it, gpt_loss=0.328, loss_mean=0.314][A
+Train step of epoch 0:  76%|███████▌  | 4904/6434 [11:30:10<3:32:33,  8.34s/it, gpt_loss=0.27, loss_mean=0.31]  [A
+Train step of epoch 0:  76%|███████▌  | 4905/6434 [11:30:10<3:24:29,  8.02s/it, gpt_loss=0.27, loss_mean=0.31][A
+Train step of epoch 0:  76%|███████▌  | 4905/6434 [11:30:18<3:24:29,  8.02s/it, gpt_loss=0.293, loss_mean=0.308][A
+Train step of epoch 0:  76%|███████▋  | 4906/6434 [11:30:18<3:23:24,  7.99s/it, gpt_loss=0.293, loss_mean=0.308][A
+Train step of epoch 0:  76%|███████▋  | 4906/6434 [11:30:25<3:23:24,  7.99s/it, gpt_loss=0.279, loss_mean=0.305][A
+Train step of epoch 0:  76%|███████▋  | 4907/6434 [11:30:25<3:20:32,  7.88s/it, gpt_loss=0.279, loss_mean=0.305][A
+Train step of epoch 0:  76%|███████▋  | 4907/6434 [11:30:34<3:20:32,  7.88s/it, gpt_loss=0.321, loss_mean=0.307][A
+Train step of epoch 0:  76%|███████▋  | 4908/6434 [11:30:34<3:29:29,  8.24s/it, gpt_loss=0.321, loss_mean=0.307][A
+Train step of epoch 0:  76%|███████▋  | 4908/6434 [11:30:41<3:29:29,  8.24s/it, gpt_loss=0.284, loss_mean=0.304][A
+Train step of epoch 0:  76%|███████▋  | 4909/6434 [11:30:41<3:20:13,  7.88s/it, gpt_loss=0.284, loss_mean=0.304][A
+[LID Router Debug] Step: 4910
+Batch Size: 10
+Audio Batch Size: 83
+LID Assignments: [5, 0, 9, 5, 6, 6, 0, 0, 0, 5]
+Active Experts in Batch: {0, 9, 5, 6}
+
+Train step of epoch 0:  76%|███████▋  | 4909/6434 [11:30:50<3:20:13,  7.88s/it, gpt_loss=0.295, loss_mean=0.303][A
+Train step of epoch 0:  76%|███████▋  | 4910/6434 [11:30:50<3:23:59,  8.03s/it, gpt_loss=0.295, loss_mean=0.303][A
+Train step of epoch 0:  76%|███████▋  | 4910/6434 [11:30:58<3:23:59,  8.03s/it, gpt_loss=0.243, loss_mean=0.297][A
+Train step of epoch 0:  76%|███████▋  | 4911/6434 [11:30:58<3:22:51,  7.99s/it, gpt_loss=0.243, loss_mean=0.297][A
+Train step of epoch 0:  76%|███████▋  | 4911/6434 [11:31:05<3:22:51,  7.99s/it, gpt_loss=0.264, loss_mean=0.294][A
+Train step of epoch 0:  76%|███████▋  | 4912/6434 [11:31:05<3:17:43,  7.79s/it, gpt_loss=0.264, loss_mean=0.294][A
+Train step of epoch 0:  76%|███████▋  | 4912/6434 [11:31:14<3:17:43,  7.79s/it, gpt_loss=0.259, loss_mean=0.291][A
+Train step of epoch 0:  76%|███████▋  | 4913/6434 [11:31:14<3:23:16,  8.02s/it, gpt_loss=0.259, loss_mean=0.291][A
+Train step of epoch 0:  76%|███████▋  | 4913/6434 [11:31:22<3:23:16,  8.02s/it, gpt_loss=0.269, loss_mean=0.288][A
+Train step of epoch 0:  76%|███████▋  | 4914/6434 [11:31:22<3:23:21,  8.03s/it, gpt_loss=0.269, loss_mean=0.288][A
+Train step of epoch 0:  76%|███████▋  | 4914/6434 [11:31:30<3:23:21,  8.03s/it, gpt_loss=0.225, loss_mean=0.282][A
+Train step of epoch 0:  76%|███████▋  | 4915/6434 [11:31:30<3:27:29,  8.20s/it, gpt_loss=0.225, loss_mean=0.282][A
+Train step of epoch 0:  76%|███████▋  | 4915/6434 [11:31:39<3:27:29,  8.20s/it, gpt_loss=0.303, loss_mean=0.284][A
+Train step of epoch 0:  76%|███████▋  | 4916/6434 [11:31:39<3:33:38,  8.44s/it, gpt_loss=0.303, loss_mean=0.284][A
+Train step of epoch 0:  76%|███████▋  | 4916/6434 [11:31:48<3:33:38,  8.44s/it, gpt_loss=0.212, loss_mean=0.277][A
+Train step of epoch 0:  76%|███████▋  | 4917/6434 [11:31:48<3:32:28,  8.40s/it, gpt_loss=0.212, loss_mean=0.277][A
+Train step of epoch 0:  76%|███████▋  | 4917/6434 [11:31:56<3:32:28,  8.40s/it, gpt_loss=0.358, loss_mean=0.285][A
+Train step of epoch 0:  76%|███████▋  | 4918/6434 [11:31:56<3:31:26,  8.37s/it, gpt_loss=0.358, loss_mean=0.285][A
+Train step of epoch 0:  76%|███████▋  | 4918/6434 [11:32:04<3:31:26,  8.37s/it, gpt_loss=0.322, loss_mean=0.289][A
+Train step of epoch 0:  76%|███████▋  | 4919/6434 [11:32:04<3:30:54,  8.35s/it, gpt_loss=0.322, loss_mean=0.289][A
+[LID Router Debug] Step: 4920
+Batch Size: 10
+Audio Batch Size: 142
+LID Assignments: [0, 3, 9, 3, 2, 0, 4, 3, 4, 9]
+Active Experts in Batch: {0, 2, 3, 4, 9}
+
+Train step of epoch 0:  76%|███████▋  | 4919/6434 [11:32:13<3:30:54,  8.35s/it, gpt_loss=0.249, loss_mean=0.285][A
+Train step of epoch 0:  76%|███████▋  | 4920/6434 [11:32:13<3:35:25,  8.54s/it, gpt_loss=0.249, loss_mean=0.285][A
+Train step of epoch 0:  76%|███████▋  | 4920/6434 [11:32:21<3:35:25,  8.54s/it, gpt_loss=0.282, loss_mean=0.284][A
+Train step of epoch 0:  76%|███████▋  | 4921/6434 [11:32:21<3:27:38,  8.23s/it, gpt_loss=0.282, loss_mean=0.284][A
+Train step of epoch 0:  76%|███████▋  | 4921/6434 [11:32:30<3:27:38,  8.23s/it, gpt_loss=0.299, loss_mean=0.286][A
+Train step of epoch 0:  76%|███████▋  | 4922/6434 [11:32:30<3:33:39,  8.48s/it, gpt_loss=0.299, loss_mean=0.286][A
+Train step of epoch 0:  76%|███████▋  | 4922/6434 [11:32:38<3:33:39,  8.48s/it, gpt_loss=0.394, loss_mean=0.297][A
+Train step of epoch 0:  77%|███████▋  | 4923/6434 [11:32:38<3:35:05,  8.54s/it, gpt_loss=0.394, loss_mean=0.297][A
+Train step of epoch 0:  77%|███████▋  | 4923/6434 [11:32:47<3:35:05,  8.54s/it, gpt_loss=0.295, loss_mean=0.297][A
+Train step of epoch 0:  77%|███████▋  | 4924/6434 [11:32:47<3:37:55,  8.66s/it, gpt_loss=0.295, loss_mean=0.297][A
+Train step of epoch 0:  77%|███████▋  | 4924/6434 [11:32:56<3:37:55,  8.66s/it, gpt_loss=0.267, loss_mean=0.294][A
+Train step of epoch 0:  77%|███████▋  | 4925/6434 [11:32:56<3:38:47,  8.70s/it, gpt_loss=0.267, loss_mean=0.294][A
+Train step of epoch 0:  77%|███████▋  | 4925/6434 [11:33:04<3:38:47,  8.70s/it, gpt_loss=0.293, loss_mean=0.294][A
+Train step of epoch 0:  77%|███████▋  | 4926/6434 [11:33:04<3:31:59,  8.43s/it, gpt_loss=0.293, loss_mean=0.294][A
+Train step of epoch 0:  77%|███████▋  | 4926/6434 [11:33:12<3:31:59,  8.43s/it, gpt_loss=0.263, loss_mean=0.291][A
+Train step of epoch 0:  77%|███████▋  | 4927/6434 [11:33:12<3:26:53,  8.24s/it, gpt_loss=0.263, loss_mean=0.291][A
+Train step of epoch 0:  77%|███████▋  | 4927/6434 [11:33:21<3:26:53,  8.24s/it, gpt_loss=0.244, loss_mean=0.286][A
+Train step of epoch 0:  77%|███████▋  | 4928/6434 [11:33:21<3:31:42,  8.43s/it, gpt_loss=0.244, loss_mean=0.286][A
+Train step of epoch 0:  77%|███████▋  | 4928/6434 [11:33:31<3:31:42,  8.43s/it, gpt_loss=0.238, loss_mean=0.281][A
+Train step of epoch 0:  77%|███████▋  | 4929/6434 [11:33:31<3:48:10,  9.10s/it, gpt_loss=0.238, loss_mean=0.281][A
+[LID Router Debug] Step: 4930
+Batch Size: 10
+Audio Batch Size: 73
+LID Assignments: [5, 1, 4, 4, 0, 1, 2, 4, 6, 5]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6}
+
+Train step of epoch 0:  77%|███████▋  | 4929/6434 [11:33:38<3:48:10,  9.10s/it, gpt_loss=0.378, loss_mean=0.291][A
+Train step of epoch 0:  77%|███████▋  | 4930/6434 [11:33:38<3:32:45,  8.49s/it, gpt_loss=0.378, loss_mean=0.291][A
+Train step of epoch 0:  77%|███████▋  | 4930/6434 [11:33:48<3:32:45,  8.49s/it, gpt_loss=0.303, loss_mean=0.292][A
+Train step of epoch 0:  77%|███████▋  | 4931/6434 [11:33:48<3:38:57,  8.74s/it, gpt_loss=0.303, loss_mean=0.292][A
+Train step of epoch 0:  77%|███████▋  | 4931/6434 [11:33:56<3:38:57,  8.74s/it, gpt_loss=0.266, loss_mean=0.289][A
+Train step of epoch 0:  77%|███████▋  | 4932/6434 [11:33:56<3:38:56,  8.75s/it, gpt_loss=0.266, loss_mean=0.289][A
+Train step of epoch 0:  77%|███████▋  | 4932/6434 [11:34:05<3:38:56,  8.75s/it, gpt_loss=0.296, loss_mean=0.29] [A
+Train step of epoch 0:  77%|███████▋  | 4933/6434 [11:34:05<3:34:25,  8.57s/it, gpt_loss=0.296, loss_mean=0.29][A
+Train step of epoch 0:  77%|███████▋  | 4933/6434 [11:34:13<3:34:25,  8.57s/it, gpt_loss=0.263, loss_mean=0.287][A
+Train step of epoch 0:  77%|███████▋  | 4934/6434 [11:34:13<3:30:32,  8.42s/it, gpt_loss=0.263, loss_mean=0.287][A
+Train step of epoch 0:  77%|███████▋  | 4934/6434 [11:34:21<3:30:32,  8.42s/it, gpt_loss=0.35, loss_mean=0.294] [A
+Train step of epoch 0:  77%|███████▋  | 4935/6434 [11:34:21<3:31:54,  8.48s/it, gpt_loss=0.35, loss_mean=0.294][A
+Train step of epoch 0:  77%|███████▋  | 4935/6434 [11:34:29<3:31:54,  8.48s/it, gpt_loss=0.406, loss_mean=0.305][A
+Train step of epoch 0:  77%|███████▋  | 4936/6434 [11:34:29<3:27:29,  8.31s/it, gpt_loss=0.406, loss_mean=0.305][A
+Train step of epoch 0:  77%|███████▋  | 4936/6434 [11:34:37<3:27:29,  8.31s/it, gpt_loss=0.345, loss_mean=0.309][A
+Train step of epoch 0:  77%|███████▋  | 4937/6434 [11:34:37<3:25:32,  8.24s/it, gpt_loss=0.345, loss_mean=0.309][A
+Train step of epoch 0:  77%|███████▋  | 4937/6434 [11:34:47<3:25:32,  8.24s/it, gpt_loss=0.33, loss_mean=0.311] [A
+Train step of epoch 0:  77%|███████▋  | 4938/6434 [11:34:47<3:38:34,  8.77s/it, gpt_loss=0.33, loss_mean=0.311][A
+Train step of epoch 0:  77%|███████▋  | 4938/6434 [11:34:55<3:38:34,  8.77s/it, gpt_loss=0.354, loss_mean=0.315][A
+Train step of epoch 0:  77%|███████▋  | 4939/6434 [11:34:55<3:33:17,  8.56s/it, gpt_loss=0.354, loss_mean=0.315][A
+[LID Router Debug] Step: 4940
+Batch Size: 10
+Audio Batch Size: 80
+LID Assignments: [4, 0, 4, 5, 6, 0, 5, 5, 6, 9]
+Active Experts in Batch: {0, 4, 5, 6, 9}
+
+Train step of epoch 0:  77%|███████▋  | 4939/6434 [11:35:03<3:33:17,  8.56s/it, gpt_loss=0.365, loss_mean=0.32] [A
+Train step of epoch 0:  77%|███████▋  | 4940/6434 [11:35:03<3:28:35,  8.38s/it, gpt_loss=0.365, loss_mean=0.32][A
+Train step of epoch 0:  77%|███████▋  | 4940/6434 [11:35:12<3:28:35,  8.38s/it, gpt_loss=0.353, loss_mean=0.324][A
+Train step of epoch 0:  77%|███████▋  | 4941/6434 [11:35:12<3:29:07,  8.40s/it, gpt_loss=0.353, loss_mean=0.324][A
+Train step of epoch 0:  77%|███████▋  | 4941/6434 [11:35:20<3:29:07,  8.40s/it, gpt_loss=0.317, loss_mean=0.323][A
+Train step of epoch 0:  77%|███████▋  | 4942/6434 [11:35:20<3:26:30,  8.30s/it, gpt_loss=0.317, loss_mean=0.323][A
+Train step of epoch 0:  77%|███████▋  | 4942/6434 [11:35:29<3:26:30,  8.30s/it, gpt_loss=0.318, loss_mean=0.322][A
+Train step of epoch 0:  77%|███████▋  | 4943/6434 [11:35:29<3:34:37,  8.64s/it, gpt_loss=0.318, loss_mean=0.322][A
+Train step of epoch 0:  77%|███████▋  | 4943/6434 [11:35:38<3:34:37,  8.64s/it, gpt_loss=0.39, loss_mean=0.329] [A
+Train step of epoch 0:  77%|███████▋  | 4944/6434 [11:35:38<3:33:53,  8.61s/it, gpt_loss=0.39, loss_mean=0.329][A
+Train step of epoch 0:  77%|███████▋  | 4944/6434 [11:35:46<3:33:53,  8.61s/it, gpt_loss=0.334, loss_mean=0.33][A
+Train step of epoch 0:  77%|███████▋  | 4945/6434 [11:35:46<3:30:06,  8.47s/it, gpt_loss=0.334, loss_mean=0.33][A
+Train step of epoch 0:  77%|███████▋  | 4945/6434 [11:35:53<3:30:06,  8.47s/it, gpt_loss=0.341, loss_mean=0.331][A
+Train step of epoch 0:  77%|███████▋  | 4946/6434 [11:35:53<3:23:05,  8.19s/it, gpt_loss=0.341, loss_mean=0.331][A
+Train step of epoch 0:  77%|███████▋  | 4946/6434 [11:36:03<3:23:05,  8.19s/it, gpt_loss=0.308, loss_mean=0.328][A
+Train step of epoch 0:  77%|███████▋  | 4947/6434 [11:36:03<3:32:27,  8.57s/it, gpt_loss=0.308, loss_mean=0.328][A
+Train step of epoch 0:  77%|███████▋  | 4947/6434 [11:36:12<3:32:27,  8.57s/it, gpt_loss=0.278, loss_mean=0.323][A
+Train step of epoch 0:  77%|███████▋  | 4948/6434 [11:36:12<3:32:13,  8.57s/it, gpt_loss=0.278, loss_mean=0.323][A
+Train step of epoch 0:  77%|███████▋  | 4948/6434 [11:36:21<3:32:13,  8.57s/it, gpt_loss=0.354, loss_mean=0.326][A
+Train step of epoch 0:  77%|███████▋  | 4949/6434 [11:36:21<3:35:44,  8.72s/it, gpt_loss=0.354, loss_mean=0.326][A
+[LID Router Debug] Step: 4950
+Batch Size: 10
+Audio Batch Size: 134
+LID Assignments: [3, 4, 3, 3, 2, 1, 9, 3, 9, 4]
+Active Experts in Batch: {1, 2, 3, 4, 9}
+
+Train step of epoch 0:  77%|███████▋  | 4949/6434 [11:36:29<3:35:44,  8.72s/it, gpt_loss=0.298, loss_mean=0.324][A
+Train step of epoch 0:  77%|███████▋  | 4950/6434 [11:36:29<3:36:31,  8.75s/it, gpt_loss=0.298, loss_mean=0.324][A
+Train step of epoch 0:  77%|███████▋  | 4950/6434 [11:36:38<3:36:31,  8.75s/it, gpt_loss=0.303, loss_mean=0.322][A
+Train step of epoch 0:  77%|███████▋  | 4951/6434 [11:36:38<3:32:41,  8.61s/it, gpt_loss=0.303, loss_mean=0.322][A
+Train step of epoch 0:  77%|███████▋  | 4951/6434 [11:36:46<3:32:41,  8.61s/it, gpt_loss=0.287, loss_mean=0.318][A
+Train step of epoch 0:  77%|███████▋  | 4952/6434 [11:36:46<3:29:10,  8.47s/it, gpt_loss=0.287, loss_mean=0.318][A
+Train step of epoch 0:  77%|███████▋  | 4952/6434 [11:36:55<3:29:10,  8.47s/it, gpt_loss=0.335, loss_mean=0.32] [A
+Train step of epoch 0:  77%|███████▋  | 4953/6434 [11:36:55<3:32:33,  8.61s/it, gpt_loss=0.335, loss_mean=0.32][A
+Train step of epoch 0:  77%|███████▋  | 4953/6434 [11:37:03<3:32:33,  8.61s/it, gpt_loss=0.269, loss_mean=0.315][A
+Train step of epoch 0:  77%|███████▋  | 4954/6434 [11:37:03<3:31:31,  8.58s/it, gpt_loss=0.269, loss_mean=0.315][A
+Train step of epoch 0:  77%|███████▋  | 4954/6434 [11:37:12<3:31:31,  8.58s/it, gpt_loss=0.297, loss_mean=0.313][A
+Train step of epoch 0:  77%|███████▋  | 4955/6434 [11:37:12<3:34:36,  8.71s/it, gpt_loss=0.297, loss_mean=0.313][A
+Train step of epoch 0:  77%|███████▋  | 4955/6434 [11:37:19<3:34:36,  8.71s/it, gpt_loss=0.285, loss_mean=0.31] [A
+Train step of epoch 0:  77%|███████▋  | 4956/6434 [11:37:19<3:20:36,  8.14s/it, gpt_loss=0.285, loss_mean=0.31][A
+Train step of epoch 0:  77%|███████▋  | 4956/6434 [11:37:28<3:20:36,  8.14s/it, gpt_loss=0.265, loss_mean=0.306][A
+Train step of epoch 0:  77%|███████▋  | 4957/6434 [11:37:28<3:24:01,  8.29s/it, gpt_loss=0.265, loss_mean=0.306][A
+Train step of epoch 0:  77%|███████▋  | 4957/6434 [11:37:38<3:24:01,  8.29s/it, gpt_loss=0.267, loss_mean=0.302][A
+Train step of epoch 0:  77%|███████▋  | 4958/6434 [11:37:38<3:34:45,  8.73s/it, gpt_loss=0.267, loss_mean=0.302][A
+Train step of epoch 0:  77%|███████▋  | 4958/6434 [11:37:45<3:34:45,  8.73s/it, gpt_loss=0.237, loss_mean=0.295][A
+Train step of epoch 0:  77%|███████▋  | 4959/6434 [11:37:45<3:27:53,  8.46s/it, gpt_loss=0.237, loss_mean=0.295][A
+[LID Router Debug] Step: 4960
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [3, 0, 3, 9, 0, 4, 0, 9, 4, 0]
+Active Experts in Batch: {0, 9, 3, 4}
+
+Train step of epoch 0:  77%|███████▋  | 4959/6434 [11:37:53<3:27:53,  8.46s/it, gpt_loss=0.322, loss_mean=0.298][A
+Train step of epoch 0:  77%|███████▋  | 4960/6434 [11:37:53<3:24:29,  8.32s/it, gpt_loss=0.322, loss_mean=0.298][A
+Train step of epoch 0:  77%|███████▋  | 4960/6434 [11:38:01<3:24:29,  8.32s/it, gpt_loss=0.278, loss_mean=0.296][A
+Train step of epoch 0:  77%|███████▋  | 4961/6434 [11:38:01<3:21:22,  8.20s/it, gpt_loss=0.278, loss_mean=0.296][A
+Train step of epoch 0:  77%|███████▋  | 4961/6434 [11:38:09<3:21:22,  8.20s/it, gpt_loss=0.25, loss_mean=0.291] [A
+Train step of epoch 0:  77%|███████▋  | 4962/6434 [11:38:09<3:18:32,  8.09s/it, gpt_loss=0.25, loss_mean=0.291][A
+Train step of epoch 0:  77%|███████▋  | 4962/6434 [11:38:17<3:18:32,  8.09s/it, gpt_loss=0.252, loss_mean=0.287][A
+Train step of epoch 0:  77%|███████▋  | 4963/6434 [11:38:17<3:18:35,  8.10s/it, gpt_loss=0.252, loss_mean=0.287][A
+Train step of epoch 0:  77%|███████▋  | 4963/6434 [11:38:27<3:18:35,  8.10s/it, gpt_loss=0.3, loss_mean=0.289]  [A
+Train step of epoch 0:  77%|███████▋  | 4964/6434 [11:38:27<3:27:28,  8.47s/it, gpt_loss=0.3, loss_mean=0.289][A
+Train step of epoch 0:  77%|███████▋  | 4964/6434 [11:38:34<3:27:28,  8.47s/it, gpt_loss=0.304, loss_mean=0.29][A
+Train step of epoch 0:  77%|███████▋  | 4965/6434 [11:38:34<3:20:49,  8.20s/it, gpt_loss=0.304, loss_mean=0.29][A
+Train step of epoch 0:  77%|███████▋  | 4965/6434 [11:38:42<3:20:49,  8.20s/it, gpt_loss=0.352, loss_mean=0.296][A
+Train step of epoch 0:  77%|███████▋  | 4966/6434 [11:38:42<3:20:28,  8.19s/it, gpt_loss=0.352, loss_mean=0.296][A
+Train step of epoch 0:  77%|███████▋  | 4966/6434 [11:38:50<3:20:28,  8.19s/it, gpt_loss=0.403, loss_mean=0.307][A
+Train step of epoch 0:  77%|███████▋  | 4967/6434 [11:38:50<3:17:05,  8.06s/it, gpt_loss=0.403, loss_mean=0.307][A
+Train step of epoch 0:  77%|███████▋  | 4967/6434 [11:38:58<3:17:05,  8.06s/it, gpt_loss=0.243, loss_mean=0.301][A
+Train step of epoch 0:  77%|███████▋  | 4968/6434 [11:38:58<3:18:55,  8.14s/it, gpt_loss=0.243, loss_mean=0.301][A
+Train step of epoch 0:  77%|███████▋  | 4968/6434 [11:39:06<3:18:55,  8.14s/it, gpt_loss=0.326, loss_mean=0.303][A
+Train step of epoch 0:  77%|███████▋  | 4969/6434 [11:39:06<3:16:52,  8.06s/it, gpt_loss=0.326, loss_mean=0.303][A
+[LID Router Debug] Step: 4970
+Batch Size: 10
+Audio Batch Size: 130
+LID Assignments: [6, 5, 1, 3, 5, 2, 1, 3, 9, 0]
+Active Experts in Batch: {0, 1, 2, 3, 5, 6, 9}
+
+Train step of epoch 0:  77%|███████▋  | 4969/6434 [11:39:15<3:16:52,  8.06s/it, gpt_loss=0.228, loss_mean=0.296][A
+Train step of epoch 0:  77%|███████▋  | 4970/6434 [11:39:15<3:21:07,  8.24s/it, gpt_loss=0.228, loss_mean=0.296][A
+Train step of epoch 0:  77%|███████▋  | 4970/6434 [11:39:23<3:21:07,  8.24s/it, gpt_loss=0.278, loss_mean=0.294][A
+Train step of epoch 0:  77%|███████▋  | 4971/6434 [11:39:23<3:23:10,  8.33s/it, gpt_loss=0.278, loss_mean=0.294][A
+Train step of epoch 0:  77%|███████▋  | 4971/6434 [11:39:31<3:23:10,  8.33s/it, gpt_loss=0.324, loss_mean=0.297][A
+Train step of epoch 0:  77%|███████▋  | 4972/6434 [11:39:31<3:18:35,  8.15s/it, gpt_loss=0.324, loss_mean=0.297][A
+Train step of epoch 0:  77%|███████▋  | 4972/6434 [11:39:39<3:18:35,  8.15s/it, gpt_loss=0.229, loss_mean=0.29] [A
+Train step of epoch 0:  77%|███████▋  | 4973/6434 [11:39:39<3:14:08,  7.97s/it, gpt_loss=0.229, loss_mean=0.29][A
+Train step of epoch 0:  77%|███████▋  | 4973/6434 [11:39:47<3:14:08,  7.97s/it, gpt_loss=0.267, loss_mean=0.288][A
+Train step of epoch 0:  77%|███████▋  | 4974/6434 [11:39:47<3:12:43,  7.92s/it, gpt_loss=0.267, loss_mean=0.288][A
+Train step of epoch 0:  77%|███████▋  | 4974/6434 [11:39:55<3:12:43,  7.92s/it, gpt_loss=0.275, loss_mean=0.287][A
+Train step of epoch 0:  77%|███████▋  | 4975/6434 [11:39:55<3:14:45,  8.01s/it, gpt_loss=0.275, loss_mean=0.287][A
+Train step of epoch 0:  77%|███████▋  | 4975/6434 [11:40:03<3:14:45,  8.01s/it, gpt_loss=0.357, loss_mean=0.294][A
+Train step of epoch 0:  77%|███████▋  | 4976/6434 [11:40:03<3:15:25,  8.04s/it, gpt_loss=0.357, loss_mean=0.294][A
+Train step of epoch 0:  77%|███████▋  | 4976/6434 [11:40:11<3:15:25,  8.04s/it, gpt_loss=0.222, loss_mean=0.286][A
+Train step of epoch 0:  77%|███████▋  | 4977/6434 [11:40:11<3:19:22,  8.21s/it, gpt_loss=0.222, loss_mean=0.286][A
+Train step of epoch 0:  77%|███████▋  | 4977/6434 [11:40:20<3:19:22,  8.21s/it, gpt_loss=0.24, loss_mean=0.282] [A
+Train step of epoch 0:  77%|███████▋  | 4978/6434 [11:40:20<3:20:20,  8.26s/it, gpt_loss=0.24, loss_mean=0.282][A
+Train step of epoch 0:  77%|███████▋  | 4978/6434 [11:40:28<3:20:20,  8.26s/it, gpt_loss=0.338, loss_mean=0.287][A
+Train step of epoch 0:  77%|███████▋  | 4979/6434 [11:40:28<3:20:37,  8.27s/it, gpt_loss=0.338, loss_mean=0.287][A
+[LID Router Debug] Step: 4980
+Batch Size: 10
+Audio Batch Size: 119
+LID Assignments: [0, 9, 0, 3, 3, 2, 4, 1, 9, 5]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  77%|███████▋  | 4979/6434 [11:40:37<3:20:37,  8.27s/it, gpt_loss=0.379, loss_mean=0.297][A
+Train step of epoch 0:  77%|███████▋  | 4980/6434 [11:40:37<3:24:33,  8.44s/it, gpt_loss=0.379, loss_mean=0.297][A
+Train step of epoch 0:  77%|███████▋  | 4980/6434 [11:40:45<3:24:33,  8.44s/it, gpt_loss=0.256, loss_mean=0.293][A
+Train step of epoch 0:  77%|███████▋  | 4981/6434 [11:40:45<3:21:48,  8.33s/it, gpt_loss=0.256, loss_mean=0.293][A
+Train step of epoch 0:  77%|███████▋  | 4981/6434 [11:40:53<3:21:48,  8.33s/it, gpt_loss=0.256, loss_mean=0.289][A
+Train step of epoch 0:  77%|███████▋  | 4982/6434 [11:40:53<3:21:44,  8.34s/it, gpt_loss=0.256, loss_mean=0.289][A
+Train step of epoch 0:  77%|███████▋  | 4982/6434 [11:41:01<3:21:44,  8.34s/it, gpt_loss=0.272, loss_mean=0.287][A
+Train step of epoch 0:  77%|███████▋  | 4983/6434 [11:41:01<3:14:09,  8.03s/it, gpt_loss=0.272, loss_mean=0.287][A
+Train step of epoch 0:  77%|███████▋  | 4983/6434 [11:41:09<3:14:09,  8.03s/it, gpt_loss=0.308, loss_mean=0.289][A
+Train step of epoch 0:  77%|███████▋  | 4984/6434 [11:41:09<3:16:13,  8.12s/it, gpt_loss=0.308, loss_mean=0.289][A
+Train step of epoch 0:  77%|███████▋  | 4984/6434 [11:41:17<3:16:13,  8.12s/it, gpt_loss=0.371, loss_mean=0.297][A
+Train step of epoch 0:  77%|███████▋  | 4985/6434 [11:41:17<3:13:39,  8.02s/it, gpt_loss=0.371, loss_mean=0.297][A
+Train step of epoch 0:  77%|███████▋  | 4985/6434 [11:41:24<3:13:39,  8.02s/it, gpt_loss=0.229, loss_mean=0.291][A
+Train step of epoch 0:  77%|███████▋  | 4986/6434 [11:41:24<3:09:44,  7.86s/it, gpt_loss=0.229, loss_mean=0.291][A
+Train step of epoch 0:  77%|███████▋  | 4986/6434 [11:41:34<3:09:44,  7.86s/it, gpt_loss=0.256, loss_mean=0.287][A
+Train step of epoch 0:  78%|███████▊  | 4987/6434 [11:41:34<3:21:49,  8.37s/it, gpt_loss=0.256, loss_mean=0.287][A
+Train step of epoch 0:  78%|███████▊  | 4987/6434 [11:41:41<3:21:49,  8.37s/it, gpt_loss=0.253, loss_mean=0.284][A
+Train step of epoch 0:  78%|███████▊  | 4988/6434 [11:41:41<3:14:30,  8.07s/it, gpt_loss=0.253, loss_mean=0.284][A
+Train step of epoch 0:  78%|███████▊  | 4988/6434 [11:41:50<3:14:30,  8.07s/it, gpt_loss=0.29, loss_mean=0.284] [A
+Train step of epoch 0:  78%|███████▊  | 4989/6434 [11:41:50<3:18:43,  8.25s/it, gpt_loss=0.29, loss_mean=0.284][A
+[LID Router Debug] Step: 4990
+Batch Size: 10
+Audio Batch Size: 122
+LID Assignments: [9, 4, 4, 9, 2, 3, 2, 4, 6, 5]
+Active Experts in Batch: {2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  78%|███████▊  | 4989/6434 [11:41:59<3:18:43,  8.25s/it, gpt_loss=0.246, loss_mean=0.281][A
+Train step of epoch 0:  78%|███████▊  | 4990/6434 [11:41:59<3:23:47,  8.47s/it, gpt_loss=0.246, loss_mean=0.281][A
+Train step of epoch 0:  78%|███████▊  | 4990/6434 [11:42:07<3:23:47,  8.47s/it, gpt_loss=0.355, loss_mean=0.288][A
+Train step of epoch 0:  78%|███████▊  | 4991/6434 [11:42:07<3:20:18,  8.33s/it, gpt_loss=0.355, loss_mean=0.288][A
+Train step of epoch 0:  78%|███████▊  | 4991/6434 [11:42:17<3:20:18,  8.33s/it, gpt_loss=0.313, loss_mean=0.291][A
+Train step of epoch 0:  78%|███████▊  | 4992/6434 [11:42:17<3:31:00,  8.78s/it, gpt_loss=0.313, loss_mean=0.291][A
+Train step of epoch 0:  78%|███████▊  | 4992/6434 [11:42:25<3:31:00,  8.78s/it, gpt_loss=0.272, loss_mean=0.289][A
+Train step of epoch 0:  78%|███████▊  | 4993/6434 [11:42:25<3:26:08,  8.58s/it, gpt_loss=0.272, loss_mean=0.289][A
+Train step of epoch 0:  78%|███████▊  | 4993/6434 [11:42:33<3:26:08,  8.58s/it, gpt_loss=0.377, loss_mean=0.298][A
+Train step of epoch 0:  78%|███████▊  | 4994/6434 [11:42:33<3:26:18,  8.60s/it, gpt_loss=0.377, loss_mean=0.298][A
+Train step of epoch 0:  78%|███████▊  | 4994/6434 [11:42:41<3:26:18,  8.60s/it, gpt_loss=0.324, loss_mean=0.3]  [A
+Train step of epoch 0:  78%|███████▊  | 4995/6434 [11:42:41<3:14:48,  8.12s/it, gpt_loss=0.324, loss_mean=0.3][A
+Train step of epoch 0:  78%|███████▊  | 4995/6434 [11:42:49<3:14:48,  8.12s/it, gpt_loss=0.36, loss_mean=0.306][A
+Train step of epoch 0:  78%|███████▊  | 4996/6434 [11:42:49<3:16:43,  8.21s/it, gpt_loss=0.36, loss_mean=0.306][A
+Train step of epoch 0:  78%|███████▊  | 4996/6434 [11:42:57<3:16:43,  8.21s/it, gpt_loss=0.214, loss_mean=0.297][A
+Train step of epoch 0:  78%|███████▊  | 4997/6434 [11:42:57<3:16:33,  8.21s/it, gpt_loss=0.214, loss_mean=0.297][A
+Train step of epoch 0:  78%|███████▊  | 4997/6434 [11:43:04<3:16:33,  8.21s/it, gpt_loss=0.308, loss_mean=0.298][A
+Train step of epoch 0:  78%|███████▊  | 4998/6434 [11:43:04<3:10:04,  7.94s/it, gpt_loss=0.308, loss_mean=0.298][A
+Train step of epoch 0:  78%|███████▊  | 4998/6434 [11:43:14<3:10:04,  7.94s/it, gpt_loss=0.3, loss_mean=0.298]  [A
+Train step of epoch 0:  78%|███████▊  | 4999/6434 [11:43:14<3:21:21,  8.42s/it, gpt_loss=0.3, loss_mean=0.298][A
+[LID Router Debug] Step: 5000
+Batch Size: 10
+Audio Batch Size: 88
+LID Assignments: [6, 1, 4, 6, 3, 4, 1, 5, 1, 1]
+Active Experts in Batch: {1, 3, 4, 5, 6}
+[2026-02-07 03:39:26,771] [INFO] [logging.py:96:log_dist] [Rank 0] step=2500, skipped=0, lr=[1.7037272993713084e-05, 1.7037272993713084e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 03:39:26,772] [INFO] [timer.py:260:stop] epoch=0/micro_step=5000/global_step=2500, RunningAvgSamplesPerSec=4.749148699406654, CurrSamplesPerSec=4.488112626876263, MemAllocated=12.47GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  78%|███████▊  | 4999/6434 [11:43:22<3:21:21,  8.42s/it, gpt_loss=0.313, loss_mean=0.3][A
+Train step of epoch 0:  78%|███████▊  | 5000/6434 [11:43:22<3:20:32,  8.39s/it, gpt_loss=0.313, loss_mean=0.3][A
+Train step of epoch 0:  78%|███████▊  | 5000/6434 [11:43:30<3:20:32,  8.39s/it, gpt_loss=0.333, loss_mean=0.303][A
+Train step of epoch 0:  78%|███████▊  | 5001/6434 [11:43:30<3:16:09,  8.21s/it, gpt_loss=0.333, loss_mean=0.303][A
+Train step of epoch 0:  78%|███████▊  | 5001/6434 [11:43:38<3:16:09,  8.21s/it, gpt_loss=0.224, loss_mean=0.295][A
+Train step of epoch 0:  78%|███████▊  | 5002/6434 [11:43:38<3:14:21,  8.14s/it, gpt_loss=0.224, loss_mean=0.295][A
+Train step of epoch 0:  78%|███████▊  | 5002/6434 [11:43:48<3:14:21,  8.14s/it, gpt_loss=0.25, loss_mean=0.291] [A
+Train step of epoch 0:  78%|███████▊  | 5003/6434 [11:43:48<3:24:25,  8.57s/it, gpt_loss=0.25, loss_mean=0.291][A
+Train step of epoch 0:  78%|███████▊  | 5003/6434 [11:43:56<3:24:25,  8.57s/it, gpt_loss=0.289, loss_mean=0.29][A
+Train step of epoch 0:  78%|███████▊  | 5004/6434 [11:43:56<3:25:19,  8.61s/it, gpt_loss=0.289, loss_mean=0.29][A
+Train step of epoch 0:  78%|███████▊  | 5004/6434 [11:44:05<3:25:19,  8.61s/it, gpt_loss=0.283, loss_mean=0.29][A
+Train step of epoch 0:  78%|███████▊  | 5005/6434 [11:44:05<3:22:33,  8.50s/it, gpt_loss=0.283, loss_mean=0.29][A
+Train step of epoch 0:  78%|███████▊  | 5005/6434 [11:44:13<3:22:33,  8.50s/it, gpt_loss=0.245, loss_mean=0.285][A
+Train step of epoch 0:  78%|███████▊  | 5006/6434 [11:44:13<3:19:04,  8.36s/it, gpt_loss=0.245, loss_mean=0.285][A
+Train step of epoch 0:  78%|███████▊  | 5006/6434 [11:44:21<3:19:04,  8.36s/it, gpt_loss=0.332, loss_mean=0.29] [A
+Train step of epoch 0:  78%|███████▊  | 5007/6434 [11:44:21<3:17:14,  8.29s/it, gpt_loss=0.332, loss_mean=0.29][A
+Train step of epoch 0:  78%|███████▊  | 5007/6434 [11:44:28<3:17:14,  8.29s/it, gpt_loss=0.292, loss_mean=0.29][A
+Train step of epoch 0:  78%|███████▊  | 5008/6434 [11:44:28<3:12:06,  8.08s/it, gpt_loss=0.292, loss_mean=0.29][A
+Train step of epoch 0:  78%|███████▊  | 5008/6434 [11:44:37<3:12:06,  8.08s/it, gpt_loss=0.293, loss_mean=0.29][A
+Train step of epoch 0:  78%|███████▊  | 5009/6434 [11:44:37<3:18:54,  8.37s/it, gpt_loss=0.293, loss_mean=0.29][A
+[LID Router Debug] Step: 5010
+Batch Size: 10
+Audio Batch Size: 119
+LID Assignments: [0, 6, 9, 4, 7, 5, 9, 1, 2, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6, 7, 9}
+
+Train step of epoch 0:  78%|███████▊  | 5009/6434 [11:44:45<3:18:54,  8.37s/it, gpt_loss=0.548, loss_mean=0.316][A
+Train step of epoch 0:  78%|███████▊  | 5010/6434 [11:44:45<3:16:22,  8.27s/it, gpt_loss=0.548, loss_mean=0.316][A
+Train step of epoch 0:  78%|███████▊  | 5010/6434 [11:44:54<3:16:22,  8.27s/it, gpt_loss=0.253, loss_mean=0.31] [A
+Train step of epoch 0:  78%|███████▊  | 5011/6434 [11:44:54<3:18:37,  8.37s/it, gpt_loss=0.253, loss_mean=0.31][A
+Train step of epoch 0:  78%|███████▊  | 5011/6434 [11:45:03<3:18:37,  8.37s/it, gpt_loss=0.22, loss_mean=0.301][A
+Train step of epoch 0:  78%|███████▊  | 5012/6434 [11:45:03<3:21:30,  8.50s/it, gpt_loss=0.22, loss_mean=0.301][A
+Train step of epoch 0:  78%|███████▊  | 5012/6434 [11:45:12<3:21:30,  8.50s/it, gpt_loss=0.24, loss_mean=0.295][A
+Train step of epoch 0:  78%|███████▊  | 5013/6434 [11:45:12<3:23:02,  8.57s/it, gpt_loss=0.24, loss_mean=0.295][A
+Train step of epoch 0:  78%|███████▊  | 5013/6434 [11:45:21<3:23:02,  8.57s/it, gpt_loss=0.274, loss_mean=0.293][A
+Train step of epoch 0:  78%|███████▊  | 5014/6434 [11:45:21<3:29:48,  8.87s/it, gpt_loss=0.274, loss_mean=0.293][A
+Train step of epoch 0:  78%|███████▊  | 5014/6434 [11:45:30<3:29:48,  8.87s/it, gpt_loss=0.266, loss_mean=0.29] [A
+Train step of epoch 0:  78%|███████▊  | 5015/6434 [11:45:30<3:26:50,  8.75s/it, gpt_loss=0.266, loss_mean=0.29][A
+Train step of epoch 0:  78%|███████▊  | 5015/6434 [11:45:38<3:26:50,  8.75s/it, gpt_loss=0.412, loss_mean=0.302][A
+Train step of epoch 0:  78%|███████▊  | 5016/6434 [11:45:38<3:22:51,  8.58s/it, gpt_loss=0.412, loss_mean=0.302][A
+Train step of epoch 0:  78%|███████▊  | 5016/6434 [11:45:46<3:22:51,  8.58s/it, gpt_loss=0.223, loss_mean=0.294][A
+Train step of epoch 0:  78%|███████▊  | 5017/6434 [11:45:46<3:17:44,  8.37s/it, gpt_loss=0.223, loss_mean=0.294][A
+Train step of epoch 0:  78%|███████▊  | 5017/6434 [11:45:54<3:17:44,  8.37s/it, gpt_loss=0.377, loss_mean=0.303][A
+Train step of epoch 0:  78%|███████▊  | 5018/6434 [11:45:54<3:13:28,  8.20s/it, gpt_loss=0.377, loss_mean=0.303][A
+Train step of epoch 0:  78%|███████▊  | 5018/6434 [11:46:02<3:13:28,  8.20s/it, gpt_loss=0.295, loss_mean=0.302][A
+Train step of epoch 0:  78%|███████▊  | 5019/6434 [11:46:02<3:15:26,  8.29s/it, gpt_loss=0.295, loss_mean=0.302][A
+[LID Router Debug] Step: 5020
+Batch Size: 10
+Audio Batch Size: 123
+LID Assignments: [2, 2, 0, 5, 0, 9, 5, 3, 9, 2]
+Active Experts in Batch: {0, 2, 3, 5, 9}
+
+Train step of epoch 0:  78%|███████▊  | 5019/6434 [11:46:10<3:15:26,  8.29s/it, gpt_loss=0.286, loss_mean=0.3]  [A
+Train step of epoch 0:  78%|███████▊  | 5020/6434 [11:46:10<3:15:15,  8.29s/it, gpt_loss=0.286, loss_mean=0.3][A
+Train step of epoch 0:  78%|███████▊  | 5020/6434 [11:46:19<3:15:15,  8.29s/it, gpt_loss=0.297, loss_mean=0.3][A
+Train step of epoch 0:  78%|███████▊  | 5021/6434 [11:46:19<3:17:05,  8.37s/it, gpt_loss=0.297, loss_mean=0.3][A
+Train step of epoch 0:  78%|███████▊  | 5021/6434 [11:46:27<3:17:05,  8.37s/it, gpt_loss=0.237, loss_mean=0.294][A
+Train step of epoch 0:  78%|███████▊  | 5022/6434 [11:46:27<3:17:04,  8.37s/it, gpt_loss=0.237, loss_mean=0.294][A
+Train step of epoch 0:  78%|███████▊  | 5022/6434 [11:46:35<3:17:04,  8.37s/it, gpt_loss=0.308, loss_mean=0.295][A
+Train step of epoch 0:  78%|███████▊  | 5023/6434 [11:46:35<3:10:19,  8.09s/it, gpt_loss=0.308, loss_mean=0.295][A
+Train step of epoch 0:  78%|███████▊  | 5023/6434 [11:46:44<3:10:19,  8.09s/it, gpt_loss=0.304, loss_mean=0.296][A
+Train step of epoch 0:  78%|███████▊  | 5024/6434 [11:46:44<3:18:13,  8.44s/it, gpt_loss=0.304, loss_mean=0.296][A
+Train step of epoch 0:  78%|███████▊  | 5024/6434 [11:46:53<3:18:13,  8.44s/it, gpt_loss=0.251, loss_mean=0.292][A
+Train step of epoch 0:  78%|███████▊  | 5025/6434 [11:46:53<3:21:42,  8.59s/it, gpt_loss=0.251, loss_mean=0.292][A
+Train step of epoch 0:  78%|███████▊  | 5025/6434 [11:47:01<3:21:42,  8.59s/it, gpt_loss=0.305, loss_mean=0.293][A
+Train step of epoch 0:  78%|███████▊  | 5026/6434 [11:47:01<3:17:54,  8.43s/it, gpt_loss=0.305, loss_mean=0.293][A
+Train step of epoch 0:  78%|███████▊  | 5026/6434 [11:47:09<3:17:54,  8.43s/it, gpt_loss=0.292, loss_mean=0.293][A
+Train step of epoch 0:  78%|███████▊  | 5027/6434 [11:47:09<3:13:39,  8.26s/it, gpt_loss=0.292, loss_mean=0.293][A
+Train step of epoch 0:  78%|███████▊  | 5027/6434 [11:47:18<3:13:39,  8.26s/it, gpt_loss=0.248, loss_mean=0.288][A
+Train step of epoch 0:  78%|███████▊  | 5028/6434 [11:47:18<3:16:59,  8.41s/it, gpt_loss=0.248, loss_mean=0.288][A
+Train step of epoch 0:  78%|███████▊  | 5028/6434 [11:47:25<3:16:59,  8.41s/it, gpt_loss=0.307, loss_mean=0.29] [A
+Train step of epoch 0:  78%|███████▊  | 5029/6434 [11:47:25<3:11:01,  8.16s/it, gpt_loss=0.307, loss_mean=0.29][A
+[LID Router Debug] Step: 5030
+Batch Size: 10
+Audio Batch Size: 119
+LID Assignments: [3, 5, 9, 5, 0, 3, 3, 9, 4, 1]
+Active Experts in Batch: {0, 1, 3, 4, 5, 9}
+
+Train step of epoch 0:  78%|███████▊  | 5029/6434 [11:47:33<3:11:01,  8.16s/it, gpt_loss=0.386, loss_mean=0.3] [A
+Train step of epoch 0:  78%|███████▊  | 5030/6434 [11:47:33<3:10:10,  8.13s/it, gpt_loss=0.386, loss_mean=0.3][A
+Train step of epoch 0:  78%|███████▊  | 5030/6434 [11:47:42<3:10:10,  8.13s/it, gpt_loss=0.241, loss_mean=0.294][A
+Train step of epoch 0:  78%|███████▊  | 5031/6434 [11:47:42<3:16:19,  8.40s/it, gpt_loss=0.241, loss_mean=0.294][A
+Train step of epoch 0:  78%|███████▊  | 5031/6434 [11:47:50<3:16:19,  8.40s/it, gpt_loss=0.358, loss_mean=0.3]  [A
+Train step of epoch 0:  78%|███████▊  | 5032/6434 [11:47:50<3:12:09,  8.22s/it, gpt_loss=0.358, loss_mean=0.3][A
+Train step of epoch 0:  78%|███████▊  | 5032/6434 [11:47:58<3:12:09,  8.22s/it, gpt_loss=0.313, loss_mean=0.302][A
+Train step of epoch 0:  78%|███████▊  | 5033/6434 [11:47:58<3:09:13,  8.10s/it, gpt_loss=0.313, loss_mean=0.302][A
+Train step of epoch 0:  78%|███████▊  | 5033/6434 [11:48:07<3:09:13,  8.10s/it, gpt_loss=0.262, loss_mean=0.298][A
+Train step of epoch 0:  78%|███████▊  | 5034/6434 [11:48:07<3:13:50,  8.31s/it, gpt_loss=0.262, loss_mean=0.298][A
+Train step of epoch 0:  78%|███████▊  | 5034/6434 [11:48:15<3:13:50,  8.31s/it, gpt_loss=0.28, loss_mean=0.296] [A
+Train step of epoch 0:  78%|███████▊  | 5035/6434 [11:48:15<3:16:41,  8.44s/it, gpt_loss=0.28, loss_mean=0.296][A
+Train step of epoch 0:  78%|███████▊  | 5035/6434 [11:48:24<3:16:41,  8.44s/it, gpt_loss=0.269, loss_mean=0.293][A
+Train step of epoch 0:  78%|███████▊  | 5036/6434 [11:48:24<3:16:52,  8.45s/it, gpt_loss=0.269, loss_mean=0.293][A
+Train step of epoch 0:  78%|███████▊  | 5036/6434 [11:48:32<3:16:52,  8.45s/it, gpt_loss=0.264, loss_mean=0.29] [A
+Train step of epoch 0:  78%|███████▊  | 5037/6434 [11:48:32<3:15:54,  8.41s/it, gpt_loss=0.264, loss_mean=0.29][A
+Train step of epoch 0:  78%|███████▊  | 5037/6434 [11:48:41<3:15:54,  8.41s/it, gpt_loss=0.26, loss_mean=0.287][A
+Train step of epoch 0:  78%|███████▊  | 5038/6434 [11:48:41<3:17:53,  8.51s/it, gpt_loss=0.26, loss_mean=0.287][A
+Train step of epoch 0:  78%|███████▊  | 5038/6434 [11:48:47<3:17:53,  8.51s/it, gpt_loss=0.287, loss_mean=0.287][A
+Train step of epoch 0:  78%|███████▊  | 5039/6434 [11:48:47<3:03:51,  7.91s/it, gpt_loss=0.287, loss_mean=0.287][A
+[LID Router Debug] Step: 5040
+Batch Size: 10
+Audio Batch Size: 120
+LID Assignments: [0, 0, 5, 9, 2, 5, 2, 2, 2, 3]
+Active Experts in Batch: {0, 2, 3, 5, 9}
+
+Train step of epoch 0:  78%|███████▊  | 5039/6434 [11:48:56<3:03:51,  7.91s/it, gpt_loss=0.232, loss_mean=0.282][A
+Train step of epoch 0:  78%|███████▊  | 5040/6434 [11:48:56<3:11:02,  8.22s/it, gpt_loss=0.232, loss_mean=0.282][A
+Train step of epoch 0:  78%|███████▊  | 5040/6434 [11:49:04<3:11:02,  8.22s/it, gpt_loss=0.387, loss_mean=0.292][A
+Train step of epoch 0:  78%|███████▊  | 5041/6434 [11:49:04<3:08:49,  8.13s/it, gpt_loss=0.387, loss_mean=0.292][A
+Train step of epoch 0:  78%|███████▊  | 5041/6434 [11:49:12<3:08:49,  8.13s/it, gpt_loss=0.277, loss_mean=0.291][A
+Train step of epoch 0:  78%|███████▊  | 5042/6434 [11:49:12<3:09:13,  8.16s/it, gpt_loss=0.277, loss_mean=0.291][A
+Train step of epoch 0:  78%|███████▊  | 5042/6434 [11:49:20<3:09:13,  8.16s/it, gpt_loss=0.28, loss_mean=0.29]  [A
+Train step of epoch 0:  78%|███████▊  | 5043/6434 [11:49:20<3:04:23,  7.95s/it, gpt_loss=0.28, loss_mean=0.29][A
+Train step of epoch 0:  78%|███████▊  | 5043/6434 [11:49:28<3:04:23,  7.95s/it, gpt_loss=0.277, loss_mean=0.288][A
+Train step of epoch 0:  78%|███████▊  | 5044/6434 [11:49:28<3:02:56,  7.90s/it, gpt_loss=0.277, loss_mean=0.288][A
+Train step of epoch 0:  78%|███████▊  | 5044/6434 [11:49:37<3:02:56,  7.90s/it, gpt_loss=0.216, loss_mean=0.281][A
+Train step of epoch 0:  78%|███████▊  | 5045/6434 [11:49:37<3:14:51,  8.42s/it, gpt_loss=0.216, loss_mean=0.281][A
+Train step of epoch 0:  78%|███████▊  | 5045/6434 [11:49:47<3:14:51,  8.42s/it, gpt_loss=0.288, loss_mean=0.282][A
+Train step of epoch 0:  78%|███████▊  | 5046/6434 [11:49:47<3:24:56,  8.86s/it, gpt_loss=0.288, loss_mean=0.282][A
+Train step of epoch 0:  78%|███████▊  | 5046/6434 [11:49:55<3:24:56,  8.86s/it, gpt_loss=0.305, loss_mean=0.284][A
+Train step of epoch 0:  78%|███████▊  | 5047/6434 [11:49:55<3:17:45,  8.55s/it, gpt_loss=0.305, loss_mean=0.284][A
+Train step of epoch 0:  78%|███████▊  | 5047/6434 [11:50:03<3:17:45,  8.55s/it, gpt_loss=0.296, loss_mean=0.285][A
+Train step of epoch 0:  78%|███████▊  | 5048/6434 [11:50:03<3:12:50,  8.35s/it, gpt_loss=0.296, loss_mean=0.285][A
+Train step of epoch 0:  78%|███████▊  | 5048/6434 [11:50:11<3:12:50,  8.35s/it, gpt_loss=0.234, loss_mean=0.28] [A
+Train step of epoch 0:  78%|███████▊  | 5049/6434 [11:50:11<3:11:49,  8.31s/it, gpt_loss=0.234, loss_mean=0.28][A
+[LID Router Debug] Step: 5050
+Batch Size: 10
+Audio Batch Size: 76
+LID Assignments: [9, 1, 9, 0, 6, 0, 5, 1, 9, 1]
+Active Experts in Batch: {0, 1, 5, 6, 9}
+
+Train step of epoch 0:  78%|███████▊  | 5049/6434 [11:50:19<3:11:49,  8.31s/it, gpt_loss=0.275, loss_mean=0.28][A
+Train step of epoch 0:  78%|███████▊  | 5050/6434 [11:50:19<3:10:31,  8.26s/it, gpt_loss=0.275, loss_mean=0.28][A
+Train step of epoch 0:  78%|███████▊  | 5050/6434 [11:50:27<3:10:31,  8.26s/it, gpt_loss=0.284, loss_mean=0.28][A
+Train step of epoch 0:  79%|███████▊  | 5051/6434 [11:50:27<3:04:16,  7.99s/it, gpt_loss=0.284, loss_mean=0.28][A
+Train step of epoch 0:  79%|███████▊  | 5051/6434 [11:50:36<3:04:16,  7.99s/it, gpt_loss=0.398, loss_mean=0.292][A
+Train step of epoch 0:  79%|███████▊  | 5052/6434 [11:50:36<3:12:20,  8.35s/it, gpt_loss=0.398, loss_mean=0.292][A
+Train step of epoch 0:  79%|███████▊  | 5052/6434 [11:50:45<3:12:20,  8.35s/it, gpt_loss=0.205, loss_mean=0.283][A
+Train step of epoch 0:  79%|███████▊  | 5053/6434 [11:50:45<3:15:08,  8.48s/it, gpt_loss=0.205, loss_mean=0.283][A
+Train step of epoch 0:  79%|███████▊  | 5053/6434 [11:50:53<3:15:08,  8.48s/it, gpt_loss=0.297, loss_mean=0.285][A
+Train step of epoch 0:  79%|███████▊  | 5054/6434 [11:50:53<3:14:12,  8.44s/it, gpt_loss=0.297, loss_mean=0.285][A
+Train step of epoch 0:  79%|███████▊  | 5054/6434 [11:51:01<3:14:12,  8.44s/it, gpt_loss=0.278, loss_mean=0.284][A
+Train step of epoch 0:  79%|███████▊  | 5055/6434 [11:51:01<3:10:35,  8.29s/it, gpt_loss=0.278, loss_mean=0.284][A
+Train step of epoch 0:  79%|███████▊  | 5055/6434 [11:51:08<3:10:35,  8.29s/it, gpt_loss=0.247, loss_mean=0.28] [A
+Train step of epoch 0:  79%|███████▊  | 5056/6434 [11:51:08<3:04:36,  8.04s/it, gpt_loss=0.247, loss_mean=0.28][A
+Train step of epoch 0:  79%|███████▊  | 5056/6434 [11:51:16<3:04:36,  8.04s/it, gpt_loss=0.39, loss_mean=0.291][A
+Train step of epoch 0:  79%|███████▊  | 5057/6434 [11:51:16<2:59:32,  7.82s/it, gpt_loss=0.39, loss_mean=0.291][A
+Train step of epoch 0:  79%|███████▊  | 5057/6434 [11:51:24<2:59:32,  7.82s/it, gpt_loss=0.381, loss_mean=0.3] [A
+Train step of epoch 0:  79%|███████▊  | 5058/6434 [11:51:24<3:03:46,  8.01s/it, gpt_loss=0.381, loss_mean=0.3][A
+Train step of epoch 0:  79%|███████▊  | 5058/6434 [11:51:33<3:03:46,  8.01s/it, gpt_loss=0.342, loss_mean=0.304][A
+Train step of epoch 0:  79%|███████▊  | 5059/6434 [11:51:33<3:06:34,  8.14s/it, gpt_loss=0.342, loss_mean=0.304][A
+[LID Router Debug] Step: 5060
+Batch Size: 10
+Audio Batch Size: 78
+LID Assignments: [5, 2, 0, 2, 1, 5, 4, 5, 4, 5]
+Active Experts in Batch: {0, 1, 2, 4, 5}
+
+Train step of epoch 0:  79%|███████▊  | 5059/6434 [11:51:42<3:06:34,  8.14s/it, gpt_loss=0.272, loss_mean=0.301][A
+Train step of epoch 0:  79%|███████▊  | 5060/6434 [11:51:42<3:11:50,  8.38s/it, gpt_loss=0.272, loss_mean=0.301][A
+Train step of epoch 0:  79%|███████▊  | 5060/6434 [11:51:50<3:11:50,  8.38s/it, gpt_loss=0.29, loss_mean=0.3]   [A
+Train step of epoch 0:  79%|███████▊  | 5061/6434 [11:51:50<3:14:50,  8.51s/it, gpt_loss=0.29, loss_mean=0.3][A
+Train step of epoch 0:  79%|███████▊  | 5061/6434 [11:51:59<3:14:50,  8.51s/it, gpt_loss=0.287, loss_mean=0.299][A
+Train step of epoch 0:  79%|███████▊  | 5062/6434 [11:51:59<3:14:49,  8.52s/it, gpt_loss=0.287, loss_mean=0.299][A
+Train step of epoch 0:  79%|███████▊  | 5062/6434 [11:52:09<3:14:49,  8.52s/it, gpt_loss=0.284, loss_mean=0.297][A
+Train step of epoch 0:  79%|███████▊  | 5063/6434 [11:52:09<3:23:39,  8.91s/it, gpt_loss=0.284, loss_mean=0.297][A
+Train step of epoch 0:  79%|███████▊  | 5063/6434 [11:52:18<3:23:39,  8.91s/it, gpt_loss=0.287, loss_mean=0.296][A
+Train step of epoch 0:  79%|███████▊  | 5064/6434 [11:52:18<3:26:09,  9.03s/it, gpt_loss=0.287, loss_mean=0.296][A
+Train step of epoch 0:  79%|███████▊  | 5064/6434 [11:52:27<3:26:09,  9.03s/it, gpt_loss=0.275, loss_mean=0.294][A
+Train step of epoch 0:  79%|███████▊  | 5065/6434 [11:52:27<3:24:07,  8.95s/it, gpt_loss=0.275, loss_mean=0.294][A
+Train step of epoch 0:  79%|███████▊  | 5065/6434 [11:52:37<3:24:07,  8.95s/it, gpt_loss=0.252, loss_mean=0.29] [A
+Train step of epoch 0:  79%|███████▊  | 5066/6434 [11:52:37<3:29:37,  9.19s/it, gpt_loss=0.252, loss_mean=0.29][A
+Train step of epoch 0:  79%|███████▊  | 5066/6434 [11:52:46<3:29:37,  9.19s/it, gpt_loss=0.342, loss_mean=0.295][A
+Train step of epoch 0:  79%|███████▉  | 5067/6434 [11:52:46<3:28:37,  9.16s/it, gpt_loss=0.342, loss_mean=0.295][A
+Train step of epoch 0:  79%|███████▉  | 5067/6434 [11:52:54<3:28:37,  9.16s/it, gpt_loss=0.251, loss_mean=0.291][A
+Train step of epoch 0:  79%|███████▉  | 5068/6434 [11:52:54<3:22:03,  8.88s/it, gpt_loss=0.251, loss_mean=0.291][A
+Train step of epoch 0:  79%|███████▉  | 5068/6434 [11:53:03<3:22:03,  8.88s/it, gpt_loss=0.309, loss_mean=0.293][A
+Train step of epoch 0:  79%|███████▉  | 5069/6434 [11:53:03<3:20:19,  8.81s/it, gpt_loss=0.309, loss_mean=0.293][A
+[LID Router Debug] Step: 5070
+Batch Size: 10
+Audio Batch Size: 94
+LID Assignments: [1, 0, 2, 1, 1, 9, 3, 1, 5, 1]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9}
+
+Train step of epoch 0:  79%|███████▉  | 5069/6434 [11:53:10<3:20:19,  8.81s/it, gpt_loss=0.34, loss_mean=0.297] [A
+Train step of epoch 0:  79%|███████▉  | 5070/6434 [11:53:10<3:14:03,  8.54s/it, gpt_loss=0.34, loss_mean=0.297][A
+Train step of epoch 0:  79%|███████▉  | 5070/6434 [11:53:20<3:14:03,  8.54s/it, gpt_loss=0.316, loss_mean=0.299][A
+Train step of epoch 0:  79%|███████▉  | 5071/6434 [11:53:20<3:19:12,  8.77s/it, gpt_loss=0.316, loss_mean=0.299][A
+Train step of epoch 0:  79%|███████▉  | 5071/6434 [11:53:28<3:19:12,  8.77s/it, gpt_loss=0.252, loss_mean=0.295][A
+Train step of epoch 0:  79%|███████▉  | 5072/6434 [11:53:28<3:14:13,  8.56s/it, gpt_loss=0.252, loss_mean=0.295][A
+Train step of epoch 0:  79%|███████▉  | 5072/6434 [11:53:36<3:14:13,  8.56s/it, gpt_loss=0.34, loss_mean=0.299] [A
+Train step of epoch 0:  79%|███████▉  | 5073/6434 [11:53:36<3:12:11,  8.47s/it, gpt_loss=0.34, loss_mean=0.299][A
+Train step of epoch 0:  79%|███████▉  | 5073/6434 [11:53:43<3:12:11,  8.47s/it, gpt_loss=0.305, loss_mean=0.3] [A
+Train step of epoch 0:  79%|███████▉  | 5074/6434 [11:53:43<3:03:56,  8.12s/it, gpt_loss=0.305, loss_mean=0.3][A
+Train step of epoch 0:  79%|███████▉  | 5074/6434 [11:53:53<3:03:56,  8.12s/it, gpt_loss=0.34, loss_mean=0.304][A
+Train step of epoch 0:  79%|███████▉  | 5075/6434 [11:53:53<3:11:58,  8.48s/it, gpt_loss=0.34, loss_mean=0.304][A
+Train step of epoch 0:  79%|███████▉  | 5075/6434 [11:54:01<3:11:58,  8.48s/it, gpt_loss=0.287, loss_mean=0.302][A
+Train step of epoch 0:  79%|███████▉  | 5076/6434 [11:54:01<3:12:31,  8.51s/it, gpt_loss=0.287, loss_mean=0.302][A
+Train step of epoch 0:  79%|███████▉  | 5076/6434 [11:54:10<3:12:31,  8.51s/it, gpt_loss=0.316, loss_mean=0.303][A
+Train step of epoch 0:  79%|███████▉  | 5077/6434 [11:54:10<3:14:55,  8.62s/it, gpt_loss=0.316, loss_mean=0.303][A
+Train step of epoch 0:  79%|███████▉  | 5077/6434 [11:54:18<3:14:55,  8.62s/it, gpt_loss=0.262, loss_mean=0.299][A
+Train step of epoch 0:  79%|███████▉  | 5078/6434 [11:54:18<3:08:17,  8.33s/it, gpt_loss=0.262, loss_mean=0.299][A
+Train step of epoch 0:  79%|███████▉  | 5078/6434 [11:54:27<3:08:17,  8.33s/it, gpt_loss=0.309, loss_mean=0.3]  [A
+Train step of epoch 0:  79%|███████▉  | 5079/6434 [11:54:27<3:16:53,  8.72s/it, gpt_loss=0.309, loss_mean=0.3][A
+[LID Router Debug] Step: 5080
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [2, 3, 5, 2, 5, 2, 0, 2, 9, 9]
+Active Experts in Batch: {0, 2, 3, 5, 9}
+
+Train step of epoch 0:  79%|███████▉  | 5079/6434 [11:54:35<3:16:53,  8.72s/it, gpt_loss=0.334, loss_mean=0.304][A
+Train step of epoch 0:  79%|███████▉  | 5080/6434 [11:54:35<3:10:38,  8.45s/it, gpt_loss=0.334, loss_mean=0.304][A
+Train step of epoch 0:  79%|███████▉  | 5080/6434 [11:54:43<3:10:38,  8.45s/it, gpt_loss=0.292, loss_mean=0.302][A
+Train step of epoch 0:  79%|███████▉  | 5081/6434 [11:54:43<3:05:11,  8.21s/it, gpt_loss=0.292, loss_mean=0.302][A
+Train step of epoch 0:  79%|███████▉  | 5081/6434 [11:54:52<3:05:11,  8.21s/it, gpt_loss=0.424, loss_mean=0.315][A
+Train step of epoch 0:  79%|███████▉  | 5082/6434 [11:54:52<3:07:58,  8.34s/it, gpt_loss=0.424, loss_mean=0.315][A
+Train step of epoch 0:  79%|███████▉  | 5082/6434 [11:54:59<3:07:58,  8.34s/it, gpt_loss=0.23, loss_mean=0.306] [A
+Train step of epoch 0:  79%|███████▉  | 5083/6434 [11:54:59<3:04:45,  8.21s/it, gpt_loss=0.23, loss_mean=0.306][A
+Train step of epoch 0:  79%|███████▉  | 5083/6434 [11:55:08<3:04:45,  8.21s/it, gpt_loss=0.283, loss_mean=0.304][A
+Train step of epoch 0:  79%|███████▉  | 5084/6434 [11:55:08<3:04:51,  8.22s/it, gpt_loss=0.283, loss_mean=0.304][A
+Train step of epoch 0:  79%|███████▉  | 5084/6434 [11:55:15<3:04:51,  8.22s/it, gpt_loss=0.276, loss_mean=0.301][A
+Train step of epoch 0:  79%|███████▉  | 5085/6434 [11:55:15<3:02:00,  8.10s/it, gpt_loss=0.276, loss_mean=0.301][A
+Train step of epoch 0:  79%|███████▉  | 5085/6434 [11:55:25<3:02:00,  8.10s/it, gpt_loss=0.262, loss_mean=0.297][A
+Train step of epoch 0:  79%|███████▉  | 5086/6434 [11:55:25<3:10:32,  8.48s/it, gpt_loss=0.262, loss_mean=0.297][A
+Train step of epoch 0:  79%|███████▉  | 5086/6434 [11:55:32<3:10:32,  8.48s/it, gpt_loss=0.267, loss_mean=0.294][A
+Train step of epoch 0:  79%|███████▉  | 5087/6434 [11:55:32<3:02:26,  8.13s/it, gpt_loss=0.267, loss_mean=0.294][A
+Train step of epoch 0:  79%|███████▉  | 5087/6434 [11:55:40<3:02:26,  8.13s/it, gpt_loss=0.285, loss_mean=0.293][A
+Train step of epoch 0:  79%|███████▉  | 5088/6434 [11:55:40<2:57:40,  7.92s/it, gpt_loss=0.285, loss_mean=0.293][A
+Train step of epoch 0:  79%|███████▉  | 5088/6434 [11:55:48<2:57:40,  7.92s/it, gpt_loss=0.285, loss_mean=0.292][A
+Train step of epoch 0:  79%|███████▉  | 5089/6434 [11:55:48<3:00:11,  8.04s/it, gpt_loss=0.285, loss_mean=0.292][A
+[LID Router Debug] Step: 5090
+Batch Size: 10
+Audio Batch Size: 145
+LID Assignments: [9, 9, 5, 9, 4, 3, 9, 1, 0, 1]
+Active Experts in Batch: {0, 1, 3, 4, 5, 9}
+
+Train step of epoch 0:  79%|███████▉  | 5089/6434 [11:55:57<3:00:11,  8.04s/it, gpt_loss=0.289, loss_mean=0.292][A
+Train step of epoch 0:  79%|███████▉  | 5090/6434 [11:55:57<3:08:48,  8.43s/it, gpt_loss=0.289, loss_mean=0.292][A
+Train step of epoch 0:  79%|███████▉  | 5090/6434 [11:56:05<3:08:48,  8.43s/it, gpt_loss=0.229, loss_mean=0.286][A
+Train step of epoch 0:  79%|███████▉  | 5091/6434 [11:56:05<3:04:59,  8.26s/it, gpt_loss=0.229, loss_mean=0.286][A
+Train step of epoch 0:  79%|███████▉  | 5091/6434 [11:56:12<3:04:59,  8.26s/it, gpt_loss=0.271, loss_mean=0.284][A
+Train step of epoch 0:  79%|███████▉  | 5092/6434 [11:56:12<2:55:31,  7.85s/it, gpt_loss=0.271, loss_mean=0.284][A
+Train step of epoch 0:  79%|███████▉  | 5092/6434 [11:56:21<2:55:31,  7.85s/it, gpt_loss=0.256, loss_mean=0.281][A
+Train step of epoch 0:  79%|███████▉  | 5093/6434 [11:56:21<3:02:25,  8.16s/it, gpt_loss=0.256, loss_mean=0.281][A
+Train step of epoch 0:  79%|███████▉  | 5093/6434 [11:56:30<3:02:25,  8.16s/it, gpt_loss=0.218, loss_mean=0.275][A
+Train step of epoch 0:  79%|███████▉  | 5094/6434 [11:56:30<3:05:58,  8.33s/it, gpt_loss=0.218, loss_mean=0.275][A
+Train step of epoch 0:  79%|███████▉  | 5094/6434 [11:56:38<3:05:58,  8.33s/it, gpt_loss=0.281, loss_mean=0.276][A
+Train step of epoch 0:  79%|███████▉  | 5095/6434 [11:56:38<3:03:12,  8.21s/it, gpt_loss=0.281, loss_mean=0.276][A
+Train step of epoch 0:  79%|███████▉  | 5095/6434 [11:56:45<3:03:12,  8.21s/it, gpt_loss=0.399, loss_mean=0.288][A
+Train step of epoch 0:  79%|███████▉  | 5096/6434 [11:56:45<3:00:02,  8.07s/it, gpt_loss=0.399, loss_mean=0.288][A
+Train step of epoch 0:  79%|███████▉  | 5096/6434 [11:56:53<3:00:02,  8.07s/it, gpt_loss=0.316, loss_mean=0.291][A
+Train step of epoch 0:  79%|███████▉  | 5097/6434 [11:56:53<2:54:35,  7.83s/it, gpt_loss=0.316, loss_mean=0.291][A
+Train step of epoch 0:  79%|███████▉  | 5097/6434 [11:57:01<2:54:35,  7.83s/it, gpt_loss=0.262, loss_mean=0.288][A
+Train step of epoch 0:  79%|███████▉  | 5098/6434 [11:57:01<2:55:33,  7.88s/it, gpt_loss=0.262, loss_mean=0.288][A
+Train step of epoch 0:  79%|███████▉  | 5098/6434 [11:57:09<2:55:33,  7.88s/it, gpt_loss=0.181, loss_mean=0.277][A
+Train step of epoch 0:  79%|███████▉  | 5099/6434 [11:57:09<2:59:43,  8.08s/it, gpt_loss=0.181, loss_mean=0.277][A
+[LID Router Debug] Step: 5100
+Batch Size: 10
+Audio Batch Size: 74
+LID Assignments: [5, 1, 2, 6, 1, 1, 1, 2, 1, 2]
+Active Experts in Batch: {1, 2, 5, 6}
+
+Train step of epoch 0:  79%|███████▉  | 5099/6434 [11:57:17<2:59:43,  8.08s/it, gpt_loss=0.337, loss_mean=0.283][A
+Train step of epoch 0:  79%|███████▉  | 5100/6434 [11:57:17<2:56:06,  7.92s/it, gpt_loss=0.337, loss_mean=0.283][A
+Train step of epoch 0:  79%|███████▉  | 5100/6434 [11:57:26<2:56:06,  7.92s/it, gpt_loss=0.251, loss_mean=0.28] [A
+Train step of epoch 0:  79%|███████▉  | 5101/6434 [11:57:26<3:04:40,  8.31s/it, gpt_loss=0.251, loss_mean=0.28][A
+Train step of epoch 0:  79%|███████▉  | 5101/6434 [11:57:34<3:04:40,  8.31s/it, gpt_loss=0.303, loss_mean=0.282][A
+Train step of epoch 0:  79%|███████▉  | 5102/6434 [11:57:34<3:00:43,  8.14s/it, gpt_loss=0.303, loss_mean=0.282][A
+Train step of epoch 0:  79%|███████▉  | 5102/6434 [11:57:43<3:00:43,  8.14s/it, gpt_loss=0.272, loss_mean=0.281][A
+Train step of epoch 0:  79%|███████▉  | 5103/6434 [11:57:43<3:06:52,  8.42s/it, gpt_loss=0.272, loss_mean=0.281][A
+Train step of epoch 0:  79%|███████▉  | 5103/6434 [11:57:51<3:06:52,  8.42s/it, gpt_loss=0.294, loss_mean=0.283][A
+Train step of epoch 0:  79%|███████▉  | 5104/6434 [11:57:51<3:06:09,  8.40s/it, gpt_loss=0.294, loss_mean=0.283][A
+Train step of epoch 0:  79%|███████▉  | 5104/6434 [11:58:00<3:06:09,  8.40s/it, gpt_loss=0.262, loss_mean=0.281][A
+Train step of epoch 0:  79%|███████▉  | 5105/6434 [11:58:00<3:07:51,  8.48s/it, gpt_loss=0.262, loss_mean=0.281][A
+Train step of epoch 0:  79%|███████▉  | 5105/6434 [11:58:08<3:07:51,  8.48s/it, gpt_loss=0.363, loss_mean=0.289][A
+Train step of epoch 0:  79%|███████▉  | 5106/6434 [11:58:08<3:08:09,  8.50s/it, gpt_loss=0.363, loss_mean=0.289][A
+Train step of epoch 0:  79%|███████▉  | 5106/6434 [11:58:16<3:08:09,  8.50s/it, gpt_loss=0.42, loss_mean=0.302] [A
+Train step of epoch 0:  79%|███████▉  | 5107/6434 [11:58:16<3:03:43,  8.31s/it, gpt_loss=0.42, loss_mean=0.302][A
+Train step of epoch 0:  79%|███████▉  | 5107/6434 [11:58:26<3:03:43,  8.31s/it, gpt_loss=0.256, loss_mean=0.297][A
+Train step of epoch 0:  79%|███████▉  | 5108/6434 [11:58:26<3:10:50,  8.64s/it, gpt_loss=0.256, loss_mean=0.297][A
+Train step of epoch 0:  79%|███████▉  | 5108/6434 [11:58:35<3:10:50,  8.64s/it, gpt_loss=0.308, loss_mean=0.298][A
+Train step of epoch 0:  79%|███████▉  | 5109/6434 [11:58:35<3:18:19,  8.98s/it, gpt_loss=0.308, loss_mean=0.298][A
+[LID Router Debug] Step: 5110
+Batch Size: 10
+Audio Batch Size: 149
+LID Assignments: [2, 1, 0, 9, 3, 9, 6, 2, 3, 2]
+Active Experts in Batch: {0, 1, 2, 3, 6, 9}
+
+Train step of epoch 0:  79%|███████▉  | 5109/6434 [11:58:44<3:18:19,  8.98s/it, gpt_loss=0.276, loss_mean=0.296][A
+Train step of epoch 0:  79%|███████▉  | 5110/6434 [11:58:44<3:18:42,  9.01s/it, gpt_loss=0.276, loss_mean=0.296][A
+Train step of epoch 0:  79%|███████▉  | 5110/6434 [11:58:53<3:18:42,  9.01s/it, gpt_loss=0.277, loss_mean=0.294][A
+Train step of epoch 0:  79%|███████▉  | 5111/6434 [11:58:53<3:12:56,  8.75s/it, gpt_loss=0.277, loss_mean=0.294][A
+Train step of epoch 0:  79%|███████▉  | 5111/6434 [11:59:01<3:12:56,  8.75s/it, gpt_loss=0.334, loss_mean=0.298][A
+Train step of epoch 0:  79%|███████▉  | 5112/6434 [11:59:01<3:09:30,  8.60s/it, gpt_loss=0.334, loss_mean=0.298][A
+Train step of epoch 0:  79%|███████▉  | 5112/6434 [11:59:10<3:09:30,  8.60s/it, gpt_loss=0.262, loss_mean=0.295][A
+Train step of epoch 0:  79%|███████▉  | 5113/6434 [11:59:10<3:13:13,  8.78s/it, gpt_loss=0.262, loss_mean=0.295][A
+Train step of epoch 0:  79%|███████▉  | 5113/6434 [11:59:18<3:13:13,  8.78s/it, gpt_loss=0.217, loss_mean=0.287][A
+Train step of epoch 0:  79%|███████▉  | 5114/6434 [11:59:18<3:09:57,  8.63s/it, gpt_loss=0.217, loss_mean=0.287][A
+Train step of epoch 0:  79%|███████▉  | 5114/6434 [11:59:26<3:09:57,  8.63s/it, gpt_loss=0.323, loss_mean=0.29] [A
+Train step of epoch 0:  79%|███████▉  | 5115/6434 [11:59:26<3:04:14,  8.38s/it, gpt_loss=0.323, loss_mean=0.29][A
+Train step of epoch 0:  79%|███████▉  | 5115/6434 [11:59:35<3:04:14,  8.38s/it, gpt_loss=0.305, loss_mean=0.292][A
+Train step of epoch 0:  80%|███████▉  | 5116/6434 [11:59:35<3:06:14,  8.48s/it, gpt_loss=0.305, loss_mean=0.292][A
+Train step of epoch 0:  80%|███████▉  | 5116/6434 [11:59:42<3:06:14,  8.48s/it, gpt_loss=0.245, loss_mean=0.287][A
+Train step of epoch 0:  80%|███████▉  | 5117/6434 [11:59:42<3:00:31,  8.22s/it, gpt_loss=0.245, loss_mean=0.287][A
+Train step of epoch 0:  80%|███████▉  | 5117/6434 [11:59:52<3:00:31,  8.22s/it, gpt_loss=0.324, loss_mean=0.291][A
+Train step of epoch 0:  80%|███████▉  | 5118/6434 [11:59:52<3:08:05,  8.58s/it, gpt_loss=0.324, loss_mean=0.291][A
+Train step of epoch 0:  80%|███████▉  | 5118/6434 [12:00:00<3:08:05,  8.58s/it, gpt_loss=0.321, loss_mean=0.294][A
+Train step of epoch 0:  80%|███████▉  | 5119/6434 [12:00:00<3:05:02,  8.44s/it, gpt_loss=0.321, loss_mean=0.294][A
+[LID Router Debug] Step: 5120
+Batch Size: 10
+Audio Batch Size: 114
+LID Assignments: [5, 0, 1, 1, 3, 9, 2, 3, 4, 6]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  80%|███████▉  | 5119/6434 [12:00:08<3:05:02,  8.44s/it, gpt_loss=0.236, loss_mean=0.288][A
+Train step of epoch 0:  80%|███████▉  | 5120/6434 [12:00:08<3:01:34,  8.29s/it, gpt_loss=0.236, loss_mean=0.288][A
+Train step of epoch 0:  80%|███████▉  | 5120/6434 [12:00:17<3:01:34,  8.29s/it, gpt_loss=0.26, loss_mean=0.285] [A
+Train step of epoch 0:  80%|███████▉  | 5121/6434 [12:00:17<3:06:31,  8.52s/it, gpt_loss=0.26, loss_mean=0.285][A
+Train step of epoch 0:  80%|███████▉  | 5121/6434 [12:00:24<3:06:31,  8.52s/it, gpt_loss=0.29, loss_mean=0.286][A
+Train step of epoch 0:  80%|███████▉  | 5122/6434 [12:00:24<3:00:08,  8.24s/it, gpt_loss=0.29, loss_mean=0.286][A
+Train step of epoch 0:  80%|███████▉  | 5122/6434 [12:00:32<3:00:08,  8.24s/it, gpt_loss=0.296, loss_mean=0.287][A
+Train step of epoch 0:  80%|███████▉  | 5123/6434 [12:00:32<2:53:44,  7.95s/it, gpt_loss=0.296, loss_mean=0.287][A
+Train step of epoch 0:  80%|███████▉  | 5123/6434 [12:00:40<2:53:44,  7.95s/it, gpt_loss=0.392, loss_mean=0.297][A
+Train step of epoch 0:  80%|███████▉  | 5124/6434 [12:00:40<2:57:28,  8.13s/it, gpt_loss=0.392, loss_mean=0.297][A
+Train step of epoch 0:  80%|███████▉  | 5124/6434 [12:00:50<2:57:28,  8.13s/it, gpt_loss=0.226, loss_mean=0.29] [A
+Train step of epoch 0:  80%|███████▉  | 5125/6434 [12:00:50<3:06:57,  8.57s/it, gpt_loss=0.226, loss_mean=0.29][A
+Train step of epoch 0:  80%|███████▉  | 5125/6434 [12:00:59<3:06:57,  8.57s/it, gpt_loss=0.26, loss_mean=0.287][A
+Train step of epoch 0:  80%|███████▉  | 5126/6434 [12:00:59<3:09:46,  8.71s/it, gpt_loss=0.26, loss_mean=0.287][A
+Train step of epoch 0:  80%|███████▉  | 5126/6434 [12:01:07<3:09:46,  8.71s/it, gpt_loss=0.313, loss_mean=0.29][A
+Train step of epoch 0:  80%|███████▉  | 5127/6434 [12:01:07<3:06:26,  8.56s/it, gpt_loss=0.313, loss_mean=0.29][A
+Train step of epoch 0:  80%|███████▉  | 5127/6434 [12:01:16<3:06:26,  8.56s/it, gpt_loss=0.289, loss_mean=0.29][A
+Train step of epoch 0:  80%|███████▉  | 5128/6434 [12:01:16<3:08:42,  8.67s/it, gpt_loss=0.289, loss_mean=0.29][A
+Train step of epoch 0:  80%|███████▉  | 5128/6434 [12:01:24<3:08:42,  8.67s/it, gpt_loss=0.375, loss_mean=0.298][A
+Train step of epoch 0:  80%|███████▉  | 5129/6434 [12:01:24<3:03:55,  8.46s/it, gpt_loss=0.375, loss_mean=0.298][A
+[LID Router Debug] Step: 5130
+Batch Size: 10
+Audio Batch Size: 106
+LID Assignments: [9, 9, 9, 1, 5, 0, 1, 6, 3, 6]
+Active Experts in Batch: {0, 1, 3, 5, 6, 9}
+
+Train step of epoch 0:  80%|███████▉  | 5129/6434 [12:01:33<3:03:55,  8.46s/it, gpt_loss=0.326, loss_mean=0.301][A
+Train step of epoch 0:  80%|███████▉  | 5130/6434 [12:01:33<3:10:06,  8.75s/it, gpt_loss=0.326, loss_mean=0.301][A
+Train step of epoch 0:  80%|███████▉  | 5130/6434 [12:01:42<3:10:06,  8.75s/it, gpt_loss=0.377, loss_mean=0.309][A
+Train step of epoch 0:  80%|███████▉  | 5131/6434 [12:01:42<3:08:31,  8.68s/it, gpt_loss=0.377, loss_mean=0.309][A
+Train step of epoch 0:  80%|███████▉  | 5131/6434 [12:01:52<3:08:31,  8.68s/it, gpt_loss=0.371, loss_mean=0.315][A
+Train step of epoch 0:  80%|███████▉  | 5132/6434 [12:01:52<3:18:32,  9.15s/it, gpt_loss=0.371, loss_mean=0.315][A
+Train step of epoch 0:  80%|███████▉  | 5132/6434 [12:02:00<3:18:32,  9.15s/it, gpt_loss=0.252, loss_mean=0.309][A
+Train step of epoch 0:  80%|███████▉  | 5133/6434 [12:02:00<3:11:44,  8.84s/it, gpt_loss=0.252, loss_mean=0.309][A
+Train step of epoch 0:  80%|███████▉  | 5133/6434 [12:02:09<3:11:44,  8.84s/it, gpt_loss=0.238, loss_mean=0.301][A
+Train step of epoch 0:  80%|███████▉  | 5134/6434 [12:02:09<3:12:38,  8.89s/it, gpt_loss=0.238, loss_mean=0.301][A
+Train step of epoch 0:  80%|███████▉  | 5134/6434 [12:02:18<3:12:38,  8.89s/it, gpt_loss=0.28, loss_mean=0.299] [A
+Train step of epoch 0:  80%|███████▉  | 5135/6434 [12:02:18<3:08:55,  8.73s/it, gpt_loss=0.28, loss_mean=0.299][A
+Train step of epoch 0:  80%|███████▉  | 5135/6434 [12:02:26<3:08:55,  8.73s/it, gpt_loss=0.236, loss_mean=0.293][A
+Train step of epoch 0:  80%|███████▉  | 5136/6434 [12:02:26<3:02:59,  8.46s/it, gpt_loss=0.236, loss_mean=0.293][A
+Train step of epoch 0:  80%|███████▉  | 5136/6434 [12:02:34<3:02:59,  8.46s/it, gpt_loss=0.353, loss_mean=0.299][A
+Train step of epoch 0:  80%|███████▉  | 5137/6434 [12:02:34<3:04:00,  8.51s/it, gpt_loss=0.353, loss_mean=0.299][A
+Train step of epoch 0:  80%|███████▉  | 5137/6434 [12:02:42<3:04:00,  8.51s/it, gpt_loss=0.232, loss_mean=0.292][A
+Train step of epoch 0:  80%|███████▉  | 5138/6434 [12:02:42<2:59:48,  8.32s/it, gpt_loss=0.232, loss_mean=0.292][A
+Train step of epoch 0:  80%|███████▉  | 5138/6434 [12:02:51<2:59:48,  8.32s/it, gpt_loss=0.237, loss_mean=0.287][A
+Train step of epoch 0:  80%|███████▉  | 5139/6434 [12:02:51<3:03:23,  8.50s/it, gpt_loss=0.237, loss_mean=0.287][A
+[LID Router Debug] Step: 5140
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [3, 0, 0, 4, 4, 2, 5, 9, 1, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  80%|███████▉  | 5139/6434 [12:02:59<3:03:23,  8.50s/it, gpt_loss=0.286, loss_mean=0.287][A
+Train step of epoch 0:  80%|███████▉  | 5140/6434 [12:02:59<3:00:41,  8.38s/it, gpt_loss=0.286, loss_mean=0.287][A
+Train step of epoch 0:  80%|███████▉  | 5140/6434 [12:03:07<3:00:41,  8.38s/it, gpt_loss=0.306, loss_mean=0.289][A
+Train step of epoch 0:  80%|███████▉  | 5141/6434 [12:03:07<2:57:17,  8.23s/it, gpt_loss=0.306, loss_mean=0.289][A
+Train step of epoch 0:  80%|███████▉  | 5141/6434 [12:03:16<2:57:17,  8.23s/it, gpt_loss=0.248, loss_mean=0.285][A
+Train step of epoch 0:  80%|███████▉  | 5142/6434 [12:03:16<3:02:22,  8.47s/it, gpt_loss=0.248, loss_mean=0.285][A
+Train step of epoch 0:  80%|███████▉  | 5142/6434 [12:03:25<3:02:22,  8.47s/it, gpt_loss=0.235, loss_mean=0.28] [A
+Train step of epoch 0:  80%|███████▉  | 5143/6434 [12:03:25<3:07:21,  8.71s/it, gpt_loss=0.235, loss_mean=0.28][A
+Train step of epoch 0:  80%|███████▉  | 5143/6434 [12:03:34<3:07:21,  8.71s/it, gpt_loss=0.222, loss_mean=0.274][A
+Train step of epoch 0:  80%|███████▉  | 5144/6434 [12:03:34<3:06:55,  8.69s/it, gpt_loss=0.222, loss_mean=0.274][A
+Train step of epoch 0:  80%|███████▉  | 5144/6434 [12:03:42<3:06:55,  8.69s/it, gpt_loss=0.351, loss_mean=0.282][A
+Train step of epoch 0:  80%|███████▉  | 5145/6434 [12:03:42<3:05:26,  8.63s/it, gpt_loss=0.351, loss_mean=0.282][A
+Train step of epoch 0:  80%|███████▉  | 5145/6434 [12:03:51<3:05:26,  8.63s/it, gpt_loss=0.219, loss_mean=0.275][A
+Train step of epoch 0:  80%|███████▉  | 5146/6434 [12:03:51<3:06:15,  8.68s/it, gpt_loss=0.219, loss_mean=0.275][A
+Train step of epoch 0:  80%|███████▉  | 5146/6434 [12:04:00<3:06:15,  8.68s/it, gpt_loss=0.415, loss_mean=0.289][A
+Train step of epoch 0:  80%|███████▉  | 5147/6434 [12:04:00<3:08:31,  8.79s/it, gpt_loss=0.415, loss_mean=0.289][A
+Train step of epoch 0:  80%|███████▉  | 5147/6434 [12:04:08<3:08:31,  8.79s/it, gpt_loss=0.29, loss_mean=0.289] [A
+Train step of epoch 0:  80%|████████  | 5148/6434 [12:04:08<2:58:54,  8.35s/it, gpt_loss=0.29, loss_mean=0.289][A
+Train step of epoch 0:  80%|████████  | 5148/6434 [12:04:16<2:58:54,  8.35s/it, gpt_loss=0.285, loss_mean=0.289][A
+Train step of epoch 0:  80%|████████  | 5149/6434 [12:04:16<3:02:38,  8.53s/it, gpt_loss=0.285, loss_mean=0.289][A
+[LID Router Debug] Step: 5150
+Batch Size: 10
+Audio Batch Size: 79
+LID Assignments: [1, 1, 2, 1, 5, 5, 1, 6, 0, 2]
+Active Experts in Batch: {0, 1, 2, 5, 6}
+
+Train step of epoch 0:  80%|████████  | 5149/6434 [12:04:25<3:02:38,  8.53s/it, gpt_loss=0.375, loss_mean=0.297][A
+Train step of epoch 0:  80%|████████  | 5150/6434 [12:04:25<3:00:04,  8.41s/it, gpt_loss=0.375, loss_mean=0.297][A
+Train step of epoch 0:  80%|████████  | 5150/6434 [12:04:33<3:00:04,  8.41s/it, gpt_loss=0.302, loss_mean=0.298][A
+Train step of epoch 0:  80%|████████  | 5151/6434 [12:04:33<3:01:33,  8.49s/it, gpt_loss=0.302, loss_mean=0.298][A
+Train step of epoch 0:  80%|████████  | 5151/6434 [12:04:41<3:01:33,  8.49s/it, gpt_loss=0.267, loss_mean=0.295][A
+Train step of epoch 0:  80%|████████  | 5152/6434 [12:04:41<2:55:20,  8.21s/it, gpt_loss=0.267, loss_mean=0.295][A
+Train step of epoch 0:  80%|████████  | 5152/6434 [12:04:50<2:55:20,  8.21s/it, gpt_loss=0.322, loss_mean=0.297][A
+Train step of epoch 0:  80%|████████  | 5153/6434 [12:04:50<3:02:32,  8.55s/it, gpt_loss=0.322, loss_mean=0.297][A
+Train step of epoch 0:  80%|████████  | 5153/6434 [12:04:58<3:02:32,  8.55s/it, gpt_loss=0.22, loss_mean=0.29]  [A
+Train step of epoch 0:  80%|████████  | 5154/6434 [12:04:58<3:00:38,  8.47s/it, gpt_loss=0.22, loss_mean=0.29][A
+Train step of epoch 0:  80%|████████  | 5154/6434 [12:05:06<3:00:38,  8.47s/it, gpt_loss=0.324, loss_mean=0.293][A
+Train step of epoch 0:  80%|████████  | 5155/6434 [12:05:06<2:57:02,  8.31s/it, gpt_loss=0.324, loss_mean=0.293][A
+Train step of epoch 0:  80%|████████  | 5155/6434 [12:05:15<2:57:02,  8.31s/it, gpt_loss=0.312, loss_mean=0.295][A
+Train step of epoch 0:  80%|████████  | 5156/6434 [12:05:15<2:59:04,  8.41s/it, gpt_loss=0.312, loss_mean=0.295][A
+Train step of epoch 0:  80%|████████  | 5156/6434 [12:05:23<2:59:04,  8.41s/it, gpt_loss=0.383, loss_mean=0.304][A
+Train step of epoch 0:  80%|████████  | 5157/6434 [12:05:23<2:56:01,  8.27s/it, gpt_loss=0.383, loss_mean=0.304][A
+Train step of epoch 0:  80%|████████  | 5157/6434 [12:05:32<2:56:01,  8.27s/it, gpt_loss=0.318, loss_mean=0.305][A
+Train step of epoch 0:  80%|████████  | 5158/6434 [12:05:32<3:03:18,  8.62s/it, gpt_loss=0.318, loss_mean=0.305][A
+Train step of epoch 0:  80%|████████  | 5158/6434 [12:05:42<3:03:18,  8.62s/it, gpt_loss=0.306, loss_mean=0.305][A
+Train step of epoch 0:  80%|████████  | 5159/6434 [12:05:42<3:06:56,  8.80s/it, gpt_loss=0.306, loss_mean=0.305][A
+[LID Router Debug] Step: 5160
+Batch Size: 10
+Audio Batch Size: 108
+LID Assignments: [5, 1, 9, 2, 1, 1, 2, 2, 3, 2]
+Active Experts in Batch: {1, 2, 3, 5, 9}
+
+Train step of epoch 0:  80%|████████  | 5159/6434 [12:05:52<3:06:56,  8.80s/it, gpt_loss=0.244, loss_mean=0.299][A
+Train step of epoch 0:  80%|████████  | 5160/6434 [12:05:52<3:16:32,  9.26s/it, gpt_loss=0.244, loss_mean=0.299][A
+Train step of epoch 0:  80%|████████  | 5160/6434 [12:05:59<3:16:32,  9.26s/it, gpt_loss=0.353, loss_mean=0.305][A
+Train step of epoch 0:  80%|████████  | 5161/6434 [12:05:59<3:04:39,  8.70s/it, gpt_loss=0.353, loss_mean=0.305][A
+Train step of epoch 0:  80%|████████  | 5161/6434 [12:06:07<3:04:39,  8.70s/it, gpt_loss=0.267, loss_mean=0.301][A
+Train step of epoch 0:  80%|████████  | 5162/6434 [12:06:07<3:00:39,  8.52s/it, gpt_loss=0.267, loss_mean=0.301][A
+Train step of epoch 0:  80%|████████  | 5162/6434 [12:06:15<3:00:39,  8.52s/it, gpt_loss=0.324, loss_mean=0.303][A
+Train step of epoch 0:  80%|████████  | 5163/6434 [12:06:15<2:54:56,  8.26s/it, gpt_loss=0.324, loss_mean=0.303][A
+Train step of epoch 0:  80%|████████  | 5163/6434 [12:06:23<2:54:56,  8.26s/it, gpt_loss=0.359, loss_mean=0.309][A
+Train step of epoch 0:  80%|████████  | 5164/6434 [12:06:23<2:52:50,  8.17s/it, gpt_loss=0.359, loss_mean=0.309][A
+Train step of epoch 0:  80%|████████  | 5164/6434 [12:06:32<2:52:50,  8.17s/it, gpt_loss=0.202, loss_mean=0.298][A
+Train step of epoch 0:  80%|████████  | 5165/6434 [12:06:32<2:56:27,  8.34s/it, gpt_loss=0.202, loss_mean=0.298][A
+Train step of epoch 0:  80%|████████  | 5165/6434 [12:06:40<2:56:27,  8.34s/it, gpt_loss=0.239, loss_mean=0.292][A
+Train step of epoch 0:  80%|████████  | 5166/6434 [12:06:40<2:52:11,  8.15s/it, gpt_loss=0.239, loss_mean=0.292][A
+Train step of epoch 0:  80%|████████  | 5166/6434 [12:06:49<2:52:11,  8.15s/it, gpt_loss=0.32, loss_mean=0.295] [A
+Train step of epoch 0:  80%|████████  | 5167/6434 [12:06:49<3:00:15,  8.54s/it, gpt_loss=0.32, loss_mean=0.295][A
+Train step of epoch 0:  80%|████████  | 5167/6434 [12:06:57<3:00:15,  8.54s/it, gpt_loss=0.298, loss_mean=0.295][A
+Train step of epoch 0:  80%|████████  | 5168/6434 [12:06:57<2:59:23,  8.50s/it, gpt_loss=0.298, loss_mean=0.295][A
+Train step of epoch 0:  80%|████████  | 5168/6434 [12:07:06<2:59:23,  8.50s/it, gpt_loss=0.284, loss_mean=0.294][A
+Train step of epoch 0:  80%|████████  | 5169/6434 [12:07:06<3:01:09,  8.59s/it, gpt_loss=0.284, loss_mean=0.294][A
+[LID Router Debug] Step: 5170
+Batch Size: 10
+Audio Batch Size: 109
+LID Assignments: [9, 6, 5, 3, 1, 1, 2, 3, 1, 6]
+Active Experts in Batch: {1, 2, 3, 5, 6, 9}
+
+Train step of epoch 0:  80%|████████  | 5169/6434 [12:07:15<3:01:09,  8.59s/it, gpt_loss=0.283, loss_mean=0.293][A
+Train step of epoch 0:  80%|████████  | 5170/6434 [12:07:15<2:59:24,  8.52s/it, gpt_loss=0.283, loss_mean=0.293][A
+Train step of epoch 0:  80%|████████  | 5170/6434 [12:07:22<2:59:24,  8.52s/it, gpt_loss=0.273, loss_mean=0.291][A
+Train step of epoch 0:  80%|████████  | 5171/6434 [12:07:22<2:52:33,  8.20s/it, gpt_loss=0.273, loss_mean=0.291][A
+Train step of epoch 0:  80%|████████  | 5171/6434 [12:07:30<2:52:33,  8.20s/it, gpt_loss=0.285, loss_mean=0.29] [A
+Train step of epoch 0:  80%|████████  | 5172/6434 [12:07:30<2:51:18,  8.14s/it, gpt_loss=0.285, loss_mean=0.29][A
+Train step of epoch 0:  80%|████████  | 5172/6434 [12:07:39<2:51:18,  8.14s/it, gpt_loss=0.329, loss_mean=0.294][A
+Train step of epoch 0:  80%|████████  | 5173/6434 [12:07:39<2:57:14,  8.43s/it, gpt_loss=0.329, loss_mean=0.294][A
+Train step of epoch 0:  80%|████████  | 5173/6434 [12:07:48<2:57:14,  8.43s/it, gpt_loss=0.248, loss_mean=0.29] [A
+Train step of epoch 0:  80%|████████  | 5174/6434 [12:07:48<3:00:47,  8.61s/it, gpt_loss=0.248, loss_mean=0.29][A
+Train step of epoch 0:  80%|████████  | 5174/6434 [12:07:57<3:00:47,  8.61s/it, gpt_loss=0.336, loss_mean=0.294][A
+Train step of epoch 0:  80%|████████  | 5175/6434 [12:07:57<3:03:39,  8.75s/it, gpt_loss=0.336, loss_mean=0.294][A
+Train step of epoch 0:  80%|████████  | 5175/6434 [12:08:06<3:03:39,  8.75s/it, gpt_loss=0.314, loss_mean=0.296][A
+Train step of epoch 0:  80%|████████  | 5176/6434 [12:08:06<3:06:07,  8.88s/it, gpt_loss=0.314, loss_mean=0.296][A
+Train step of epoch 0:  80%|████████  | 5176/6434 [12:08:14<3:06:07,  8.88s/it, gpt_loss=0.259, loss_mean=0.293][A
+Train step of epoch 0:  80%|████████  | 5177/6434 [12:08:14<2:59:39,  8.58s/it, gpt_loss=0.259, loss_mean=0.293][A
+Train step of epoch 0:  80%|████████  | 5177/6434 [12:08:23<2:59:39,  8.58s/it, gpt_loss=0.283, loss_mean=0.292][A
+Train step of epoch 0:  80%|████████  | 5178/6434 [12:08:23<3:03:10,  8.75s/it, gpt_loss=0.283, loss_mean=0.292][A
+Train step of epoch 0:  80%|████████  | 5178/6434 [12:08:31<3:03:10,  8.75s/it, gpt_loss=0.353, loss_mean=0.298][A
+Train step of epoch 0:  80%|████████  | 5179/6434 [12:08:31<2:56:54,  8.46s/it, gpt_loss=0.353, loss_mean=0.298][A
+[LID Router Debug] Step: 5180
+Batch Size: 10
+Audio Batch Size: 122
+LID Assignments: [3, 5, 0, 5, 4, 1, 4, 4, 2, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5}
+
+Train step of epoch 0:  80%|████████  | 5179/6434 [12:08:39<2:56:54,  8.46s/it, gpt_loss=0.347, loss_mean=0.303][A
+Train step of epoch 0:  81%|████████  | 5180/6434 [12:08:39<2:54:16,  8.34s/it, gpt_loss=0.347, loss_mean=0.303][A
+Train step of epoch 0:  81%|████████  | 5180/6434 [12:08:48<2:54:16,  8.34s/it, gpt_loss=0.362, loss_mean=0.309][A
+Train step of epoch 0:  81%|████████  | 5181/6434 [12:08:48<2:58:28,  8.55s/it, gpt_loss=0.362, loss_mean=0.309][A
+Train step of epoch 0:  81%|████████  | 5181/6434 [12:08:57<2:58:28,  8.55s/it, gpt_loss=0.304, loss_mean=0.308][A
+Train step of epoch 0:  81%|████████  | 5182/6434 [12:08:57<3:01:35,  8.70s/it, gpt_loss=0.304, loss_mean=0.308][A
+Train step of epoch 0:  81%|████████  | 5182/6434 [12:09:05<3:01:35,  8.70s/it, gpt_loss=0.297, loss_mean=0.307][A
+Train step of epoch 0:  81%|████████  | 5183/6434 [12:09:05<2:56:44,  8.48s/it, gpt_loss=0.297, loss_mean=0.307][A
+Train step of epoch 0:  81%|████████  | 5183/6434 [12:09:14<2:56:44,  8.48s/it, gpt_loss=0.293, loss_mean=0.306][A
+Train step of epoch 0:  81%|████████  | 5184/6434 [12:09:14<2:59:39,  8.62s/it, gpt_loss=0.293, loss_mean=0.306][A
+Train step of epoch 0:  81%|████████  | 5184/6434 [12:09:23<2:59:39,  8.62s/it, gpt_loss=0.264, loss_mean=0.301][A
+Train step of epoch 0:  81%|████████  | 5185/6434 [12:09:23<2:59:10,  8.61s/it, gpt_loss=0.264, loss_mean=0.301][A
+Train step of epoch 0:  81%|████████  | 5185/6434 [12:09:30<2:59:10,  8.61s/it, gpt_loss=0.365, loss_mean=0.308][A
+Train step of epoch 0:  81%|████████  | 5186/6434 [12:09:30<2:49:57,  8.17s/it, gpt_loss=0.365, loss_mean=0.308][A
+Train step of epoch 0:  81%|████████  | 5186/6434 [12:09:40<2:49:57,  8.17s/it, gpt_loss=0.262, loss_mean=0.303][A
+Train step of epoch 0:  81%|████████  | 5187/6434 [12:09:40<3:02:53,  8.80s/it, gpt_loss=0.262, loss_mean=0.303][A
+Train step of epoch 0:  81%|████████  | 5187/6434 [12:09:49<3:02:53,  8.80s/it, gpt_loss=0.36, loss_mean=0.309] [A
+Train step of epoch 0:  81%|████████  | 5188/6434 [12:09:49<3:01:55,  8.76s/it, gpt_loss=0.36, loss_mean=0.309][A
+Train step of epoch 0:  81%|████████  | 5188/6434 [12:09:58<3:01:55,  8.76s/it, gpt_loss=0.344, loss_mean=0.312][A
+Train step of epoch 0:  81%|████████  | 5189/6434 [12:09:58<3:03:37,  8.85s/it, gpt_loss=0.344, loss_mean=0.312][A
+[LID Router Debug] Step: 5190
+Batch Size: 10
+Audio Batch Size: 91
+LID Assignments: [5, 1, 0, 0, 2, 6, 2, 1, 6, 6]
+Active Experts in Batch: {0, 1, 2, 5, 6}
+
+Train step of epoch 0:  81%|████████  | 5189/6434 [12:10:06<3:03:37,  8.85s/it, gpt_loss=0.253, loss_mean=0.307][A
+Train step of epoch 0:  81%|████████  | 5190/6434 [12:10:06<2:56:20,  8.50s/it, gpt_loss=0.253, loss_mean=0.307][A
+Train step of epoch 0:  81%|████████  | 5190/6434 [12:10:15<2:56:20,  8.50s/it, gpt_loss=0.291, loss_mean=0.305][A
+Train step of epoch 0:  81%|████████  | 5191/6434 [12:10:15<2:59:47,  8.68s/it, gpt_loss=0.291, loss_mean=0.305][A
+Train step of epoch 0:  81%|████████  | 5191/6434 [12:10:22<2:59:47,  8.68s/it, gpt_loss=0.305, loss_mean=0.305][A
+Train step of epoch 0:  81%|████████  | 5192/6434 [12:10:22<2:51:47,  8.30s/it, gpt_loss=0.305, loss_mean=0.305][A
+Train step of epoch 0:  81%|████████  | 5192/6434 [12:10:31<2:51:47,  8.30s/it, gpt_loss=0.312, loss_mean=0.306][A
+Train step of epoch 0:  81%|████████  | 5193/6434 [12:10:31<2:56:12,  8.52s/it, gpt_loss=0.312, loss_mean=0.306][A
+Train step of epoch 0:  81%|████████  | 5193/6434 [12:10:40<2:56:12,  8.52s/it, gpt_loss=0.299, loss_mean=0.305][A
+Train step of epoch 0:  81%|████████  | 5194/6434 [12:10:40<2:55:51,  8.51s/it, gpt_loss=0.299, loss_mean=0.305][A
+Train step of epoch 0:  81%|████████  | 5194/6434 [12:10:48<2:55:51,  8.51s/it, gpt_loss=0.275, loss_mean=0.302][A
+Train step of epoch 0:  81%|████████  | 5195/6434 [12:10:48<2:55:31,  8.50s/it, gpt_loss=0.275, loss_mean=0.302][A
+Train step of epoch 0:  81%|████████  | 5195/6434 [12:10:58<2:55:31,  8.50s/it, gpt_loss=0.263, loss_mean=0.298][A
+Train step of epoch 0:  81%|████████  | 5196/6434 [12:10:58<3:02:15,  8.83s/it, gpt_loss=0.263, loss_mean=0.298][A
+Train step of epoch 0:  81%|████████  | 5196/6434 [12:11:06<3:02:15,  8.83s/it, gpt_loss=0.225, loss_mean=0.291][A
+Train step of epoch 0:  81%|████████  | 5197/6434 [12:11:06<2:58:02,  8.64s/it, gpt_loss=0.225, loss_mean=0.291][A
+Train step of epoch 0:  81%|████████  | 5197/6434 [12:11:14<2:58:02,  8.64s/it, gpt_loss=0.279, loss_mean=0.29] [A
+Train step of epoch 0:  81%|████████  | 5198/6434 [12:11:14<2:52:30,  8.37s/it, gpt_loss=0.279, loss_mean=0.29][A
+Train step of epoch 0:  81%|████████  | 5198/6434 [12:11:22<2:52:30,  8.37s/it, gpt_loss=0.241, loss_mean=0.285][A
+Train step of epoch 0:  81%|████████  | 5199/6434 [12:11:22<2:53:30,  8.43s/it, gpt_loss=0.241, loss_mean=0.285][A
+[LID Router Debug] Step: 5200
+Batch Size: 10
+Audio Batch Size: 140
+LID Assignments: [3, 2, 3, 6, 9, 5, 9, 2, 0, 1]
+Active Experts in Batch: {0, 1, 2, 3, 5, 6, 9}
+[2026-02-07 04:07:35,767] [INFO] [logging.py:96:log_dist] [Rank 0] step=2600, skipped=0, lr=[1.679989062579352e-05, 1.679989062579352e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 04:07:35,768] [INFO] [timer.py:260:stop] epoch=0/micro_step=5200/global_step=2600, RunningAvgSamplesPerSec=4.748991686948652, CurrSamplesPerSec=4.562462811661392, MemAllocated=13.04GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  81%|████████  | 5199/6434 [12:11:31<2:53:30,  8.43s/it, gpt_loss=0.271, loss_mean=0.283][A
+Train step of epoch 0:  81%|████████  | 5200/6434 [12:11:31<2:56:54,  8.60s/it, gpt_loss=0.271, loss_mean=0.283][A
+Train step of epoch 0:  81%|████████  | 5200/6434 [12:11:39<2:56:54,  8.60s/it, gpt_loss=0.289, loss_mean=0.284][A
+Train step of epoch 0:  81%|████████  | 5201/6434 [12:11:39<2:52:18,  8.39s/it, gpt_loss=0.289, loss_mean=0.284][A
+Train step of epoch 0:  81%|████████  | 5201/6434 [12:11:48<2:52:18,  8.39s/it, gpt_loss=0.342, loss_mean=0.29] [A
+Train step of epoch 0:  81%|████████  | 5202/6434 [12:11:48<2:51:48,  8.37s/it, gpt_loss=0.342, loss_mean=0.29][A
+Train step of epoch 0:  81%|████████  | 5202/6434 [12:11:55<2:51:48,  8.37s/it, gpt_loss=0.201, loss_mean=0.281][A
+Train step of epoch 0:  81%|████████  | 5203/6434 [12:11:55<2:48:02,  8.19s/it, gpt_loss=0.201, loss_mean=0.281][A
+Train step of epoch 0:  81%|████████  | 5203/6434 [12:12:03<2:48:02,  8.19s/it, gpt_loss=0.31, loss_mean=0.284] [A
+Train step of epoch 0:  81%|████████  | 5204/6434 [12:12:03<2:44:29,  8.02s/it, gpt_loss=0.31, loss_mean=0.284][A
+Train step of epoch 0:  81%|████████  | 5204/6434 [12:12:12<2:44:29,  8.02s/it, gpt_loss=0.268, loss_mean=0.282][A
+Train step of epoch 0:  81%|████████  | 5205/6434 [12:12:12<2:53:42,  8.48s/it, gpt_loss=0.268, loss_mean=0.282][A
+Train step of epoch 0:  81%|████████  | 5205/6434 [12:12:21<2:53:42,  8.48s/it, gpt_loss=0.266, loss_mean=0.281][A
+Train step of epoch 0:  81%|████████  | 5206/6434 [12:12:21<2:52:51,  8.45s/it, gpt_loss=0.266, loss_mean=0.281][A
+Train step of epoch 0:  81%|████████  | 5206/6434 [12:12:29<2:52:51,  8.45s/it, gpt_loss=0.334, loss_mean=0.286][A
+Train step of epoch 0:  81%|████████  | 5207/6434 [12:12:29<2:50:18,  8.33s/it, gpt_loss=0.334, loss_mean=0.286][A
+Train step of epoch 0:  81%|████████  | 5207/6434 [12:12:38<2:50:18,  8.33s/it, gpt_loss=0.334, loss_mean=0.291][A
+Train step of epoch 0:  81%|████████  | 5208/6434 [12:12:38<2:54:57,  8.56s/it, gpt_loss=0.334, loss_mean=0.291][A
+Train step of epoch 0:  81%|████████  | 5208/6434 [12:12:46<2:54:57,  8.56s/it, gpt_loss=0.263, loss_mean=0.288][A
+Train step of epoch 0:  81%|████████  | 5209/6434 [12:12:46<2:51:47,  8.41s/it, gpt_loss=0.263, loss_mean=0.288][A
+[LID Router Debug] Step: 5210
+Batch Size: 10
+Audio Batch Size: 126
+LID Assignments: [3, 1, 5, 5, 3, 3, 0, 5, 1, 2]
+Active Experts in Batch: {0, 1, 2, 3, 5}
+
+Train step of epoch 0:  81%|████████  | 5209/6434 [12:12:54<2:51:47,  8.41s/it, gpt_loss=0.251, loss_mean=0.284][A
+Train step of epoch 0:  81%|████████  | 5210/6434 [12:12:54<2:50:07,  8.34s/it, gpt_loss=0.251, loss_mean=0.284][A
+Train step of epoch 0:  81%|████████  | 5210/6434 [12:13:03<2:50:07,  8.34s/it, gpt_loss=0.286, loss_mean=0.285][A
+Train step of epoch 0:  81%|████████  | 5211/6434 [12:13:03<2:52:14,  8.45s/it, gpt_loss=0.286, loss_mean=0.285][A
+Train step of epoch 0:  81%|████████  | 5211/6434 [12:13:11<2:52:14,  8.45s/it, gpt_loss=0.318, loss_mean=0.288][A
+Train step of epoch 0:  81%|████████  | 5212/6434 [12:13:11<2:49:00,  8.30s/it, gpt_loss=0.318, loss_mean=0.288][A
+Train step of epoch 0:  81%|████████  | 5212/6434 [12:13:19<2:49:00,  8.30s/it, gpt_loss=0.262, loss_mean=0.285][A
+Train step of epoch 0:  81%|████████  | 5213/6434 [12:13:19<2:47:08,  8.21s/it, gpt_loss=0.262, loss_mean=0.285][A
+Train step of epoch 0:  81%|████████  | 5213/6434 [12:13:27<2:47:08,  8.21s/it, gpt_loss=0.359, loss_mean=0.293][A
+Train step of epoch 0:  81%|████████  | 5214/6434 [12:13:27<2:47:38,  8.25s/it, gpt_loss=0.359, loss_mean=0.293][A
+Train step of epoch 0:  81%|████████  | 5214/6434 [12:13:36<2:47:38,  8.25s/it, gpt_loss=0.305, loss_mean=0.294][A
+Train step of epoch 0:  81%|████████  | 5215/6434 [12:13:36<2:51:32,  8.44s/it, gpt_loss=0.305, loss_mean=0.294][A
+Train step of epoch 0:  81%|████████  | 5215/6434 [12:13:44<2:51:32,  8.44s/it, gpt_loss=0.267, loss_mean=0.291][A
+Train step of epoch 0:  81%|████████  | 5216/6434 [12:13:44<2:50:05,  8.38s/it, gpt_loss=0.267, loss_mean=0.291][A
+Train step of epoch 0:  81%|████████  | 5216/6434 [12:13:53<2:50:05,  8.38s/it, gpt_loss=0.275, loss_mean=0.29] [A
+Train step of epoch 0:  81%|████████  | 5217/6434 [12:13:53<2:53:41,  8.56s/it, gpt_loss=0.275, loss_mean=0.29][A
+Train step of epoch 0:  81%|████████  | 5217/6434 [12:14:02<2:53:41,  8.56s/it, gpt_loss=0.23, loss_mean=0.284][A
+Train step of epoch 0:  81%|████████  | 5218/6434 [12:14:02<2:56:24,  8.70s/it, gpt_loss=0.23, loss_mean=0.284][A
+Train step of epoch 0:  81%|████████  | 5218/6434 [12:14:12<2:56:24,  8.70s/it, gpt_loss=0.302, loss_mean=0.285][A
+Train step of epoch 0:  81%|████████  | 5219/6434 [12:14:12<2:59:36,  8.87s/it, gpt_loss=0.302, loss_mean=0.285][A
+[LID Router Debug] Step: 5220
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [4, 1, 2, 6, 2, 2, 4, 9, 3, 5]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  81%|████████  | 5219/6434 [12:14:20<2:59:36,  8.87s/it, gpt_loss=0.322, loss_mean=0.289][A
+Train step of epoch 0:  81%|████████  | 5220/6434 [12:14:20<2:55:10,  8.66s/it, gpt_loss=0.322, loss_mean=0.289][A
+Train step of epoch 0:  81%|████████  | 5220/6434 [12:14:30<2:55:10,  8.66s/it, gpt_loss=0.296, loss_mean=0.29] [A
+Train step of epoch 0:  81%|████████  | 5221/6434 [12:14:30<3:01:38,  8.98s/it, gpt_loss=0.296, loss_mean=0.29][A
+Train step of epoch 0:  81%|████████  | 5221/6434 [12:14:39<3:01:38,  8.98s/it, gpt_loss=0.338, loss_mean=0.295][A
+Train step of epoch 0:  81%|████████  | 5222/6434 [12:14:39<3:03:35,  9.09s/it, gpt_loss=0.338, loss_mean=0.295][A
+Train step of epoch 0:  81%|████████  | 5222/6434 [12:14:47<3:03:35,  9.09s/it, gpt_loss=0.3, loss_mean=0.295]  [A
+Train step of epoch 0:  81%|████████  | 5223/6434 [12:14:47<2:58:42,  8.85s/it, gpt_loss=0.3, loss_mean=0.295][A
+Train step of epoch 0:  81%|████████  | 5223/6434 [12:14:55<2:58:42,  8.85s/it, gpt_loss=0.246, loss_mean=0.29][A
+Train step of epoch 0:  81%|████████  | 5224/6434 [12:14:55<2:50:21,  8.45s/it, gpt_loss=0.246, loss_mean=0.29][A
+Train step of epoch 0:  81%|████████  | 5224/6434 [12:15:03<2:50:21,  8.45s/it, gpt_loss=0.292, loss_mean=0.291][A
+Train step of epoch 0:  81%|████████  | 5225/6434 [12:15:03<2:50:34,  8.47s/it, gpt_loss=0.292, loss_mean=0.291][A
+Train step of epoch 0:  81%|████████  | 5225/6434 [12:15:12<2:50:34,  8.47s/it, gpt_loss=0.307, loss_mean=0.292][A
+Train step of epoch 0:  81%|████████  | 5226/6434 [12:15:12<2:50:25,  8.47s/it, gpt_loss=0.307, loss_mean=0.292][A
+Train step of epoch 0:  81%|████████  | 5226/6434 [12:15:20<2:50:25,  8.47s/it, gpt_loss=0.275, loss_mean=0.29] [A
+Train step of epoch 0:  81%|████████  | 5227/6434 [12:15:20<2:48:40,  8.38s/it, gpt_loss=0.275, loss_mean=0.29][A
+Train step of epoch 0:  81%|████████  | 5227/6434 [12:15:29<2:48:40,  8.38s/it, gpt_loss=0.272, loss_mean=0.289][A
+Train step of epoch 0:  81%|████████▏ | 5228/6434 [12:15:29<2:52:29,  8.58s/it, gpt_loss=0.272, loss_mean=0.289][A
+Train step of epoch 0:  81%|████████▏ | 5228/6434 [12:15:37<2:52:29,  8.58s/it, gpt_loss=0.333, loss_mean=0.293][A
+Train step of epoch 0:  81%|████████▏ | 5229/6434 [12:15:37<2:52:28,  8.59s/it, gpt_loss=0.333, loss_mean=0.293][A
+[LID Router Debug] Step: 5230
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [1, 3, 4, 4, 5, 2, 6, 0, 1, 2]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+Train step of epoch 0:  81%|████████▏ | 5229/6434 [12:15:46<2:52:28,  8.59s/it, gpt_loss=0.292, loss_mean=0.293][A
+Train step of epoch 0:  81%|████████▏ | 5230/6434 [12:15:46<2:50:31,  8.50s/it, gpt_loss=0.292, loss_mean=0.293][A
+Train step of epoch 0:  81%|████████▏ | 5230/6434 [12:15:54<2:50:31,  8.50s/it, gpt_loss=0.38, loss_mean=0.302] [A
+Train step of epoch 0:  81%|████████▏ | 5231/6434 [12:15:54<2:46:17,  8.29s/it, gpt_loss=0.38, loss_mean=0.302][A
+Train step of epoch 0:  81%|████████▏ | 5231/6434 [12:16:02<2:46:17,  8.29s/it, gpt_loss=0.303, loss_mean=0.302][A
+Train step of epoch 0:  81%|████████▏ | 5232/6434 [12:16:02<2:45:31,  8.26s/it, gpt_loss=0.303, loss_mean=0.302][A
+Train step of epoch 0:  81%|████████▏ | 5232/6434 [12:16:10<2:45:31,  8.26s/it, gpt_loss=0.257, loss_mean=0.297][A
+Train step of epoch 0:  81%|████████▏ | 5233/6434 [12:16:10<2:45:37,  8.27s/it, gpt_loss=0.257, loss_mean=0.297][A
+Train step of epoch 0:  81%|████████▏ | 5233/6434 [12:16:19<2:45:37,  8.27s/it, gpt_loss=0.366, loss_mean=0.304][A
+Train step of epoch 0:  81%|████████▏ | 5234/6434 [12:16:19<2:46:31,  8.33s/it, gpt_loss=0.366, loss_mean=0.304][A
+Train step of epoch 0:  81%|████████▏ | 5234/6434 [12:16:26<2:46:31,  8.33s/it, gpt_loss=0.32, loss_mean=0.306] [A
+Train step of epoch 0:  81%|████████▏ | 5235/6434 [12:16:26<2:41:47,  8.10s/it, gpt_loss=0.32, loss_mean=0.306][A
+Train step of epoch 0:  81%|████████▏ | 5235/6434 [12:16:35<2:41:47,  8.10s/it, gpt_loss=0.352, loss_mean=0.31][A
+Train step of epoch 0:  81%|████████▏ | 5236/6434 [12:16:35<2:47:25,  8.39s/it, gpt_loss=0.352, loss_mean=0.31][A
+Train step of epoch 0:  81%|████████▏ | 5236/6434 [12:16:44<2:47:25,  8.39s/it, gpt_loss=0.301, loss_mean=0.31][A
+Train step of epoch 0:  81%|████████▏ | 5237/6434 [12:16:44<2:49:31,  8.50s/it, gpt_loss=0.301, loss_mean=0.31][A
+Train step of epoch 0:  81%|████████▏ | 5237/6434 [12:16:52<2:49:31,  8.50s/it, gpt_loss=0.285, loss_mean=0.307][A
+Train step of epoch 0:  81%|████████▏ | 5238/6434 [12:16:52<2:48:55,  8.47s/it, gpt_loss=0.285, loss_mean=0.307][A
+Train step of epoch 0:  81%|████████▏ | 5238/6434 [12:17:00<2:48:55,  8.47s/it, gpt_loss=0.27, loss_mean=0.303] [A
+Train step of epoch 0:  81%|████████▏ | 5239/6434 [12:17:00<2:45:41,  8.32s/it, gpt_loss=0.27, loss_mean=0.303][A
+[LID Router Debug] Step: 5240
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [2, 3, 1, 5, 3, 5, 4, 0, 1, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5}
+
+Train step of epoch 0:  81%|████████▏ | 5239/6434 [12:17:08<2:45:41,  8.32s/it, gpt_loss=0.307, loss_mean=0.304][A
+Train step of epoch 0:  81%|████████▏ | 5240/6434 [12:17:08<2:43:10,  8.20s/it, gpt_loss=0.307, loss_mean=0.304][A
+Train step of epoch 0:  81%|████████▏ | 5240/6434 [12:17:16<2:43:10,  8.20s/it, gpt_loss=0.25, loss_mean=0.298] [A
+Train step of epoch 0:  81%|████████▏ | 5241/6434 [12:17:16<2:42:51,  8.19s/it, gpt_loss=0.25, loss_mean=0.298][A
+Train step of epoch 0:  81%|████████▏ | 5241/6434 [12:17:24<2:42:51,  8.19s/it, gpt_loss=0.266, loss_mean=0.295][A
+Train step of epoch 0:  81%|████████▏ | 5242/6434 [12:17:24<2:42:06,  8.16s/it, gpt_loss=0.266, loss_mean=0.295][A
+Train step of epoch 0:  81%|████████▏ | 5242/6434 [12:17:32<2:42:06,  8.16s/it, gpt_loss=0.263, loss_mean=0.292][A
+Train step of epoch 0:  81%|████████▏ | 5243/6434 [12:17:32<2:39:13,  8.02s/it, gpt_loss=0.263, loss_mean=0.292][A
+Train step of epoch 0:  81%|████████▏ | 5243/6434 [12:17:42<2:39:13,  8.02s/it, gpt_loss=0.346, loss_mean=0.297][A
+Train step of epoch 0:  82%|████████▏ | 5244/6434 [12:17:42<2:47:10,  8.43s/it, gpt_loss=0.346, loss_mean=0.297][A
+Train step of epoch 0:  82%|████████▏ | 5244/6434 [12:17:49<2:47:10,  8.43s/it, gpt_loss=0.383, loss_mean=0.306][A
+Train step of epoch 0:  82%|████████▏ | 5245/6434 [12:17:49<2:43:04,  8.23s/it, gpt_loss=0.383, loss_mean=0.306][A
+Train step of epoch 0:  82%|████████▏ | 5245/6434 [12:17:58<2:43:04,  8.23s/it, gpt_loss=0.338, loss_mean=0.309][A
+Train step of epoch 0:  82%|████████▏ | 5246/6434 [12:17:58<2:44:17,  8.30s/it, gpt_loss=0.338, loss_mean=0.309][A
+Train step of epoch 0:  82%|████████▏ | 5246/6434 [12:18:06<2:44:17,  8.30s/it, gpt_loss=0.32, loss_mean=0.31]  [A
+Train step of epoch 0:  82%|████████▏ | 5247/6434 [12:18:06<2:45:07,  8.35s/it, gpt_loss=0.32, loss_mean=0.31][A
+Train step of epoch 0:  82%|████████▏ | 5247/6434 [12:18:14<2:45:07,  8.35s/it, gpt_loss=0.301, loss_mean=0.309][A
+Train step of epoch 0:  82%|████████▏ | 5248/6434 [12:18:14<2:44:23,  8.32s/it, gpt_loss=0.301, loss_mean=0.309][A
+Train step of epoch 0:  82%|████████▏ | 5248/6434 [12:18:23<2:44:23,  8.32s/it, gpt_loss=0.254, loss_mean=0.304][A
+Train step of epoch 0:  82%|████████▏ | 5249/6434 [12:18:23<2:45:50,  8.40s/it, gpt_loss=0.254, loss_mean=0.304][A
+[LID Router Debug] Step: 5250
+Batch Size: 10
+Audio Batch Size: 89
+LID Assignments: [9, 5, 5, 2, 2, 0, 2, 9, 6, 1]
+Active Experts in Batch: {0, 1, 2, 5, 6, 9}
+
+Train step of epoch 0:  82%|████████▏ | 5249/6434 [12:18:31<2:45:50,  8.40s/it, gpt_loss=0.291, loss_mean=0.303][A
+Train step of epoch 0:  82%|████████▏ | 5250/6434 [12:18:31<2:42:21,  8.23s/it, gpt_loss=0.291, loss_mean=0.303][A
+Train step of epoch 0:  82%|████████▏ | 5250/6434 [12:18:39<2:42:21,  8.23s/it, gpt_loss=0.28, loss_mean=0.3]   [A
+Train step of epoch 0:  82%|████████▏ | 5251/6434 [12:18:39<2:44:28,  8.34s/it, gpt_loss=0.28, loss_mean=0.3][A
+Train step of epoch 0:  82%|████████▏ | 5251/6434 [12:18:47<2:44:28,  8.34s/it, gpt_loss=0.238, loss_mean=0.294][A
+Train step of epoch 0:  82%|████████▏ | 5252/6434 [12:18:47<2:39:05,  8.08s/it, gpt_loss=0.238, loss_mean=0.294][A
+Train step of epoch 0:  82%|████████▏ | 5252/6434 [12:18:55<2:39:05,  8.08s/it, gpt_loss=0.241, loss_mean=0.289][A
+Train step of epoch 0:  82%|████████▏ | 5253/6434 [12:18:55<2:36:11,  7.94s/it, gpt_loss=0.241, loss_mean=0.289][A
+Train step of epoch 0:  82%|████████▏ | 5253/6434 [12:19:04<2:36:11,  7.94s/it, gpt_loss=0.278, loss_mean=0.288][A
+Train step of epoch 0:  82%|████████▏ | 5254/6434 [12:19:04<2:42:20,  8.26s/it, gpt_loss=0.278, loss_mean=0.288][A
+Train step of epoch 0:  82%|████████▏ | 5254/6434 [12:19:12<2:42:20,  8.26s/it, gpt_loss=0.332, loss_mean=0.292][A
+Train step of epoch 0:  82%|████████▏ | 5255/6434 [12:19:12<2:41:55,  8.24s/it, gpt_loss=0.332, loss_mean=0.292][A
+Train step of epoch 0:  82%|████████▏ | 5255/6434 [12:19:20<2:41:55,  8.24s/it, gpt_loss=0.252, loss_mean=0.288][A
+Train step of epoch 0:  82%|████████▏ | 5256/6434 [12:19:20<2:40:29,  8.17s/it, gpt_loss=0.252, loss_mean=0.288][A
+Train step of epoch 0:  82%|████████▏ | 5256/6434 [12:19:28<2:40:29,  8.17s/it, gpt_loss=0.296, loss_mean=0.289][A
+Train step of epoch 0:  82%|████████▏ | 5257/6434 [12:19:28<2:40:07,  8.16s/it, gpt_loss=0.296, loss_mean=0.289][A
+Train step of epoch 0:  82%|████████▏ | 5257/6434 [12:19:36<2:40:07,  8.16s/it, gpt_loss=0.324, loss_mean=0.292][A
+Train step of epoch 0:  82%|████████▏ | 5258/6434 [12:19:36<2:39:32,  8.14s/it, gpt_loss=0.324, loss_mean=0.292][A
+Train step of epoch 0:  82%|████████▏ | 5258/6434 [12:19:45<2:39:32,  8.14s/it, gpt_loss=0.303, loss_mean=0.293][A
+Train step of epoch 0:  82%|████████▏ | 5259/6434 [12:19:45<2:41:47,  8.26s/it, gpt_loss=0.303, loss_mean=0.293][A
+[LID Router Debug] Step: 5260
+Batch Size: 10
+Audio Batch Size: 91
+LID Assignments: [3, 0, 4, 0, 1, 0, 4, 6, 2, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6}
+
+Train step of epoch 0:  82%|████████▏ | 5259/6434 [12:19:53<2:41:47,  8.26s/it, gpt_loss=0.227, loss_mean=0.287][A
+Train step of epoch 0:  82%|████████▏ | 5260/6434 [12:19:53<2:41:41,  8.26s/it, gpt_loss=0.227, loss_mean=0.287][A
+Train step of epoch 0:  82%|████████▏ | 5260/6434 [12:20:01<2:41:41,  8.26s/it, gpt_loss=0.346, loss_mean=0.293][A
+Train step of epoch 0:  82%|████████▏ | 5261/6434 [12:20:01<2:42:18,  8.30s/it, gpt_loss=0.346, loss_mean=0.293][A
+Train step of epoch 0:  82%|████████▏ | 5261/6434 [12:20:09<2:42:18,  8.30s/it, gpt_loss=0.361, loss_mean=0.3]  [A
+Train step of epoch 0:  82%|████████▏ | 5262/6434 [12:20:09<2:40:05,  8.20s/it, gpt_loss=0.361, loss_mean=0.3][A
+Train step of epoch 0:  82%|████████▏ | 5262/6434 [12:20:18<2:40:05,  8.20s/it, gpt_loss=0.392, loss_mean=0.309][A
+Train step of epoch 0:  82%|████████▏ | 5263/6434 [12:20:18<2:44:52,  8.45s/it, gpt_loss=0.392, loss_mean=0.309][A
+Train step of epoch 0:  82%|████████▏ | 5263/6434 [12:20:27<2:44:52,  8.45s/it, gpt_loss=0.379, loss_mean=0.316][A
+Train step of epoch 0:  82%|████████▏ | 5264/6434 [12:20:27<2:49:08,  8.67s/it, gpt_loss=0.379, loss_mean=0.316][A
+Train step of epoch 0:  82%|████████▏ | 5264/6434 [12:20:36<2:49:08,  8.67s/it, gpt_loss=0.274, loss_mean=0.312][A
+Train step of epoch 0:  82%|████████▏ | 5265/6434 [12:20:36<2:48:30,  8.65s/it, gpt_loss=0.274, loss_mean=0.312][A
+Train step of epoch 0:  82%|████████▏ | 5265/6434 [12:20:45<2:48:30,  8.65s/it, gpt_loss=0.317, loss_mean=0.312][A
+Train step of epoch 0:  82%|████████▏ | 5266/6434 [12:20:45<2:48:52,  8.67s/it, gpt_loss=0.317, loss_mean=0.312][A
+Train step of epoch 0:  82%|████████▏ | 5266/6434 [12:20:54<2:48:52,  8.67s/it, gpt_loss=0.271, loss_mean=0.308][A
+Train step of epoch 0:  82%|████████▏ | 5267/6434 [12:20:54<2:55:00,  9.00s/it, gpt_loss=0.271, loss_mean=0.308][A
+Train step of epoch 0:  82%|████████▏ | 5267/6434 [12:21:03<2:55:00,  9.00s/it, gpt_loss=0.288, loss_mean=0.306][A
+Train step of epoch 0:  82%|████████▏ | 5268/6434 [12:21:03<2:52:26,  8.87s/it, gpt_loss=0.288, loss_mean=0.306][A
+Train step of epoch 0:  82%|████████▏ | 5268/6434 [12:21:12<2:52:26,  8.87s/it, gpt_loss=0.269, loss_mean=0.302][A
+Train step of epoch 0:  82%|████████▏ | 5269/6434 [12:21:12<2:52:29,  8.88s/it, gpt_loss=0.269, loss_mean=0.302][A
+[LID Router Debug] Step: 5270
+Batch Size: 10
+Audio Batch Size: 130
+LID Assignments: [1, 9, 5, 9, 0, 4, 4, 3, 3, 2]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  82%|████████▏ | 5269/6434 [12:21:20<2:52:29,  8.88s/it, gpt_loss=0.286, loss_mean=0.301][A
+Train step of epoch 0:  82%|████████▏ | 5270/6434 [12:21:20<2:48:54,  8.71s/it, gpt_loss=0.286, loss_mean=0.301][A
+Train step of epoch 0:  82%|████████▏ | 5270/6434 [12:21:29<2:48:54,  8.71s/it, gpt_loss=0.363, loss_mean=0.307][A
+Train step of epoch 0:  82%|████████▏ | 5271/6434 [12:21:29<2:46:58,  8.61s/it, gpt_loss=0.363, loss_mean=0.307][A
+Train step of epoch 0:  82%|████████▏ | 5271/6434 [12:21:37<2:46:58,  8.61s/it, gpt_loss=0.312, loss_mean=0.307][A
+Train step of epoch 0:  82%|████████▏ | 5272/6434 [12:21:37<2:44:00,  8.47s/it, gpt_loss=0.312, loss_mean=0.307][A
+Train step of epoch 0:  82%|████████▏ | 5272/6434 [12:21:45<2:44:00,  8.47s/it, gpt_loss=0.374, loss_mean=0.314][A
+Train step of epoch 0:  82%|████████▏ | 5273/6434 [12:21:45<2:41:17,  8.34s/it, gpt_loss=0.374, loss_mean=0.314][A
+Train step of epoch 0:  82%|████████▏ | 5273/6434 [12:21:54<2:41:17,  8.34s/it, gpt_loss=0.275, loss_mean=0.31] [A
+Train step of epoch 0:  82%|████████▏ | 5274/6434 [12:21:54<2:48:55,  8.74s/it, gpt_loss=0.275, loss_mean=0.31][A
+Train step of epoch 0:  82%|████████▏ | 5274/6434 [12:22:03<2:48:55,  8.74s/it, gpt_loss=0.29, loss_mean=0.308][A
+Train step of epoch 0:  82%|████████▏ | 5275/6434 [12:22:03<2:45:30,  8.57s/it, gpt_loss=0.29, loss_mean=0.308][A
+Train step of epoch 0:  82%|████████▏ | 5275/6434 [12:22:11<2:45:30,  8.57s/it, gpt_loss=0.273, loss_mean=0.305][A
+Train step of epoch 0:  82%|████████▏ | 5276/6434 [12:22:11<2:44:55,  8.55s/it, gpt_loss=0.273, loss_mean=0.305][A
+Train step of epoch 0:  82%|████████▏ | 5276/6434 [12:22:19<2:44:55,  8.55s/it, gpt_loss=0.231, loss_mean=0.297][A
+Train step of epoch 0:  82%|████████▏ | 5277/6434 [12:22:19<2:39:38,  8.28s/it, gpt_loss=0.231, loss_mean=0.297][A
+Train step of epoch 0:  82%|████████▏ | 5277/6434 [12:22:28<2:39:38,  8.28s/it, gpt_loss=0.231, loss_mean=0.291][A
+Train step of epoch 0:  82%|████████▏ | 5278/6434 [12:22:28<2:44:50,  8.56s/it, gpt_loss=0.231, loss_mean=0.291][A
+Train step of epoch 0:  82%|████████▏ | 5278/6434 [12:22:37<2:44:50,  8.56s/it, gpt_loss=0.272, loss_mean=0.289][A
+Train step of epoch 0:  82%|████████▏ | 5279/6434 [12:22:37<2:44:35,  8.55s/it, gpt_loss=0.272, loss_mean=0.289][A
+[LID Router Debug] Step: 5280
+Batch Size: 10
+Audio Batch Size: 139
+LID Assignments: [3, 1, 1, 1, 3, 4, 0, 3, 2, 10]
+Active Experts in Batch: {0, 1, 2, 3, 4, 10}
+
+Train step of epoch 0:  82%|████████▏ | 5279/6434 [12:22:46<2:44:35,  8.55s/it, gpt_loss=0.273, loss_mean=0.287][A
+Train step of epoch 0:  82%|████████▏ | 5280/6434 [12:22:46<2:47:00,  8.68s/it, gpt_loss=0.273, loss_mean=0.287][A
+Train step of epoch 0:  82%|████████▏ | 5280/6434 [12:22:54<2:47:00,  8.68s/it, gpt_loss=0.281, loss_mean=0.287][A
+Train step of epoch 0:  82%|████████▏ | 5281/6434 [12:22:54<2:47:25,  8.71s/it, gpt_loss=0.281, loss_mean=0.287][A
+Train step of epoch 0:  82%|████████▏ | 5281/6434 [12:23:03<2:47:25,  8.71s/it, gpt_loss=0.251, loss_mean=0.283][A
+Train step of epoch 0:  82%|████████▏ | 5282/6434 [12:23:03<2:44:17,  8.56s/it, gpt_loss=0.251, loss_mean=0.283][A
+Train step of epoch 0:  82%|████████▏ | 5282/6434 [12:23:11<2:44:17,  8.56s/it, gpt_loss=0.268, loss_mean=0.282][A
+Train step of epoch 0:  82%|████████▏ | 5283/6434 [12:23:11<2:44:23,  8.57s/it, gpt_loss=0.268, loss_mean=0.282][A
+Train step of epoch 0:  82%|████████▏ | 5283/6434 [12:23:19<2:44:23,  8.57s/it, gpt_loss=0.291, loss_mean=0.283][A
+Train step of epoch 0:  82%|████████▏ | 5284/6434 [12:23:19<2:39:08,  8.30s/it, gpt_loss=0.291, loss_mean=0.283][A
+Train step of epoch 0:  82%|████████▏ | 5284/6434 [12:23:27<2:39:08,  8.30s/it, gpt_loss=0.294, loss_mean=0.284][A
+Train step of epoch 0:  82%|████████▏ | 5285/6434 [12:23:27<2:36:39,  8.18s/it, gpt_loss=0.294, loss_mean=0.284][A
+Train step of epoch 0:  82%|████████▏ | 5285/6434 [12:23:36<2:36:39,  8.18s/it, gpt_loss=0.351, loss_mean=0.29] [A
+Train step of epoch 0:  82%|████████▏ | 5286/6434 [12:23:36<2:43:18,  8.54s/it, gpt_loss=0.351, loss_mean=0.29][A
+Train step of epoch 0:  82%|████████▏ | 5286/6434 [12:23:44<2:43:18,  8.54s/it, gpt_loss=0.265, loss_mean=0.288][A
+Train step of epoch 0:  82%|████████▏ | 5287/6434 [12:23:44<2:40:03,  8.37s/it, gpt_loss=0.265, loss_mean=0.288][A
+Train step of epoch 0:  82%|████████▏ | 5287/6434 [12:23:53<2:40:03,  8.37s/it, gpt_loss=0.302, loss_mean=0.289][A
+Train step of epoch 0:  82%|████████▏ | 5288/6434 [12:23:53<2:40:43,  8.42s/it, gpt_loss=0.302, loss_mean=0.289][A
+Train step of epoch 0:  82%|████████▏ | 5288/6434 [12:24:00<2:40:43,  8.42s/it, gpt_loss=0.342, loss_mean=0.295][A
+Train step of epoch 0:  82%|████████▏ | 5289/6434 [12:24:00<2:32:18,  7.98s/it, gpt_loss=0.342, loss_mean=0.295][A
+[LID Router Debug] Step: 5290
+Batch Size: 10
+Audio Batch Size: 122
+LID Assignments: [1, 4, 3, 3, 5, 2, 5, 3, 1, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5}
+
+Train step of epoch 0:  82%|████████▏ | 5289/6434 [12:24:08<2:32:18,  7.98s/it, gpt_loss=0.318, loss_mean=0.297][A
+Train step of epoch 0:  82%|████████▏ | 5290/6434 [12:24:08<2:34:17,  8.09s/it, gpt_loss=0.318, loss_mean=0.297][A
+Train step of epoch 0:  82%|████████▏ | 5290/6434 [12:24:17<2:34:17,  8.09s/it, gpt_loss=0.267, loss_mean=0.294][A
+Train step of epoch 0:  82%|████████▏ | 5291/6434 [12:24:17<2:41:56,  8.50s/it, gpt_loss=0.267, loss_mean=0.294][A
+Train step of epoch 0:  82%|████████▏ | 5291/6434 [12:24:25<2:41:56,  8.50s/it, gpt_loss=0.363, loss_mean=0.301][A
+Train step of epoch 0:  82%|████████▏ | 5292/6434 [12:24:25<2:36:42,  8.23s/it, gpt_loss=0.363, loss_mean=0.301][A
+Train step of epoch 0:  82%|████████▏ | 5292/6434 [12:24:34<2:36:42,  8.23s/it, gpt_loss=0.272, loss_mean=0.298][A
+Train step of epoch 0:  82%|████████▏ | 5293/6434 [12:24:34<2:40:58,  8.46s/it, gpt_loss=0.272, loss_mean=0.298][A
+Train step of epoch 0:  82%|████████▏ | 5293/6434 [12:24:43<2:40:58,  8.46s/it, gpt_loss=0.337, loss_mean=0.302][A
+Train step of epoch 0:  82%|████████▏ | 5294/6434 [12:24:43<2:41:52,  8.52s/it, gpt_loss=0.337, loss_mean=0.302][A
+Train step of epoch 0:  82%|████████▏ | 5294/6434 [12:24:51<2:41:52,  8.52s/it, gpt_loss=0.231, loss_mean=0.295][A
+Train step of epoch 0:  82%|████████▏ | 5295/6434 [12:24:51<2:39:49,  8.42s/it, gpt_loss=0.231, loss_mean=0.295][A
+Train step of epoch 0:  82%|████████▏ | 5295/6434 [12:24:59<2:39:49,  8.42s/it, gpt_loss=0.263, loss_mean=0.292][A
+Train step of epoch 0:  82%|████████▏ | 5296/6434 [12:24:59<2:36:57,  8.28s/it, gpt_loss=0.263, loss_mean=0.292][A
+Train step of epoch 0:  82%|████████▏ | 5296/6434 [12:25:06<2:36:57,  8.28s/it, gpt_loss=0.318, loss_mean=0.294][A
+Train step of epoch 0:  82%|████████▏ | 5297/6434 [12:25:06<2:33:09,  8.08s/it, gpt_loss=0.318, loss_mean=0.294][A
+Train step of epoch 0:  82%|████████▏ | 5297/6434 [12:25:15<2:33:09,  8.08s/it, gpt_loss=0.342, loss_mean=0.299][A
+Train step of epoch 0:  82%|████████▏ | 5298/6434 [12:25:15<2:36:12,  8.25s/it, gpt_loss=0.342, loss_mean=0.299][A
+Train step of epoch 0:  82%|████████▏ | 5298/6434 [12:25:23<2:36:12,  8.25s/it, gpt_loss=0.292, loss_mean=0.298][A
+Train step of epoch 0:  82%|████████▏ | 5299/6434 [12:25:23<2:35:42,  8.23s/it, gpt_loss=0.292, loss_mean=0.298][A
+[LID Router Debug] Step: 5300
+Batch Size: 10
+Audio Batch Size: 130
+LID Assignments: [6, 1, 3, 0, 6, 4, 3, 4, 3, 2]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6}
+
+Train step of epoch 0:  82%|████████▏ | 5299/6434 [12:25:32<2:35:42,  8.23s/it, gpt_loss=0.214, loss_mean=0.29] [A
+Train step of epoch 0:  82%|████████▏ | 5300/6434 [12:25:32<2:36:53,  8.30s/it, gpt_loss=0.214, loss_mean=0.29][A
+Train step of epoch 0:  82%|████████▏ | 5300/6434 [12:25:40<2:36:53,  8.30s/it, gpt_loss=0.176, loss_mean=0.279][A
+Train step of epoch 0:  82%|████████▏ | 5301/6434 [12:25:40<2:39:30,  8.45s/it, gpt_loss=0.176, loss_mean=0.279][A
+Train step of epoch 0:  82%|████████▏ | 5301/6434 [12:25:48<2:39:30,  8.45s/it, gpt_loss=0.303, loss_mean=0.281][A
+Train step of epoch 0:  82%|████████▏ | 5302/6434 [12:25:48<2:33:29,  8.14s/it, gpt_loss=0.303, loss_mean=0.281][A
+Train step of epoch 0:  82%|████████▏ | 5302/6434 [12:25:57<2:33:29,  8.14s/it, gpt_loss=0.205, loss_mean=0.273][A
+Train step of epoch 0:  82%|████████▏ | 5303/6434 [12:25:57<2:39:23,  8.46s/it, gpt_loss=0.205, loss_mean=0.273][A
+Train step of epoch 0:  82%|████████▏ | 5303/6434 [12:26:08<2:39:23,  8.46s/it, gpt_loss=0.325, loss_mean=0.279][A
+Train step of epoch 0:  82%|████████▏ | 5304/6434 [12:26:08<2:52:51,  9.18s/it, gpt_loss=0.325, loss_mean=0.279][A
+Train step of epoch 0:  82%|████████▏ | 5304/6434 [12:26:16<2:52:51,  9.18s/it, gpt_loss=0.292, loss_mean=0.28] [A
+Train step of epoch 0:  82%|████████▏ | 5305/6434 [12:26:16<2:48:48,  8.97s/it, gpt_loss=0.292, loss_mean=0.28][A
+Train step of epoch 0:  82%|████████▏ | 5305/6434 [12:26:26<2:48:48,  8.97s/it, gpt_loss=0.24, loss_mean=0.276][A
+Train step of epoch 0:  82%|████████▏ | 5306/6434 [12:26:26<2:51:28,  9.12s/it, gpt_loss=0.24, loss_mean=0.276][A
+Train step of epoch 0:  82%|████████▏ | 5306/6434 [12:26:34<2:51:28,  9.12s/it, gpt_loss=0.349, loss_mean=0.283][A
+Train step of epoch 0:  82%|████████▏ | 5307/6434 [12:26:34<2:45:26,  8.81s/it, gpt_loss=0.349, loss_mean=0.283][A
+Train step of epoch 0:  82%|████████▏ | 5307/6434 [12:26:42<2:45:26,  8.81s/it, gpt_loss=0.285, loss_mean=0.283][A
+Train step of epoch 0:  82%|████████▏ | 5308/6434 [12:26:42<2:42:24,  8.65s/it, gpt_loss=0.285, loss_mean=0.283][A
+Train step of epoch 0:  82%|████████▏ | 5308/6434 [12:26:50<2:42:24,  8.65s/it, gpt_loss=0.258, loss_mean=0.281][A
+Train step of epoch 0:  83%|████████▎ | 5309/6434 [12:26:50<2:35:21,  8.29s/it, gpt_loss=0.258, loss_mean=0.281][A
+[LID Router Debug] Step: 5310
+Batch Size: 10
+Audio Batch Size: 107
+LID Assignments: [2, 5, 3, 6, 3, 5, 1, 9, 4, 2]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  83%|████████▎ | 5309/6434 [12:26:59<2:35:21,  8.29s/it, gpt_loss=0.293, loss_mean=0.282][A
+Train step of epoch 0:  83%|████████▎ | 5310/6434 [12:26:59<2:38:21,  8.45s/it, gpt_loss=0.293, loss_mean=0.282][A
+Train step of epoch 0:  83%|████████▎ | 5310/6434 [12:27:07<2:38:21,  8.45s/it, gpt_loss=0.232, loss_mean=0.277][A
+Train step of epoch 0:  83%|████████▎ | 5311/6434 [12:27:07<2:39:57,  8.55s/it, gpt_loss=0.232, loss_mean=0.277][A
+Train step of epoch 0:  83%|████████▎ | 5311/6434 [12:27:17<2:39:57,  8.55s/it, gpt_loss=0.284, loss_mean=0.278][A
+Train step of epoch 0:  83%|████████▎ | 5312/6434 [12:27:17<2:44:28,  8.80s/it, gpt_loss=0.284, loss_mean=0.278][A
+Train step of epoch 0:  83%|████████▎ | 5312/6434 [12:27:26<2:44:28,  8.80s/it, gpt_loss=0.18, loss_mean=0.268] [A
+Train step of epoch 0:  83%|████████▎ | 5313/6434 [12:27:26<2:46:31,  8.91s/it, gpt_loss=0.18, loss_mean=0.268][A
+Train step of epoch 0:  83%|████████▎ | 5313/6434 [12:27:34<2:46:31,  8.91s/it, gpt_loss=0.247, loss_mean=0.266][A
+Train step of epoch 0:  83%|████████▎ | 5314/6434 [12:27:34<2:41:07,  8.63s/it, gpt_loss=0.247, loss_mean=0.266][A
+Train step of epoch 0:  83%|████████▎ | 5314/6434 [12:27:42<2:41:07,  8.63s/it, gpt_loss=0.26, loss_mean=0.265] [A
+Train step of epoch 0:  83%|████████▎ | 5315/6434 [12:27:42<2:37:38,  8.45s/it, gpt_loss=0.26, loss_mean=0.265][A
+Train step of epoch 0:  83%|████████▎ | 5315/6434 [12:27:51<2:37:38,  8.45s/it, gpt_loss=0.262, loss_mean=0.265][A
+Train step of epoch 0:  83%|████████▎ | 5316/6434 [12:27:51<2:40:10,  8.60s/it, gpt_loss=0.262, loss_mean=0.265][A
+Train step of epoch 0:  83%|████████▎ | 5316/6434 [12:27:59<2:40:10,  8.60s/it, gpt_loss=0.348, loss_mean=0.273][A
+Train step of epoch 0:  83%|████████▎ | 5317/6434 [12:27:59<2:35:11,  8.34s/it, gpt_loss=0.348, loss_mean=0.273][A
+Train step of epoch 0:  83%|████████▎ | 5317/6434 [12:28:08<2:35:11,  8.34s/it, gpt_loss=0.317, loss_mean=0.278][A
+Train step of epoch 0:  83%|████████▎ | 5318/6434 [12:28:08<2:43:02,  8.77s/it, gpt_loss=0.317, loss_mean=0.278][A
+Train step of epoch 0:  83%|████████▎ | 5318/6434 [12:28:17<2:43:02,  8.77s/it, gpt_loss=0.269, loss_mean=0.277][A
+Train step of epoch 0:  83%|████████▎ | 5319/6434 [12:28:17<2:42:06,  8.72s/it, gpt_loss=0.269, loss_mean=0.277][A
+[LID Router Debug] Step: 5320
+Batch Size: 10
+Audio Batch Size: 106
+LID Assignments: [4, 9, 2, 0, 1, 5, 0, 9, 1, 1]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+Train step of epoch 0:  83%|████████▎ | 5319/6434 [12:28:25<2:42:06,  8.72s/it, gpt_loss=0.288, loss_mean=0.278][A
+Train step of epoch 0:  83%|████████▎ | 5320/6434 [12:28:25<2:39:22,  8.58s/it, gpt_loss=0.288, loss_mean=0.278][A
+Train step of epoch 0:  83%|████████▎ | 5320/6434 [12:28:35<2:39:22,  8.58s/it, gpt_loss=0.282, loss_mean=0.278][A
+Train step of epoch 0:  83%|████████▎ | 5321/6434 [12:28:35<2:46:45,  8.99s/it, gpt_loss=0.282, loss_mean=0.278][A
+Train step of epoch 0:  83%|████████▎ | 5321/6434 [12:28:42<2:46:45,  8.99s/it, gpt_loss=0.39, loss_mean=0.289] [A
+Train step of epoch 0:  83%|████████▎ | 5322/6434 [12:28:42<2:35:58,  8.42s/it, gpt_loss=0.39, loss_mean=0.289][A
+Train step of epoch 0:  83%|████████▎ | 5322/6434 [12:28:50<2:35:58,  8.42s/it, gpt_loss=0.344, loss_mean=0.295][A
+Train step of epoch 0:  83%|████████▎ | 5323/6434 [12:28:50<2:33:10,  8.27s/it, gpt_loss=0.344, loss_mean=0.295][A
+Train step of epoch 0:  83%|████████▎ | 5323/6434 [12:28:59<2:33:10,  8.27s/it, gpt_loss=0.31, loss_mean=0.296] [A
+Train step of epoch 0:  83%|████████▎ | 5324/6434 [12:28:59<2:37:22,  8.51s/it, gpt_loss=0.31, loss_mean=0.296][A
+Train step of epoch 0:  83%|████████▎ | 5324/6434 [12:29:07<2:37:22,  8.51s/it, gpt_loss=0.257, loss_mean=0.292][A
+Train step of epoch 0:  83%|████████▎ | 5325/6434 [12:29:07<2:34:03,  8.33s/it, gpt_loss=0.257, loss_mean=0.292][A
+Train step of epoch 0:  83%|████████▎ | 5325/6434 [12:29:16<2:34:03,  8.33s/it, gpt_loss=0.366, loss_mean=0.3]  [A
+Train step of epoch 0:  83%|████████▎ | 5326/6434 [12:29:16<2:38:29,  8.58s/it, gpt_loss=0.366, loss_mean=0.3][A
+Train step of epoch 0:  83%|████████▎ | 5326/6434 [12:29:24<2:38:29,  8.58s/it, gpt_loss=0.336, loss_mean=0.303][A
+Train step of epoch 0:  83%|████████▎ | 5327/6434 [12:29:24<2:32:49,  8.28s/it, gpt_loss=0.336, loss_mean=0.303][A
+Train step of epoch 0:  83%|████████▎ | 5327/6434 [12:29:33<2:32:49,  8.28s/it, gpt_loss=0.333, loss_mean=0.306][A
+Train step of epoch 0:  83%|████████▎ | 5328/6434 [12:29:33<2:36:46,  8.50s/it, gpt_loss=0.333, loss_mean=0.306][A
+Train step of epoch 0:  83%|████████▎ | 5328/6434 [12:29:41<2:36:46,  8.50s/it, gpt_loss=0.299, loss_mean=0.306][A
+Train step of epoch 0:  83%|████████▎ | 5329/6434 [12:29:41<2:34:28,  8.39s/it, gpt_loss=0.299, loss_mean=0.306][A
+[LID Router Debug] Step: 5330
+Batch Size: 10
+Audio Batch Size: 112
+LID Assignments: [3, 5, 2, 4, 2, 2, 1, 0, 4, 6]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+Train step of epoch 0:  83%|████████▎ | 5329/6434 [12:29:49<2:34:28,  8.39s/it, gpt_loss=0.27, loss_mean=0.302] [A
+Train step of epoch 0:  83%|████████▎ | 5330/6434 [12:29:49<2:31:44,  8.25s/it, gpt_loss=0.27, loss_mean=0.302][A
+Train step of epoch 0:  83%|████████▎ | 5330/6434 [12:29:57<2:31:44,  8.25s/it, gpt_loss=0.353, loss_mean=0.307][A
+Train step of epoch 0:  83%|████████▎ | 5331/6434 [12:29:57<2:30:39,  8.20s/it, gpt_loss=0.353, loss_mean=0.307][A
+Train step of epoch 0:  83%|████████▎ | 5331/6434 [12:30:05<2:30:39,  8.20s/it, gpt_loss=0.33, loss_mean=0.309] [A
+Train step of epoch 0:  83%|████████▎ | 5332/6434 [12:30:05<2:30:25,  8.19s/it, gpt_loss=0.33, loss_mean=0.309][A
+Train step of epoch 0:  83%|████████▎ | 5332/6434 [12:30:13<2:30:25,  8.19s/it, gpt_loss=0.218, loss_mean=0.3] [A
+Train step of epoch 0:  83%|████████▎ | 5333/6434 [12:30:13<2:26:23,  7.98s/it, gpt_loss=0.218, loss_mean=0.3][A
+Train step of epoch 0:  83%|████████▎ | 5333/6434 [12:30:21<2:26:23,  7.98s/it, gpt_loss=0.361, loss_mean=0.306][A
+Train step of epoch 0:  83%|████████▎ | 5334/6434 [12:30:21<2:29:49,  8.17s/it, gpt_loss=0.361, loss_mean=0.306][A
+Train step of epoch 0:  83%|████████▎ | 5334/6434 [12:30:29<2:29:49,  8.17s/it, gpt_loss=0.347, loss_mean=0.31] [A
+Train step of epoch 0:  83%|████████▎ | 5335/6434 [12:30:29<2:26:00,  7.97s/it, gpt_loss=0.347, loss_mean=0.31][A
+Train step of epoch 0:  83%|████████▎ | 5335/6434 [12:30:38<2:26:00,  7.97s/it, gpt_loss=0.249, loss_mean=0.304][A
+Train step of epoch 0:  83%|████████▎ | 5336/6434 [12:30:38<2:30:51,  8.24s/it, gpt_loss=0.249, loss_mean=0.304][A
+Train step of epoch 0:  83%|████████▎ | 5336/6434 [12:30:46<2:30:51,  8.24s/it, gpt_loss=0.312, loss_mean=0.305][A
+Train step of epoch 0:  83%|████████▎ | 5337/6434 [12:30:46<2:31:25,  8.28s/it, gpt_loss=0.312, loss_mean=0.305][A
+Train step of epoch 0:  83%|████████▎ | 5337/6434 [12:30:55<2:31:25,  8.28s/it, gpt_loss=0.241, loss_mean=0.299][A
+Train step of epoch 0:  83%|████████▎ | 5338/6434 [12:30:55<2:33:04,  8.38s/it, gpt_loss=0.241, loss_mean=0.299][A
+Train step of epoch 0:  83%|████████▎ | 5338/6434 [12:31:04<2:33:04,  8.38s/it, gpt_loss=0.372, loss_mean=0.306][A
+Train step of epoch 0:  83%|████████▎ | 5339/6434 [12:31:04<2:39:04,  8.72s/it, gpt_loss=0.372, loss_mean=0.306][A
+[LID Router Debug] Step: 5340
+Batch Size: 10
+Audio Batch Size: 106
+LID Assignments: [3, 4, 2, 0, 2, 5, 1, 9, 9, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  83%|████████▎ | 5339/6434 [12:31:12<2:39:04,  8.72s/it, gpt_loss=0.377, loss_mean=0.313][A
+Train step of epoch 0:  83%|████████▎ | 5340/6434 [12:31:12<2:34:19,  8.46s/it, gpt_loss=0.377, loss_mean=0.313][A
+Train step of epoch 0:  83%|████████▎ | 5340/6434 [12:31:20<2:34:19,  8.46s/it, gpt_loss=0.271, loss_mean=0.309][A
+Train step of epoch 0:  83%|████████▎ | 5341/6434 [12:31:20<2:34:10,  8.46s/it, gpt_loss=0.271, loss_mean=0.309][A
+Train step of epoch 0:  83%|████████▎ | 5341/6434 [12:31:28<2:34:10,  8.46s/it, gpt_loss=0.296, loss_mean=0.308][A
+Train step of epoch 0:  83%|████████▎ | 5342/6434 [12:31:28<2:31:13,  8.31s/it, gpt_loss=0.296, loss_mean=0.308][A
+Train step of epoch 0:  83%|████████▎ | 5342/6434 [12:31:36<2:31:13,  8.31s/it, gpt_loss=0.256, loss_mean=0.302][A
+Train step of epoch 0:  83%|████████▎ | 5343/6434 [12:31:36<2:28:45,  8.18s/it, gpt_loss=0.256, loss_mean=0.302][A
+Train step of epoch 0:  83%|████████▎ | 5343/6434 [12:31:45<2:28:45,  8.18s/it, gpt_loss=0.242, loss_mean=0.296][A
+Train step of epoch 0:  83%|████████▎ | 5344/6434 [12:31:45<2:31:07,  8.32s/it, gpt_loss=0.242, loss_mean=0.296][A
+Train step of epoch 0:  83%|████████▎ | 5344/6434 [12:31:53<2:31:07,  8.32s/it, gpt_loss=0.3, loss_mean=0.297]  [A
+Train step of epoch 0:  83%|████████▎ | 5345/6434 [12:31:53<2:31:06,  8.33s/it, gpt_loss=0.3, loss_mean=0.297][A
+Train step of epoch 0:  83%|████████▎ | 5345/6434 [12:32:02<2:31:06,  8.33s/it, gpt_loss=0.285, loss_mean=0.295][A
+Train step of epoch 0:  83%|████████▎ | 5346/6434 [12:32:02<2:31:57,  8.38s/it, gpt_loss=0.285, loss_mean=0.295][A
+Train step of epoch 0:  83%|████████▎ | 5346/6434 [12:32:10<2:31:57,  8.38s/it, gpt_loss=0.267, loss_mean=0.293][A
+Train step of epoch 0:  83%|████████▎ | 5347/6434 [12:32:10<2:28:15,  8.18s/it, gpt_loss=0.267, loss_mean=0.293][A
+Train step of epoch 0:  83%|████████▎ | 5347/6434 [12:32:18<2:28:15,  8.18s/it, gpt_loss=0.289, loss_mean=0.292][A
+Train step of epoch 0:  83%|████████▎ | 5348/6434 [12:32:18<2:30:20,  8.31s/it, gpt_loss=0.289, loss_mean=0.292][A
+Train step of epoch 0:  83%|████████▎ | 5348/6434 [12:32:26<2:30:20,  8.31s/it, gpt_loss=0.274, loss_mean=0.29] [A
+Train step of epoch 0:  83%|████████▎ | 5349/6434 [12:32:26<2:29:35,  8.27s/it, gpt_loss=0.274, loss_mean=0.29][A
+[LID Router Debug] Step: 5350
+Batch Size: 10
+Audio Batch Size: 86
+LID Assignments: [5, 4, 5, 6, 5, 5, 1, 9, 4, 0]
+Active Experts in Batch: {0, 1, 4, 5, 6, 9}
+
+Train step of epoch 0:  83%|████████▎ | 5349/6434 [12:32:34<2:29:35,  8.27s/it, gpt_loss=0.305, loss_mean=0.292][A
+Train step of epoch 0:  83%|████████▎ | 5350/6434 [12:32:34<2:25:35,  8.06s/it, gpt_loss=0.305, loss_mean=0.292][A
+Train step of epoch 0:  83%|████████▎ | 5350/6434 [12:32:43<2:25:35,  8.06s/it, gpt_loss=0.262, loss_mean=0.289][A
+Train step of epoch 0:  83%|████████▎ | 5351/6434 [12:32:43<2:31:01,  8.37s/it, gpt_loss=0.262, loss_mean=0.289][A
+Train step of epoch 0:  83%|████████▎ | 5351/6434 [12:32:51<2:31:01,  8.37s/it, gpt_loss=0.246, loss_mean=0.285][A
+Train step of epoch 0:  83%|████████▎ | 5352/6434 [12:32:51<2:30:54,  8.37s/it, gpt_loss=0.246, loss_mean=0.285][A
+Train step of epoch 0:  83%|████████▎ | 5352/6434 [12:33:01<2:30:54,  8.37s/it, gpt_loss=0.211, loss_mean=0.277][A
+Train step of epoch 0:  83%|████████▎ | 5353/6434 [12:33:01<2:36:28,  8.69s/it, gpt_loss=0.211, loss_mean=0.277][A
+Train step of epoch 0:  83%|████████▎ | 5353/6434 [12:33:11<2:36:28,  8.69s/it, gpt_loss=0.266, loss_mean=0.276][A
+Train step of epoch 0:  83%|████████▎ | 5354/6434 [12:33:11<2:44:34,  9.14s/it, gpt_loss=0.266, loss_mean=0.276][A
+Train step of epoch 0:  83%|████████▎ | 5354/6434 [12:33:19<2:44:34,  9.14s/it, gpt_loss=0.289, loss_mean=0.278][A
+Train step of epoch 0:  83%|████████▎ | 5355/6434 [12:33:19<2:40:28,  8.92s/it, gpt_loss=0.289, loss_mean=0.278][A
+Train step of epoch 0:  83%|████████▎ | 5355/6434 [12:33:29<2:40:28,  8.92s/it, gpt_loss=0.286, loss_mean=0.278][A
+Train step of epoch 0:  83%|████████▎ | 5356/6434 [12:33:29<2:42:30,  9.05s/it, gpt_loss=0.286, loss_mean=0.278][A
+Train step of epoch 0:  83%|████████▎ | 5356/6434 [12:33:37<2:42:30,  9.05s/it, gpt_loss=0.273, loss_mean=0.278][A
+Train step of epoch 0:  83%|████████▎ | 5357/6434 [12:33:37<2:40:45,  8.96s/it, gpt_loss=0.273, loss_mean=0.278][A
+Train step of epoch 0:  83%|████████▎ | 5357/6434 [12:33:46<2:40:45,  8.96s/it, gpt_loss=0.282, loss_mean=0.278][A
+Train step of epoch 0:  83%|████████▎ | 5358/6434 [12:33:46<2:37:33,  8.79s/it, gpt_loss=0.282, loss_mean=0.278][A
+Train step of epoch 0:  83%|████████▎ | 5358/6434 [12:33:53<2:37:33,  8.79s/it, gpt_loss=0.421, loss_mean=0.293][A
+Train step of epoch 0:  83%|████████▎ | 5359/6434 [12:33:53<2:30:49,  8.42s/it, gpt_loss=0.421, loss_mean=0.293][A
+[LID Router Debug] Step: 5360
+Batch Size: 10
+Audio Batch Size: 138
+LID Assignments: [5, 6, 0, 3, 3, 9, 9, 1, 9, 9]
+Active Experts in Batch: {0, 1, 3, 5, 6, 9}
+
+Train step of epoch 0:  83%|████████▎ | 5359/6434 [12:34:02<2:30:49,  8.42s/it, gpt_loss=0.287, loss_mean=0.292][A
+Train step of epoch 0:  83%|████████▎ | 5360/6434 [12:34:02<2:34:21,  8.62s/it, gpt_loss=0.287, loss_mean=0.292][A
+Train step of epoch 0:  83%|████████▎ | 5360/6434 [12:34:12<2:34:21,  8.62s/it, gpt_loss=0.303, loss_mean=0.293][A
+Train step of epoch 0:  83%|████████▎ | 5361/6434 [12:34:12<2:37:30,  8.81s/it, gpt_loss=0.303, loss_mean=0.293][A
+Train step of epoch 0:  83%|████████▎ | 5361/6434 [12:34:20<2:37:30,  8.81s/it, gpt_loss=0.294, loss_mean=0.293][A
+Train step of epoch 0:  83%|████████▎ | 5362/6434 [12:34:20<2:36:56,  8.78s/it, gpt_loss=0.294, loss_mean=0.293][A
+Train step of epoch 0:  83%|████████▎ | 5362/6434 [12:34:30<2:36:56,  8.78s/it, gpt_loss=0.314, loss_mean=0.295][A
+Train step of epoch 0:  83%|████████▎ | 5363/6434 [12:34:30<2:38:50,  8.90s/it, gpt_loss=0.314, loss_mean=0.295][A
+Train step of epoch 0:  83%|████████▎ | 5363/6434 [12:34:39<2:38:50,  8.90s/it, gpt_loss=0.271, loss_mean=0.293][A
+Train step of epoch 0:  83%|████████▎ | 5364/6434 [12:34:39<2:38:44,  8.90s/it, gpt_loss=0.271, loss_mean=0.293][A
+Train step of epoch 0:  83%|████████▎ | 5364/6434 [12:34:46<2:38:44,  8.90s/it, gpt_loss=0.274, loss_mean=0.291][A
+Train step of epoch 0:  83%|████████▎ | 5365/6434 [12:34:46<2:33:25,  8.61s/it, gpt_loss=0.274, loss_mean=0.291][A
+Train step of epoch 0:  83%|████████▎ | 5365/6434 [12:34:55<2:33:25,  8.61s/it, gpt_loss=0.276, loss_mean=0.29] [A
+Train step of epoch 0:  83%|████████▎ | 5366/6434 [12:34:55<2:31:39,  8.52s/it, gpt_loss=0.276, loss_mean=0.29][A
+Train step of epoch 0:  83%|████████▎ | 5366/6434 [12:35:03<2:31:39,  8.52s/it, gpt_loss=0.274, loss_mean=0.288][A
+Train step of epoch 0:  83%|████████▎ | 5367/6434 [12:35:03<2:29:47,  8.42s/it, gpt_loss=0.274, loss_mean=0.288][A
+Train step of epoch 0:  83%|████████▎ | 5367/6434 [12:35:12<2:29:47,  8.42s/it, gpt_loss=0.269, loss_mean=0.286][A
+Train step of epoch 0:  83%|████████▎ | 5368/6434 [12:35:12<2:30:48,  8.49s/it, gpt_loss=0.269, loss_mean=0.286][A
+Train step of epoch 0:  83%|████████▎ | 5368/6434 [12:35:20<2:30:48,  8.49s/it, gpt_loss=0.359, loss_mean=0.293][A
+Train step of epoch 0:  83%|████████▎ | 5369/6434 [12:35:20<2:29:46,  8.44s/it, gpt_loss=0.359, loss_mean=0.293][A
+[LID Router Debug] Step: 5370
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [2, 5, 3, 4, 6, 6, 6, 9, 3, 3]
+Active Experts in Batch: {2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  83%|████████▎ | 5369/6434 [12:35:29<2:29:46,  8.44s/it, gpt_loss=0.241, loss_mean=0.288][A
+Train step of epoch 0:  83%|████████▎ | 5370/6434 [12:35:29<2:30:54,  8.51s/it, gpt_loss=0.241, loss_mean=0.288][A
+Train step of epoch 0:  83%|████████▎ | 5370/6434 [12:35:37<2:30:54,  8.51s/it, gpt_loss=0.26, loss_mean=0.285] [A
+Train step of epoch 0:  83%|████████▎ | 5371/6434 [12:35:37<2:27:35,  8.33s/it, gpt_loss=0.26, loss_mean=0.285][A
+Train step of epoch 0:  83%|████████▎ | 5371/6434 [12:35:45<2:27:35,  8.33s/it, gpt_loss=0.326, loss_mean=0.289][A
+Train step of epoch 0:  83%|████████▎ | 5372/6434 [12:35:45<2:29:48,  8.46s/it, gpt_loss=0.326, loss_mean=0.289][A
+Train step of epoch 0:  83%|████████▎ | 5372/6434 [12:35:54<2:29:48,  8.46s/it, gpt_loss=0.343, loss_mean=0.295][A
+Train step of epoch 0:  84%|████████▎ | 5373/6434 [12:35:54<2:31:00,  8.54s/it, gpt_loss=0.343, loss_mean=0.295][A
+Train step of epoch 0:  84%|████████▎ | 5373/6434 [12:36:04<2:31:00,  8.54s/it, gpt_loss=0.355, loss_mean=0.301][A
+Train step of epoch 0:  84%|████████▎ | 5374/6434 [12:36:04<2:37:40,  8.93s/it, gpt_loss=0.355, loss_mean=0.301][A
+Train step of epoch 0:  84%|████████▎ | 5374/6434 [12:36:12<2:37:40,  8.93s/it, gpt_loss=0.266, loss_mean=0.297][A
+Train step of epoch 0:  84%|████████▎ | 5375/6434 [12:36:12<2:32:50,  8.66s/it, gpt_loss=0.266, loss_mean=0.297][A
+Train step of epoch 0:  84%|████████▎ | 5375/6434 [12:36:20<2:32:50,  8.66s/it, gpt_loss=0.346, loss_mean=0.302][A
+Train step of epoch 0:  84%|████████▎ | 5376/6434 [12:36:20<2:32:29,  8.65s/it, gpt_loss=0.346, loss_mean=0.302][A
+Train step of epoch 0:  84%|████████▎ | 5376/6434 [12:36:28<2:32:29,  8.65s/it, gpt_loss=0.278, loss_mean=0.3]  [A
+Train step of epoch 0:  84%|████████▎ | 5377/6434 [12:36:28<2:28:48,  8.45s/it, gpt_loss=0.278, loss_mean=0.3][A
+Train step of epoch 0:  84%|████████▎ | 5377/6434 [12:36:37<2:28:48,  8.45s/it, gpt_loss=0.326, loss_mean=0.302][A
+Train step of epoch 0:  84%|████████▎ | 5378/6434 [12:36:37<2:30:55,  8.58s/it, gpt_loss=0.326, loss_mean=0.302][A
+Train step of epoch 0:  84%|████████▎ | 5378/6434 [12:36:46<2:30:55,  8.58s/it, gpt_loss=0.348, loss_mean=0.307][A
+Train step of epoch 0:  84%|████████▎ | 5379/6434 [12:36:46<2:29:32,  8.50s/it, gpt_loss=0.348, loss_mean=0.307][A
+[LID Router Debug] Step: 5380
+Batch Size: 10
+Audio Batch Size: 77
+LID Assignments: [4, 2, 5, 6, 1, 1, 6, 5, 5, 9]
+Active Experts in Batch: {1, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  84%|████████▎ | 5379/6434 [12:36:56<2:29:32,  8.50s/it, gpt_loss=0.264, loss_mean=0.303][A
+Train step of epoch 0:  84%|████████▎ | 5380/6434 [12:36:56<2:39:17,  9.07s/it, gpt_loss=0.264, loss_mean=0.303][A
+Train step of epoch 0:  84%|████████▎ | 5380/6434 [12:37:03<2:39:17,  9.07s/it, gpt_loss=0.268, loss_mean=0.299][A
+Train step of epoch 0:  84%|████████▎ | 5381/6434 [12:37:03<2:28:37,  8.47s/it, gpt_loss=0.268, loss_mean=0.299][A
+Train step of epoch 0:  84%|████████▎ | 5381/6434 [12:37:13<2:28:37,  8.47s/it, gpt_loss=0.287, loss_mean=0.298][A
+Train step of epoch 0:  84%|████████▎ | 5382/6434 [12:37:13<2:36:59,  8.95s/it, gpt_loss=0.287, loss_mean=0.298][A
+Train step of epoch 0:  84%|████████▎ | 5382/6434 [12:37:21<2:36:59,  8.95s/it, gpt_loss=0.343, loss_mean=0.302][A
+Train step of epoch 0:  84%|████████▎ | 5383/6434 [12:37:21<2:30:03,  8.57s/it, gpt_loss=0.343, loss_mean=0.302][A
+Train step of epoch 0:  84%|████████▎ | 5383/6434 [12:37:30<2:30:03,  8.57s/it, gpt_loss=0.349, loss_mean=0.307][A
+Train step of epoch 0:  84%|████████▎ | 5384/6434 [12:37:30<2:34:27,  8.83s/it, gpt_loss=0.349, loss_mean=0.307][A
+Train step of epoch 0:  84%|████████▎ | 5384/6434 [12:37:38<2:34:27,  8.83s/it, gpt_loss=0.278, loss_mean=0.304][A
+Train step of epoch 0:  84%|████████▎ | 5385/6434 [12:37:38<2:29:49,  8.57s/it, gpt_loss=0.278, loss_mean=0.304][A
+Train step of epoch 0:  84%|████████▎ | 5385/6434 [12:37:46<2:29:49,  8.57s/it, gpt_loss=0.297, loss_mean=0.303][A
+Train step of epoch 0:  84%|████████▎ | 5386/6434 [12:37:46<2:26:33,  8.39s/it, gpt_loss=0.297, loss_mean=0.303][A
+Train step of epoch 0:  84%|████████▎ | 5386/6434 [12:37:54<2:26:33,  8.39s/it, gpt_loss=0.27, loss_mean=0.3]   [A
+Train step of epoch 0:  84%|████████▎ | 5387/6434 [12:37:54<2:22:47,  8.18s/it, gpt_loss=0.27, loss_mean=0.3][A
+Train step of epoch 0:  84%|████████▎ | 5387/6434 [12:38:03<2:22:47,  8.18s/it, gpt_loss=0.213, loss_mean=0.291][A
+Train step of epoch 0:  84%|████████▎ | 5388/6434 [12:38:03<2:29:28,  8.57s/it, gpt_loss=0.213, loss_mean=0.291][A
+Train step of epoch 0:  84%|████████▎ | 5388/6434 [12:38:12<2:29:28,  8.57s/it, gpt_loss=0.235, loss_mean=0.286][A
+Train step of epoch 0:  84%|████████▍ | 5389/6434 [12:38:12<2:30:07,  8.62s/it, gpt_loss=0.235, loss_mean=0.286][A
+[LID Router Debug] Step: 5390
+Batch Size: 10
+Audio Batch Size: 110
+LID Assignments: [3, 9, 9, 3, 4, 5, 2, 2, 4, 2]
+Active Experts in Batch: {2, 3, 4, 5, 9}
+
+Train step of epoch 0:  84%|████████▍ | 5389/6434 [12:38:22<2:30:07,  8.62s/it, gpt_loss=0.327, loss_mean=0.29] [A
+Train step of epoch 0:  84%|████████▍ | 5390/6434 [12:38:22<2:34:41,  8.89s/it, gpt_loss=0.327, loss_mean=0.29][A
+Train step of epoch 0:  84%|████████▍ | 5390/6434 [12:38:30<2:34:41,  8.89s/it, gpt_loss=0.251, loss_mean=0.286][A
+Train step of epoch 0:  84%|████████▍ | 5391/6434 [12:38:30<2:31:31,  8.72s/it, gpt_loss=0.251, loss_mean=0.286][A
+Train step of epoch 0:  84%|████████▍ | 5391/6434 [12:38:38<2:31:31,  8.72s/it, gpt_loss=0.367, loss_mean=0.294][A
+Train step of epoch 0:  84%|████████▍ | 5392/6434 [12:38:38<2:27:44,  8.51s/it, gpt_loss=0.367, loss_mean=0.294][A
+Train step of epoch 0:  84%|████████▍ | 5392/6434 [12:38:47<2:27:44,  8.51s/it, gpt_loss=0.287, loss_mean=0.293][A
+Train step of epoch 0:  84%|████████▍ | 5393/6434 [12:38:47<2:29:08,  8.60s/it, gpt_loss=0.287, loss_mean=0.293][A
+Train step of epoch 0:  84%|████████▍ | 5393/6434 [12:38:55<2:29:08,  8.60s/it, gpt_loss=0.3, loss_mean=0.294]  [A
+Train step of epoch 0:  84%|████████▍ | 5394/6434 [12:38:55<2:25:14,  8.38s/it, gpt_loss=0.3, loss_mean=0.294][A
+Train step of epoch 0:  84%|████████▍ | 5394/6434 [12:39:03<2:25:14,  8.38s/it, gpt_loss=0.313, loss_mean=0.296][A
+Train step of epoch 0:  84%|████████▍ | 5395/6434 [12:39:03<2:26:52,  8.48s/it, gpt_loss=0.313, loss_mean=0.296][A
+Train step of epoch 0:  84%|████████▍ | 5395/6434 [12:39:11<2:26:52,  8.48s/it, gpt_loss=0.259, loss_mean=0.292][A
+Train step of epoch 0:  84%|████████▍ | 5396/6434 [12:39:11<2:21:31,  8.18s/it, gpt_loss=0.259, loss_mean=0.292][A
+Train step of epoch 0:  84%|████████▍ | 5396/6434 [12:39:19<2:21:31,  8.18s/it, gpt_loss=0.352, loss_mean=0.298][A
+Train step of epoch 0:  84%|████████▍ | 5397/6434 [12:39:19<2:20:54,  8.15s/it, gpt_loss=0.352, loss_mean=0.298][A
+Train step of epoch 0:  84%|████████▍ | 5397/6434 [12:39:29<2:20:54,  8.15s/it, gpt_loss=0.313, loss_mean=0.3]  [A
+Train step of epoch 0:  84%|████████▍ | 5398/6434 [12:39:29<2:28:12,  8.58s/it, gpt_loss=0.313, loss_mean=0.3][A
+Train step of epoch 0:  84%|████████▍ | 5398/6434 [12:39:37<2:28:12,  8.58s/it, gpt_loss=0.241, loss_mean=0.294][A
+Train step of epoch 0:  84%|████████▍ | 5399/6434 [12:39:37<2:26:54,  8.52s/it, gpt_loss=0.241, loss_mean=0.294][A
+[LID Router Debug] Step: 5400
+Batch Size: 10
+Audio Batch Size: 117
+LID Assignments: [4, 9, 6, 9, 2, 4, 0, 3, 9, 2]
+Active Experts in Batch: {0, 2, 3, 4, 6, 9}
+[2026-02-07 04:35:50,385] [INFO] [logging.py:96:log_dist] [Rank 0] step=2700, skipped=0, lr=[1.6555156485255435e-05, 1.6555156485255435e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 04:35:50,385] [INFO] [timer.py:260:stop] epoch=0/micro_step=5400/global_step=2700, RunningAvgSamplesPerSec=4.748330549973422, CurrSamplesPerSec=4.623270131588996, MemAllocated=12.91GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  84%|████████▍ | 5399/6434 [12:39:46<2:26:54,  8.52s/it, gpt_loss=0.305, loss_mean=0.295][A
+Train step of epoch 0:  84%|████████▍ | 5400/6434 [12:39:46<2:29:08,  8.65s/it, gpt_loss=0.305, loss_mean=0.295][A
+Train step of epoch 0:  84%|████████▍ | 5400/6434 [12:39:55<2:29:08,  8.65s/it, gpt_loss=0.245, loss_mean=0.29] [A
+Train step of epoch 0:  84%|████████▍ | 5401/6434 [12:39:55<2:32:17,  8.85s/it, gpt_loss=0.245, loss_mean=0.29][A
+Train step of epoch 0:  84%|████████▍ | 5401/6434 [12:40:04<2:32:17,  8.85s/it, gpt_loss=0.288, loss_mean=0.29][A
+Train step of epoch 0:  84%|████████▍ | 5402/6434 [12:40:04<2:31:45,  8.82s/it, gpt_loss=0.288, loss_mean=0.29][A
+Train step of epoch 0:  84%|████████▍ | 5402/6434 [12:40:13<2:31:45,  8.82s/it, gpt_loss=0.26, loss_mean=0.287][A
+Train step of epoch 0:  84%|████████▍ | 5403/6434 [12:40:13<2:31:43,  8.83s/it, gpt_loss=0.26, loss_mean=0.287][A
+Train step of epoch 0:  84%|████████▍ | 5403/6434 [12:40:21<2:31:43,  8.83s/it, gpt_loss=0.297, loss_mean=0.288][A
+Train step of epoch 0:  84%|████████▍ | 5404/6434 [12:40:21<2:28:22,  8.64s/it, gpt_loss=0.297, loss_mean=0.288][A
+Train step of epoch 0:  84%|████████▍ | 5404/6434 [12:40:30<2:28:22,  8.64s/it, gpt_loss=0.318, loss_mean=0.291][A
+Train step of epoch 0:  84%|████████▍ | 5405/6434 [12:40:30<2:29:08,  8.70s/it, gpt_loss=0.318, loss_mean=0.291][A
+Train step of epoch 0:  84%|████████▍ | 5405/6434 [12:40:38<2:29:08,  8.70s/it, gpt_loss=0.294, loss_mean=0.291][A
+Train step of epoch 0:  84%|████████▍ | 5406/6434 [12:40:38<2:26:03,  8.52s/it, gpt_loss=0.294, loss_mean=0.291][A
+Train step of epoch 0:  84%|████████▍ | 5406/6434 [12:40:47<2:26:03,  8.52s/it, gpt_loss=0.296, loss_mean=0.292][A
+Train step of epoch 0:  84%|████████▍ | 5407/6434 [12:40:47<2:30:10,  8.77s/it, gpt_loss=0.296, loss_mean=0.292][A
+Train step of epoch 0:  84%|████████▍ | 5407/6434 [12:40:56<2:30:10,  8.77s/it, gpt_loss=0.26, loss_mean=0.288] [A
+Train step of epoch 0:  84%|████████▍ | 5408/6434 [12:40:56<2:27:08,  8.60s/it, gpt_loss=0.26, loss_mean=0.288][A
+Train step of epoch 0:  84%|████████▍ | 5408/6434 [12:41:05<2:27:08,  8.60s/it, gpt_loss=0.296, loss_mean=0.289][A
+Train step of epoch 0:  84%|████████▍ | 5409/6434 [12:41:05<2:28:58,  8.72s/it, gpt_loss=0.296, loss_mean=0.289][A
+[LID Router Debug] Step: 5410
+Batch Size: 10
+Audio Batch Size: 115
+LID Assignments: [7, 1, 5, 2, 2, 9, 1, 2, 4, 2]
+Active Experts in Batch: {1, 2, 4, 5, 7, 9}
+
+Train step of epoch 0:  84%|████████▍ | 5409/6434 [12:41:13<2:28:58,  8.72s/it, gpt_loss=0.327, loss_mean=0.293][A
+Train step of epoch 0:  84%|████████▍ | 5410/6434 [12:41:13<2:26:34,  8.59s/it, gpt_loss=0.327, loss_mean=0.293][A
+Train step of epoch 0:  84%|████████▍ | 5410/6434 [12:41:21<2:26:34,  8.59s/it, gpt_loss=0.268, loss_mean=0.29] [A
+Train step of epoch 0:  84%|████████▍ | 5411/6434 [12:41:21<2:23:19,  8.41s/it, gpt_loss=0.268, loss_mean=0.29][A
+Train step of epoch 0:  84%|████████▍ | 5411/6434 [12:41:30<2:23:19,  8.41s/it, gpt_loss=0.268, loss_mean=0.288][A
+Train step of epoch 0:  84%|████████▍ | 5412/6434 [12:41:30<2:25:52,  8.56s/it, gpt_loss=0.268, loss_mean=0.288][A
+Train step of epoch 0:  84%|████████▍ | 5412/6434 [12:41:38<2:25:52,  8.56s/it, gpt_loss=0.325, loss_mean=0.292][A
+Train step of epoch 0:  84%|████████▍ | 5413/6434 [12:41:38<2:22:01,  8.35s/it, gpt_loss=0.325, loss_mean=0.292][A
+Train step of epoch 0:  84%|████████▍ | 5413/6434 [12:41:45<2:22:01,  8.35s/it, gpt_loss=0.274, loss_mean=0.29] [A
+Train step of epoch 0:  84%|████████▍ | 5414/6434 [12:41:45<2:19:08,  8.19s/it, gpt_loss=0.274, loss_mean=0.29][A
+Train step of epoch 0:  84%|████████▍ | 5414/6434 [12:41:55<2:19:08,  8.19s/it, gpt_loss=0.336, loss_mean=0.295][A
+Train step of epoch 0:  84%|████████▍ | 5415/6434 [12:41:55<2:25:26,  8.56s/it, gpt_loss=0.336, loss_mean=0.295][A
+Train step of epoch 0:  84%|████████▍ | 5415/6434 [12:42:04<2:25:26,  8.56s/it, gpt_loss=0.362, loss_mean=0.301][A
+Train step of epoch 0:  84%|████████▍ | 5416/6434 [12:42:04<2:26:23,  8.63s/it, gpt_loss=0.362, loss_mean=0.301][A
+Train step of epoch 0:  84%|████████▍ | 5416/6434 [12:42:12<2:26:23,  8.63s/it, gpt_loss=0.294, loss_mean=0.301][A
+Train step of epoch 0:  84%|████████▍ | 5417/6434 [12:42:12<2:24:24,  8.52s/it, gpt_loss=0.294, loss_mean=0.301][A
+Train step of epoch 0:  84%|████████▍ | 5417/6434 [12:42:20<2:24:24,  8.52s/it, gpt_loss=0.255, loss_mean=0.296][A
+Train step of epoch 0:  84%|████████▍ | 5418/6434 [12:42:20<2:21:38,  8.36s/it, gpt_loss=0.255, loss_mean=0.296][A
+Train step of epoch 0:  84%|████████▍ | 5418/6434 [12:42:28<2:21:38,  8.36s/it, gpt_loss=0.305, loss_mean=0.297][A
+Train step of epoch 0:  84%|████████▍ | 5419/6434 [12:42:28<2:18:39,  8.20s/it, gpt_loss=0.305, loss_mean=0.297][A
+[LID Router Debug] Step: 5420
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [2, 6, 3, 2, 6, 4, 5, 9, 1, 1]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  84%|████████▍ | 5419/6434 [12:42:36<2:18:39,  8.20s/it, gpt_loss=0.308, loss_mean=0.298][A
+Train step of epoch 0:  84%|████████▍ | 5420/6434 [12:42:36<2:20:05,  8.29s/it, gpt_loss=0.308, loss_mean=0.298][A
+Train step of epoch 0:  84%|████████▍ | 5420/6434 [12:42:44<2:20:05,  8.29s/it, gpt_loss=0.271, loss_mean=0.295][A
+Train step of epoch 0:  84%|████████▍ | 5421/6434 [12:42:44<2:16:25,  8.08s/it, gpt_loss=0.271, loss_mean=0.295][A
+Train step of epoch 0:  84%|████████▍ | 5421/6434 [12:42:52<2:16:25,  8.08s/it, gpt_loss=0.421, loss_mean=0.308][A
+Train step of epoch 0:  84%|████████▍ | 5422/6434 [12:42:52<2:15:53,  8.06s/it, gpt_loss=0.421, loss_mean=0.308][A
+Train step of epoch 0:  84%|████████▍ | 5422/6434 [12:43:01<2:15:53,  8.06s/it, gpt_loss=0.299, loss_mean=0.307][A
+Train step of epoch 0:  84%|████████▍ | 5423/6434 [12:43:01<2:19:23,  8.27s/it, gpt_loss=0.299, loss_mean=0.307][A
+Train step of epoch 0:  84%|████████▍ | 5423/6434 [12:43:08<2:19:23,  8.27s/it, gpt_loss=0.26, loss_mean=0.302] [A
+Train step of epoch 0:  84%|████████▍ | 5424/6434 [12:43:08<2:15:20,  8.04s/it, gpt_loss=0.26, loss_mean=0.302][A
+Train step of epoch 0:  84%|████████▍ | 5424/6434 [12:43:16<2:15:20,  8.04s/it, gpt_loss=0.262, loss_mean=0.298][A
+Train step of epoch 0:  84%|████████▍ | 5425/6434 [12:43:16<2:14:12,  7.98s/it, gpt_loss=0.262, loss_mean=0.298][A
+Train step of epoch 0:  84%|████████▍ | 5425/6434 [12:43:24<2:14:12,  7.98s/it, gpt_loss=0.323, loss_mean=0.301][A
+Train step of epoch 0:  84%|████████▍ | 5426/6434 [12:43:24<2:12:13,  7.87s/it, gpt_loss=0.323, loss_mean=0.301][A
+Train step of epoch 0:  84%|████████▍ | 5426/6434 [12:43:32<2:12:13,  7.87s/it, gpt_loss=0.289, loss_mean=0.3]  [A
+Train step of epoch 0:  84%|████████▍ | 5427/6434 [12:43:32<2:13:24,  7.95s/it, gpt_loss=0.289, loss_mean=0.3][A
+Train step of epoch 0:  84%|████████▍ | 5427/6434 [12:43:40<2:13:24,  7.95s/it, gpt_loss=0.287, loss_mean=0.298][A
+Train step of epoch 0:  84%|████████▍ | 5428/6434 [12:43:40<2:16:43,  8.15s/it, gpt_loss=0.287, loss_mean=0.298][A
+Train step of epoch 0:  84%|████████▍ | 5428/6434 [12:43:49<2:16:43,  8.15s/it, gpt_loss=0.257, loss_mean=0.294][A
+Train step of epoch 0:  84%|████████▍ | 5429/6434 [12:43:49<2:18:26,  8.27s/it, gpt_loss=0.257, loss_mean=0.294][A
+[LID Router Debug] Step: 5430
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [4, 6, 9, 2, 4, 4, 0, 5, 0, 1]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  84%|████████▍ | 5429/6434 [12:43:58<2:18:26,  8.27s/it, gpt_loss=0.327, loss_mean=0.298][A
+Train step of epoch 0:  84%|████████▍ | 5430/6434 [12:43:58<2:22:05,  8.49s/it, gpt_loss=0.327, loss_mean=0.298][A
+Train step of epoch 0:  84%|████████▍ | 5430/6434 [12:44:07<2:22:05,  8.49s/it, gpt_loss=0.367, loss_mean=0.304][A
+Train step of epoch 0:  84%|████████▍ | 5431/6434 [12:44:07<2:27:14,  8.81s/it, gpt_loss=0.367, loss_mean=0.304][A
+Train step of epoch 0:  84%|████████▍ | 5431/6434 [12:44:15<2:27:14,  8.81s/it, gpt_loss=0.306, loss_mean=0.305][A
+Train step of epoch 0:  84%|████████▍ | 5432/6434 [12:44:15<2:22:15,  8.52s/it, gpt_loss=0.306, loss_mean=0.305][A
+Train step of epoch 0:  84%|████████▍ | 5432/6434 [12:44:24<2:22:15,  8.52s/it, gpt_loss=0.337, loss_mean=0.308][A
+Train step of epoch 0:  84%|████████▍ | 5433/6434 [12:44:24<2:22:53,  8.57s/it, gpt_loss=0.337, loss_mean=0.308][A
+Train step of epoch 0:  84%|████████▍ | 5433/6434 [12:44:33<2:22:53,  8.57s/it, gpt_loss=0.319, loss_mean=0.309][A
+Train step of epoch 0:  84%|████████▍ | 5434/6434 [12:44:33<2:24:23,  8.66s/it, gpt_loss=0.319, loss_mean=0.309][A
+Train step of epoch 0:  84%|████████▍ | 5434/6434 [12:44:42<2:24:23,  8.66s/it, gpt_loss=0.236, loss_mean=0.302][A
+Train step of epoch 0:  84%|████████▍ | 5435/6434 [12:44:42<2:24:38,  8.69s/it, gpt_loss=0.236, loss_mean=0.302][A
+Train step of epoch 0:  84%|████████▍ | 5435/6434 [12:44:50<2:24:38,  8.69s/it, gpt_loss=0.304, loss_mean=0.302][A
+Train step of epoch 0:  84%|████████▍ | 5436/6434 [12:44:50<2:23:31,  8.63s/it, gpt_loss=0.304, loss_mean=0.302][A
+Train step of epoch 0:  84%|████████▍ | 5436/6434 [12:44:59<2:23:31,  8.63s/it, gpt_loss=0.294, loss_mean=0.301][A
+Train step of epoch 0:  85%|████████▍ | 5437/6434 [12:44:59<2:25:25,  8.75s/it, gpt_loss=0.294, loss_mean=0.301][A
+Train step of epoch 0:  85%|████████▍ | 5437/6434 [12:45:07<2:25:25,  8.75s/it, gpt_loss=0.333, loss_mean=0.304][A
+Train step of epoch 0:  85%|████████▍ | 5438/6434 [12:45:07<2:23:13,  8.63s/it, gpt_loss=0.333, loss_mean=0.304][A
+Train step of epoch 0:  85%|████████▍ | 5438/6434 [12:45:16<2:23:13,  8.63s/it, gpt_loss=0.28, loss_mean=0.302] [A
+Train step of epoch 0:  85%|████████▍ | 5439/6434 [12:45:16<2:21:03,  8.51s/it, gpt_loss=0.28, loss_mean=0.302][A
+[LID Router Debug] Step: 5440
+Batch Size: 10
+Audio Batch Size: 108
+LID Assignments: [2, 5, 9, 2, 2, 0, 6, 4, 1, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  85%|████████▍ | 5439/6434 [12:45:24<2:21:03,  8.51s/it, gpt_loss=0.257, loss_mean=0.297][A
+Train step of epoch 0:  85%|████████▍ | 5440/6434 [12:45:24<2:21:51,  8.56s/it, gpt_loss=0.257, loss_mean=0.297][A
+Train step of epoch 0:  85%|████████▍ | 5440/6434 [12:45:32<2:21:51,  8.56s/it, gpt_loss=0.354, loss_mean=0.303][A
+Train step of epoch 0:  85%|████████▍ | 5441/6434 [12:45:32<2:18:17,  8.36s/it, gpt_loss=0.354, loss_mean=0.303][A
+Train step of epoch 0:  85%|████████▍ | 5441/6434 [12:45:41<2:18:17,  8.36s/it, gpt_loss=0.36, loss_mean=0.309] [A
+Train step of epoch 0:  85%|████████▍ | 5442/6434 [12:45:41<2:20:05,  8.47s/it, gpt_loss=0.36, loss_mean=0.309][A
+Train step of epoch 0:  85%|████████▍ | 5442/6434 [12:45:50<2:20:05,  8.47s/it, gpt_loss=0.363, loss_mean=0.314][A
+Train step of epoch 0:  85%|████████▍ | 5443/6434 [12:45:50<2:23:16,  8.67s/it, gpt_loss=0.363, loss_mean=0.314][A
+Train step of epoch 0:  85%|████████▍ | 5443/6434 [12:45:58<2:23:16,  8.67s/it, gpt_loss=0.208, loss_mean=0.303][A
+Train step of epoch 0:  85%|████████▍ | 5444/6434 [12:45:58<2:21:20,  8.57s/it, gpt_loss=0.208, loss_mean=0.303][A
+Train step of epoch 0:  85%|████████▍ | 5444/6434 [12:46:07<2:21:20,  8.57s/it, gpt_loss=0.278, loss_mean=0.301][A
+Train step of epoch 0:  85%|████████▍ | 5445/6434 [12:46:07<2:23:09,  8.69s/it, gpt_loss=0.278, loss_mean=0.301][A
+Train step of epoch 0:  85%|████████▍ | 5445/6434 [12:46:15<2:23:09,  8.69s/it, gpt_loss=0.312, loss_mean=0.302][A
+Train step of epoch 0:  85%|████████▍ | 5446/6434 [12:46:15<2:18:40,  8.42s/it, gpt_loss=0.312, loss_mean=0.302][A
+Train step of epoch 0:  85%|████████▍ | 5446/6434 [12:46:23<2:18:40,  8.42s/it, gpt_loss=0.276, loss_mean=0.299][A
+Train step of epoch 0:  85%|████████▍ | 5447/6434 [12:46:23<2:16:08,  8.28s/it, gpt_loss=0.276, loss_mean=0.299][A
+Train step of epoch 0:  85%|████████▍ | 5447/6434 [12:46:31<2:16:08,  8.28s/it, gpt_loss=0.255, loss_mean=0.295][A
+Train step of epoch 0:  85%|████████▍ | 5448/6434 [12:46:31<2:15:29,  8.24s/it, gpt_loss=0.255, loss_mean=0.295][A
+Train step of epoch 0:  85%|████████▍ | 5448/6434 [12:46:40<2:15:29,  8.24s/it, gpt_loss=0.208, loss_mean=0.286][A
+Train step of epoch 0:  85%|████████▍ | 5449/6434 [12:46:40<2:15:35,  8.26s/it, gpt_loss=0.208, loss_mean=0.286][A
+[LID Router Debug] Step: 5450
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [2, 0, 9, 2, 4, 0, 4, 5, 5, 3]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  85%|████████▍ | 5449/6434 [12:46:48<2:15:35,  8.26s/it, gpt_loss=0.294, loss_mean=0.287][A
+Train step of epoch 0:  85%|████████▍ | 5450/6434 [12:46:48<2:13:56,  8.17s/it, gpt_loss=0.294, loss_mean=0.287][A
+Train step of epoch 0:  85%|████████▍ | 5450/6434 [12:46:56<2:13:56,  8.17s/it, gpt_loss=0.271, loss_mean=0.285][A
+Train step of epoch 0:  85%|████████▍ | 5451/6434 [12:46:56<2:14:25,  8.20s/it, gpt_loss=0.271, loss_mean=0.285][A
+Train step of epoch 0:  85%|████████▍ | 5451/6434 [12:47:04<2:14:25,  8.20s/it, gpt_loss=0.243, loss_mean=0.281][A
+Train step of epoch 0:  85%|████████▍ | 5452/6434 [12:47:04<2:16:32,  8.34s/it, gpt_loss=0.243, loss_mean=0.281][A
+Train step of epoch 0:  85%|████████▍ | 5452/6434 [12:47:13<2:16:32,  8.34s/it, gpt_loss=0.296, loss_mean=0.283][A
+Train step of epoch 0:  85%|████████▍ | 5453/6434 [12:47:13<2:16:03,  8.32s/it, gpt_loss=0.296, loss_mean=0.283][A
+Train step of epoch 0:  85%|████████▍ | 5453/6434 [12:47:22<2:16:03,  8.32s/it, gpt_loss=0.303, loss_mean=0.285][A
+Train step of epoch 0:  85%|████████▍ | 5454/6434 [12:47:22<2:22:10,  8.70s/it, gpt_loss=0.303, loss_mean=0.285][A
+Train step of epoch 0:  85%|████████▍ | 5454/6434 [12:47:30<2:22:10,  8.70s/it, gpt_loss=0.423, loss_mean=0.299][A
+Train step of epoch 0:  85%|████████▍ | 5455/6434 [12:47:30<2:18:13,  8.47s/it, gpt_loss=0.423, loss_mean=0.299][A
+Train step of epoch 0:  85%|████████▍ | 5455/6434 [12:47:39<2:18:13,  8.47s/it, gpt_loss=0.281, loss_mean=0.297][A
+Train step of epoch 0:  85%|████████▍ | 5456/6434 [12:47:39<2:20:57,  8.65s/it, gpt_loss=0.281, loss_mean=0.297][A
+Train step of epoch 0:  85%|████████▍ | 5456/6434 [12:47:48<2:20:57,  8.65s/it, gpt_loss=0.227, loss_mean=0.29] [A
+Train step of epoch 0:  85%|████████▍ | 5457/6434 [12:47:48<2:21:32,  8.69s/it, gpt_loss=0.227, loss_mean=0.29][A
+Train step of epoch 0:  85%|████████▍ | 5457/6434 [12:47:57<2:21:32,  8.69s/it, gpt_loss=0.307, loss_mean=0.292][A
+Train step of epoch 0:  85%|████████▍ | 5458/6434 [12:47:57<2:21:29,  8.70s/it, gpt_loss=0.307, loss_mean=0.292][A
+Train step of epoch 0:  85%|████████▍ | 5458/6434 [12:48:06<2:21:29,  8.70s/it, gpt_loss=0.351, loss_mean=0.297][A
+Train step of epoch 0:  85%|████████▍ | 5459/6434 [12:48:06<2:21:42,  8.72s/it, gpt_loss=0.351, loss_mean=0.297][A
+[LID Router Debug] Step: 5460
+Batch Size: 10
+Audio Batch Size: 120
+LID Assignments: [2, 3, 6, 1, 2, 3, 0, 5, 9, 9]
+Active Experts in Batch: {0, 1, 2, 3, 5, 6, 9}
+
+Train step of epoch 0:  85%|████████▍ | 5459/6434 [12:48:14<2:21:42,  8.72s/it, gpt_loss=0.228, loss_mean=0.29] [A
+Train step of epoch 0:  85%|████████▍ | 5460/6434 [12:48:14<2:20:26,  8.65s/it, gpt_loss=0.228, loss_mean=0.29][A
+Train step of epoch 0:  85%|████████▍ | 5460/6434 [12:48:23<2:20:26,  8.65s/it, gpt_loss=0.246, loss_mean=0.286][A
+Train step of epoch 0:  85%|████████▍ | 5461/6434 [12:48:23<2:23:22,  8.84s/it, gpt_loss=0.246, loss_mean=0.286][A
+Train step of epoch 0:  85%|████████▍ | 5461/6434 [12:48:32<2:23:22,  8.84s/it, gpt_loss=0.268, loss_mean=0.284][A
+Train step of epoch 0:  85%|████████▍ | 5462/6434 [12:48:32<2:22:16,  8.78s/it, gpt_loss=0.268, loss_mean=0.284][A
+Train step of epoch 0:  85%|████████▍ | 5462/6434 [12:48:41<2:22:16,  8.78s/it, gpt_loss=0.291, loss_mean=0.285][A
+Train step of epoch 0:  85%|████████▍ | 5463/6434 [12:48:41<2:21:04,  8.72s/it, gpt_loss=0.291, loss_mean=0.285][A
+Train step of epoch 0:  85%|████████▍ | 5463/6434 [12:48:48<2:21:04,  8.72s/it, gpt_loss=0.391, loss_mean=0.295][A
+Train step of epoch 0:  85%|████████▍ | 5464/6434 [12:48:48<2:12:55,  8.22s/it, gpt_loss=0.391, loss_mean=0.295][A
+Train step of epoch 0:  85%|████████▍ | 5464/6434 [12:48:56<2:12:55,  8.22s/it, gpt_loss=0.305, loss_mean=0.296][A
+Train step of epoch 0:  85%|████████▍ | 5465/6434 [12:48:56<2:12:06,  8.18s/it, gpt_loss=0.305, loss_mean=0.296][A
+Train step of epoch 0:  85%|████████▍ | 5465/6434 [12:49:05<2:12:06,  8.18s/it, gpt_loss=0.271, loss_mean=0.294][A
+Train step of epoch 0:  85%|████████▍ | 5466/6434 [12:49:05<2:17:14,  8.51s/it, gpt_loss=0.271, loss_mean=0.294][A
+Train step of epoch 0:  85%|████████▍ | 5466/6434 [12:49:13<2:17:14,  8.51s/it, gpt_loss=0.301, loss_mean=0.295][A
+Train step of epoch 0:  85%|████████▍ | 5467/6434 [12:49:13<2:15:13,  8.39s/it, gpt_loss=0.301, loss_mean=0.295][A
+Train step of epoch 0:  85%|████████▍ | 5467/6434 [12:49:21<2:15:13,  8.39s/it, gpt_loss=0.298, loss_mean=0.295][A
+Train step of epoch 0:  85%|████████▍ | 5468/6434 [12:49:21<2:11:59,  8.20s/it, gpt_loss=0.298, loss_mean=0.295][A
+Train step of epoch 0:  85%|████████▍ | 5468/6434 [12:49:29<2:11:59,  8.20s/it, gpt_loss=0.339, loss_mean=0.299][A
+Train step of epoch 0:  85%|████████▌ | 5469/6434 [12:49:29<2:13:00,  8.27s/it, gpt_loss=0.339, loss_mean=0.299][A
+[LID Router Debug] Step: 5470
+Batch Size: 10
+Audio Batch Size: 108
+LID Assignments: [0, 4, 2, 0, 3, 9, 1, 2, 4, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+Train step of epoch 0:  85%|████████▌ | 5469/6434 [12:49:37<2:13:00,  8.27s/it, gpt_loss=0.227, loss_mean=0.292][A
+Train step of epoch 0:  85%|████████▌ | 5470/6434 [12:49:37<2:10:16,  8.11s/it, gpt_loss=0.227, loss_mean=0.292][A
+Train step of epoch 0:  85%|████████▌ | 5470/6434 [12:49:46<2:10:16,  8.11s/it, gpt_loss=0.307, loss_mean=0.294][A
+Train step of epoch 0:  85%|████████▌ | 5471/6434 [12:49:46<2:16:09,  8.48s/it, gpt_loss=0.307, loss_mean=0.294][A
+Train step of epoch 0:  85%|████████▌ | 5471/6434 [12:49:55<2:16:09,  8.48s/it, gpt_loss=0.269, loss_mean=0.291][A
+Train step of epoch 0:  85%|████████▌ | 5472/6434 [12:49:55<2:18:33,  8.64s/it, gpt_loss=0.269, loss_mean=0.291][A
+Train step of epoch 0:  85%|████████▌ | 5472/6434 [12:50:04<2:18:33,  8.64s/it, gpt_loss=0.227, loss_mean=0.285][A
+Train step of epoch 0:  85%|████████▌ | 5473/6434 [12:50:04<2:16:50,  8.54s/it, gpt_loss=0.227, loss_mean=0.285][A
+Train step of epoch 0:  85%|████████▌ | 5473/6434 [12:50:13<2:16:50,  8.54s/it, gpt_loss=0.335, loss_mean=0.29] [A
+Train step of epoch 0:  85%|████████▌ | 5474/6434 [12:50:13<2:18:50,  8.68s/it, gpt_loss=0.335, loss_mean=0.29][A
+Train step of epoch 0:  85%|████████▌ | 5474/6434 [12:50:22<2:18:50,  8.68s/it, gpt_loss=0.215, loss_mean=0.282][A
+Train step of epoch 0:  85%|████████▌ | 5475/6434 [12:50:22<2:20:42,  8.80s/it, gpt_loss=0.215, loss_mean=0.282][A
+Train step of epoch 0:  85%|████████▌ | 5475/6434 [12:50:30<2:20:42,  8.80s/it, gpt_loss=0.248, loss_mean=0.279][A
+Train step of epoch 0:  85%|████████▌ | 5476/6434 [12:50:30<2:16:56,  8.58s/it, gpt_loss=0.248, loss_mean=0.279][A
+Train step of epoch 0:  85%|████████▌ | 5476/6434 [12:50:38<2:16:56,  8.58s/it, gpt_loss=0.284, loss_mean=0.279][A
+Train step of epoch 0:  85%|████████▌ | 5477/6434 [12:50:38<2:15:02,  8.47s/it, gpt_loss=0.284, loss_mean=0.279][A
+Train step of epoch 0:  85%|████████▌ | 5477/6434 [12:50:47<2:15:02,  8.47s/it, gpt_loss=0.293, loss_mean=0.281][A
+Train step of epoch 0:  85%|████████▌ | 5478/6434 [12:50:47<2:15:01,  8.47s/it, gpt_loss=0.293, loss_mean=0.281][A
+Train step of epoch 0:  85%|████████▌ | 5478/6434 [12:50:55<2:15:01,  8.47s/it, gpt_loss=0.297, loss_mean=0.282][A
+Train step of epoch 0:  85%|████████▌ | 5479/6434 [12:50:55<2:14:21,  8.44s/it, gpt_loss=0.297, loss_mean=0.282][A
+[LID Router Debug] Step: 5480
+Batch Size: 10
+Audio Batch Size: 118
+LID Assignments: [3, 0, 0, 9, 0, 8, 9, 4, 3, 2]
+Active Experts in Batch: {0, 2, 3, 4, 8, 9}
+
+Train step of epoch 0:  85%|████████▌ | 5479/6434 [12:51:03<2:14:21,  8.44s/it, gpt_loss=0.26, loss_mean=0.28]  [A
+Train step of epoch 0:  85%|████████▌ | 5480/6434 [12:51:03<2:13:09,  8.38s/it, gpt_loss=0.26, loss_mean=0.28][A
+Train step of epoch 0:  85%|████████▌ | 5480/6434 [12:51:14<2:13:09,  8.38s/it, gpt_loss=0.291, loss_mean=0.281][A
+Train step of epoch 0:  85%|████████▌ | 5481/6434 [12:51:14<2:24:22,  9.09s/it, gpt_loss=0.291, loss_mean=0.281][A
+Train step of epoch 0:  85%|████████▌ | 5481/6434 [12:51:22<2:24:22,  9.09s/it, gpt_loss=0.245, loss_mean=0.278][A
+Train step of epoch 0:  85%|████████▌ | 5482/6434 [12:51:22<2:17:33,  8.67s/it, gpt_loss=0.245, loss_mean=0.278][A
+Train step of epoch 0:  85%|████████▌ | 5482/6434 [12:51:31<2:17:33,  8.67s/it, gpt_loss=0.334, loss_mean=0.283][A
+Train step of epoch 0:  85%|████████▌ | 5483/6434 [12:51:31<2:18:40,  8.75s/it, gpt_loss=0.334, loss_mean=0.283][A
+Train step of epoch 0:  85%|████████▌ | 5483/6434 [12:51:38<2:18:40,  8.75s/it, gpt_loss=0.277, loss_mean=0.283][A
+Train step of epoch 0:  85%|████████▌ | 5484/6434 [12:51:38<2:11:39,  8.31s/it, gpt_loss=0.277, loss_mean=0.283][A
+Train step of epoch 0:  85%|████████▌ | 5484/6434 [12:51:47<2:11:39,  8.31s/it, gpt_loss=0.311, loss_mean=0.286][A
+Train step of epoch 0:  85%|████████▌ | 5485/6434 [12:51:47<2:14:07,  8.48s/it, gpt_loss=0.311, loss_mean=0.286][A
+Train step of epoch 0:  85%|████████▌ | 5485/6434 [12:51:56<2:14:07,  8.48s/it, gpt_loss=0.29, loss_mean=0.286] [A
+Train step of epoch 0:  85%|████████▌ | 5486/6434 [12:51:56<2:20:16,  8.88s/it, gpt_loss=0.29, loss_mean=0.286][A
+Train step of epoch 0:  85%|████████▌ | 5486/6434 [12:52:05<2:20:16,  8.88s/it, gpt_loss=0.284, loss_mean=0.286][A
+Train step of epoch 0:  85%|████████▌ | 5487/6434 [12:52:05<2:16:23,  8.64s/it, gpt_loss=0.284, loss_mean=0.286][A
+Train step of epoch 0:  85%|████████▌ | 5487/6434 [12:52:13<2:16:23,  8.64s/it, gpt_loss=0.264, loss_mean=0.284][A
+Train step of epoch 0:  85%|████████▌ | 5488/6434 [12:52:13<2:15:32,  8.60s/it, gpt_loss=0.264, loss_mean=0.284][A
+Train step of epoch 0:  85%|████████▌ | 5488/6434 [12:52:20<2:15:32,  8.60s/it, gpt_loss=0.282, loss_mean=0.283][A
+Train step of epoch 0:  85%|████████▌ | 5489/6434 [12:52:20<2:08:47,  8.18s/it, gpt_loss=0.282, loss_mean=0.283][A
+[LID Router Debug] Step: 5490
+Batch Size: 10
+Audio Batch Size: 107
+LID Assignments: [9, 5, 3, 0, 0, 5, 5, 4, 3, 5]
+Active Experts in Batch: {0, 3, 4, 5, 9}
+
+Train step of epoch 0:  85%|████████▌ | 5489/6434 [12:52:29<2:08:47,  8.18s/it, gpt_loss=0.36, loss_mean=0.291] [A
+Train step of epoch 0:  85%|████████▌ | 5490/6434 [12:52:29<2:12:28,  8.42s/it, gpt_loss=0.36, loss_mean=0.291][A
+Train step of epoch 0:  85%|████████▌ | 5490/6434 [12:52:37<2:12:28,  8.42s/it, gpt_loss=0.231, loss_mean=0.285][A
+Train step of epoch 0:  85%|████████▌ | 5491/6434 [12:52:37<2:09:57,  8.27s/it, gpt_loss=0.231, loss_mean=0.285][A
+Train step of epoch 0:  85%|████████▌ | 5491/6434 [12:52:45<2:09:57,  8.27s/it, gpt_loss=0.214, loss_mean=0.278][A
+Train step of epoch 0:  85%|████████▌ | 5492/6434 [12:52:45<2:10:01,  8.28s/it, gpt_loss=0.214, loss_mean=0.278][A
+Train step of epoch 0:  85%|████████▌ | 5492/6434 [12:52:53<2:10:01,  8.28s/it, gpt_loss=0.31, loss_mean=0.281] [A
+Train step of epoch 0:  85%|████████▌ | 5493/6434 [12:52:53<2:06:00,  8.03s/it, gpt_loss=0.31, loss_mean=0.281][A
+Train step of epoch 0:  85%|████████▌ | 5493/6434 [12:53:01<2:06:00,  8.03s/it, gpt_loss=0.294, loss_mean=0.283][A
+Train step of epoch 0:  85%|████████▌ | 5494/6434 [12:53:01<2:07:42,  8.15s/it, gpt_loss=0.294, loss_mean=0.283][A
+Train step of epoch 0:  85%|████████▌ | 5494/6434 [12:53:10<2:07:42,  8.15s/it, gpt_loss=0.232, loss_mean=0.277][A
+Train step of epoch 0:  85%|████████▌ | 5495/6434 [12:53:10<2:08:42,  8.22s/it, gpt_loss=0.232, loss_mean=0.277][A
+Train step of epoch 0:  85%|████████▌ | 5495/6434 [12:53:18<2:08:42,  8.22s/it, gpt_loss=0.316, loss_mean=0.281][A
+Train step of epoch 0:  85%|████████▌ | 5496/6434 [12:53:18<2:09:17,  8.27s/it, gpt_loss=0.316, loss_mean=0.281][A
+Train step of epoch 0:  85%|████████▌ | 5496/6434 [12:53:28<2:09:17,  8.27s/it, gpt_loss=0.288, loss_mean=0.282][A
+Train step of epoch 0:  85%|████████▌ | 5497/6434 [12:53:28<2:14:20,  8.60s/it, gpt_loss=0.288, loss_mean=0.282][A
+Train step of epoch 0:  85%|████████▌ | 5497/6434 [12:53:37<2:14:20,  8.60s/it, gpt_loss=0.277, loss_mean=0.281][A
+Train step of epoch 0:  85%|████████▌ | 5498/6434 [12:53:37<2:16:04,  8.72s/it, gpt_loss=0.277, loss_mean=0.281][A
+Train step of epoch 0:  85%|████████▌ | 5498/6434 [12:53:46<2:16:04,  8.72s/it, gpt_loss=0.29, loss_mean=0.282] [A
+Train step of epoch 0:  85%|████████▌ | 5499/6434 [12:53:46<2:17:18,  8.81s/it, gpt_loss=0.29, loss_mean=0.282][A
+[LID Router Debug] Step: 5500
+Batch Size: 10
+Audio Batch Size: 125
+LID Assignments: [2, 3, 6, 5, 5, 9, 5, 2, 3, 9]
+Active Experts in Batch: {2, 3, 5, 6, 9}
+
+Train step of epoch 0:  85%|████████▌ | 5499/6434 [12:53:54<2:17:18,  8.81s/it, gpt_loss=0.336, loss_mean=0.288][A
+Train step of epoch 0:  85%|████████▌ | 5500/6434 [12:53:54<2:17:29,  8.83s/it, gpt_loss=0.336, loss_mean=0.288][A
+Train step of epoch 0:  85%|████████▌ | 5500/6434 [12:54:04<2:17:29,  8.83s/it, gpt_loss=0.264, loss_mean=0.285][A
+Train step of epoch 0:  85%|████████▌ | 5501/6434 [12:54:04<2:19:52,  8.99s/it, gpt_loss=0.264, loss_mean=0.285][A
+Train step of epoch 0:  85%|████████▌ | 5501/6434 [12:54:13<2:19:52,  8.99s/it, gpt_loss=0.244, loss_mean=0.281][A
+Train step of epoch 0:  86%|████████▌ | 5502/6434 [12:54:13<2:18:55,  8.94s/it, gpt_loss=0.244, loss_mean=0.281][A
+Train step of epoch 0:  86%|████████▌ | 5502/6434 [12:54:21<2:18:55,  8.94s/it, gpt_loss=0.276, loss_mean=0.281][A
+Train step of epoch 0:  86%|████████▌ | 5503/6434 [12:54:21<2:15:39,  8.74s/it, gpt_loss=0.276, loss_mean=0.281][A
+Train step of epoch 0:  86%|████████▌ | 5503/6434 [12:54:30<2:15:39,  8.74s/it, gpt_loss=0.345, loss_mean=0.287][A
+Train step of epoch 0:  86%|████████▌ | 5504/6434 [12:54:30<2:15:41,  8.75s/it, gpt_loss=0.345, loss_mean=0.287][A
+Train step of epoch 0:  86%|████████▌ | 5504/6434 [12:54:39<2:15:41,  8.75s/it, gpt_loss=0.259, loss_mean=0.284][A
+Train step of epoch 0:  86%|████████▌ | 5505/6434 [12:54:39<2:17:09,  8.86s/it, gpt_loss=0.259, loss_mean=0.284][A
+Train step of epoch 0:  86%|████████▌ | 5505/6434 [12:54:48<2:17:09,  8.86s/it, gpt_loss=0.226, loss_mean=0.279][A
+Train step of epoch 0:  86%|████████▌ | 5506/6434 [12:54:48<2:17:14,  8.87s/it, gpt_loss=0.226, loss_mean=0.279][A
+Train step of epoch 0:  86%|████████▌ | 5506/6434 [12:54:55<2:17:14,  8.87s/it, gpt_loss=0.237, loss_mean=0.274][A
+Train step of epoch 0:  86%|████████▌ | 5507/6434 [12:54:55<2:11:24,  8.50s/it, gpt_loss=0.237, loss_mean=0.274][A
+Train step of epoch 0:  86%|████████▌ | 5507/6434 [12:55:05<2:11:24,  8.50s/it, gpt_loss=0.32, loss_mean=0.279] [A
+Train step of epoch 0:  86%|████████▌ | 5508/6434 [12:55:05<2:14:25,  8.71s/it, gpt_loss=0.32, loss_mean=0.279][A
+Train step of epoch 0:  86%|████████▌ | 5508/6434 [12:55:13<2:14:25,  8.71s/it, gpt_loss=0.347, loss_mean=0.286][A
+Train step of epoch 0:  86%|████████▌ | 5509/6434 [12:55:13<2:11:17,  8.52s/it, gpt_loss=0.347, loss_mean=0.286][A
+[LID Router Debug] Step: 5510
+Batch Size: 10
+Audio Batch Size: 74
+LID Assignments: [1, 2, 5, 0, 2, 1, 2, 2, 2, 9]
+Active Experts in Batch: {0, 1, 2, 5, 9}
+
+Train step of epoch 0:  86%|████████▌ | 5509/6434 [12:55:20<2:11:17,  8.52s/it, gpt_loss=0.314, loss_mean=0.289][A
+Train step of epoch 0:  86%|████████▌ | 5510/6434 [12:55:20<2:06:01,  8.18s/it, gpt_loss=0.314, loss_mean=0.289][A
+Train step of epoch 0:  86%|████████▌ | 5510/6434 [12:55:29<2:06:01,  8.18s/it, gpt_loss=0.255, loss_mean=0.285][A
+Train step of epoch 0:  86%|████████▌ | 5511/6434 [12:55:29<2:09:29,  8.42s/it, gpt_loss=0.255, loss_mean=0.285][A
+Train step of epoch 0:  86%|████████▌ | 5511/6434 [12:55:36<2:09:29,  8.42s/it, gpt_loss=0.264, loss_mean=0.283][A
+Train step of epoch 0:  86%|████████▌ | 5512/6434 [12:55:36<2:00:56,  7.87s/it, gpt_loss=0.264, loss_mean=0.283][A
+Train step of epoch 0:  86%|████████▌ | 5512/6434 [12:55:43<2:00:56,  7.87s/it, gpt_loss=0.239, loss_mean=0.279][A
+Train step of epoch 0:  86%|████████▌ | 5513/6434 [12:55:43<1:59:32,  7.79s/it, gpt_loss=0.239, loss_mean=0.279][A
+Train step of epoch 0:  86%|████████▌ | 5513/6434 [12:55:52<1:59:32,  7.79s/it, gpt_loss=0.306, loss_mean=0.282][A
+Train step of epoch 0:  86%|████████▌ | 5514/6434 [12:55:52<2:02:16,  7.97s/it, gpt_loss=0.306, loss_mean=0.282][A
+Train step of epoch 0:  86%|████████▌ | 5514/6434 [12:56:00<2:02:16,  7.97s/it, gpt_loss=0.25, loss_mean=0.278] [A
+Train step of epoch 0:  86%|████████▌ | 5515/6434 [12:56:00<2:04:32,  8.13s/it, gpt_loss=0.25, loss_mean=0.278][A
+Train step of epoch 0:  86%|████████▌ | 5515/6434 [12:56:08<2:04:32,  8.13s/it, gpt_loss=0.255, loss_mean=0.276][A
+Train step of epoch 0:  86%|████████▌ | 5516/6434 [12:56:08<2:03:25,  8.07s/it, gpt_loss=0.255, loss_mean=0.276][A
+Train step of epoch 0:  86%|████████▌ | 5516/6434 [12:56:16<2:03:25,  8.07s/it, gpt_loss=0.242, loss_mean=0.273][A
+Train step of epoch 0:  86%|████████▌ | 5517/6434 [12:56:16<2:01:50,  7.97s/it, gpt_loss=0.242, loss_mean=0.273][A
+Train step of epoch 0:  86%|████████▌ | 5517/6434 [12:56:23<2:01:50,  7.97s/it, gpt_loss=0.327, loss_mean=0.278][A
+Train step of epoch 0:  86%|████████▌ | 5518/6434 [12:56:23<2:00:48,  7.91s/it, gpt_loss=0.327, loss_mean=0.278][A
+Train step of epoch 0:  86%|████████▌ | 5518/6434 [12:56:32<2:00:48,  7.91s/it, gpt_loss=0.36, loss_mean=0.286] [A
+Train step of epoch 0:  86%|████████▌ | 5519/6434 [12:56:32<2:04:22,  8.16s/it, gpt_loss=0.36, loss_mean=0.286][A
+[LID Router Debug] Step: 5520
+Batch Size: 10
+Audio Batch Size: 98
+LID Assignments: [4, 6, 3, 4, 1, 5, 1, 9, 4, 6]
+Active Experts in Batch: {1, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  86%|████████▌ | 5519/6434 [12:56:41<2:04:22,  8.16s/it, gpt_loss=0.327, loss_mean=0.29][A
+Train step of epoch 0:  86%|████████▌ | 5520/6434 [12:56:41<2:07:28,  8.37s/it, gpt_loss=0.327, loss_mean=0.29][A
+Train step of epoch 0:  86%|████████▌ | 5520/6434 [12:56:51<2:07:28,  8.37s/it, gpt_loss=0.318, loss_mean=0.293][A
+Train step of epoch 0:  86%|████████▌ | 5521/6434 [12:56:51<2:13:53,  8.80s/it, gpt_loss=0.318, loss_mean=0.293][A
+Train step of epoch 0:  86%|████████▌ | 5521/6434 [12:57:00<2:13:53,  8.80s/it, gpt_loss=0.364, loss_mean=0.3]  [A
+Train step of epoch 0:  86%|████████▌ | 5522/6434 [12:57:00<2:14:02,  8.82s/it, gpt_loss=0.364, loss_mean=0.3][A
+Train step of epoch 0:  86%|████████▌ | 5522/6434 [12:57:08<2:14:02,  8.82s/it, gpt_loss=0.292, loss_mean=0.299][A
+Train step of epoch 0:  86%|████████▌ | 5523/6434 [12:57:08<2:09:55,  8.56s/it, gpt_loss=0.292, loss_mean=0.299][A
+Train step of epoch 0:  86%|████████▌ | 5523/6434 [12:57:16<2:09:55,  8.56s/it, gpt_loss=0.321, loss_mean=0.301][A
+Train step of epoch 0:  86%|████████▌ | 5524/6434 [12:57:16<2:10:48,  8.62s/it, gpt_loss=0.321, loss_mean=0.301][A
+Train step of epoch 0:  86%|████████▌ | 5524/6434 [12:57:25<2:10:48,  8.62s/it, gpt_loss=0.287, loss_mean=0.3]  [A
+Train step of epoch 0:  86%|████████▌ | 5525/6434 [12:57:25<2:08:45,  8.50s/it, gpt_loss=0.287, loss_mean=0.3][A
+Train step of epoch 0:  86%|████████▌ | 5525/6434 [12:57:33<2:08:45,  8.50s/it, gpt_loss=0.222, loss_mean=0.292][A
+Train step of epoch 0:  86%|████████▌ | 5526/6434 [12:57:33<2:09:09,  8.53s/it, gpt_loss=0.222, loss_mean=0.292][A
+Train step of epoch 0:  86%|████████▌ | 5526/6434 [12:57:41<2:09:09,  8.53s/it, gpt_loss=0.25, loss_mean=0.288] [A
+Train step of epoch 0:  86%|████████▌ | 5527/6434 [12:57:41<2:07:17,  8.42s/it, gpt_loss=0.25, loss_mean=0.288][A
+Train step of epoch 0:  86%|████████▌ | 5527/6434 [12:57:51<2:07:17,  8.42s/it, gpt_loss=0.344, loss_mean=0.294][A
+Train step of epoch 0:  86%|████████▌ | 5528/6434 [12:57:51<2:10:25,  8.64s/it, gpt_loss=0.344, loss_mean=0.294][A
+Train step of epoch 0:  86%|████████▌ | 5528/6434 [12:57:59<2:10:25,  8.64s/it, gpt_loss=0.256, loss_mean=0.29] [A
+Train step of epoch 0:  86%|████████▌ | 5529/6434 [12:57:59<2:10:02,  8.62s/it, gpt_loss=0.256, loss_mean=0.29][A
+[LID Router Debug] Step: 5530
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [0, 1, 3, 0, 5, 0, 5, 3, 2, 3]
+Active Experts in Batch: {0, 1, 2, 3, 5}
+
+Train step of epoch 0:  86%|████████▌ | 5529/6434 [12:58:08<2:10:02,  8.62s/it, gpt_loss=0.223, loss_mean=0.283][A
+Train step of epoch 0:  86%|████████▌ | 5530/6434 [12:58:08<2:12:18,  8.78s/it, gpt_loss=0.223, loss_mean=0.283][A
+Train step of epoch 0:  86%|████████▌ | 5530/6434 [12:58:17<2:12:18,  8.78s/it, gpt_loss=0.454, loss_mean=0.3]  [A
+Train step of epoch 0:  86%|████████▌ | 5531/6434 [12:58:17<2:09:54,  8.63s/it, gpt_loss=0.454, loss_mean=0.3][A
+Train step of epoch 0:  86%|████████▌ | 5531/6434 [12:58:26<2:09:54,  8.63s/it, gpt_loss=0.269, loss_mean=0.297][A
+Train step of epoch 0:  86%|████████▌ | 5532/6434 [12:58:26<2:11:02,  8.72s/it, gpt_loss=0.269, loss_mean=0.297][A
+Train step of epoch 0:  86%|████████▌ | 5532/6434 [12:58:35<2:11:02,  8.72s/it, gpt_loss=0.255, loss_mean=0.293][A
+Train step of epoch 0:  86%|████████▌ | 5533/6434 [12:58:35<2:12:02,  8.79s/it, gpt_loss=0.255, loss_mean=0.293][A
+Train step of epoch 0:  86%|████████▌ | 5533/6434 [12:58:43<2:12:02,  8.79s/it, gpt_loss=0.283, loss_mean=0.292][A
+Train step of epoch 0:  86%|████████▌ | 5534/6434 [12:58:43<2:10:06,  8.67s/it, gpt_loss=0.283, loss_mean=0.292][A
+Train step of epoch 0:  86%|████████▌ | 5534/6434 [12:58:52<2:10:06,  8.67s/it, gpt_loss=0.336, loss_mean=0.296][A
+Train step of epoch 0:  86%|████████▌ | 5535/6434 [12:58:52<2:10:52,  8.74s/it, gpt_loss=0.336, loss_mean=0.296][A
+Train step of epoch 0:  86%|████████▌ | 5535/6434 [12:59:01<2:10:52,  8.74s/it, gpt_loss=0.293, loss_mean=0.296][A
+Train step of epoch 0:  86%|████████▌ | 5536/6434 [12:59:01<2:12:00,  8.82s/it, gpt_loss=0.293, loss_mean=0.296][A
+Train step of epoch 0:  86%|████████▌ | 5536/6434 [12:59:10<2:12:00,  8.82s/it, gpt_loss=0.315, loss_mean=0.298][A
+Train step of epoch 0:  86%|████████▌ | 5537/6434 [12:59:10<2:14:24,  8.99s/it, gpt_loss=0.315, loss_mean=0.298][A
+Train step of epoch 0:  86%|████████▌ | 5537/6434 [12:59:20<2:14:24,  8.99s/it, gpt_loss=0.304, loss_mean=0.299][A
+Train step of epoch 0:  86%|████████▌ | 5538/6434 [12:59:20<2:16:45,  9.16s/it, gpt_loss=0.304, loss_mean=0.299][A
+Train step of epoch 0:  86%|████████▌ | 5538/6434 [12:59:28<2:16:45,  9.16s/it, gpt_loss=0.287, loss_mean=0.297][A
+Train step of epoch 0:  86%|████████▌ | 5539/6434 [12:59:28<2:12:26,  8.88s/it, gpt_loss=0.287, loss_mean=0.297][A
+[LID Router Debug] Step: 5540
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [0, 0, 9, 2, 5, 5, 5, 0, 3, 0]
+Active Experts in Batch: {0, 2, 3, 5, 9}
+
+Train step of epoch 0:  86%|████████▌ | 5539/6434 [12:59:36<2:12:26,  8.88s/it, gpt_loss=0.243, loss_mean=0.292][A
+Train step of epoch 0:  86%|████████▌ | 5540/6434 [12:59:36<2:06:54,  8.52s/it, gpt_loss=0.243, loss_mean=0.292][A
+Train step of epoch 0:  86%|████████▌ | 5540/6434 [12:59:45<2:06:54,  8.52s/it, gpt_loss=0.271, loss_mean=0.29] [A
+Train step of epoch 0:  86%|████████▌ | 5541/6434 [12:59:45<2:08:35,  8.64s/it, gpt_loss=0.271, loss_mean=0.29][A
+Train step of epoch 0:  86%|████████▌ | 5541/6434 [12:59:53<2:08:35,  8.64s/it, gpt_loss=0.292, loss_mean=0.29][A
+Train step of epoch 0:  86%|████████▌ | 5542/6434 [12:59:53<2:05:45,  8.46s/it, gpt_loss=0.292, loss_mean=0.29][A
+Train step of epoch 0:  86%|████████▌ | 5542/6434 [13:00:02<2:05:45,  8.46s/it, gpt_loss=0.269, loss_mean=0.288][A
+Train step of epoch 0:  86%|████████▌ | 5543/6434 [13:00:02<2:10:53,  8.81s/it, gpt_loss=0.269, loss_mean=0.288][A
+Train step of epoch 0:  86%|████████▌ | 5543/6434 [13:00:10<2:10:53,  8.81s/it, gpt_loss=0.376, loss_mean=0.297][A
+Train step of epoch 0:  86%|████████▌ | 5544/6434 [13:00:10<2:06:27,  8.53s/it, gpt_loss=0.376, loss_mean=0.297][A
+Train step of epoch 0:  86%|████████▌ | 5544/6434 [13:00:19<2:06:27,  8.53s/it, gpt_loss=0.327, loss_mean=0.3]  [A
+Train step of epoch 0:  86%|████████▌ | 5545/6434 [13:00:19<2:06:07,  8.51s/it, gpt_loss=0.327, loss_mean=0.3][A
+Train step of epoch 0:  86%|████████▌ | 5545/6434 [13:00:27<2:06:07,  8.51s/it, gpt_loss=0.258, loss_mean=0.296][A
+Train step of epoch 0:  86%|████████▌ | 5546/6434 [13:00:27<2:07:43,  8.63s/it, gpt_loss=0.258, loss_mean=0.296][A
+Train step of epoch 0:  86%|████████▌ | 5546/6434 [13:00:37<2:07:43,  8.63s/it, gpt_loss=0.274, loss_mean=0.293][A
+Train step of epoch 0:  86%|████████▌ | 5547/6434 [13:00:37<2:10:14,  8.81s/it, gpt_loss=0.274, loss_mean=0.293][A
+Train step of epoch 0:  86%|████████▌ | 5547/6434 [13:00:44<2:10:14,  8.81s/it, gpt_loss=0.262, loss_mean=0.29] [A
+Train step of epoch 0:  86%|████████▌ | 5548/6434 [13:00:44<2:05:22,  8.49s/it, gpt_loss=0.262, loss_mean=0.29][A
+Train step of epoch 0:  86%|████████▌ | 5548/6434 [13:00:53<2:05:22,  8.49s/it, gpt_loss=0.417, loss_mean=0.303][A
+Train step of epoch 0:  86%|████████▌ | 5549/6434 [13:00:53<2:06:11,  8.56s/it, gpt_loss=0.417, loss_mean=0.303][A
+[LID Router Debug] Step: 5550
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [9, 2, 3, 5, 0, 9, 5, 9, 2, 1]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9}
+
+Train step of epoch 0:  86%|████████▌ | 5549/6434 [13:01:01<2:06:11,  8.56s/it, gpt_loss=0.363, loss_mean=0.309][A
+Train step of epoch 0:  86%|████████▋ | 5550/6434 [13:01:01<2:02:58,  8.35s/it, gpt_loss=0.363, loss_mean=0.309][A
+Train step of epoch 0:  86%|████████▋ | 5550/6434 [13:01:08<2:02:58,  8.35s/it, gpt_loss=0.258, loss_mean=0.304][A
+Train step of epoch 0:  86%|████████▋ | 5551/6434 [13:01:08<1:58:02,  8.02s/it, gpt_loss=0.258, loss_mean=0.304][A
+Train step of epoch 0:  86%|████████▋ | 5551/6434 [13:01:16<1:58:02,  8.02s/it, gpt_loss=0.269, loss_mean=0.3]  [A
+Train step of epoch 0:  86%|████████▋ | 5552/6434 [13:01:16<1:58:15,  8.05s/it, gpt_loss=0.269, loss_mean=0.3][A
+Train step of epoch 0:  86%|████████▋ | 5552/6434 [13:01:25<1:58:15,  8.05s/it, gpt_loss=0.282, loss_mean=0.298][A
+Train step of epoch 0:  86%|████████▋ | 5553/6434 [13:01:25<1:59:26,  8.13s/it, gpt_loss=0.282, loss_mean=0.298][A
+Train step of epoch 0:  86%|████████▋ | 5553/6434 [13:01:33<1:59:26,  8.13s/it, gpt_loss=0.3, loss_mean=0.299]  [A
+Train step of epoch 0:  86%|████████▋ | 5554/6434 [13:01:33<2:02:01,  8.32s/it, gpt_loss=0.3, loss_mean=0.299][A
+Train step of epoch 0:  86%|████████▋ | 5554/6434 [13:01:44<2:02:01,  8.32s/it, gpt_loss=0.238, loss_mean=0.293][A
+Train step of epoch 0:  86%|████████▋ | 5555/6434 [13:01:44<2:10:48,  8.93s/it, gpt_loss=0.238, loss_mean=0.293][A
+Train step of epoch 0:  86%|████████▋ | 5555/6434 [13:01:52<2:10:48,  8.93s/it, gpt_loss=0.318, loss_mean=0.295][A
+Train step of epoch 0:  86%|████████▋ | 5556/6434 [13:01:52<2:05:08,  8.55s/it, gpt_loss=0.318, loss_mean=0.295][A
+Train step of epoch 0:  86%|████████▋ | 5556/6434 [13:02:01<2:05:08,  8.55s/it, gpt_loss=0.335, loss_mean=0.299][A
+Train step of epoch 0:  86%|████████▋ | 5557/6434 [13:02:01<2:10:43,  8.94s/it, gpt_loss=0.335, loss_mean=0.299][A
+Train step of epoch 0:  86%|████████▋ | 5557/6434 [13:02:09<2:10:43,  8.94s/it, gpt_loss=0.341, loss_mean=0.303][A
+Train step of epoch 0:  86%|████████▋ | 5558/6434 [13:02:09<2:06:34,  8.67s/it, gpt_loss=0.341, loss_mean=0.303][A
+Train step of epoch 0:  86%|████████▋ | 5558/6434 [13:02:19<2:06:34,  8.67s/it, gpt_loss=0.314, loss_mean=0.304][A
+Train step of epoch 0:  86%|████████▋ | 5559/6434 [13:02:19<2:11:09,  8.99s/it, gpt_loss=0.314, loss_mean=0.304][A
+[LID Router Debug] Step: 5560
+Batch Size: 10
+Audio Batch Size: 98
+LID Assignments: [5, 9, 5, 9, 9, 4, 2, 2, 3, 0]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  86%|████████▋ | 5559/6434 [13:02:27<2:11:09,  8.99s/it, gpt_loss=0.343, loss_mean=0.308][A
+Train step of epoch 0:  86%|████████▋ | 5560/6434 [13:02:27<2:07:19,  8.74s/it, gpt_loss=0.343, loss_mean=0.308][A
+Train step of epoch 0:  86%|████████▋ | 5560/6434 [13:02:37<2:07:19,  8.74s/it, gpt_loss=0.263, loss_mean=0.304][A
+Train step of epoch 0:  86%|████████▋ | 5561/6434 [13:02:37<2:09:59,  8.93s/it, gpt_loss=0.263, loss_mean=0.304][A
+Train step of epoch 0:  86%|████████▋ | 5561/6434 [13:02:46<2:09:59,  8.93s/it, gpt_loss=0.267, loss_mean=0.3]  [A
+Train step of epoch 0:  86%|████████▋ | 5562/6434 [13:02:46<2:12:01,  9.08s/it, gpt_loss=0.267, loss_mean=0.3][A
+Train step of epoch 0:  86%|████████▋ | 5562/6434 [13:02:56<2:12:01,  9.08s/it, gpt_loss=0.243, loss_mean=0.294][A
+Train step of epoch 0:  86%|████████▋ | 5563/6434 [13:02:56<2:15:29,  9.33s/it, gpt_loss=0.243, loss_mean=0.294][A
+Train step of epoch 0:  86%|████████▋ | 5563/6434 [13:03:06<2:15:29,  9.33s/it, gpt_loss=0.28, loss_mean=0.293] [A
+Train step of epoch 0:  86%|████████▋ | 5564/6434 [13:03:06<2:17:09,  9.46s/it, gpt_loss=0.28, loss_mean=0.293][A
+Train step of epoch 0:  86%|████████▋ | 5564/6434 [13:03:14<2:17:09,  9.46s/it, gpt_loss=0.299, loss_mean=0.293][A
+Train step of epoch 0:  86%|████████▋ | 5565/6434 [13:03:14<2:12:59,  9.18s/it, gpt_loss=0.299, loss_mean=0.293][A
+Train step of epoch 0:  86%|████████▋ | 5565/6434 [13:03:22<2:12:59,  9.18s/it, gpt_loss=0.245, loss_mean=0.289][A
+Train step of epoch 0:  87%|████████▋ | 5566/6434 [13:03:22<2:08:22,  8.87s/it, gpt_loss=0.245, loss_mean=0.289][A
+Train step of epoch 0:  87%|████████▋ | 5566/6434 [13:03:30<2:08:22,  8.87s/it, gpt_loss=0.25, loss_mean=0.285] [A
+Train step of epoch 0:  87%|████████▋ | 5567/6434 [13:03:30<2:02:21,  8.47s/it, gpt_loss=0.25, loss_mean=0.285][A
+Train step of epoch 0:  87%|████████▋ | 5567/6434 [13:03:39<2:02:21,  8.47s/it, gpt_loss=0.354, loss_mean=0.292][A
+Train step of epoch 0:  87%|████████▋ | 5568/6434 [13:03:39<2:03:37,  8.57s/it, gpt_loss=0.354, loss_mean=0.292][A
+Train step of epoch 0:  87%|████████▋ | 5568/6434 [13:03:48<2:03:37,  8.57s/it, gpt_loss=0.341, loss_mean=0.297][A
+Train step of epoch 0:  87%|████████▋ | 5569/6434 [13:03:48<2:05:49,  8.73s/it, gpt_loss=0.341, loss_mean=0.297][A
+[LID Router Debug] Step: 5570
+Batch Size: 10
+Audio Batch Size: 113
+LID Assignments: [2, 2, 5, 3, 6, 2, 0, 5, 2, 0]
+Active Experts in Batch: {0, 2, 3, 5, 6}
+
+Train step of epoch 0:  87%|████████▋ | 5569/6434 [13:03:56<2:05:49,  8.73s/it, gpt_loss=0.242, loss_mean=0.291][A
+Train step of epoch 0:  87%|████████▋ | 5570/6434 [13:03:56<2:05:05,  8.69s/it, gpt_loss=0.242, loss_mean=0.291][A
+Train step of epoch 0:  87%|████████▋ | 5570/6434 [13:04:05<2:05:05,  8.69s/it, gpt_loss=0.377, loss_mean=0.3]  [A
+Train step of epoch 0:  87%|████████▋ | 5571/6434 [13:04:05<2:04:57,  8.69s/it, gpt_loss=0.377, loss_mean=0.3][A
+Train step of epoch 0:  87%|████████▋ | 5571/6434 [13:04:14<2:04:57,  8.69s/it, gpt_loss=0.27, loss_mean=0.297][A
+Train step of epoch 0:  87%|████████▋ | 5572/6434 [13:04:14<2:04:35,  8.67s/it, gpt_loss=0.27, loss_mean=0.297][A
+Train step of epoch 0:  87%|████████▋ | 5572/6434 [13:04:22<2:04:35,  8.67s/it, gpt_loss=0.219, loss_mean=0.289][A
+Train step of epoch 0:  87%|████████▋ | 5573/6434 [13:04:22<2:02:49,  8.56s/it, gpt_loss=0.219, loss_mean=0.289][A
+Train step of epoch 0:  87%|████████▋ | 5573/6434 [13:04:30<2:02:49,  8.56s/it, gpt_loss=0.267, loss_mean=0.287][A
+Train step of epoch 0:  87%|████████▋ | 5574/6434 [13:04:30<2:01:26,  8.47s/it, gpt_loss=0.267, loss_mean=0.287][A
+Train step of epoch 0:  87%|████████▋ | 5574/6434 [13:04:39<2:01:26,  8.47s/it, gpt_loss=0.322, loss_mean=0.29] [A
+Train step of epoch 0:  87%|████████▋ | 5575/6434 [13:04:39<2:02:05,  8.53s/it, gpt_loss=0.322, loss_mean=0.29][A
+Train step of epoch 0:  87%|████████▋ | 5575/6434 [13:04:48<2:02:05,  8.53s/it, gpt_loss=0.272, loss_mean=0.288][A
+Train step of epoch 0:  87%|████████▋ | 5576/6434 [13:04:48<2:01:50,  8.52s/it, gpt_loss=0.272, loss_mean=0.288][A
+Train step of epoch 0:  87%|████████▋ | 5576/6434 [13:04:56<2:01:50,  8.52s/it, gpt_loss=0.287, loss_mean=0.288][A
+Train step of epoch 0:  87%|████████▋ | 5577/6434 [13:04:56<2:03:27,  8.64s/it, gpt_loss=0.287, loss_mean=0.288][A
+Train step of epoch 0:  87%|████████▋ | 5577/6434 [13:05:05<2:03:27,  8.64s/it, gpt_loss=0.255, loss_mean=0.285][A
+Train step of epoch 0:  87%|████████▋ | 5578/6434 [13:05:05<2:03:33,  8.66s/it, gpt_loss=0.255, loss_mean=0.285][A
+Train step of epoch 0:  87%|████████▋ | 5578/6434 [13:05:13<2:03:33,  8.66s/it, gpt_loss=0.237, loss_mean=0.28] [A
+Train step of epoch 0:  87%|████████▋ | 5579/6434 [13:05:13<1:58:45,  8.33s/it, gpt_loss=0.237, loss_mean=0.28][A
+[LID Router Debug] Step: 5580
+Batch Size: 10
+Audio Batch Size: 96
+LID Assignments: [6, 4, 2, 9, 3, 5, 2, 5, 5, 1]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  87%|████████▋ | 5579/6434 [13:05:20<1:58:45,  8.33s/it, gpt_loss=0.333, loss_mean=0.285][A
+Train step of epoch 0:  87%|████████▋ | 5580/6434 [13:05:20<1:56:05,  8.16s/it, gpt_loss=0.333, loss_mean=0.285][A
+Train step of epoch 0:  87%|████████▋ | 5580/6434 [13:05:29<1:56:05,  8.16s/it, gpt_loss=0.286, loss_mean=0.285][A
+Train step of epoch 0:  87%|████████▋ | 5581/6434 [13:05:29<1:58:56,  8.37s/it, gpt_loss=0.286, loss_mean=0.285][A
+Train step of epoch 0:  87%|████████▋ | 5581/6434 [13:05:39<1:58:56,  8.37s/it, gpt_loss=0.304, loss_mean=0.287][A
+Train step of epoch 0:  87%|████████▋ | 5582/6434 [13:05:39<2:03:06,  8.67s/it, gpt_loss=0.304, loss_mean=0.287][A
+Train step of epoch 0:  87%|████████▋ | 5582/6434 [13:05:47<2:03:06,  8.67s/it, gpt_loss=0.284, loss_mean=0.287][A
+Train step of epoch 0:  87%|████████▋ | 5583/6434 [13:05:47<1:59:35,  8.43s/it, gpt_loss=0.284, loss_mean=0.287][A
+Train step of epoch 0:  87%|████████▋ | 5583/6434 [13:05:55<1:59:35,  8.43s/it, gpt_loss=0.355, loss_mean=0.294][A
+Train step of epoch 0:  87%|████████▋ | 5584/6434 [13:05:55<1:57:26,  8.29s/it, gpt_loss=0.355, loss_mean=0.294][A
+Train step of epoch 0:  87%|████████▋ | 5584/6434 [13:06:04<1:57:26,  8.29s/it, gpt_loss=0.366, loss_mean=0.301][A
+Train step of epoch 0:  87%|████████▋ | 5585/6434 [13:06:04<2:01:04,  8.56s/it, gpt_loss=0.366, loss_mean=0.301][A
+Train step of epoch 0:  87%|████████▋ | 5585/6434 [13:06:13<2:01:04,  8.56s/it, gpt_loss=0.274, loss_mean=0.298][A
+Train step of epoch 0:  87%|████████▋ | 5586/6434 [13:06:13<2:03:02,  8.71s/it, gpt_loss=0.274, loss_mean=0.298][A
+Train step of epoch 0:  87%|████████▋ | 5586/6434 [13:06:22<2:03:02,  8.71s/it, gpt_loss=0.316, loss_mean=0.3]  [A
+Train step of epoch 0:  87%|████████▋ | 5587/6434 [13:06:22<2:05:23,  8.88s/it, gpt_loss=0.316, loss_mean=0.3][A
+Train step of epoch 0:  87%|████████▋ | 5587/6434 [13:06:30<2:05:23,  8.88s/it, gpt_loss=0.281, loss_mean=0.298][A
+Train step of epoch 0:  87%|████████▋ | 5588/6434 [13:06:30<2:01:15,  8.60s/it, gpt_loss=0.281, loss_mean=0.298][A
+Train step of epoch 0:  87%|████████▋ | 5588/6434 [13:06:38<2:01:15,  8.60s/it, gpt_loss=0.255, loss_mean=0.294][A
+Train step of epoch 0:  87%|████████▋ | 5589/6434 [13:06:38<1:57:57,  8.38s/it, gpt_loss=0.255, loss_mean=0.294][A
+[LID Router Debug] Step: 5590
+Batch Size: 10
+Audio Batch Size: 90
+LID Assignments: [9, 0, 0, 0, 3, 2, 0, 4, 5, 9]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  87%|████████▋ | 5589/6434 [13:06:45<1:57:57,  8.38s/it, gpt_loss=0.284, loss_mean=0.293][A
+Train step of epoch 0:  87%|████████▋ | 5590/6434 [13:06:45<1:54:16,  8.12s/it, gpt_loss=0.284, loss_mean=0.293][A
+Train step of epoch 0:  87%|████████▋ | 5590/6434 [13:06:53<1:54:16,  8.12s/it, gpt_loss=0.291, loss_mean=0.293][A
+Train step of epoch 0:  87%|████████▋ | 5591/6434 [13:06:53<1:51:18,  7.92s/it, gpt_loss=0.291, loss_mean=0.293][A
+Train step of epoch 0:  87%|████████▋ | 5591/6434 [13:07:02<1:51:18,  7.92s/it, gpt_loss=0.267, loss_mean=0.29] [A
+Train step of epoch 0:  87%|████████▋ | 5592/6434 [13:07:02<1:56:19,  8.29s/it, gpt_loss=0.267, loss_mean=0.29][A
+Train step of epoch 0:  87%|████████▋ | 5592/6434 [13:07:10<1:56:19,  8.29s/it, gpt_loss=0.284, loss_mean=0.289][A
+Train step of epoch 0:  87%|████████▋ | 5593/6434 [13:07:10<1:55:57,  8.27s/it, gpt_loss=0.284, loss_mean=0.289][A
+Train step of epoch 0:  87%|████████▋ | 5593/6434 [13:07:18<1:55:57,  8.27s/it, gpt_loss=0.257, loss_mean=0.286][A
+Train step of epoch 0:  87%|████████▋ | 5594/6434 [13:07:18<1:55:18,  8.24s/it, gpt_loss=0.257, loss_mean=0.286][A
+Train step of epoch 0:  87%|████████▋ | 5594/6434 [13:07:26<1:55:18,  8.24s/it, gpt_loss=0.373, loss_mean=0.295][A
+Train step of epoch 0:  87%|████████▋ | 5595/6434 [13:07:26<1:54:34,  8.19s/it, gpt_loss=0.373, loss_mean=0.295][A
+Train step of epoch 0:  87%|████████▋ | 5595/6434 [13:07:35<1:54:34,  8.19s/it, gpt_loss=0.309, loss_mean=0.296][A
+Train step of epoch 0:  87%|████████▋ | 5596/6434 [13:07:35<1:54:32,  8.20s/it, gpt_loss=0.309, loss_mean=0.296][A
+Train step of epoch 0:  87%|████████▋ | 5596/6434 [13:07:44<1:54:32,  8.20s/it, gpt_loss=0.312, loss_mean=0.298][A
+Train step of epoch 0:  87%|████████▋ | 5597/6434 [13:07:44<1:57:26,  8.42s/it, gpt_loss=0.312, loss_mean=0.298][A
+Train step of epoch 0:  87%|████████▋ | 5597/6434 [13:07:52<1:57:26,  8.42s/it, gpt_loss=0.236, loss_mean=0.292][A
+Train step of epoch 0:  87%|████████▋ | 5598/6434 [13:07:52<1:56:20,  8.35s/it, gpt_loss=0.236, loss_mean=0.292][A
+Train step of epoch 0:  87%|████████▋ | 5598/6434 [13:08:00<1:56:20,  8.35s/it, gpt_loss=0.257, loss_mean=0.288][A
+Train step of epoch 0:  87%|████████▋ | 5599/6434 [13:08:00<1:57:19,  8.43s/it, gpt_loss=0.257, loss_mean=0.288][A
+[LID Router Debug] Step: 5600
+Batch Size: 10
+Audio Batch Size: 130
+LID Assignments: [6, 9, 6, 0, 3, 3, 0, 3, 4, 9]
+Active Experts in Batch: {0, 3, 4, 6, 9}
+[2026-02-07 05:04:13,441] [INFO] [logging.py:96:log_dist] [Rank 0] step=2800, skipped=0, lr=[1.6303335168965484e-05, 1.6303335168965484e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 05:04:13,442] [INFO] [timer.py:260:stop] epoch=0/micro_step=5600/global_step=2800, RunningAvgSamplesPerSec=4.746820647112967, CurrSamplesPerSec=4.6713356590310235, MemAllocated=12.63GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  87%|████████▋ | 5599/6434 [13:08:09<1:57:19,  8.43s/it, gpt_loss=0.349, loss_mean=0.294][A
+Train step of epoch 0:  87%|████████▋ | 5600/6434 [13:08:09<1:57:38,  8.46s/it, gpt_loss=0.349, loss_mean=0.294][A
+Train step of epoch 0:  87%|████████▋ | 5600/6434 [13:08:17<1:57:38,  8.46s/it, gpt_loss=0.29, loss_mean=0.294] [A
+Train step of epoch 0:  87%|████████▋ | 5601/6434 [13:08:17<1:55:45,  8.34s/it, gpt_loss=0.29, loss_mean=0.294][A
+Train step of epoch 0:  87%|████████▋ | 5601/6434 [13:08:25<1:55:45,  8.34s/it, gpt_loss=0.337, loss_mean=0.298][A
+Train step of epoch 0:  87%|████████▋ | 5602/6434 [13:08:25<1:54:47,  8.28s/it, gpt_loss=0.337, loss_mean=0.298][A
+Train step of epoch 0:  87%|████████▋ | 5602/6434 [13:08:33<1:54:47,  8.28s/it, gpt_loss=0.256, loss_mean=0.294][A
+Train step of epoch 0:  87%|████████▋ | 5603/6434 [13:08:33<1:54:26,  8.26s/it, gpt_loss=0.256, loss_mean=0.294][A
+Train step of epoch 0:  87%|████████▋ | 5603/6434 [13:08:42<1:54:26,  8.26s/it, gpt_loss=0.367, loss_mean=0.301][A
+Train step of epoch 0:  87%|████████▋ | 5604/6434 [13:08:42<1:54:57,  8.31s/it, gpt_loss=0.367, loss_mean=0.301][A
+Train step of epoch 0:  87%|████████▋ | 5604/6434 [13:08:50<1:54:57,  8.31s/it, gpt_loss=0.279, loss_mean=0.299][A
+Train step of epoch 0:  87%|████████▋ | 5605/6434 [13:08:50<1:52:47,  8.16s/it, gpt_loss=0.279, loss_mean=0.299][A
+Train step of epoch 0:  87%|████████▋ | 5605/6434 [13:08:58<1:52:47,  8.16s/it, gpt_loss=0.284, loss_mean=0.298][A
+Train step of epoch 0:  87%|████████▋ | 5606/6434 [13:08:58<1:53:53,  8.25s/it, gpt_loss=0.284, loss_mean=0.298][A
+Train step of epoch 0:  87%|████████▋ | 5606/6434 [13:09:06<1:53:53,  8.25s/it, gpt_loss=0.294, loss_mean=0.297][A
+Train step of epoch 0:  87%|████████▋ | 5607/6434 [13:09:06<1:52:09,  8.14s/it, gpt_loss=0.294, loss_mean=0.297][A
+Train step of epoch 0:  87%|████████▋ | 5607/6434 [13:09:15<1:52:09,  8.14s/it, gpt_loss=0.267, loss_mean=0.294][A
+Train step of epoch 0:  87%|████████▋ | 5608/6434 [13:09:15<1:57:01,  8.50s/it, gpt_loss=0.267, loss_mean=0.294][A
+Train step of epoch 0:  87%|████████▋ | 5608/6434 [13:09:24<1:57:01,  8.50s/it, gpt_loss=0.2, loss_mean=0.285]  [A
+Train step of epoch 0:  87%|████████▋ | 5609/6434 [13:09:24<1:58:43,  8.63s/it, gpt_loss=0.2, loss_mean=0.285][A
+[LID Router Debug] Step: 5610
+Batch Size: 10
+Audio Batch Size: 107
+LID Assignments: [1, 2, 3, 5, 5, 9, 5, 9, 2, 5]
+Active Experts in Batch: {1, 2, 3, 5, 9}
+
+Train step of epoch 0:  87%|████████▋ | 5609/6434 [13:09:32<1:58:43,  8.63s/it, gpt_loss=0.351, loss_mean=0.291][A
+Train step of epoch 0:  87%|████████▋ | 5610/6434 [13:09:32<1:56:43,  8.50s/it, gpt_loss=0.351, loss_mean=0.291][A
+Train step of epoch 0:  87%|████████▋ | 5610/6434 [13:09:40<1:56:43,  8.50s/it, gpt_loss=0.314, loss_mean=0.294][A
+Train step of epoch 0:  87%|████████▋ | 5611/6434 [13:09:40<1:53:24,  8.27s/it, gpt_loss=0.314, loss_mean=0.294][A
+Train step of epoch 0:  87%|████████▋ | 5611/6434 [13:09:49<1:53:24,  8.27s/it, gpt_loss=0.349, loss_mean=0.299][A
+Train step of epoch 0:  87%|████████▋ | 5612/6434 [13:09:49<1:56:11,  8.48s/it, gpt_loss=0.349, loss_mean=0.299][A
+Train step of epoch 0:  87%|████████▋ | 5612/6434 [13:09:58<1:56:11,  8.48s/it, gpt_loss=0.277, loss_mean=0.297][A
+Train step of epoch 0:  87%|████████▋ | 5613/6434 [13:09:58<1:56:26,  8.51s/it, gpt_loss=0.277, loss_mean=0.297][A
+Train step of epoch 0:  87%|████████▋ | 5613/6434 [13:10:06<1:56:26,  8.51s/it, gpt_loss=0.311, loss_mean=0.298][A
+Train step of epoch 0:  87%|████████▋ | 5614/6434 [13:10:06<1:54:11,  8.36s/it, gpt_loss=0.311, loss_mean=0.298][A
+Train step of epoch 0:  87%|████████▋ | 5614/6434 [13:10:14<1:54:11,  8.36s/it, gpt_loss=0.278, loss_mean=0.296][A
+Train step of epoch 0:  87%|████████▋ | 5615/6434 [13:10:14<1:54:05,  8.36s/it, gpt_loss=0.278, loss_mean=0.296][A
+Train step of epoch 0:  87%|████████▋ | 5615/6434 [13:10:23<1:54:05,  8.36s/it, gpt_loss=0.232, loss_mean=0.29] [A
+Train step of epoch 0:  87%|████████▋ | 5616/6434 [13:10:23<1:54:21,  8.39s/it, gpt_loss=0.232, loss_mean=0.29][A
+Train step of epoch 0:  87%|████████▋ | 5616/6434 [13:10:30<1:54:21,  8.39s/it, gpt_loss=0.372, loss_mean=0.298][A
+Train step of epoch 0:  87%|████████▋ | 5617/6434 [13:10:30<1:52:20,  8.25s/it, gpt_loss=0.372, loss_mean=0.298][A
+Train step of epoch 0:  87%|████████▋ | 5617/6434 [13:10:38<1:52:20,  8.25s/it, gpt_loss=0.302, loss_mean=0.299][A
+Train step of epoch 0:  87%|████████▋ | 5618/6434 [13:10:38<1:49:46,  8.07s/it, gpt_loss=0.302, loss_mean=0.299][A
+Train step of epoch 0:  87%|████████▋ | 5618/6434 [13:10:48<1:49:46,  8.07s/it, gpt_loss=0.212, loss_mean=0.29] [A
+Train step of epoch 0:  87%|████████▋ | 5619/6434 [13:10:48<1:55:50,  8.53s/it, gpt_loss=0.212, loss_mean=0.29][A
+[LID Router Debug] Step: 5620
+Batch Size: 10
+Audio Batch Size: 111
+LID Assignments: [2, 2, 4, 4, 3, 4, 3, 9, 5, 4]
+Active Experts in Batch: {2, 3, 4, 5, 9}
+
+Train step of epoch 0:  87%|████████▋ | 5619/6434 [13:10:56<1:55:50,  8.53s/it, gpt_loss=0.257, loss_mean=0.287][A
+Train step of epoch 0:  87%|████████▋ | 5620/6434 [13:10:56<1:52:43,  8.31s/it, gpt_loss=0.257, loss_mean=0.287][A
+Train step of epoch 0:  87%|████████▋ | 5620/6434 [13:11:04<1:52:43,  8.31s/it, gpt_loss=0.192, loss_mean=0.277][A
+Train step of epoch 0:  87%|████████▋ | 5621/6434 [13:11:04<1:51:54,  8.26s/it, gpt_loss=0.192, loss_mean=0.277][A
+Train step of epoch 0:  87%|████████▋ | 5621/6434 [13:11:13<1:51:54,  8.26s/it, gpt_loss=0.284, loss_mean=0.278][A
+Train step of epoch 0:  87%|████████▋ | 5622/6434 [13:11:13<1:55:37,  8.54s/it, gpt_loss=0.284, loss_mean=0.278][A
+Train step of epoch 0:  87%|████████▋ | 5622/6434 [13:11:23<1:55:37,  8.54s/it, gpt_loss=0.246, loss_mean=0.275][A
+Train step of epoch 0:  87%|████████▋ | 5623/6434 [13:11:23<2:00:11,  8.89s/it, gpt_loss=0.246, loss_mean=0.275][A
+Train step of epoch 0:  87%|████████▋ | 5623/6434 [13:11:31<2:00:11,  8.89s/it, gpt_loss=0.353, loss_mean=0.282][A
+Train step of epoch 0:  87%|████████▋ | 5624/6434 [13:11:31<1:59:00,  8.82s/it, gpt_loss=0.353, loss_mean=0.282][A
+Train step of epoch 0:  87%|████████▋ | 5624/6434 [13:11:40<1:59:00,  8.82s/it, gpt_loss=0.266, loss_mean=0.281][A
+Train step of epoch 0:  87%|████████▋ | 5625/6434 [13:11:40<1:56:58,  8.68s/it, gpt_loss=0.266, loss_mean=0.281][A
+Train step of epoch 0:  87%|████████▋ | 5625/6434 [13:11:47<1:56:58,  8.68s/it, gpt_loss=0.326, loss_mean=0.285][A
+Train step of epoch 0:  87%|████████▋ | 5626/6434 [13:11:47<1:52:45,  8.37s/it, gpt_loss=0.326, loss_mean=0.285][A
+Train step of epoch 0:  87%|████████▋ | 5626/6434 [13:11:55<1:52:45,  8.37s/it, gpt_loss=0.285, loss_mean=0.285][A
+Train step of epoch 0:  87%|████████▋ | 5627/6434 [13:11:55<1:51:56,  8.32s/it, gpt_loss=0.285, loss_mean=0.285][A
+Train step of epoch 0:  87%|████████▋ | 5627/6434 [13:12:04<1:51:56,  8.32s/it, gpt_loss=0.344, loss_mean=0.291][A
+Train step of epoch 0:  87%|████████▋ | 5628/6434 [13:12:04<1:54:05,  8.49s/it, gpt_loss=0.344, loss_mean=0.291][A
+Train step of epoch 0:  87%|████████▋ | 5628/6434 [13:12:12<1:54:05,  8.49s/it, gpt_loss=0.302, loss_mean=0.292][A
+Train step of epoch 0:  87%|████████▋ | 5629/6434 [13:12:12<1:49:16,  8.14s/it, gpt_loss=0.302, loss_mean=0.292][A
+[LID Router Debug] Step: 5630
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [1, 4, 5, 0, 4, 1, 9, 0, 3, 4]
+Active Experts in Batch: {0, 1, 3, 4, 5, 9}
+
+Train step of epoch 0:  87%|████████▋ | 5629/6434 [13:12:19<1:49:16,  8.14s/it, gpt_loss=0.25, loss_mean=0.288] [A
+Train step of epoch 0:  88%|████████▊ | 5630/6434 [13:12:19<1:47:30,  8.02s/it, gpt_loss=0.25, loss_mean=0.288][A
+Train step of epoch 0:  88%|████████▊ | 5630/6434 [13:12:28<1:47:30,  8.02s/it, gpt_loss=0.243, loss_mean=0.284][A
+Train step of epoch 0:  88%|████████▊ | 5631/6434 [13:12:28<1:48:30,  8.11s/it, gpt_loss=0.243, loss_mean=0.284][A
+Train step of epoch 0:  88%|████████▊ | 5631/6434 [13:12:36<1:48:30,  8.11s/it, gpt_loss=0.241, loss_mean=0.279][A
+Train step of epoch 0:  88%|████████▊ | 5632/6434 [13:12:36<1:49:32,  8.20s/it, gpt_loss=0.241, loss_mean=0.279][A
+Train step of epoch 0:  88%|████████▊ | 5632/6434 [13:12:45<1:49:32,  8.20s/it, gpt_loss=0.284, loss_mean=0.28] [A
+Train step of epoch 0:  88%|████████▊ | 5633/6434 [13:12:45<1:52:44,  8.45s/it, gpt_loss=0.284, loss_mean=0.28][A
+Train step of epoch 0:  88%|████████▊ | 5633/6434 [13:12:53<1:52:44,  8.45s/it, gpt_loss=0.44, loss_mean=0.296][A
+Train step of epoch 0:  88%|████████▊ | 5634/6434 [13:12:53<1:50:14,  8.27s/it, gpt_loss=0.44, loss_mean=0.296][A
+Train step of epoch 0:  88%|████████▊ | 5634/6434 [13:13:01<1:50:14,  8.27s/it, gpt_loss=0.254, loss_mean=0.292][A
+Train step of epoch 0:  88%|████████▊ | 5635/6434 [13:13:01<1:49:00,  8.19s/it, gpt_loss=0.254, loss_mean=0.292][A
+Train step of epoch 0:  88%|████████▊ | 5635/6434 [13:13:11<1:49:00,  8.19s/it, gpt_loss=0.315, loss_mean=0.294][A
+Train step of epoch 0:  88%|████████▊ | 5636/6434 [13:13:11<1:55:52,  8.71s/it, gpt_loss=0.315, loss_mean=0.294][A
+Train step of epoch 0:  88%|████████▊ | 5636/6434 [13:13:20<1:55:52,  8.71s/it, gpt_loss=0.316, loss_mean=0.296][A
+Train step of epoch 0:  88%|████████▊ | 5637/6434 [13:13:20<1:57:20,  8.83s/it, gpt_loss=0.316, loss_mean=0.296][A
+Train step of epoch 0:  88%|████████▊ | 5637/6434 [13:13:28<1:57:20,  8.83s/it, gpt_loss=0.319, loss_mean=0.298][A
+Train step of epoch 0:  88%|████████▊ | 5638/6434 [13:13:28<1:54:11,  8.61s/it, gpt_loss=0.319, loss_mean=0.298][A
+Train step of epoch 0:  88%|████████▊ | 5638/6434 [13:13:36<1:54:11,  8.61s/it, gpt_loss=0.31, loss_mean=0.3]   [A
+Train step of epoch 0:  88%|████████▊ | 5639/6434 [13:13:36<1:49:59,  8.30s/it, gpt_loss=0.31, loss_mean=0.3][A
+[LID Router Debug] Step: 5640
+Batch Size: 10
+Audio Batch Size: 98
+LID Assignments: [0, 2, 5, 4, 3, 2, 5, 9, 5, 0]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  88%|████████▊ | 5639/6434 [13:13:44<1:49:59,  8.30s/it, gpt_loss=0.295, loss_mean=0.299][A
+Train step of epoch 0:  88%|████████▊ | 5640/6434 [13:13:44<1:51:49,  8.45s/it, gpt_loss=0.295, loss_mean=0.299][A
+Train step of epoch 0:  88%|████████▊ | 5640/6434 [13:13:53<1:51:49,  8.45s/it, gpt_loss=0.324, loss_mean=0.302][A
+Train step of epoch 0:  88%|████████▊ | 5641/6434 [13:13:53<1:51:06,  8.41s/it, gpt_loss=0.324, loss_mean=0.302][A
+Train step of epoch 0:  88%|████████▊ | 5641/6434 [13:14:01<1:51:06,  8.41s/it, gpt_loss=0.312, loss_mean=0.303][A
+Train step of epoch 0:  88%|████████▊ | 5642/6434 [13:14:01<1:48:49,  8.24s/it, gpt_loss=0.312, loss_mean=0.303][A
+Train step of epoch 0:  88%|████████▊ | 5642/6434 [13:14:08<1:48:49,  8.24s/it, gpt_loss=0.254, loss_mean=0.298][A
+Train step of epoch 0:  88%|████████▊ | 5643/6434 [13:14:08<1:46:28,  8.08s/it, gpt_loss=0.254, loss_mean=0.298][A
+Train step of epoch 0:  88%|████████▊ | 5643/6434 [13:14:16<1:46:28,  8.08s/it, gpt_loss=0.247, loss_mean=0.293][A
+Train step of epoch 0:  88%|████████▊ | 5644/6434 [13:14:16<1:46:36,  8.10s/it, gpt_loss=0.247, loss_mean=0.293][A
+Train step of epoch 0:  88%|████████▊ | 5644/6434 [13:14:24<1:46:36,  8.10s/it, gpt_loss=0.26, loss_mean=0.289] [A
+Train step of epoch 0:  88%|████████▊ | 5645/6434 [13:14:24<1:46:06,  8.07s/it, gpt_loss=0.26, loss_mean=0.289][A
+Train step of epoch 0:  88%|████████▊ | 5645/6434 [13:14:32<1:46:06,  8.07s/it, gpt_loss=0.42, loss_mean=0.303][A
+Train step of epoch 0:  88%|████████▊ | 5646/6434 [13:14:32<1:42:48,  7.83s/it, gpt_loss=0.42, loss_mean=0.303][A
+Train step of epoch 0:  88%|████████▊ | 5646/6434 [13:14:39<1:42:48,  7.83s/it, gpt_loss=0.252, loss_mean=0.297][A
+Train step of epoch 0:  88%|████████▊ | 5647/6434 [13:14:39<1:41:12,  7.72s/it, gpt_loss=0.252, loss_mean=0.297][A
+Train step of epoch 0:  88%|████████▊ | 5647/6434 [13:14:47<1:41:12,  7.72s/it, gpt_loss=0.213, loss_mean=0.289][A
+Train step of epoch 0:  88%|████████▊ | 5648/6434 [13:14:47<1:40:18,  7.66s/it, gpt_loss=0.213, loss_mean=0.289][A
+Train step of epoch 0:  88%|████████▊ | 5648/6434 [13:14:55<1:40:18,  7.66s/it, gpt_loss=0.275, loss_mean=0.288][A
+Train step of epoch 0:  88%|████████▊ | 5649/6434 [13:14:55<1:40:36,  7.69s/it, gpt_loss=0.275, loss_mean=0.288][A
+[LID Router Debug] Step: 5650
+Batch Size: 10
+Audio Batch Size: 94
+LID Assignments: [1, 4, 9, 4, 0, 0, 0, 3, 9, 4]
+Active Experts in Batch: {0, 1, 3, 4, 9}
+
+Train step of epoch 0:  88%|████████▊ | 5649/6434 [13:15:04<1:40:36,  7.69s/it, gpt_loss=0.26, loss_mean=0.285] [A
+Train step of epoch 0:  88%|████████▊ | 5650/6434 [13:15:04<1:45:40,  8.09s/it, gpt_loss=0.26, loss_mean=0.285][A
+Train step of epoch 0:  88%|████████▊ | 5650/6434 [13:15:13<1:45:40,  8.09s/it, gpt_loss=0.248, loss_mean=0.281][A
+Train step of epoch 0:  88%|████████▊ | 5651/6434 [13:15:13<1:52:22,  8.61s/it, gpt_loss=0.248, loss_mean=0.281][A
+Train step of epoch 0:  88%|████████▊ | 5651/6434 [13:15:21<1:52:22,  8.61s/it, gpt_loss=0.345, loss_mean=0.287][A
+Train step of epoch 0:  88%|████████▊ | 5652/6434 [13:15:21<1:50:19,  8.47s/it, gpt_loss=0.345, loss_mean=0.287][A
+Train step of epoch 0:  88%|████████▊ | 5652/6434 [13:15:30<1:50:19,  8.47s/it, gpt_loss=0.271, loss_mean=0.286][A
+Train step of epoch 0:  88%|████████▊ | 5653/6434 [13:15:30<1:49:10,  8.39s/it, gpt_loss=0.271, loss_mean=0.286][A
+Train step of epoch 0:  88%|████████▊ | 5653/6434 [13:15:37<1:49:10,  8.39s/it, gpt_loss=0.298, loss_mean=0.287][A
+Train step of epoch 0:  88%|████████▊ | 5654/6434 [13:15:37<1:46:31,  8.19s/it, gpt_loss=0.298, loss_mean=0.287][A
+Train step of epoch 0:  88%|████████▊ | 5654/6434 [13:15:46<1:46:31,  8.19s/it, gpt_loss=0.275, loss_mean=0.286][A
+Train step of epoch 0:  88%|████████▊ | 5655/6434 [13:15:46<1:47:24,  8.27s/it, gpt_loss=0.275, loss_mean=0.286][A
+Train step of epoch 0:  88%|████████▊ | 5655/6434 [13:15:55<1:47:24,  8.27s/it, gpt_loss=0.325, loss_mean=0.29] [A
+Train step of epoch 0:  88%|████████▊ | 5656/6434 [13:15:55<1:50:17,  8.51s/it, gpt_loss=0.325, loss_mean=0.29][A
+Train step of epoch 0:  88%|████████▊ | 5656/6434 [13:16:04<1:50:17,  8.51s/it, gpt_loss=0.223, loss_mean=0.283][A
+Train step of epoch 0:  88%|████████▊ | 5657/6434 [13:16:04<1:53:28,  8.76s/it, gpt_loss=0.223, loss_mean=0.283][A
+Train step of epoch 0:  88%|████████▊ | 5657/6434 [13:16:13<1:53:28,  8.76s/it, gpt_loss=0.243, loss_mean=0.279][A
+Train step of epoch 0:  88%|████████▊ | 5658/6434 [13:16:13<1:51:52,  8.65s/it, gpt_loss=0.243, loss_mean=0.279][A
+Train step of epoch 0:  88%|████████▊ | 5658/6434 [13:16:21<1:51:52,  8.65s/it, gpt_loss=0.257, loss_mean=0.277][A
+Train step of epoch 0:  88%|████████▊ | 5659/6434 [13:16:21<1:49:37,  8.49s/it, gpt_loss=0.257, loss_mean=0.277][A
+[LID Router Debug] Step: 5660
+Batch Size: 10
+Audio Batch Size: 120
+LID Assignments: [9, 5, 3, 1, 3, 1, 3, 9, 0, 1]
+Active Experts in Batch: {0, 1, 3, 5, 9}
+
+Train step of epoch 0:  88%|████████▊ | 5659/6434 [13:16:30<1:49:37,  8.49s/it, gpt_loss=0.303, loss_mean=0.279][A
+Train step of epoch 0:  88%|████████▊ | 5660/6434 [13:16:30<1:52:36,  8.73s/it, gpt_loss=0.303, loss_mean=0.279][A
+Train step of epoch 0:  88%|████████▊ | 5660/6434 [13:16:39<1:52:36,  8.73s/it, gpt_loss=0.284, loss_mean=0.28] [A
+Train step of epoch 0:  88%|████████▊ | 5661/6434 [13:16:39<1:52:52,  8.76s/it, gpt_loss=0.284, loss_mean=0.28][A
+Train step of epoch 0:  88%|████████▊ | 5661/6434 [13:16:47<1:52:52,  8.76s/it, gpt_loss=0.321, loss_mean=0.284][A
+Train step of epoch 0:  88%|████████▊ | 5662/6434 [13:16:47<1:48:48,  8.46s/it, gpt_loss=0.321, loss_mean=0.284][A
+Train step of epoch 0:  88%|████████▊ | 5662/6434 [13:16:55<1:48:48,  8.46s/it, gpt_loss=0.228, loss_mean=0.278][A
+Train step of epoch 0:  88%|████████▊ | 5663/6434 [13:16:55<1:48:42,  8.46s/it, gpt_loss=0.228, loss_mean=0.278][A
+Train step of epoch 0:  88%|████████▊ | 5663/6434 [13:17:06<1:48:42,  8.46s/it, gpt_loss=0.294, loss_mean=0.28] [A
+Train step of epoch 0:  88%|████████▊ | 5664/6434 [13:17:06<1:57:23,  9.15s/it, gpt_loss=0.294, loss_mean=0.28][A
+Train step of epoch 0:  88%|████████▊ | 5664/6434 [13:17:14<1:57:23,  9.15s/it, gpt_loss=0.284, loss_mean=0.28][A
+Train step of epoch 0:  88%|████████▊ | 5665/6434 [13:17:14<1:52:14,  8.76s/it, gpt_loss=0.284, loss_mean=0.28][A
+Train step of epoch 0:  88%|████████▊ | 5665/6434 [13:17:21<1:52:14,  8.76s/it, gpt_loss=0.274, loss_mean=0.28][A
+Train step of epoch 0:  88%|████████▊ | 5666/6434 [13:17:21<1:45:29,  8.24s/it, gpt_loss=0.274, loss_mean=0.28][A
+Train step of epoch 0:  88%|████████▊ | 5666/6434 [13:17:29<1:45:29,  8.24s/it, gpt_loss=0.36, loss_mean=0.288][A
+Train step of epoch 0:  88%|████████▊ | 5667/6434 [13:17:29<1:47:09,  8.38s/it, gpt_loss=0.36, loss_mean=0.288][A
+Train step of epoch 0:  88%|████████▊ | 5667/6434 [13:17:38<1:47:09,  8.38s/it, gpt_loss=0.324, loss_mean=0.291][A
+Train step of epoch 0:  88%|████████▊ | 5668/6434 [13:17:38<1:45:43,  8.28s/it, gpt_loss=0.324, loss_mean=0.291][A
+Train step of epoch 0:  88%|████████▊ | 5668/6434 [13:17:46<1:45:43,  8.28s/it, gpt_loss=0.234, loss_mean=0.286][A
+Train step of epoch 0:  88%|████████▊ | 5669/6434 [13:17:46<1:44:56,  8.23s/it, gpt_loss=0.234, loss_mean=0.286][A
+[LID Router Debug] Step: 5670
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [1, 5, 9, 0, 1, 9, 3, 2, 4, 5]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  88%|████████▊ | 5669/6434 [13:17:54<1:44:56,  8.23s/it, gpt_loss=0.294, loss_mean=0.287][A
+Train step of epoch 0:  88%|████████▊ | 5670/6434 [13:17:54<1:45:54,  8.32s/it, gpt_loss=0.294, loss_mean=0.287][A
+Train step of epoch 0:  88%|████████▊ | 5670/6434 [13:18:03<1:45:54,  8.32s/it, gpt_loss=0.328, loss_mean=0.291][A
+Train step of epoch 0:  88%|████████▊ | 5671/6434 [13:18:03<1:46:08,  8.35s/it, gpt_loss=0.328, loss_mean=0.291][A
+Train step of epoch 0:  88%|████████▊ | 5671/6434 [13:18:10<1:46:08,  8.35s/it, gpt_loss=0.288, loss_mean=0.29] [A
+Train step of epoch 0:  88%|████████▊ | 5672/6434 [13:18:10<1:44:14,  8.21s/it, gpt_loss=0.288, loss_mean=0.29][A
+Train step of epoch 0:  88%|████████▊ | 5672/6434 [13:18:21<1:44:14,  8.21s/it, gpt_loss=0.367, loss_mean=0.298][A
+Train step of epoch 0:  88%|████████▊ | 5673/6434 [13:18:21<1:54:28,  9.03s/it, gpt_loss=0.367, loss_mean=0.298][A
+Train step of epoch 0:  88%|████████▊ | 5673/6434 [13:18:30<1:54:28,  9.03s/it, gpt_loss=0.291, loss_mean=0.297][A
+Train step of epoch 0:  88%|████████▊ | 5674/6434 [13:18:30<1:52:19,  8.87s/it, gpt_loss=0.291, loss_mean=0.297][A
+Train step of epoch 0:  88%|████████▊ | 5674/6434 [13:18:39<1:52:19,  8.87s/it, gpt_loss=0.335, loss_mean=0.301][A
+Train step of epoch 0:  88%|████████▊ | 5675/6434 [13:18:39<1:54:21,  9.04s/it, gpt_loss=0.335, loss_mean=0.301][A
+Train step of epoch 0:  88%|████████▊ | 5675/6434 [13:18:48<1:54:21,  9.04s/it, gpt_loss=0.322, loss_mean=0.303][A
+Train step of epoch 0:  88%|████████▊ | 5676/6434 [13:18:48<1:54:15,  9.04s/it, gpt_loss=0.322, loss_mean=0.303][A
+Train step of epoch 0:  88%|████████▊ | 5676/6434 [13:18:57<1:54:15,  9.04s/it, gpt_loss=0.288, loss_mean=0.302][A
+Train step of epoch 0:  88%|████████▊ | 5677/6434 [13:18:57<1:53:20,  8.98s/it, gpt_loss=0.288, loss_mean=0.302][A
+Train step of epoch 0:  88%|████████▊ | 5677/6434 [13:19:06<1:53:20,  8.98s/it, gpt_loss=0.311, loss_mean=0.303][A
+Train step of epoch 0:  88%|████████▊ | 5678/6434 [13:19:06<1:51:28,  8.85s/it, gpt_loss=0.311, loss_mean=0.303][A
+Train step of epoch 0:  88%|████████▊ | 5678/6434 [13:19:15<1:51:28,  8.85s/it, gpt_loss=0.261, loss_mean=0.298][A
+Train step of epoch 0:  88%|████████▊ | 5679/6434 [13:19:15<1:51:32,  8.86s/it, gpt_loss=0.261, loss_mean=0.298][A
+[LID Router Debug] Step: 5680
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [1, 9, 2, 9, 3, 2, 5, 4, 1, 9]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  88%|████████▊ | 5679/6434 [13:19:23<1:51:32,  8.86s/it, gpt_loss=0.286, loss_mean=0.297][A
+Train step of epoch 0:  88%|████████▊ | 5680/6434 [13:19:23<1:49:53,  8.75s/it, gpt_loss=0.286, loss_mean=0.297][A
+Train step of epoch 0:  88%|████████▊ | 5680/6434 [13:19:31<1:49:53,  8.75s/it, gpt_loss=0.245, loss_mean=0.292][A
+Train step of epoch 0:  88%|████████▊ | 5681/6434 [13:19:31<1:45:36,  8.41s/it, gpt_loss=0.245, loss_mean=0.292][A
+Train step of epoch 0:  88%|████████▊ | 5681/6434 [13:19:40<1:45:36,  8.41s/it, gpt_loss=0.253, loss_mean=0.288][A
+Train step of epoch 0:  88%|████████▊ | 5682/6434 [13:19:40<1:48:05,  8.62s/it, gpt_loss=0.253, loss_mean=0.288][A
+Train step of epoch 0:  88%|████████▊ | 5682/6434 [13:19:48<1:48:05,  8.62s/it, gpt_loss=0.266, loss_mean=0.286][A
+Train step of epoch 0:  88%|████████▊ | 5683/6434 [13:19:48<1:45:02,  8.39s/it, gpt_loss=0.266, loss_mean=0.286][A
+Train step of epoch 0:  88%|████████▊ | 5683/6434 [13:19:57<1:45:02,  8.39s/it, gpt_loss=0.382, loss_mean=0.296][A
+Train step of epoch 0:  88%|████████▊ | 5684/6434 [13:19:57<1:47:13,  8.58s/it, gpt_loss=0.382, loss_mean=0.296][A
+Train step of epoch 0:  88%|████████▊ | 5684/6434 [13:20:06<1:47:13,  8.58s/it, gpt_loss=0.215, loss_mean=0.287][A
+Train step of epoch 0:  88%|████████▊ | 5685/6434 [13:20:06<1:51:01,  8.89s/it, gpt_loss=0.215, loss_mean=0.287][A
+Train step of epoch 0:  88%|████████▊ | 5685/6434 [13:20:15<1:51:01,  8.89s/it, gpt_loss=0.254, loss_mean=0.284][A
+Train step of epoch 0:  88%|████████▊ | 5686/6434 [13:20:15<1:48:40,  8.72s/it, gpt_loss=0.254, loss_mean=0.284][A
+Train step of epoch 0:  88%|████████▊ | 5686/6434 [13:20:24<1:48:40,  8.72s/it, gpt_loss=0.255, loss_mean=0.281][A
+Train step of epoch 0:  88%|████████▊ | 5687/6434 [13:20:24<1:51:50,  8.98s/it, gpt_loss=0.255, loss_mean=0.281][A
+Train step of epoch 0:  88%|████████▊ | 5687/6434 [13:20:33<1:51:50,  8.98s/it, gpt_loss=0.271, loss_mean=0.28] [A
+Train step of epoch 0:  88%|████████▊ | 5688/6434 [13:20:33<1:49:19,  8.79s/it, gpt_loss=0.271, loss_mean=0.28][A
+Train step of epoch 0:  88%|████████▊ | 5688/6434 [13:20:42<1:49:19,  8.79s/it, gpt_loss=0.231, loss_mean=0.275][A
+Train step of epoch 0:  88%|████████▊ | 5689/6434 [13:20:42<1:51:21,  8.97s/it, gpt_loss=0.231, loss_mean=0.275][A
+[LID Router Debug] Step: 5690
+Batch Size: 10
+Audio Batch Size: 86
+LID Assignments: [0, 2, 0, 5, 6, 4, 4, 5, 9, 2]
+Active Experts in Batch: {0, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  88%|████████▊ | 5689/6434 [13:20:50<1:51:21,  8.97s/it, gpt_loss=0.298, loss_mean=0.278][A
+Train step of epoch 0:  88%|████████▊ | 5690/6434 [13:20:50<1:47:53,  8.70s/it, gpt_loss=0.298, loss_mean=0.278][A
+Train step of epoch 0:  88%|████████▊ | 5690/6434 [13:21:00<1:47:53,  8.70s/it, gpt_loss=0.246, loss_mean=0.274][A
+Train step of epoch 0:  88%|████████▊ | 5691/6434 [13:21:00<1:50:52,  8.95s/it, gpt_loss=0.246, loss_mean=0.274][A
+Train step of epoch 0:  88%|████████▊ | 5691/6434 [13:21:09<1:50:52,  8.95s/it, gpt_loss=0.275, loss_mean=0.274][A
+Train step of epoch 0:  88%|████████▊ | 5692/6434 [13:21:09<1:50:55,  8.97s/it, gpt_loss=0.275, loss_mean=0.274][A
+Train step of epoch 0:  88%|████████▊ | 5692/6434 [13:21:18<1:50:55,  8.97s/it, gpt_loss=0.259, loss_mean=0.273][A
+Train step of epoch 0:  88%|████████▊ | 5693/6434 [13:21:18<1:51:03,  8.99s/it, gpt_loss=0.259, loss_mean=0.273][A
+Train step of epoch 0:  88%|████████▊ | 5693/6434 [13:21:27<1:51:03,  8.99s/it, gpt_loss=0.353, loss_mean=0.281][A
+Train step of epoch 0:  88%|████████▊ | 5694/6434 [13:21:27<1:50:39,  8.97s/it, gpt_loss=0.353, loss_mean=0.281][A
+Train step of epoch 0:  88%|████████▊ | 5694/6434 [13:21:35<1:50:39,  8.97s/it, gpt_loss=0.278, loss_mean=0.281][A
+Train step of epoch 0:  89%|████████▊ | 5695/6434 [13:21:35<1:48:01,  8.77s/it, gpt_loss=0.278, loss_mean=0.281][A
+Train step of epoch 0:  89%|████████▊ | 5695/6434 [13:21:43<1:48:01,  8.77s/it, gpt_loss=0.268, loss_mean=0.279][A
+Train step of epoch 0:  89%|████████▊ | 5696/6434 [13:21:43<1:45:24,  8.57s/it, gpt_loss=0.268, loss_mean=0.279][A
+Train step of epoch 0:  89%|████████▊ | 5696/6434 [13:21:52<1:45:24,  8.57s/it, gpt_loss=0.349, loss_mean=0.286][A
+Train step of epoch 0:  89%|████████▊ | 5697/6434 [13:21:52<1:45:11,  8.56s/it, gpt_loss=0.349, loss_mean=0.286][A
+Train step of epoch 0:  89%|████████▊ | 5697/6434 [13:22:00<1:45:11,  8.56s/it, gpt_loss=0.374, loss_mean=0.295][A
+Train step of epoch 0:  89%|████████▊ | 5698/6434 [13:22:00<1:44:44,  8.54s/it, gpt_loss=0.374, loss_mean=0.295][A
+Train step of epoch 0:  89%|████████▊ | 5698/6434 [13:22:09<1:44:44,  8.54s/it, gpt_loss=0.389, loss_mean=0.304][A
+Train step of epoch 0:  89%|████████▊ | 5699/6434 [13:22:09<1:45:18,  8.60s/it, gpt_loss=0.389, loss_mean=0.304][A
+[LID Router Debug] Step: 5700
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [2, 6, 0, 5, 1, 4, 3, 3, 5, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+Train step of epoch 0:  89%|████████▊ | 5699/6434 [13:22:17<1:45:18,  8.60s/it, gpt_loss=0.283, loss_mean=0.302][A
+Train step of epoch 0:  89%|████████▊ | 5700/6434 [13:22:17<1:44:22,  8.53s/it, gpt_loss=0.283, loss_mean=0.302][A
+Train step of epoch 0:  89%|████████▊ | 5700/6434 [13:22:25<1:44:22,  8.53s/it, gpt_loss=0.224, loss_mean=0.294][A
+Train step of epoch 0:  89%|████████▊ | 5701/6434 [13:22:25<1:41:45,  8.33s/it, gpt_loss=0.224, loss_mean=0.294][A
+Train step of epoch 0:  89%|████████▊ | 5701/6434 [13:22:34<1:41:45,  8.33s/it, gpt_loss=0.31, loss_mean=0.296] [A
+Train step of epoch 0:  89%|████████▊ | 5702/6434 [13:22:34<1:43:02,  8.45s/it, gpt_loss=0.31, loss_mean=0.296][A
+Train step of epoch 0:  89%|████████▊ | 5702/6434 [13:22:41<1:43:02,  8.45s/it, gpt_loss=0.283, loss_mean=0.295][A
+Train step of epoch 0:  89%|████████▊ | 5703/6434 [13:22:41<1:38:39,  8.10s/it, gpt_loss=0.283, loss_mean=0.295][A
+Train step of epoch 0:  89%|████████▊ | 5703/6434 [13:22:49<1:38:39,  8.10s/it, gpt_loss=0.19, loss_mean=0.284] [A
+Train step of epoch 0:  89%|████████▊ | 5704/6434 [13:22:49<1:37:38,  8.03s/it, gpt_loss=0.19, loss_mean=0.284][A
+Train step of epoch 0:  89%|████████▊ | 5704/6434 [13:22:57<1:37:38,  8.03s/it, gpt_loss=0.312, loss_mean=0.287][A
+Train step of epoch 0:  89%|████████▊ | 5705/6434 [13:22:57<1:36:17,  7.92s/it, gpt_loss=0.312, loss_mean=0.287][A
+Train step of epoch 0:  89%|████████▊ | 5705/6434 [13:23:06<1:36:17,  7.92s/it, gpt_loss=0.293, loss_mean=0.288][A
+Train step of epoch 0:  89%|████████▊ | 5706/6434 [13:23:06<1:40:09,  8.26s/it, gpt_loss=0.293, loss_mean=0.288][A
+Train step of epoch 0:  89%|████████▊ | 5706/6434 [13:23:13<1:40:09,  8.26s/it, gpt_loss=0.323, loss_mean=0.291][A
+Train step of epoch 0:  89%|████████▊ | 5707/6434 [13:23:13<1:38:40,  8.14s/it, gpt_loss=0.323, loss_mean=0.291][A
+Train step of epoch 0:  89%|████████▊ | 5707/6434 [13:23:21<1:38:40,  8.14s/it, gpt_loss=0.265, loss_mean=0.289][A
+Train step of epoch 0:  89%|████████▊ | 5708/6434 [13:23:21<1:37:11,  8.03s/it, gpt_loss=0.265, loss_mean=0.289][A
+Train step of epoch 0:  89%|████████▊ | 5708/6434 [13:23:29<1:37:11,  8.03s/it, gpt_loss=0.298, loss_mean=0.29] [A
+Train step of epoch 0:  89%|████████▊ | 5709/6434 [13:23:29<1:37:35,  8.08s/it, gpt_loss=0.298, loss_mean=0.29][A
+[LID Router Debug] Step: 5710
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [4, 4, 0, 1, 3, 1, 2, 1, 9, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+Train step of epoch 0:  89%|████████▊ | 5709/6434 [13:23:39<1:37:35,  8.08s/it, gpt_loss=0.231, loss_mean=0.284][A
+Train step of epoch 0:  89%|████████▊ | 5710/6434 [13:23:39<1:41:38,  8.42s/it, gpt_loss=0.231, loss_mean=0.284][A
+Train step of epoch 0:  89%|████████▊ | 5710/6434 [13:23:47<1:41:38,  8.42s/it, gpt_loss=0.283, loss_mean=0.284][A
+Train step of epoch 0:  89%|████████▉ | 5711/6434 [13:23:47<1:42:58,  8.55s/it, gpt_loss=0.283, loss_mean=0.284][A
+Train step of epoch 0:  89%|████████▉ | 5711/6434 [13:23:57<1:42:58,  8.55s/it, gpt_loss=0.302, loss_mean=0.286][A
+Train step of epoch 0:  89%|████████▉ | 5712/6434 [13:23:57<1:45:21,  8.76s/it, gpt_loss=0.302, loss_mean=0.286][A
+Train step of epoch 0:  89%|████████▉ | 5712/6434 [13:24:05<1:45:21,  8.76s/it, gpt_loss=0.267, loss_mean=0.284][A
+Train step of epoch 0:  89%|████████▉ | 5713/6434 [13:24:05<1:42:22,  8.52s/it, gpt_loss=0.267, loss_mean=0.284][A
+Train step of epoch 0:  89%|████████▉ | 5713/6434 [13:24:13<1:42:22,  8.52s/it, gpt_loss=0.235, loss_mean=0.279][A
+Train step of epoch 0:  89%|████████▉ | 5714/6434 [13:24:13<1:41:30,  8.46s/it, gpt_loss=0.235, loss_mean=0.279][A
+Train step of epoch 0:  89%|████████▉ | 5714/6434 [13:24:22<1:41:30,  8.46s/it, gpt_loss=0.295, loss_mean=0.28] [A
+Train step of epoch 0:  89%|████████▉ | 5715/6434 [13:24:22<1:43:54,  8.67s/it, gpt_loss=0.295, loss_mean=0.28][A
+Train step of epoch 0:  89%|████████▉ | 5715/6434 [13:24:31<1:43:54,  8.67s/it, gpt_loss=0.31, loss_mean=0.283][A
+Train step of epoch 0:  89%|████████▉ | 5716/6434 [13:24:31<1:42:29,  8.56s/it, gpt_loss=0.31, loss_mean=0.283][A
+Train step of epoch 0:  89%|████████▉ | 5716/6434 [13:24:38<1:42:29,  8.56s/it, gpt_loss=0.323, loss_mean=0.287][A
+Train step of epoch 0:  89%|████████▉ | 5717/6434 [13:24:38<1:38:29,  8.24s/it, gpt_loss=0.323, loss_mean=0.287][A
+Train step of epoch 0:  89%|████████▉ | 5717/6434 [13:24:46<1:38:29,  8.24s/it, gpt_loss=0.336, loss_mean=0.292][A
+Train step of epoch 0:  89%|████████▉ | 5718/6434 [13:24:46<1:36:39,  8.10s/it, gpt_loss=0.336, loss_mean=0.292][A
+Train step of epoch 0:  89%|████████▉ | 5718/6434 [13:24:55<1:36:39,  8.10s/it, gpt_loss=0.34, loss_mean=0.297] [A
+Train step of epoch 0:  89%|████████▉ | 5719/6434 [13:24:55<1:39:11,  8.32s/it, gpt_loss=0.34, loss_mean=0.297][A
+[LID Router Debug] Step: 5720
+Batch Size: 10
+Audio Batch Size: 123
+LID Assignments: [1, 2, 9, 3, 9, 2, 2, 0, 0, 5]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9}
+
+Train step of epoch 0:  89%|████████▉ | 5719/6434 [13:25:03<1:39:11,  8.32s/it, gpt_loss=0.275, loss_mean=0.295][A
+Train step of epoch 0:  89%|████████▉ | 5720/6434 [13:25:03<1:38:26,  8.27s/it, gpt_loss=0.275, loss_mean=0.295][A
+Train step of epoch 0:  89%|████████▉ | 5720/6434 [13:25:12<1:38:26,  8.27s/it, gpt_loss=0.291, loss_mean=0.294][A
+Train step of epoch 0:  89%|████████▉ | 5721/6434 [13:25:12<1:42:37,  8.64s/it, gpt_loss=0.291, loss_mean=0.294][A
+Train step of epoch 0:  89%|████████▉ | 5721/6434 [13:25:22<1:42:37,  8.64s/it, gpt_loss=0.361, loss_mean=0.301][A
+Train step of epoch 0:  89%|████████▉ | 5722/6434 [13:25:22<1:45:00,  8.85s/it, gpt_loss=0.361, loss_mean=0.301][A
+Train step of epoch 0:  89%|████████▉ | 5722/6434 [13:25:31<1:45:00,  8.85s/it, gpt_loss=0.269, loss_mean=0.298][A
+Train step of epoch 0:  89%|████████▉ | 5723/6434 [13:25:31<1:45:30,  8.90s/it, gpt_loss=0.269, loss_mean=0.298][A
+Train step of epoch 0:  89%|████████▉ | 5723/6434 [13:25:39<1:45:30,  8.90s/it, gpt_loss=0.297, loss_mean=0.298][A
+Train step of epoch 0:  89%|████████▉ | 5724/6434 [13:25:39<1:42:59,  8.70s/it, gpt_loss=0.297, loss_mean=0.298][A
+Train step of epoch 0:  89%|████████▉ | 5724/6434 [13:25:47<1:42:59,  8.70s/it, gpt_loss=0.226, loss_mean=0.291][A
+Train step of epoch 0:  89%|████████▉ | 5725/6434 [13:25:47<1:40:41,  8.52s/it, gpt_loss=0.226, loss_mean=0.291][A
+Train step of epoch 0:  89%|████████▉ | 5725/6434 [13:25:55<1:40:41,  8.52s/it, gpt_loss=0.355, loss_mean=0.297][A
+Train step of epoch 0:  89%|████████▉ | 5726/6434 [13:25:55<1:37:22,  8.25s/it, gpt_loss=0.355, loss_mean=0.297][A
+Train step of epoch 0:  89%|████████▉ | 5726/6434 [13:26:03<1:37:22,  8.25s/it, gpt_loss=0.181, loss_mean=0.285][A
+Train step of epoch 0:  89%|████████▉ | 5727/6434 [13:26:03<1:36:57,  8.23s/it, gpt_loss=0.181, loss_mean=0.285][A
+Train step of epoch 0:  89%|████████▉ | 5727/6434 [13:26:13<1:36:57,  8.23s/it, gpt_loss=0.317, loss_mean=0.289][A
+Train step of epoch 0:  89%|████████▉ | 5728/6434 [13:26:13<1:42:17,  8.69s/it, gpt_loss=0.317, loss_mean=0.289][A
+Train step of epoch 0:  89%|████████▉ | 5728/6434 [13:26:19<1:42:17,  8.69s/it, gpt_loss=0.238, loss_mean=0.284][A
+Train step of epoch 0:  89%|████████▉ | 5729/6434 [13:26:19<1:35:47,  8.15s/it, gpt_loss=0.238, loss_mean=0.284][A
+[LID Router Debug] Step: 5730
+Batch Size: 10
+Audio Batch Size: 112
+LID Assignments: [9, 5, 6, 4, 0, 3, 8, 4, 9, 2]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6, 8, 9}
+
+Train step of epoch 0:  89%|████████▉ | 5729/6434 [13:26:28<1:35:47,  8.15s/it, gpt_loss=0.372, loss_mean=0.292][A
+Train step of epoch 0:  89%|████████▉ | 5730/6434 [13:26:28<1:36:28,  8.22s/it, gpt_loss=0.372, loss_mean=0.292][A
+Train step of epoch 0:  89%|████████▉ | 5730/6434 [13:26:36<1:36:28,  8.22s/it, gpt_loss=0.312, loss_mean=0.294][A
+Train step of epoch 0:  89%|████████▉ | 5731/6434 [13:26:36<1:35:36,  8.16s/it, gpt_loss=0.312, loss_mean=0.294][A
+Train step of epoch 0:  89%|████████▉ | 5731/6434 [13:26:45<1:35:36,  8.16s/it, gpt_loss=0.297, loss_mean=0.295][A
+Train step of epoch 0:  89%|████████▉ | 5732/6434 [13:26:45<1:38:55,  8.46s/it, gpt_loss=0.297, loss_mean=0.295][A
+Train step of epoch 0:  89%|████████▉ | 5732/6434 [13:26:53<1:38:55,  8.46s/it, gpt_loss=0.278, loss_mean=0.293][A
+Train step of epoch 0:  89%|████████▉ | 5733/6434 [13:26:53<1:36:37,  8.27s/it, gpt_loss=0.278, loss_mean=0.293][A
+Train step of epoch 0:  89%|████████▉ | 5733/6434 [13:27:01<1:36:37,  8.27s/it, gpt_loss=0.245, loss_mean=0.288][A
+Train step of epoch 0:  89%|████████▉ | 5734/6434 [13:27:01<1:37:29,  8.36s/it, gpt_loss=0.245, loss_mean=0.288][A
+Train step of epoch 0:  89%|████████▉ | 5734/6434 [13:27:11<1:37:29,  8.36s/it, gpt_loss=0.244, loss_mean=0.284][A
+Train step of epoch 0:  89%|████████▉ | 5735/6434 [13:27:11<1:40:49,  8.65s/it, gpt_loss=0.244, loss_mean=0.284][A
+Train step of epoch 0:  89%|████████▉ | 5735/6434 [13:27:19<1:40:49,  8.65s/it, gpt_loss=0.209, loss_mean=0.276][A
+Train step of epoch 0:  89%|████████▉ | 5736/6434 [13:27:19<1:40:23,  8.63s/it, gpt_loss=0.209, loss_mean=0.276][A
+Train step of epoch 0:  89%|████████▉ | 5736/6434 [13:27:29<1:40:23,  8.63s/it, gpt_loss=0.266, loss_mean=0.275][A
+Train step of epoch 0:  89%|████████▉ | 5737/6434 [13:27:29<1:42:29,  8.82s/it, gpt_loss=0.266, loss_mean=0.275][A
+Train step of epoch 0:  89%|████████▉ | 5737/6434 [13:27:37<1:42:29,  8.82s/it, gpt_loss=0.288, loss_mean=0.276][A
+Train step of epoch 0:  89%|████████▉ | 5738/6434 [13:27:37<1:41:18,  8.73s/it, gpt_loss=0.288, loss_mean=0.276][A
+Train step of epoch 0:  89%|████████▉ | 5738/6434 [13:27:45<1:41:18,  8.73s/it, gpt_loss=0.331, loss_mean=0.282][A
+Train step of epoch 0:  89%|████████▉ | 5739/6434 [13:27:45<1:38:44,  8.52s/it, gpt_loss=0.331, loss_mean=0.282][A
+[LID Router Debug] Step: 5740
+Batch Size: 10
+Audio Batch Size: 96
+LID Assignments: [0, 2, 4, 1, 3, 5, 1, 1, 6, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+Train step of epoch 0:  89%|████████▉ | 5739/6434 [13:27:54<1:38:44,  8.52s/it, gpt_loss=0.237, loss_mean=0.277][A
+Train step of epoch 0:  89%|████████▉ | 5740/6434 [13:27:54<1:38:53,  8.55s/it, gpt_loss=0.237, loss_mean=0.277][A
+Train step of epoch 0:  89%|████████▉ | 5740/6434 [13:28:02<1:38:53,  8.55s/it, gpt_loss=0.202, loss_mean=0.27] [A
+Train step of epoch 0:  89%|████████▉ | 5741/6434 [13:28:02<1:38:26,  8.52s/it, gpt_loss=0.202, loss_mean=0.27][A
+Train step of epoch 0:  89%|████████▉ | 5741/6434 [13:28:10<1:38:26,  8.52s/it, gpt_loss=0.375, loss_mean=0.28][A
+Train step of epoch 0:  89%|████████▉ | 5742/6434 [13:28:10<1:36:53,  8.40s/it, gpt_loss=0.375, loss_mean=0.28][A
+Train step of epoch 0:  89%|████████▉ | 5742/6434 [13:28:18<1:36:53,  8.40s/it, gpt_loss=0.359, loss_mean=0.288][A
+Train step of epoch 0:  89%|████████▉ | 5743/6434 [13:28:18<1:35:12,  8.27s/it, gpt_loss=0.359, loss_mean=0.288][A
+Train step of epoch 0:  89%|████████▉ | 5743/6434 [13:28:27<1:35:12,  8.27s/it, gpt_loss=0.296, loss_mean=0.289][A
+Train step of epoch 0:  89%|████████▉ | 5744/6434 [13:28:27<1:35:53,  8.34s/it, gpt_loss=0.296, loss_mean=0.289][A
+Train step of epoch 0:  89%|████████▉ | 5744/6434 [13:28:35<1:35:53,  8.34s/it, gpt_loss=0.302, loss_mean=0.29] [A
+Train step of epoch 0:  89%|████████▉ | 5745/6434 [13:28:35<1:36:43,  8.42s/it, gpt_loss=0.302, loss_mean=0.29][A
+Train step of epoch 0:  89%|████████▉ | 5745/6434 [13:28:43<1:36:43,  8.42s/it, gpt_loss=0.286, loss_mean=0.29][A
+Train step of epoch 0:  89%|████████▉ | 5746/6434 [13:28:43<1:34:49,  8.27s/it, gpt_loss=0.286, loss_mean=0.29][A
+Train step of epoch 0:  89%|████████▉ | 5746/6434 [13:28:51<1:34:49,  8.27s/it, gpt_loss=0.266, loss_mean=0.287][A
+Train step of epoch 0:  89%|████████▉ | 5747/6434 [13:28:51<1:32:30,  8.08s/it, gpt_loss=0.266, loss_mean=0.287][A
+Train step of epoch 0:  89%|████████▉ | 5747/6434 [13:29:00<1:32:30,  8.08s/it, gpt_loss=0.302, loss_mean=0.289][A
+Train step of epoch 0:  89%|████████▉ | 5748/6434 [13:29:00<1:35:59,  8.40s/it, gpt_loss=0.302, loss_mean=0.289][A
+Train step of epoch 0:  89%|████████▉ | 5748/6434 [13:29:08<1:35:59,  8.40s/it, gpt_loss=0.326, loss_mean=0.293][A
+Train step of epoch 0:  89%|████████▉ | 5749/6434 [13:29:08<1:32:55,  8.14s/it, gpt_loss=0.326, loss_mean=0.293][A
+[LID Router Debug] Step: 5750
+Batch Size: 10
+Audio Batch Size: 91
+LID Assignments: [5, 9, 6, 0, 3, 1, 9, 4, 5, 5]
+Active Experts in Batch: {0, 1, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  89%|████████▉ | 5749/6434 [13:29:16<1:32:55,  8.14s/it, gpt_loss=0.351, loss_mean=0.298][A
+Train step of epoch 0:  89%|████████▉ | 5750/6434 [13:29:16<1:32:07,  8.08s/it, gpt_loss=0.351, loss_mean=0.298][A
+Train step of epoch 0:  89%|████████▉ | 5750/6434 [13:29:24<1:32:07,  8.08s/it, gpt_loss=0.378, loss_mean=0.306][A
+Train step of epoch 0:  89%|████████▉ | 5751/6434 [13:29:24<1:31:54,  8.07s/it, gpt_loss=0.378, loss_mean=0.306][A
+Train step of epoch 0:  89%|████████▉ | 5751/6434 [13:29:32<1:31:54,  8.07s/it, gpt_loss=0.255, loss_mean=0.301][A
+Train step of epoch 0:  89%|████████▉ | 5752/6434 [13:29:32<1:34:19,  8.30s/it, gpt_loss=0.255, loss_mean=0.301][A
+Train step of epoch 0:  89%|████████▉ | 5752/6434 [13:29:41<1:34:19,  8.30s/it, gpt_loss=0.239, loss_mean=0.295][A
+Train step of epoch 0:  89%|████████▉ | 5753/6434 [13:29:41<1:34:34,  8.33s/it, gpt_loss=0.239, loss_mean=0.295][A
+Train step of epoch 0:  89%|████████▉ | 5753/6434 [13:29:51<1:34:34,  8.33s/it, gpt_loss=0.303, loss_mean=0.296][A
+Train step of epoch 0:  89%|████████▉ | 5754/6434 [13:29:51<1:41:18,  8.94s/it, gpt_loss=0.303, loss_mean=0.296][A
+Train step of epoch 0:  89%|████████▉ | 5754/6434 [13:29:59<1:41:18,  8.94s/it, gpt_loss=0.388, loss_mean=0.305][A
+Train step of epoch 0:  89%|████████▉ | 5755/6434 [13:29:59<1:38:37,  8.71s/it, gpt_loss=0.388, loss_mean=0.305][A
+Train step of epoch 0:  89%|████████▉ | 5755/6434 [13:30:08<1:38:37,  8.71s/it, gpt_loss=0.232, loss_mean=0.298][A
+Train step of epoch 0:  89%|████████▉ | 5756/6434 [13:30:08<1:39:18,  8.79s/it, gpt_loss=0.232, loss_mean=0.298][A
+Train step of epoch 0:  89%|████████▉ | 5756/6434 [13:30:17<1:39:18,  8.79s/it, gpt_loss=0.243, loss_mean=0.292][A
+Train step of epoch 0:  89%|████████▉ | 5757/6434 [13:30:17<1:37:04,  8.60s/it, gpt_loss=0.243, loss_mean=0.292][A
+Train step of epoch 0:  89%|████████▉ | 5757/6434 [13:30:27<1:37:04,  8.60s/it, gpt_loss=0.331, loss_mean=0.296][A
+Train step of epoch 0:  89%|████████▉ | 5758/6434 [13:30:27<1:43:46,  9.21s/it, gpt_loss=0.331, loss_mean=0.296][A
+Train step of epoch 0:  89%|████████▉ | 5758/6434 [13:30:36<1:43:46,  9.21s/it, gpt_loss=0.302, loss_mean=0.297][A
+Train step of epoch 0:  90%|████████▉ | 5759/6434 [13:30:36<1:42:20,  9.10s/it, gpt_loss=0.302, loss_mean=0.297][A
+[LID Router Debug] Step: 5760
+Batch Size: 10
+Audio Batch Size: 122
+LID Assignments: [5, 1, 3, 8, 9, 1, 4, 2, 5, 3]
+Active Experts in Batch: {1, 2, 3, 4, 5, 8, 9}
+
+Train step of epoch 0:  90%|████████▉ | 5759/6434 [13:30:45<1:42:20,  9.10s/it, gpt_loss=0.316, loss_mean=0.299][A
+Train step of epoch 0:  90%|████████▉ | 5760/6434 [13:30:45<1:40:34,  8.95s/it, gpt_loss=0.316, loss_mean=0.299][A
+Train step of epoch 0:  90%|████████▉ | 5760/6434 [13:30:52<1:40:34,  8.95s/it, gpt_loss=0.391, loss_mean=0.308][A
+Train step of epoch 0:  90%|████████▉ | 5761/6434 [13:30:52<1:36:36,  8.61s/it, gpt_loss=0.391, loss_mean=0.308][A
+Train step of epoch 0:  90%|████████▉ | 5761/6434 [13:31:01<1:36:36,  8.61s/it, gpt_loss=0.26, loss_mean=0.303] [A
+Train step of epoch 0:  90%|████████▉ | 5762/6434 [13:31:01<1:37:02,  8.66s/it, gpt_loss=0.26, loss_mean=0.303][A
+Train step of epoch 0:  90%|████████▉ | 5762/6434 [13:31:10<1:37:02,  8.66s/it, gpt_loss=0.246, loss_mean=0.297][A
+Train step of epoch 0:  90%|████████▉ | 5763/6434 [13:31:10<1:36:23,  8.62s/it, gpt_loss=0.246, loss_mean=0.297][A
+Train step of epoch 0:  90%|████████▉ | 5763/6434 [13:31:19<1:36:23,  8.62s/it, gpt_loss=0.334, loss_mean=0.301][A
+Train step of epoch 0:  90%|████████▉ | 5764/6434 [13:31:19<1:38:06,  8.79s/it, gpt_loss=0.334, loss_mean=0.301][A
+Train step of epoch 0:  90%|████████▉ | 5764/6434 [13:31:27<1:38:06,  8.79s/it, gpt_loss=0.38, loss_mean=0.309] [A
+Train step of epoch 0:  90%|████████▉ | 5765/6434 [13:31:27<1:35:02,  8.52s/it, gpt_loss=0.38, loss_mean=0.309][A
+Train step of epoch 0:  90%|████████▉ | 5765/6434 [13:31:35<1:35:02,  8.52s/it, gpt_loss=0.249, loss_mean=0.303][A
+Train step of epoch 0:  90%|████████▉ | 5766/6434 [13:31:35<1:33:46,  8.42s/it, gpt_loss=0.249, loss_mean=0.303][A
+Train step of epoch 0:  90%|████████▉ | 5766/6434 [13:31:44<1:33:46,  8.42s/it, gpt_loss=0.323, loss_mean=0.305][A
+Train step of epoch 0:  90%|████████▉ | 5767/6434 [13:31:44<1:34:32,  8.51s/it, gpt_loss=0.323, loss_mean=0.305][A
+Train step of epoch 0:  90%|████████▉ | 5767/6434 [13:31:51<1:34:32,  8.51s/it, gpt_loss=0.322, loss_mean=0.307][A
+Train step of epoch 0:  90%|████████▉ | 5768/6434 [13:31:51<1:30:51,  8.19s/it, gpt_loss=0.322, loss_mean=0.307][A
+Train step of epoch 0:  90%|████████▉ | 5768/6434 [13:31:59<1:30:51,  8.19s/it, gpt_loss=0.359, loss_mean=0.312][A
+Train step of epoch 0:  90%|████████▉ | 5769/6434 [13:31:59<1:30:53,  8.20s/it, gpt_loss=0.359, loss_mean=0.312][A
+[LID Router Debug] Step: 5770
+Batch Size: 10
+Audio Batch Size: 82
+LID Assignments: [1, 5, 6, 1, 5, 4, 2, 0, 1, 9]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  90%|████████▉ | 5769/6434 [13:32:08<1:30:53,  8.20s/it, gpt_loss=0.307, loss_mean=0.311][A
+Train step of epoch 0:  90%|████████▉ | 5770/6434 [13:32:08<1:31:32,  8.27s/it, gpt_loss=0.307, loss_mean=0.311][A
+Train step of epoch 0:  90%|████████▉ | 5770/6434 [13:32:17<1:31:32,  8.27s/it, gpt_loss=0.263, loss_mean=0.307][A
+Train step of epoch 0:  90%|████████▉ | 5771/6434 [13:32:17<1:33:41,  8.48s/it, gpt_loss=0.263, loss_mean=0.307][A
+Train step of epoch 0:  90%|████████▉ | 5771/6434 [13:32:25<1:33:41,  8.48s/it, gpt_loss=0.23, loss_mean=0.299] [A
+Train step of epoch 0:  90%|████████▉ | 5772/6434 [13:32:25<1:33:00,  8.43s/it, gpt_loss=0.23, loss_mean=0.299][A
+Train step of epoch 0:  90%|████████▉ | 5772/6434 [13:32:32<1:33:00,  8.43s/it, gpt_loss=0.225, loss_mean=0.292][A
+Train step of epoch 0:  90%|████████▉ | 5773/6434 [13:32:32<1:29:05,  8.09s/it, gpt_loss=0.225, loss_mean=0.292][A
+Train step of epoch 0:  90%|████████▉ | 5773/6434 [13:32:42<1:29:05,  8.09s/it, gpt_loss=0.32, loss_mean=0.294] [A
+Train step of epoch 0:  90%|████████▉ | 5774/6434 [13:32:42<1:32:31,  8.41s/it, gpt_loss=0.32, loss_mean=0.294][A
+Train step of epoch 0:  90%|████████▉ | 5774/6434 [13:32:49<1:32:31,  8.41s/it, gpt_loss=0.271, loss_mean=0.292][A
+Train step of epoch 0:  90%|████████▉ | 5775/6434 [13:32:49<1:30:32,  8.24s/it, gpt_loss=0.271, loss_mean=0.292][A
+Train step of epoch 0:  90%|████████▉ | 5775/6434 [13:32:57<1:30:32,  8.24s/it, gpt_loss=0.234, loss_mean=0.286][A
+Train step of epoch 0:  90%|████████▉ | 5776/6434 [13:32:57<1:26:54,  7.93s/it, gpt_loss=0.234, loss_mean=0.286][A
+Train step of epoch 0:  90%|████████▉ | 5776/6434 [13:33:05<1:26:54,  7.93s/it, gpt_loss=0.312, loss_mean=0.289][A
+Train step of epoch 0:  90%|████████▉ | 5777/6434 [13:33:05<1:29:13,  8.15s/it, gpt_loss=0.312, loss_mean=0.289][A
+Train step of epoch 0:  90%|████████▉ | 5777/6434 [13:33:13<1:29:13,  8.15s/it, gpt_loss=0.26, loss_mean=0.286] [A
+Train step of epoch 0:  90%|████████▉ | 5778/6434 [13:33:13<1:28:34,  8.10s/it, gpt_loss=0.26, loss_mean=0.286][A
+Train step of epoch 0:  90%|████████▉ | 5778/6434 [13:33:21<1:28:34,  8.10s/it, gpt_loss=0.287, loss_mean=0.286][A
+Train step of epoch 0:  90%|████████▉ | 5779/6434 [13:33:21<1:28:30,  8.11s/it, gpt_loss=0.287, loss_mean=0.286][A
+[LID Router Debug] Step: 5780
+Batch Size: 10
+Audio Batch Size: 139
+LID Assignments: [4, 6, 9, 3, 9, 5, 1, 3, 5, 3]
+Active Experts in Batch: {1, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  90%|████████▉ | 5779/6434 [13:33:30<1:28:30,  8.11s/it, gpt_loss=0.302, loss_mean=0.288][A
+Train step of epoch 0:  90%|████████▉ | 5780/6434 [13:33:30<1:31:04,  8.36s/it, gpt_loss=0.302, loss_mean=0.288][A
+Train step of epoch 0:  90%|████████▉ | 5780/6434 [13:33:39<1:31:04,  8.36s/it, gpt_loss=0.312, loss_mean=0.29] [A
+Train step of epoch 0:  90%|████████▉ | 5781/6434 [13:33:39<1:32:13,  8.47s/it, gpt_loss=0.312, loss_mean=0.29][A
+Train step of epoch 0:  90%|████████▉ | 5781/6434 [13:33:47<1:32:13,  8.47s/it, gpt_loss=0.272, loss_mean=0.288][A
+Train step of epoch 0:  90%|████████▉ | 5782/6434 [13:33:47<1:30:25,  8.32s/it, gpt_loss=0.272, loss_mean=0.288][A
+Train step of epoch 0:  90%|████████▉ | 5782/6434 [13:33:56<1:30:25,  8.32s/it, gpt_loss=0.388, loss_mean=0.298][A
+Train step of epoch 0:  90%|████████▉ | 5783/6434 [13:33:56<1:31:26,  8.43s/it, gpt_loss=0.388, loss_mean=0.298][A
+Train step of epoch 0:  90%|████████▉ | 5783/6434 [13:34:05<1:31:26,  8.43s/it, gpt_loss=0.217, loss_mean=0.29] [A
+Train step of epoch 0:  90%|████████▉ | 5784/6434 [13:34:05<1:34:19,  8.71s/it, gpt_loss=0.217, loss_mean=0.29][A
+Train step of epoch 0:  90%|████████▉ | 5784/6434 [13:34:13<1:34:19,  8.71s/it, gpt_loss=0.311, loss_mean=0.292][A
+Train step of epoch 0:  90%|████████▉ | 5785/6434 [13:34:13<1:30:54,  8.40s/it, gpt_loss=0.311, loss_mean=0.292][A
+Train step of epoch 0:  90%|████████▉ | 5785/6434 [13:34:21<1:30:54,  8.40s/it, gpt_loss=0.412, loss_mean=0.304][A
+Train step of epoch 0:  90%|████████▉ | 5786/6434 [13:34:21<1:31:55,  8.51s/it, gpt_loss=0.412, loss_mean=0.304][A
+Train step of epoch 0:  90%|████████▉ | 5786/6434 [13:34:29<1:31:55,  8.51s/it, gpt_loss=0.27, loss_mean=0.301] [A
+Train step of epoch 0:  90%|████████▉ | 5787/6434 [13:34:29<1:28:37,  8.22s/it, gpt_loss=0.27, loss_mean=0.301][A
+Train step of epoch 0:  90%|████████▉ | 5787/6434 [13:34:38<1:28:37,  8.22s/it, gpt_loss=0.344, loss_mean=0.305][A
+Train step of epoch 0:  90%|████████▉ | 5788/6434 [13:34:38<1:32:19,  8.58s/it, gpt_loss=0.344, loss_mean=0.305][A
+Train step of epoch 0:  90%|████████▉ | 5788/6434 [13:34:47<1:32:19,  8.58s/it, gpt_loss=0.263, loss_mean=0.301][A
+Train step of epoch 0:  90%|████████▉ | 5789/6434 [13:34:47<1:31:46,  8.54s/it, gpt_loss=0.263, loss_mean=0.301][A
+[LID Router Debug] Step: 5790
+Batch Size: 10
+Audio Batch Size: 96
+LID Assignments: [1, 1, 1, 3, 6, 4, 0, 1, 0, 2]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6}
+
+Train step of epoch 0:  90%|████████▉ | 5789/6434 [13:34:55<1:31:46,  8.54s/it, gpt_loss=0.325, loss_mean=0.303][A
+Train step of epoch 0:  90%|████████▉ | 5790/6434 [13:34:55<1:29:52,  8.37s/it, gpt_loss=0.325, loss_mean=0.303][A
+Train step of epoch 0:  90%|████████▉ | 5790/6434 [13:35:05<1:29:52,  8.37s/it, gpt_loss=0.244, loss_mean=0.297][A
+Train step of epoch 0:  90%|█████████ | 5791/6434 [13:35:05<1:34:04,  8.78s/it, gpt_loss=0.244, loss_mean=0.297][A
+Train step of epoch 0:  90%|█████████ | 5791/6434 [13:35:14<1:34:04,  8.78s/it, gpt_loss=0.319, loss_mean=0.3]  [A
+Train step of epoch 0:  90%|█████████ | 5792/6434 [13:35:14<1:35:31,  8.93s/it, gpt_loss=0.319, loss_mean=0.3][A
+Train step of epoch 0:  90%|█████████ | 5792/6434 [13:35:24<1:35:31,  8.93s/it, gpt_loss=0.246, loss_mean=0.294][A
+Train step of epoch 0:  90%|█████████ | 5793/6434 [13:35:24<1:37:52,  9.16s/it, gpt_loss=0.246, loss_mean=0.294][A
+Train step of epoch 0:  90%|█████████ | 5793/6434 [13:35:32<1:37:52,  9.16s/it, gpt_loss=0.29, loss_mean=0.294] [A
+Train step of epoch 0:  90%|█████████ | 5794/6434 [13:35:32<1:34:28,  8.86s/it, gpt_loss=0.29, loss_mean=0.294][A
+Train step of epoch 0:  90%|█████████ | 5794/6434 [13:35:39<1:34:28,  8.86s/it, gpt_loss=0.255, loss_mean=0.29][A
+Train step of epoch 0:  90%|█████████ | 5795/6434 [13:35:39<1:29:22,  8.39s/it, gpt_loss=0.255, loss_mean=0.29][A
+Train step of epoch 0:  90%|█████████ | 5795/6434 [13:35:48<1:29:22,  8.39s/it, gpt_loss=0.317, loss_mean=0.293][A
+Train step of epoch 0:  90%|█████████ | 5796/6434 [13:35:48<1:31:08,  8.57s/it, gpt_loss=0.317, loss_mean=0.293][A
+Train step of epoch 0:  90%|█████████ | 5796/6434 [13:35:57<1:31:08,  8.57s/it, gpt_loss=0.298, loss_mean=0.293][A
+Train step of epoch 0:  90%|█████████ | 5797/6434 [13:35:57<1:32:28,  8.71s/it, gpt_loss=0.298, loss_mean=0.293][A
+Train step of epoch 0:  90%|█████████ | 5797/6434 [13:36:07<1:32:28,  8.71s/it, gpt_loss=0.292, loss_mean=0.293][A
+Train step of epoch 0:  90%|█████████ | 5798/6434 [13:36:07<1:35:21,  9.00s/it, gpt_loss=0.292, loss_mean=0.293][A
+Train step of epoch 0:  90%|█████████ | 5798/6434 [13:36:16<1:35:21,  9.00s/it, gpt_loss=0.325, loss_mean=0.296][A
+Train step of epoch 0:  90%|█████████ | 5799/6434 [13:36:16<1:35:51,  9.06s/it, gpt_loss=0.325, loss_mean=0.296][A
+[LID Router Debug] Step: 5800
+Batch Size: 10
+Audio Batch Size: 98
+LID Assignments: [4, 5, 9, 9, 5, 1, 2, 0, 4, 4]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+[2026-02-07 05:32:28,006] [INFO] [logging.py:96:log_dist] [Rank 0] step=2900, skipped=0, lr=[1.6044698936164263e-05, 1.6044698936164263e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 05:32:28,006] [INFO] [timer.py:260:stop] epoch=0/micro_step=5800/global_step=2900, RunningAvgSamplesPerSec=4.746236306011785, CurrSamplesPerSec=4.7687948092918075, MemAllocated=12.81GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  90%|█████████ | 5799/6434 [13:36:24<1:35:51,  9.06s/it, gpt_loss=0.301, loss_mean=0.297][A
+Train step of epoch 0:  90%|█████████ | 5800/6434 [13:36:24<1:31:06,  8.62s/it, gpt_loss=0.301, loss_mean=0.297][A
+Train step of epoch 0:  90%|█████████ | 5800/6434 [13:36:31<1:31:06,  8.62s/it, gpt_loss=0.295, loss_mean=0.297][A
+Train step of epoch 0:  90%|█████████ | 5801/6434 [13:36:31<1:28:06,  8.35s/it, gpt_loss=0.295, loss_mean=0.297][A
+Train step of epoch 0:  90%|█████████ | 5801/6434 [13:36:40<1:28:06,  8.35s/it, gpt_loss=0.375, loss_mean=0.304][A
+Train step of epoch 0:  90%|█████████ | 5802/6434 [13:36:40<1:30:08,  8.56s/it, gpt_loss=0.375, loss_mean=0.304][A
+Train step of epoch 0:  90%|█████████ | 5802/6434 [13:36:49<1:30:08,  8.56s/it, gpt_loss=0.278, loss_mean=0.302][A
+Train step of epoch 0:  90%|█████████ | 5803/6434 [13:36:49<1:30:38,  8.62s/it, gpt_loss=0.278, loss_mean=0.302][A
+Train step of epoch 0:  90%|█████████ | 5803/6434 [13:36:57<1:30:38,  8.62s/it, gpt_loss=0.324, loss_mean=0.304][A
+Train step of epoch 0:  90%|█████████ | 5804/6434 [13:36:57<1:29:51,  8.56s/it, gpt_loss=0.324, loss_mean=0.304][A
+Train step of epoch 0:  90%|█████████ | 5804/6434 [13:37:07<1:29:51,  8.56s/it, gpt_loss=0.295, loss_mean=0.303][A
+Train step of epoch 0:  90%|█████████ | 5805/6434 [13:37:07<1:31:39,  8.74s/it, gpt_loss=0.295, loss_mean=0.303][A
+Train step of epoch 0:  90%|█████████ | 5805/6434 [13:37:15<1:31:39,  8.74s/it, gpt_loss=0.299, loss_mean=0.303][A
+Train step of epoch 0:  90%|█████████ | 5806/6434 [13:37:15<1:30:39,  8.66s/it, gpt_loss=0.299, loss_mean=0.303][A
+Train step of epoch 0:  90%|█████████ | 5806/6434 [13:37:24<1:30:39,  8.66s/it, gpt_loss=0.235, loss_mean=0.296][A
+Train step of epoch 0:  90%|█████████ | 5807/6434 [13:37:24<1:32:23,  8.84s/it, gpt_loss=0.235, loss_mean=0.296][A
+Train step of epoch 0:  90%|█████████ | 5807/6434 [13:37:32<1:32:23,  8.84s/it, gpt_loss=0.301, loss_mean=0.296][A
+Train step of epoch 0:  90%|█████████ | 5808/6434 [13:37:32<1:29:23,  8.57s/it, gpt_loss=0.301, loss_mean=0.296][A
+Train step of epoch 0:  90%|█████████ | 5808/6434 [13:37:40<1:29:23,  8.57s/it, gpt_loss=0.258, loss_mean=0.293][A
+Train step of epoch 0:  90%|█████████ | 5809/6434 [13:37:40<1:24:57,  8.16s/it, gpt_loss=0.258, loss_mean=0.293][A
+[LID Router Debug] Step: 5810
+Batch Size: 10
+Audio Batch Size: 84
+LID Assignments: [1, 9, 0, 4, 9, 4, 4, 4, 5, 9]
+Active Experts in Batch: {0, 1, 4, 5, 9}
+
+Train step of epoch 0:  90%|█████████ | 5809/6434 [13:37:49<1:24:57,  8.16s/it, gpt_loss=0.349, loss_mean=0.298][A
+Train step of epoch 0:  90%|█████████ | 5810/6434 [13:37:49<1:28:18,  8.49s/it, gpt_loss=0.349, loss_mean=0.298][A
+Train step of epoch 0:  90%|█████████ | 5810/6434 [13:37:57<1:28:18,  8.49s/it, gpt_loss=0.257, loss_mean=0.294][A
+Train step of epoch 0:  90%|█████████ | 5811/6434 [13:37:57<1:25:52,  8.27s/it, gpt_loss=0.257, loss_mean=0.294][A
+Train step of epoch 0:  90%|█████████ | 5811/6434 [13:38:04<1:25:52,  8.27s/it, gpt_loss=0.293, loss_mean=0.294][A
+Train step of epoch 0:  90%|█████████ | 5812/6434 [13:38:04<1:24:44,  8.17s/it, gpt_loss=0.293, loss_mean=0.294][A
+Train step of epoch 0:  90%|█████████ | 5812/6434 [13:38:13<1:24:44,  8.17s/it, gpt_loss=0.316, loss_mean=0.296][A
+Train step of epoch 0:  90%|█████████ | 5813/6434 [13:38:13<1:26:30,  8.36s/it, gpt_loss=0.316, loss_mean=0.296][A
+Train step of epoch 0:  90%|█████████ | 5813/6434 [13:38:22<1:26:30,  8.36s/it, gpt_loss=0.301, loss_mean=0.297][A
+Train step of epoch 0:  90%|█████████ | 5814/6434 [13:38:22<1:28:52,  8.60s/it, gpt_loss=0.301, loss_mean=0.297][A
+Train step of epoch 0:  90%|█████████ | 5814/6434 [13:38:31<1:28:52,  8.60s/it, gpt_loss=0.235, loss_mean=0.291][A
+Train step of epoch 0:  90%|█████████ | 5815/6434 [13:38:31<1:27:22,  8.47s/it, gpt_loss=0.235, loss_mean=0.291][A
+Train step of epoch 0:  90%|█████████ | 5815/6434 [13:38:39<1:27:22,  8.47s/it, gpt_loss=0.212, loss_mean=0.283][A
+Train step of epoch 0:  90%|█████████ | 5816/6434 [13:38:39<1:27:08,  8.46s/it, gpt_loss=0.212, loss_mean=0.283][A
+Train step of epoch 0:  90%|█████████ | 5816/6434 [13:38:48<1:27:08,  8.46s/it, gpt_loss=0.257, loss_mean=0.28] [A
+Train step of epoch 0:  90%|█████████ | 5817/6434 [13:38:48<1:28:34,  8.61s/it, gpt_loss=0.257, loss_mean=0.28][A
+Train step of epoch 0:  90%|█████████ | 5817/6434 [13:38:56<1:28:34,  8.61s/it, gpt_loss=0.299, loss_mean=0.282][A
+Train step of epoch 0:  90%|█████████ | 5818/6434 [13:38:56<1:25:47,  8.36s/it, gpt_loss=0.299, loss_mean=0.282][A
+Train step of epoch 0:  90%|█████████ | 5818/6434 [13:39:04<1:25:47,  8.36s/it, gpt_loss=0.287, loss_mean=0.283][A
+Train step of epoch 0:  90%|█████████ | 5819/6434 [13:39:04<1:25:29,  8.34s/it, gpt_loss=0.287, loss_mean=0.283][A
+[LID Router Debug] Step: 5820
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [2, 6, 9, 0, 0, 2, 9, 1, 0, 5]
+Active Experts in Batch: {0, 1, 2, 5, 6, 9}
+
+Train step of epoch 0:  90%|█████████ | 5819/6434 [13:39:14<1:25:29,  8.34s/it, gpt_loss=0.293, loss_mean=0.284][A
+Train step of epoch 0:  90%|█████████ | 5820/6434 [13:39:14<1:29:09,  8.71s/it, gpt_loss=0.293, loss_mean=0.284][A
+Train step of epoch 0:  90%|█████████ | 5820/6434 [13:39:21<1:29:09,  8.71s/it, gpt_loss=0.265, loss_mean=0.282][A
+Train step of epoch 0:  90%|█████████ | 5821/6434 [13:39:21<1:25:22,  8.36s/it, gpt_loss=0.265, loss_mean=0.282][A
+Train step of epoch 0:  90%|█████████ | 5821/6434 [13:39:29<1:25:22,  8.36s/it, gpt_loss=0.277, loss_mean=0.281][A
+Train step of epoch 0:  90%|█████████ | 5822/6434 [13:39:29<1:22:09,  8.05s/it, gpt_loss=0.277, loss_mean=0.281][A
+Train step of epoch 0:  90%|█████████ | 5822/6434 [13:39:37<1:22:09,  8.05s/it, gpt_loss=0.318, loss_mean=0.285][A
+Train step of epoch 0:  91%|█████████ | 5823/6434 [13:39:37<1:22:08,  8.07s/it, gpt_loss=0.318, loss_mean=0.285][A
+Train step of epoch 0:  91%|█████████ | 5823/6434 [13:39:46<1:22:08,  8.07s/it, gpt_loss=0.373, loss_mean=0.294][A
+Train step of epoch 0:  91%|█████████ | 5824/6434 [13:39:46<1:25:28,  8.41s/it, gpt_loss=0.373, loss_mean=0.294][A
+Train step of epoch 0:  91%|█████████ | 5824/6434 [13:39:56<1:25:28,  8.41s/it, gpt_loss=0.196, loss_mean=0.284][A
+Train step of epoch 0:  91%|█████████ | 5825/6434 [13:39:56<1:29:36,  8.83s/it, gpt_loss=0.196, loss_mean=0.284][A
+Train step of epoch 0:  91%|█████████ | 5825/6434 [13:40:05<1:29:36,  8.83s/it, gpt_loss=0.268, loss_mean=0.282][A
+Train step of epoch 0:  91%|█████████ | 5826/6434 [13:40:05<1:29:44,  8.86s/it, gpt_loss=0.268, loss_mean=0.282][A
+Train step of epoch 0:  91%|█████████ | 5826/6434 [13:40:14<1:29:44,  8.86s/it, gpt_loss=0.285, loss_mean=0.283][A
+Train step of epoch 0:  91%|█████████ | 5827/6434 [13:40:14<1:30:08,  8.91s/it, gpt_loss=0.285, loss_mean=0.283][A
+Train step of epoch 0:  91%|█████████ | 5827/6434 [13:40:21<1:30:08,  8.91s/it, gpt_loss=0.263, loss_mean=0.281][A
+Train step of epoch 0:  91%|█████████ | 5828/6434 [13:40:21<1:26:00,  8.52s/it, gpt_loss=0.263, loss_mean=0.281][A
+Train step of epoch 0:  91%|█████████ | 5828/6434 [13:40:30<1:26:00,  8.52s/it, gpt_loss=0.403, loss_mean=0.293][A
+Train step of epoch 0:  91%|█████████ | 5829/6434 [13:40:30<1:25:47,  8.51s/it, gpt_loss=0.403, loss_mean=0.293][A
+[LID Router Debug] Step: 5830
+Batch Size: 10
+Audio Batch Size: 124
+LID Assignments: [3, 2, 9, 0, 4, 1, 2, 0, 0, 9]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+Train step of epoch 0:  91%|█████████ | 5829/6434 [13:40:38<1:25:47,  8.51s/it, gpt_loss=0.291, loss_mean=0.293][A
+Train step of epoch 0:  91%|█████████ | 5830/6434 [13:40:38<1:24:12,  8.37s/it, gpt_loss=0.291, loss_mean=0.293][A
+Train step of epoch 0:  91%|█████████ | 5830/6434 [13:40:46<1:24:12,  8.37s/it, gpt_loss=0.408, loss_mean=0.304][A
+Train step of epoch 0:  91%|█████████ | 5831/6434 [13:40:46<1:23:26,  8.30s/it, gpt_loss=0.408, loss_mean=0.304][A
+Train step of epoch 0:  91%|█████████ | 5831/6434 [13:40:54<1:23:26,  8.30s/it, gpt_loss=0.268, loss_mean=0.301][A
+Train step of epoch 0:  91%|█████████ | 5832/6434 [13:40:54<1:22:32,  8.23s/it, gpt_loss=0.268, loss_mean=0.301][A
+Train step of epoch 0:  91%|█████████ | 5832/6434 [13:41:02<1:22:32,  8.23s/it, gpt_loss=0.283, loss_mean=0.299][A
+Train step of epoch 0:  91%|█████████ | 5833/6434 [13:41:02<1:22:05,  8.20s/it, gpt_loss=0.283, loss_mean=0.299][A
+Train step of epoch 0:  91%|█████████ | 5833/6434 [13:41:10<1:22:05,  8.20s/it, gpt_loss=0.264, loss_mean=0.295][A
+Train step of epoch 0:  91%|█████████ | 5834/6434 [13:41:10<1:19:54,  7.99s/it, gpt_loss=0.264, loss_mean=0.295][A
+Train step of epoch 0:  91%|█████████ | 5834/6434 [13:41:19<1:19:54,  7.99s/it, gpt_loss=0.248, loss_mean=0.291][A
+Train step of epoch 0:  91%|█████████ | 5835/6434 [13:41:19<1:23:56,  8.41s/it, gpt_loss=0.248, loss_mean=0.291][A
+Train step of epoch 0:  91%|█████████ | 5835/6434 [13:41:27<1:23:56,  8.41s/it, gpt_loss=0.243, loss_mean=0.286][A
+Train step of epoch 0:  91%|█████████ | 5836/6434 [13:41:27<1:22:57,  8.32s/it, gpt_loss=0.243, loss_mean=0.286][A
+Train step of epoch 0:  91%|█████████ | 5836/6434 [13:41:36<1:22:57,  8.32s/it, gpt_loss=0.291, loss_mean=0.286][A
+Train step of epoch 0:  91%|█████████ | 5837/6434 [13:41:36<1:24:25,  8.48s/it, gpt_loss=0.291, loss_mean=0.286][A
+Train step of epoch 0:  91%|█████████ | 5837/6434 [13:41:44<1:24:25,  8.48s/it, gpt_loss=0.308, loss_mean=0.289][A
+Train step of epoch 0:  91%|█████████ | 5838/6434 [13:41:44<1:23:26,  8.40s/it, gpt_loss=0.308, loss_mean=0.289][A
+Train step of epoch 0:  91%|█████████ | 5838/6434 [13:41:53<1:23:26,  8.40s/it, gpt_loss=0.258, loss_mean=0.285][A
+Train step of epoch 0:  91%|█████████ | 5839/6434 [13:41:53<1:23:35,  8.43s/it, gpt_loss=0.258, loss_mean=0.285][A
+[LID Router Debug] Step: 5840
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [6, 4, 3, 0, 0, 1, 5, 2, 3, 5]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+Train step of epoch 0:  91%|█████████ | 5839/6434 [13:42:02<1:23:35,  8.43s/it, gpt_loss=0.295, loss_mean=0.286][A
+Train step of epoch 0:  91%|█████████ | 5840/6434 [13:42:02<1:25:09,  8.60s/it, gpt_loss=0.295, loss_mean=0.286][A
+Train step of epoch 0:  91%|█████████ | 5840/6434 [13:42:10<1:25:09,  8.60s/it, gpt_loss=0.28, loss_mean=0.286] [A
+Train step of epoch 0:  91%|█████████ | 5841/6434 [13:42:10<1:25:38,  8.66s/it, gpt_loss=0.28, loss_mean=0.286][A
+Train step of epoch 0:  91%|█████████ | 5841/6434 [13:42:18<1:25:38,  8.66s/it, gpt_loss=0.196, loss_mean=0.277][A
+Train step of epoch 0:  91%|█████████ | 5842/6434 [13:42:18<1:21:21,  8.25s/it, gpt_loss=0.196, loss_mean=0.277][A
+Train step of epoch 0:  91%|█████████ | 5842/6434 [13:42:25<1:21:21,  8.25s/it, gpt_loss=0.347, loss_mean=0.284][A
+Train step of epoch 0:  91%|█████████ | 5843/6434 [13:42:25<1:19:50,  8.11s/it, gpt_loss=0.347, loss_mean=0.284][A
+Train step of epoch 0:  91%|█████████ | 5843/6434 [13:42:35<1:19:50,  8.11s/it, gpt_loss=0.332, loss_mean=0.289][A
+Train step of epoch 0:  91%|█████████ | 5844/6434 [13:42:35<1:24:49,  8.63s/it, gpt_loss=0.332, loss_mean=0.289][A
+Train step of epoch 0:  91%|█████████ | 5844/6434 [13:42:44<1:24:49,  8.63s/it, gpt_loss=0.24, loss_mean=0.284] [A
+Train step of epoch 0:  91%|█████████ | 5845/6434 [13:42:44<1:24:19,  8.59s/it, gpt_loss=0.24, loss_mean=0.284][A
+Train step of epoch 0:  91%|█████████ | 5845/6434 [13:42:53<1:24:19,  8.59s/it, gpt_loss=0.388, loss_mean=0.294][A
+Train step of epoch 0:  91%|█████████ | 5846/6434 [13:42:53<1:27:01,  8.88s/it, gpt_loss=0.388, loss_mean=0.294][A
+Train step of epoch 0:  91%|█████████ | 5846/6434 [13:43:02<1:27:01,  8.88s/it, gpt_loss=0.261, loss_mean=0.291][A
+Train step of epoch 0:  91%|█████████ | 5847/6434 [13:43:02<1:26:34,  8.85s/it, gpt_loss=0.261, loss_mean=0.291][A
+Train step of epoch 0:  91%|█████████ | 5847/6434 [13:43:10<1:26:34,  8.85s/it, gpt_loss=0.258, loss_mean=0.288][A
+Train step of epoch 0:  91%|█████████ | 5848/6434 [13:43:10<1:24:36,  8.66s/it, gpt_loss=0.258, loss_mean=0.288][A
+Train step of epoch 0:  91%|█████████ | 5848/6434 [13:43:19<1:24:36,  8.66s/it, gpt_loss=0.314, loss_mean=0.29] [A
+Train step of epoch 0:  91%|█████████ | 5849/6434 [13:43:19<1:23:22,  8.55s/it, gpt_loss=0.314, loss_mean=0.29][A
+[LID Router Debug] Step: 5850
+Batch Size: 10
+Audio Batch Size: 129
+LID Assignments: [2, 9, 3, 7, 2, 4, 1, 4, 9, 3]
+Active Experts in Batch: {1, 2, 3, 4, 7, 9}
+
+Train step of epoch 0:  91%|█████████ | 5849/6434 [13:43:28<1:23:22,  8.55s/it, gpt_loss=0.266, loss_mean=0.288][A
+Train step of epoch 0:  91%|█████████ | 5850/6434 [13:43:28<1:24:25,  8.67s/it, gpt_loss=0.266, loss_mean=0.288][A
+Train step of epoch 0:  91%|█████████ | 5850/6434 [13:43:36<1:24:25,  8.67s/it, gpt_loss=0.288, loss_mean=0.288][A
+Train step of epoch 0:  91%|█████████ | 5851/6434 [13:43:36<1:23:26,  8.59s/it, gpt_loss=0.288, loss_mean=0.288][A
+Train step of epoch 0:  91%|█████████ | 5851/6434 [13:43:44<1:23:26,  8.59s/it, gpt_loss=0.272, loss_mean=0.286][A
+Train step of epoch 0:  91%|█████████ | 5852/6434 [13:43:44<1:22:14,  8.48s/it, gpt_loss=0.272, loss_mean=0.286][A
+Train step of epoch 0:  91%|█████████ | 5852/6434 [13:43:53<1:22:14,  8.48s/it, gpt_loss=0.256, loss_mean=0.283][A
+Train step of epoch 0:  91%|█████████ | 5853/6434 [13:43:53<1:22:55,  8.56s/it, gpt_loss=0.256, loss_mean=0.283][A
+Train step of epoch 0:  91%|█████████ | 5853/6434 [13:44:01<1:22:55,  8.56s/it, gpt_loss=0.297, loss_mean=0.285][A
+Train step of epoch 0:  91%|█████████ | 5854/6434 [13:44:01<1:20:04,  8.28s/it, gpt_loss=0.297, loss_mean=0.285][A
+Train step of epoch 0:  91%|█████████ | 5854/6434 [13:44:09<1:20:04,  8.28s/it, gpt_loss=0.333, loss_mean=0.289][A
+Train step of epoch 0:  91%|█████████ | 5855/6434 [13:44:09<1:20:27,  8.34s/it, gpt_loss=0.333, loss_mean=0.289][A
+Train step of epoch 0:  91%|█████████ | 5855/6434 [13:44:17<1:20:27,  8.34s/it, gpt_loss=0.247, loss_mean=0.285][A
+Train step of epoch 0:  91%|█████████ | 5856/6434 [13:44:17<1:18:18,  8.13s/it, gpt_loss=0.247, loss_mean=0.285][A
+Train step of epoch 0:  91%|█████████ | 5856/6434 [13:44:24<1:18:18,  8.13s/it, gpt_loss=0.302, loss_mean=0.287][A
+Train step of epoch 0:  91%|█████████ | 5857/6434 [13:44:24<1:15:30,  7.85s/it, gpt_loss=0.302, loss_mean=0.287][A
+Train step of epoch 0:  91%|█████████ | 5857/6434 [13:44:32<1:15:30,  7.85s/it, gpt_loss=0.291, loss_mean=0.287][A
+Train step of epoch 0:  91%|█████████ | 5858/6434 [13:44:32<1:14:58,  7.81s/it, gpt_loss=0.291, loss_mean=0.287][A
+Train step of epoch 0:  91%|█████████ | 5858/6434 [13:44:41<1:14:58,  7.81s/it, gpt_loss=0.335, loss_mean=0.292][A
+Train step of epoch 0:  91%|█████████ | 5859/6434 [13:44:41<1:18:48,  8.22s/it, gpt_loss=0.335, loss_mean=0.292][A
+[LID Router Debug] Step: 5860
+Batch Size: 10
+Audio Batch Size: 118
+LID Assignments: [0, 9, 4, 0, 3, 9, 2, 2, 3, 6]
+Active Experts in Batch: {0, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:  91%|█████████ | 5859/6434 [13:44:50<1:18:48,  8.22s/it, gpt_loss=0.235, loss_mean=0.286][A
+Train step of epoch 0:  91%|█████████ | 5860/6434 [13:44:50<1:20:11,  8.38s/it, gpt_loss=0.235, loss_mean=0.286][A
+Train step of epoch 0:  91%|█████████ | 5860/6434 [13:44:58<1:20:11,  8.38s/it, gpt_loss=0.278, loss_mean=0.286][A
+Train step of epoch 0:  91%|█████████ | 5861/6434 [13:44:58<1:20:34,  8.44s/it, gpt_loss=0.278, loss_mean=0.286][A
+Train step of epoch 0:  91%|█████████ | 5861/6434 [13:45:07<1:20:34,  8.44s/it, gpt_loss=0.244, loss_mean=0.281][A
+Train step of epoch 0:  91%|█████████ | 5862/6434 [13:45:07<1:22:16,  8.63s/it, gpt_loss=0.244, loss_mean=0.281][A
+Train step of epoch 0:  91%|█████████ | 5862/6434 [13:45:16<1:22:16,  8.63s/it, gpt_loss=0.304, loss_mean=0.284][A
+Train step of epoch 0:  91%|█████████ | 5863/6434 [13:45:16<1:23:31,  8.78s/it, gpt_loss=0.304, loss_mean=0.284][A
+Train step of epoch 0:  91%|█████████ | 5863/6434 [13:45:24<1:23:31,  8.78s/it, gpt_loss=0.335, loss_mean=0.289][A
+Train step of epoch 0:  91%|█████████ | 5864/6434 [13:45:24<1:20:21,  8.46s/it, gpt_loss=0.335, loss_mean=0.289][A
+Train step of epoch 0:  91%|█████████ | 5864/6434 [13:45:33<1:20:21,  8.46s/it, gpt_loss=0.245, loss_mean=0.284][A
+Train step of epoch 0:  91%|█████████ | 5865/6434 [13:45:33<1:22:45,  8.73s/it, gpt_loss=0.245, loss_mean=0.284][A
+Train step of epoch 0:  91%|█████████ | 5865/6434 [13:45:41<1:22:45,  8.73s/it, gpt_loss=0.316, loss_mean=0.288][A
+Train step of epoch 0:  91%|█████████ | 5866/6434 [13:45:41<1:20:37,  8.52s/it, gpt_loss=0.316, loss_mean=0.288][A
+Train step of epoch 0:  91%|█████████ | 5866/6434 [13:45:50<1:20:37,  8.52s/it, gpt_loss=0.473, loss_mean=0.306][A
+Train step of epoch 0:  91%|█████████ | 5867/6434 [13:45:50<1:20:05,  8.48s/it, gpt_loss=0.473, loss_mean=0.306][A
+Train step of epoch 0:  91%|█████████ | 5867/6434 [13:45:57<1:20:05,  8.48s/it, gpt_loss=0.238, loss_mean=0.299][A
+Train step of epoch 0:  91%|█████████ | 5868/6434 [13:45:57<1:17:11,  8.18s/it, gpt_loss=0.238, loss_mean=0.299][A
+Train step of epoch 0:  91%|█████████ | 5868/6434 [13:46:07<1:17:11,  8.18s/it, gpt_loss=0.234, loss_mean=0.293][A
+Train step of epoch 0:  91%|█████████ | 5869/6434 [13:46:07<1:21:13,  8.63s/it, gpt_loss=0.234, loss_mean=0.293][A
+[LID Router Debug] Step: 5870
+Batch Size: 10
+Audio Batch Size: 141
+LID Assignments: [5, 2, 9, 5, 9, 9, 3, 3, 9, 9]
+Active Experts in Batch: {9, 2, 3, 5}
+
+Train step of epoch 0:  91%|█████████ | 5869/6434 [13:46:16<1:21:13,  8.63s/it, gpt_loss=0.304, loss_mean=0.294][A
+Train step of epoch 0:  91%|█████████ | 5870/6434 [13:46:16<1:22:14,  8.75s/it, gpt_loss=0.304, loss_mean=0.294][A
+Train step of epoch 0:  91%|█████████ | 5870/6434 [13:46:24<1:22:14,  8.75s/it, gpt_loss=0.293, loss_mean=0.294][A
+Train step of epoch 0:  91%|█████████ | 5871/6434 [13:46:24<1:20:05,  8.54s/it, gpt_loss=0.293, loss_mean=0.294][A
+Train step of epoch 0:  91%|█████████ | 5871/6434 [13:46:33<1:20:05,  8.54s/it, gpt_loss=0.237, loss_mean=0.288][A
+Train step of epoch 0:  91%|█████████▏| 5872/6434 [13:46:33<1:20:50,  8.63s/it, gpt_loss=0.237, loss_mean=0.288][A
+Train step of epoch 0:  91%|█████████▏| 5872/6434 [13:46:43<1:20:50,  8.63s/it, gpt_loss=0.275, loss_mean=0.287][A
+Train step of epoch 0:  91%|█████████▏| 5873/6434 [13:46:43<1:23:28,  8.93s/it, gpt_loss=0.275, loss_mean=0.287][A
+Train step of epoch 0:  91%|█████████▏| 5873/6434 [13:46:51<1:23:28,  8.93s/it, gpt_loss=0.319, loss_mean=0.29] [A
+Train step of epoch 0:  91%|█████████▏| 5874/6434 [13:46:51<1:20:48,  8.66s/it, gpt_loss=0.319, loss_mean=0.29][A
+Train step of epoch 0:  91%|█████████▏| 5874/6434 [13:47:00<1:20:48,  8.66s/it, gpt_loss=0.232, loss_mean=0.284][A
+Train step of epoch 0:  91%|█████████▏| 5875/6434 [13:47:00<1:22:45,  8.88s/it, gpt_loss=0.232, loss_mean=0.284][A
+Train step of epoch 0:  91%|█████████▏| 5875/6434 [13:47:09<1:22:45,  8.88s/it, gpt_loss=0.276, loss_mean=0.283][A
+Train step of epoch 0:  91%|█████████▏| 5876/6434 [13:47:09<1:21:36,  8.77s/it, gpt_loss=0.276, loss_mean=0.283][A
+Train step of epoch 0:  91%|█████████▏| 5876/6434 [13:47:18<1:21:36,  8.77s/it, gpt_loss=0.264, loss_mean=0.281][A
+Train step of epoch 0:  91%|█████████▏| 5877/6434 [13:47:18<1:24:20,  9.08s/it, gpt_loss=0.264, loss_mean=0.281][A
+Train step of epoch 0:  91%|█████████▏| 5877/6434 [13:47:28<1:24:20,  9.08s/it, gpt_loss=0.337, loss_mean=0.287][A
+Train step of epoch 0:  91%|█████████▏| 5878/6434 [13:47:28<1:26:49,  9.37s/it, gpt_loss=0.337, loss_mean=0.287][A
+Train step of epoch 0:  91%|█████████▏| 5878/6434 [13:47:36<1:26:49,  9.37s/it, gpt_loss=0.328, loss_mean=0.291][A
+Train step of epoch 0:  91%|█████████▏| 5879/6434 [13:47:36<1:22:58,  8.97s/it, gpt_loss=0.328, loss_mean=0.291][A
+[LID Router Debug] Step: 5880
+Batch Size: 10
+Audio Batch Size: 113
+LID Assignments: [0, 4, 5, 3, 1, 5, 3, 2, 4, 5]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5}
+
+Train step of epoch 0:  91%|█████████▏| 5879/6434 [13:47:45<1:22:58,  8.97s/it, gpt_loss=0.251, loss_mean=0.287][A
+Train step of epoch 0:  91%|█████████▏| 5880/6434 [13:47:45<1:20:48,  8.75s/it, gpt_loss=0.251, loss_mean=0.287][A
+Train step of epoch 0:  91%|█████████▏| 5880/6434 [13:47:53<1:20:48,  8.75s/it, gpt_loss=0.286, loss_mean=0.287][A
+Train step of epoch 0:  91%|█████████▏| 5881/6434 [13:47:53<1:19:11,  8.59s/it, gpt_loss=0.286, loss_mean=0.287][A
+Train step of epoch 0:  91%|█████████▏| 5881/6434 [13:48:01<1:19:11,  8.59s/it, gpt_loss=0.321, loss_mean=0.29] [A
+Train step of epoch 0:  91%|█████████▏| 5882/6434 [13:48:01<1:18:05,  8.49s/it, gpt_loss=0.321, loss_mean=0.29][A
+Train step of epoch 0:  91%|█████████▏| 5882/6434 [13:48:08<1:18:05,  8.49s/it, gpt_loss=0.231, loss_mean=0.285][A
+Train step of epoch 0:  91%|█████████▏| 5883/6434 [13:48:08<1:14:26,  8.11s/it, gpt_loss=0.231, loss_mean=0.285][A
+Train step of epoch 0:  91%|█████████▏| 5883/6434 [13:48:18<1:14:26,  8.11s/it, gpt_loss=0.24, loss_mean=0.28]  [A
+Train step of epoch 0:  91%|█████████▏| 5884/6434 [13:48:18<1:18:12,  8.53s/it, gpt_loss=0.24, loss_mean=0.28][A
+Train step of epoch 0:  91%|█████████▏| 5884/6434 [13:48:26<1:18:12,  8.53s/it, gpt_loss=0.266, loss_mean=0.279][A
+Train step of epoch 0:  91%|█████████▏| 5885/6434 [13:48:26<1:16:36,  8.37s/it, gpt_loss=0.266, loss_mean=0.279][A
+Train step of epoch 0:  91%|█████████▏| 5885/6434 [13:48:34<1:16:36,  8.37s/it, gpt_loss=0.318, loss_mean=0.283][A
+Train step of epoch 0:  91%|█████████▏| 5886/6434 [13:48:34<1:15:48,  8.30s/it, gpt_loss=0.318, loss_mean=0.283][A
+Train step of epoch 0:  91%|█████████▏| 5886/6434 [13:48:42<1:15:48,  8.30s/it, gpt_loss=0.259, loss_mean=0.28] [A
+Train step of epoch 0:  91%|█████████▏| 5887/6434 [13:48:42<1:14:05,  8.13s/it, gpt_loss=0.259, loss_mean=0.28][A
+Train step of epoch 0:  91%|█████████▏| 5887/6434 [13:48:50<1:14:05,  8.13s/it, gpt_loss=0.295, loss_mean=0.282][A
+Train step of epoch 0:  92%|█████████▏| 5888/6434 [13:48:50<1:15:14,  8.27s/it, gpt_loss=0.295, loss_mean=0.282][A
+Train step of epoch 0:  92%|█████████▏| 5888/6434 [13:48:59<1:15:14,  8.27s/it, gpt_loss=0.315, loss_mean=0.285][A
+Train step of epoch 0:  92%|█████████▏| 5889/6434 [13:48:59<1:16:12,  8.39s/it, gpt_loss=0.315, loss_mean=0.285][A
+[LID Router Debug] Step: 5890
+Batch Size: 10
+Audio Batch Size: 124
+LID Assignments: [3, 3, 9, 2, 1, 1, 5, 3, 1, 0]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9}
+
+Train step of epoch 0:  92%|█████████▏| 5889/6434 [13:49:07<1:16:12,  8.39s/it, gpt_loss=0.26, loss_mean=0.283] [A
+Train step of epoch 0:  92%|█████████▏| 5890/6434 [13:49:07<1:15:22,  8.31s/it, gpt_loss=0.26, loss_mean=0.283][A
+Train step of epoch 0:  92%|█████████▏| 5890/6434 [13:49:16<1:15:22,  8.31s/it, gpt_loss=0.296, loss_mean=0.284][A
+Train step of epoch 0:  92%|█████████▏| 5891/6434 [13:49:16<1:16:02,  8.40s/it, gpt_loss=0.296, loss_mean=0.284][A
+Train step of epoch 0:  92%|█████████▏| 5891/6434 [13:49:24<1:16:02,  8.40s/it, gpt_loss=0.265, loss_mean=0.282][A
+Train step of epoch 0:  92%|█████████▏| 5892/6434 [13:49:24<1:14:35,  8.26s/it, gpt_loss=0.265, loss_mean=0.282][A
+Train step of epoch 0:  92%|█████████▏| 5892/6434 [13:49:34<1:14:35,  8.26s/it, gpt_loss=0.324, loss_mean=0.286][A
+Train step of epoch 0:  92%|█████████▏| 5893/6434 [13:49:34<1:18:55,  8.75s/it, gpt_loss=0.324, loss_mean=0.286][A
+Train step of epoch 0:  92%|█████████▏| 5893/6434 [13:49:42<1:18:55,  8.75s/it, gpt_loss=0.303, loss_mean=0.288][A
+Train step of epoch 0:  92%|█████████▏| 5894/6434 [13:49:42<1:16:41,  8.52s/it, gpt_loss=0.303, loss_mean=0.288][A
+Train step of epoch 0:  92%|█████████▏| 5894/6434 [13:49:51<1:16:41,  8.52s/it, gpt_loss=0.25, loss_mean=0.284] [A
+Train step of epoch 0:  92%|█████████▏| 5895/6434 [13:49:51<1:18:11,  8.70s/it, gpt_loss=0.25, loss_mean=0.284][A
+Train step of epoch 0:  92%|█████████▏| 5895/6434 [13:49:59<1:18:11,  8.70s/it, gpt_loss=0.292, loss_mean=0.285][A
+Train step of epoch 0:  92%|█████████▏| 5896/6434 [13:49:59<1:16:21,  8.52s/it, gpt_loss=0.292, loss_mean=0.285][A
+Train step of epoch 0:  92%|█████████▏| 5896/6434 [13:50:07<1:16:21,  8.52s/it, gpt_loss=0.434, loss_mean=0.3]  [A
+Train step of epoch 0:  92%|█████████▏| 5897/6434 [13:50:07<1:16:33,  8.55s/it, gpt_loss=0.434, loss_mean=0.3][A
+Train step of epoch 0:  92%|█████████▏| 5897/6434 [13:50:16<1:16:33,  8.55s/it, gpt_loss=0.331, loss_mean=0.303][A
+Train step of epoch 0:  92%|█████████▏| 5898/6434 [13:50:16<1:17:38,  8.69s/it, gpt_loss=0.331, loss_mean=0.303][A
+Train step of epoch 0:  92%|█████████▏| 5898/6434 [13:50:25<1:17:38,  8.69s/it, gpt_loss=0.28, loss_mean=0.301] [A
+Train step of epoch 0:  92%|█████████▏| 5899/6434 [13:50:25<1:16:50,  8.62s/it, gpt_loss=0.28, loss_mean=0.301][A
+[LID Router Debug] Step: 5900
+Batch Size: 10
+Audio Batch Size: 96
+LID Assignments: [2, 1, 3, 1, 4, 9, 4, 1, 5, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  92%|█████████▏| 5899/6434 [13:50:33<1:16:50,  8.62s/it, gpt_loss=0.287, loss_mean=0.299][A
+Train step of epoch 0:  92%|█████████▏| 5900/6434 [13:50:33<1:15:01,  8.43s/it, gpt_loss=0.287, loss_mean=0.299][A
+Train step of epoch 0:  92%|█████████▏| 5900/6434 [13:50:41<1:15:01,  8.43s/it, gpt_loss=0.224, loss_mean=0.292][A
+Train step of epoch 0:  92%|█████████▏| 5901/6434 [13:50:41<1:13:33,  8.28s/it, gpt_loss=0.224, loss_mean=0.292][A
+Train step of epoch 0:  92%|█████████▏| 5901/6434 [13:50:50<1:13:33,  8.28s/it, gpt_loss=0.287, loss_mean=0.291][A
+Train step of epoch 0:  92%|█████████▏| 5902/6434 [13:50:50<1:16:21,  8.61s/it, gpt_loss=0.287, loss_mean=0.291][A
+Train step of epoch 0:  92%|█████████▏| 5902/6434 [13:50:58<1:16:21,  8.61s/it, gpt_loss=0.249, loss_mean=0.287][A
+Train step of epoch 0:  92%|█████████▏| 5903/6434 [13:50:58<1:13:03,  8.25s/it, gpt_loss=0.249, loss_mean=0.287][A
+Train step of epoch 0:  92%|█████████▏| 5903/6434 [13:51:05<1:13:03,  8.25s/it, gpt_loss=0.315, loss_mean=0.29] [A
+Train step of epoch 0:  92%|█████████▏| 5904/6434 [13:51:05<1:11:16,  8.07s/it, gpt_loss=0.315, loss_mean=0.29][A
+Train step of epoch 0:  92%|█████████▏| 5904/6434 [13:51:13<1:11:16,  8.07s/it, gpt_loss=0.258, loss_mean=0.287][A
+Train step of epoch 0:  92%|█████████▏| 5905/6434 [13:51:13<1:10:39,  8.01s/it, gpt_loss=0.258, loss_mean=0.287][A
+Train step of epoch 0:  92%|█████████▏| 5905/6434 [13:51:21<1:10:39,  8.01s/it, gpt_loss=0.304, loss_mean=0.288][A
+Train step of epoch 0:  92%|█████████▏| 5906/6434 [13:51:21<1:11:23,  8.11s/it, gpt_loss=0.304, loss_mean=0.288][A
+Train step of epoch 0:  92%|█████████▏| 5906/6434 [13:51:30<1:11:23,  8.11s/it, gpt_loss=0.2, loss_mean=0.28]   [A
+Train step of epoch 0:  92%|█████████▏| 5907/6434 [13:51:30<1:11:14,  8.11s/it, gpt_loss=0.2, loss_mean=0.28][A
+Train step of epoch 0:  92%|█████████▏| 5907/6434 [13:51:37<1:11:14,  8.11s/it, gpt_loss=0.25, loss_mean=0.277][A
+Train step of epoch 0:  92%|█████████▏| 5908/6434 [13:51:37<1:10:24,  8.03s/it, gpt_loss=0.25, loss_mean=0.277][A
+Train step of epoch 0:  92%|█████████▏| 5908/6434 [13:51:45<1:10:24,  8.03s/it, gpt_loss=0.271, loss_mean=0.276][A
+Train step of epoch 0:  92%|█████████▏| 5909/6434 [13:51:45<1:09:02,  7.89s/it, gpt_loss=0.271, loss_mean=0.276][A
+[LID Router Debug] Step: 5910
+Batch Size: 10
+Audio Batch Size: 119
+LID Assignments: [5, 2, 5, 2, 3, 9, 5, 3, 9, 2]
+Active Experts in Batch: {9, 2, 3, 5}
+
+Train step of epoch 0:  92%|█████████▏| 5909/6434 [13:51:53<1:09:02,  7.89s/it, gpt_loss=0.247, loss_mean=0.273][A
+Train step of epoch 0:  92%|█████████▏| 5910/6434 [13:51:53<1:10:02,  8.02s/it, gpt_loss=0.247, loss_mean=0.273][A
+Train step of epoch 0:  92%|█████████▏| 5910/6434 [13:52:01<1:10:02,  8.02s/it, gpt_loss=0.223, loss_mean=0.268][A
+Train step of epoch 0:  92%|█████████▏| 5911/6434 [13:52:01<1:10:11,  8.05s/it, gpt_loss=0.223, loss_mean=0.268][A
+Train step of epoch 0:  92%|█████████▏| 5911/6434 [13:52:09<1:10:11,  8.05s/it, gpt_loss=0.286, loss_mean=0.27] [A
+Train step of epoch 0:  92%|█████████▏| 5912/6434 [13:52:09<1:09:52,  8.03s/it, gpt_loss=0.286, loss_mean=0.27][A
+Train step of epoch 0:  92%|█████████▏| 5912/6434 [13:52:17<1:09:52,  8.03s/it, gpt_loss=0.349, loss_mean=0.278][A
+Train step of epoch 0:  92%|█████████▏| 5913/6434 [13:52:17<1:08:41,  7.91s/it, gpt_loss=0.349, loss_mean=0.278][A
+Train step of epoch 0:  92%|█████████▏| 5913/6434 [13:52:25<1:08:41,  7.91s/it, gpt_loss=0.266, loss_mean=0.277][A
+Train step of epoch 0:  92%|█████████▏| 5914/6434 [13:52:25<1:09:26,  8.01s/it, gpt_loss=0.266, loss_mean=0.277][A
+Train step of epoch 0:  92%|█████████▏| 5914/6434 [13:52:33<1:09:26,  8.01s/it, gpt_loss=0.38, loss_mean=0.287] [A
+Train step of epoch 0:  92%|█████████▏| 5915/6434 [13:52:33<1:07:27,  7.80s/it, gpt_loss=0.38, loss_mean=0.287][A
+Train step of epoch 0:  92%|█████████▏| 5915/6434 [13:52:40<1:07:27,  7.80s/it, gpt_loss=0.292, loss_mean=0.288][A
+Train step of epoch 0:  92%|█████████▏| 5916/6434 [13:52:40<1:05:25,  7.58s/it, gpt_loss=0.292, loss_mean=0.288][A
+Train step of epoch 0:  92%|█████████▏| 5916/6434 [13:52:48<1:05:25,  7.58s/it, gpt_loss=0.385, loss_mean=0.297][A
+Train step of epoch 0:  92%|█████████▏| 5917/6434 [13:52:48<1:08:31,  7.95s/it, gpt_loss=0.385, loss_mean=0.297][A
+Train step of epoch 0:  92%|█████████▏| 5917/6434 [13:52:57<1:08:31,  7.95s/it, gpt_loss=0.258, loss_mean=0.293][A
+Train step of epoch 0:  92%|█████████▏| 5918/6434 [13:52:57<1:09:36,  8.09s/it, gpt_loss=0.258, loss_mean=0.293][A
+Train step of epoch 0:  92%|█████████▏| 5918/6434 [13:53:05<1:09:36,  8.09s/it, gpt_loss=0.319, loss_mean=0.296][A
+Train step of epoch 0:  92%|█████████▏| 5919/6434 [13:53:05<1:09:25,  8.09s/it, gpt_loss=0.319, loss_mean=0.296][A
+[LID Router Debug] Step: 5920
+Batch Size: 10
+Audio Batch Size: 115
+LID Assignments: [4, 5, 3, 4, 9, 1, 9, 5, 3, 2]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  92%|█████████▏| 5919/6434 [13:53:13<1:09:25,  8.09s/it, gpt_loss=0.296, loss_mean=0.296][A
+Train step of epoch 0:  92%|█████████▏| 5920/6434 [13:53:13<1:09:10,  8.08s/it, gpt_loss=0.296, loss_mean=0.296][A
+Train step of epoch 0:  92%|█████████▏| 5920/6434 [13:53:22<1:09:10,  8.08s/it, gpt_loss=0.305, loss_mean=0.297][A
+Train step of epoch 0:  92%|█████████▏| 5921/6434 [13:53:22<1:10:36,  8.26s/it, gpt_loss=0.305, loss_mean=0.297][A
+Train step of epoch 0:  92%|█████████▏| 5921/6434 [13:53:30<1:10:36,  8.26s/it, gpt_loss=0.318, loss_mean=0.299][A
+Train step of epoch 0:  92%|█████████▏| 5922/6434 [13:53:30<1:11:35,  8.39s/it, gpt_loss=0.318, loss_mean=0.299][A
+Train step of epoch 0:  92%|█████████▏| 5922/6434 [13:53:38<1:11:35,  8.39s/it, gpt_loss=0.481, loss_mean=0.317][A
+Train step of epoch 0:  92%|█████████▏| 5923/6434 [13:53:38<1:10:30,  8.28s/it, gpt_loss=0.481, loss_mean=0.317][A
+Train step of epoch 0:  92%|█████████▏| 5923/6434 [13:53:47<1:10:30,  8.28s/it, gpt_loss=0.309, loss_mean=0.316][A
+Train step of epoch 0:  92%|█████████▏| 5924/6434 [13:53:47<1:10:06,  8.25s/it, gpt_loss=0.309, loss_mean=0.316][A
+Train step of epoch 0:  92%|█████████▏| 5924/6434 [13:53:55<1:10:06,  8.25s/it, gpt_loss=0.319, loss_mean=0.317][A
+Train step of epoch 0:  92%|█████████▏| 5925/6434 [13:53:55<1:11:03,  8.38s/it, gpt_loss=0.319, loss_mean=0.317][A
+Train step of epoch 0:  92%|█████████▏| 5925/6434 [13:54:03<1:11:03,  8.38s/it, gpt_loss=0.309, loss_mean=0.316][A
+Train step of epoch 0:  92%|█████████▏| 5926/6434 [13:54:03<1:09:37,  8.22s/it, gpt_loss=0.309, loss_mean=0.316][A
+Train step of epoch 0:  92%|█████████▏| 5926/6434 [13:54:11<1:09:37,  8.22s/it, gpt_loss=0.285, loss_mean=0.313][A
+Train step of epoch 0:  92%|█████████▏| 5927/6434 [13:54:11<1:08:23,  8.09s/it, gpt_loss=0.285, loss_mean=0.313][A
+Train step of epoch 0:  92%|█████████▏| 5927/6434 [13:54:18<1:08:23,  8.09s/it, gpt_loss=0.373, loss_mean=0.319][A
+Train step of epoch 0:  92%|█████████▏| 5928/6434 [13:54:18<1:06:46,  7.92s/it, gpt_loss=0.373, loss_mean=0.319][A
+Train step of epoch 0:  92%|█████████▏| 5928/6434 [13:54:26<1:06:46,  7.92s/it, gpt_loss=0.266, loss_mean=0.313][A
+Train step of epoch 0:  92%|█████████▏| 5929/6434 [13:54:26<1:06:07,  7.86s/it, gpt_loss=0.266, loss_mean=0.313][A
+[LID Router Debug] Step: 5930
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [9, 5, 9, 5, 1, 5, 9, 9, 5, 1]
+Active Experts in Batch: {9, 5, 1}
+
+Train step of epoch 0:  92%|█████████▏| 5929/6434 [13:54:36<1:06:07,  7.86s/it, gpt_loss=0.35, loss_mean=0.317] [A
+Train step of epoch 0:  92%|█████████▏| 5930/6434 [13:54:36<1:10:08,  8.35s/it, gpt_loss=0.35, loss_mean=0.317][A
+Train step of epoch 0:  92%|█████████▏| 5930/6434 [13:54:45<1:10:08,  8.35s/it, gpt_loss=0.364, loss_mean=0.322][A
+Train step of epoch 0:  92%|█████████▏| 5931/6434 [13:54:45<1:11:51,  8.57s/it, gpt_loss=0.364, loss_mean=0.322][A
+Train step of epoch 0:  92%|█████████▏| 5931/6434 [13:54:53<1:11:51,  8.57s/it, gpt_loss=0.312, loss_mean=0.321][A
+Train step of epoch 0:  92%|█████████▏| 5932/6434 [13:54:53<1:10:56,  8.48s/it, gpt_loss=0.312, loss_mean=0.321][A
+Train step of epoch 0:  92%|█████████▏| 5932/6434 [13:55:03<1:10:56,  8.48s/it, gpt_loss=0.275, loss_mean=0.316][A
+Train step of epoch 0:  92%|█████████▏| 5933/6434 [13:55:03<1:15:26,  9.04s/it, gpt_loss=0.275, loss_mean=0.316][A
+Train step of epoch 0:  92%|█████████▏| 5933/6434 [13:55:12<1:15:26,  9.04s/it, gpt_loss=0.237, loss_mean=0.308][A
+Train step of epoch 0:  92%|█████████▏| 5934/6434 [13:55:12<1:14:08,  8.90s/it, gpt_loss=0.237, loss_mean=0.308][A
+Train step of epoch 0:  92%|█████████▏| 5934/6434 [13:55:20<1:14:08,  8.90s/it, gpt_loss=0.269, loss_mean=0.304][A
+Train step of epoch 0:  92%|█████████▏| 5935/6434 [13:55:20<1:10:51,  8.52s/it, gpt_loss=0.269, loss_mean=0.304][A
+Train step of epoch 0:  92%|█████████▏| 5935/6434 [13:55:27<1:10:51,  8.52s/it, gpt_loss=0.261, loss_mean=0.3]  [A
+Train step of epoch 0:  92%|█████████▏| 5936/6434 [13:55:27<1:09:18,  8.35s/it, gpt_loss=0.261, loss_mean=0.3][A
+Train step of epoch 0:  92%|█████████▏| 5936/6434 [13:55:35<1:09:18,  8.35s/it, gpt_loss=0.255, loss_mean=0.295][A
+Train step of epoch 0:  92%|█████████▏| 5937/6434 [13:55:35<1:07:22,  8.13s/it, gpt_loss=0.255, loss_mean=0.295][A
+Train step of epoch 0:  92%|█████████▏| 5937/6434 [13:55:44<1:07:22,  8.13s/it, gpt_loss=0.302, loss_mean=0.296][A
+Train step of epoch 0:  92%|█████████▏| 5938/6434 [13:55:44<1:09:45,  8.44s/it, gpt_loss=0.302, loss_mean=0.296][A
+Train step of epoch 0:  92%|█████████▏| 5938/6434 [13:55:52<1:09:45,  8.44s/it, gpt_loss=0.331, loss_mean=0.3]  [A
+Train step of epoch 0:  92%|█████████▏| 5939/6434 [13:55:52<1:09:06,  8.38s/it, gpt_loss=0.331, loss_mean=0.3][A
+[LID Router Debug] Step: 5940
+Batch Size: 10
+Audio Batch Size: 110
+LID Assignments: [1, 9, 0, 3, 3, 2, 1, 2, 9, 9]
+Active Experts in Batch: {0, 1, 2, 3, 9}
+
+Train step of epoch 0:  92%|█████████▏| 5939/6434 [13:56:00<1:09:06,  8.38s/it, gpt_loss=0.358, loss_mean=0.305][A
+Train step of epoch 0:  92%|█████████▏| 5940/6434 [13:56:00<1:06:49,  8.12s/it, gpt_loss=0.358, loss_mean=0.305][A
+Train step of epoch 0:  92%|█████████▏| 5940/6434 [13:56:08<1:06:49,  8.12s/it, gpt_loss=0.323, loss_mean=0.307][A
+Train step of epoch 0:  92%|█████████▏| 5941/6434 [13:56:08<1:06:39,  8.11s/it, gpt_loss=0.323, loss_mean=0.307][A
+Train step of epoch 0:  92%|█████████▏| 5941/6434 [13:56:17<1:06:39,  8.11s/it, gpt_loss=0.364, loss_mean=0.313][A
+Train step of epoch 0:  92%|█████████▏| 5942/6434 [13:56:17<1:08:21,  8.34s/it, gpt_loss=0.364, loss_mean=0.313][A
+Train step of epoch 0:  92%|█████████▏| 5942/6434 [13:56:25<1:08:21,  8.34s/it, gpt_loss=0.401, loss_mean=0.322][A
+Train step of epoch 0:  92%|█████████▏| 5943/6434 [13:56:25<1:07:34,  8.26s/it, gpt_loss=0.401, loss_mean=0.322][A
+Train step of epoch 0:  92%|█████████▏| 5943/6434 [13:56:36<1:07:34,  8.26s/it, gpt_loss=0.261, loss_mean=0.316][A
+Train step of epoch 0:  92%|█████████▏| 5944/6434 [13:56:36<1:13:26,  8.99s/it, gpt_loss=0.261, loss_mean=0.316][A
+Train step of epoch 0:  92%|█████████▏| 5944/6434 [13:56:46<1:13:26,  8.99s/it, gpt_loss=0.293, loss_mean=0.313][A
+Train step of epoch 0:  92%|█████████▏| 5945/6434 [13:56:46<1:15:58,  9.32s/it, gpt_loss=0.293, loss_mean=0.313][A
+Train step of epoch 0:  92%|█████████▏| 5945/6434 [13:56:54<1:15:58,  9.32s/it, gpt_loss=0.314, loss_mean=0.313][A
+Train step of epoch 0:  92%|█████████▏| 5946/6434 [13:56:54<1:14:07,  9.11s/it, gpt_loss=0.314, loss_mean=0.313][A
+Train step of epoch 0:  92%|█████████▏| 5946/6434 [13:57:03<1:14:07,  9.11s/it, gpt_loss=0.374, loss_mean=0.32] [A
+Train step of epoch 0:  92%|█████████▏| 5947/6434 [13:57:03<1:12:32,  8.94s/it, gpt_loss=0.374, loss_mean=0.32][A
+Train step of epoch 0:  92%|█████████▏| 5947/6434 [13:57:12<1:12:32,  8.94s/it, gpt_loss=0.272, loss_mean=0.315][A
+Train step of epoch 0:  92%|█████████▏| 5948/6434 [13:57:12<1:11:52,  8.87s/it, gpt_loss=0.272, loss_mean=0.315][A
+Train step of epoch 0:  92%|█████████▏| 5948/6434 [13:57:20<1:11:52,  8.87s/it, gpt_loss=0.293, loss_mean=0.313][A
+Train step of epoch 0:  92%|█████████▏| 5949/6434 [13:57:20<1:11:23,  8.83s/it, gpt_loss=0.293, loss_mean=0.313][A
+[LID Router Debug] Step: 5950
+Batch Size: 10
+Audio Batch Size: 90
+LID Assignments: [6, 6, 2, 2, 0, 9, 4, 5, 9, 0]
+Active Experts in Batch: {0, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  92%|█████████▏| 5949/6434 [13:57:29<1:11:23,  8.83s/it, gpt_loss=0.217, loss_mean=0.303][A
+Train step of epoch 0:  92%|█████████▏| 5950/6434 [13:57:29<1:09:26,  8.61s/it, gpt_loss=0.217, loss_mean=0.303][A
+Train step of epoch 0:  92%|█████████▏| 5950/6434 [13:57:37<1:09:26,  8.61s/it, gpt_loss=0.341, loss_mean=0.307][A
+Train step of epoch 0:  92%|█████████▏| 5951/6434 [13:57:37<1:08:43,  8.54s/it, gpt_loss=0.341, loss_mean=0.307][A
+Train step of epoch 0:  92%|█████████▏| 5951/6434 [13:57:46<1:08:43,  8.54s/it, gpt_loss=0.262, loss_mean=0.302][A
+Train step of epoch 0:  93%|█████████▎| 5952/6434 [13:57:46<1:09:07,  8.60s/it, gpt_loss=0.262, loss_mean=0.302][A
+Train step of epoch 0:  93%|█████████▎| 5952/6434 [13:57:53<1:09:07,  8.60s/it, gpt_loss=0.255, loss_mean=0.298][A
+Train step of epoch 0:  93%|█████████▎| 5953/6434 [13:57:53<1:07:07,  8.37s/it, gpt_loss=0.255, loss_mean=0.298][A
+Train step of epoch 0:  93%|█████████▎| 5953/6434 [13:58:02<1:07:07,  8.37s/it, gpt_loss=0.307, loss_mean=0.299][A
+Train step of epoch 0:  93%|█████████▎| 5954/6434 [13:58:02<1:06:10,  8.27s/it, gpt_loss=0.307, loss_mean=0.299][A
+Train step of epoch 0:  93%|█████████▎| 5954/6434 [13:58:10<1:06:10,  8.27s/it, gpt_loss=0.226, loss_mean=0.291][A
+Train step of epoch 0:  93%|█████████▎| 5955/6434 [13:58:10<1:06:29,  8.33s/it, gpt_loss=0.226, loss_mean=0.291][A
+Train step of epoch 0:  93%|█████████▎| 5955/6434 [13:58:18<1:06:29,  8.33s/it, gpt_loss=0.277, loss_mean=0.29] [A
+Train step of epoch 0:  93%|█████████▎| 5956/6434 [13:58:18<1:05:34,  8.23s/it, gpt_loss=0.277, loss_mean=0.29][A
+Train step of epoch 0:  93%|█████████▎| 5956/6434 [13:58:26<1:05:34,  8.23s/it, gpt_loss=0.305, loss_mean=0.292][A
+Train step of epoch 0:  93%|█████████▎| 5957/6434 [13:58:26<1:05:17,  8.21s/it, gpt_loss=0.305, loss_mean=0.292][A
+Train step of epoch 0:  93%|█████████▎| 5957/6434 [13:58:34<1:05:17,  8.21s/it, gpt_loss=0.291, loss_mean=0.291][A
+Train step of epoch 0:  93%|█████████▎| 5958/6434 [13:58:34<1:04:56,  8.19s/it, gpt_loss=0.291, loss_mean=0.291][A
+Train step of epoch 0:  93%|█████████▎| 5958/6434 [13:58:43<1:04:56,  8.19s/it, gpt_loss=0.288, loss_mean=0.291][A
+Train step of epoch 0:  93%|█████████▎| 5959/6434 [13:58:43<1:04:59,  8.21s/it, gpt_loss=0.288, loss_mean=0.291][A
+[LID Router Debug] Step: 5960
+Batch Size: 10
+Audio Batch Size: 113
+LID Assignments: [1, 4, 2, 4, 3, 2, 3, 3, 2, 5]
+Active Experts in Batch: {1, 2, 3, 4, 5}
+
+Train step of epoch 0:  93%|█████████▎| 5959/6434 [13:58:52<1:04:59,  8.21s/it, gpt_loss=0.296, loss_mean=0.292][A
+Train step of epoch 0:  93%|█████████▎| 5960/6434 [13:58:52<1:07:48,  8.58s/it, gpt_loss=0.296, loss_mean=0.292][A
+Train step of epoch 0:  93%|█████████▎| 5960/6434 [13:59:00<1:07:48,  8.58s/it, gpt_loss=0.278, loss_mean=0.29] [A
+Train step of epoch 0:  93%|█████████▎| 5961/6434 [13:59:00<1:06:40,  8.46s/it, gpt_loss=0.278, loss_mean=0.29][A
+Train step of epoch 0:  93%|█████████▎| 5961/6434 [13:59:09<1:06:40,  8.46s/it, gpt_loss=0.277, loss_mean=0.289][A
+Train step of epoch 0:  93%|█████████▎| 5962/6434 [13:59:09<1:06:25,  8.44s/it, gpt_loss=0.277, loss_mean=0.289][A
+Train step of epoch 0:  93%|█████████▎| 5962/6434 [13:59:17<1:06:25,  8.44s/it, gpt_loss=0.3, loss_mean=0.29]   [A
+Train step of epoch 0:  93%|█████████▎| 5963/6434 [13:59:17<1:07:02,  8.54s/it, gpt_loss=0.3, loss_mean=0.29][A
+Train step of epoch 0:  93%|█████████▎| 5963/6434 [13:59:26<1:07:02,  8.54s/it, gpt_loss=0.283, loss_mean=0.289][A
+Train step of epoch 0:  93%|█████████▎| 5964/6434 [13:59:26<1:06:40,  8.51s/it, gpt_loss=0.283, loss_mean=0.289][A
+Train step of epoch 0:  93%|█████████▎| 5964/6434 [13:59:35<1:06:40,  8.51s/it, gpt_loss=0.201, loss_mean=0.281][A
+Train step of epoch 0:  93%|█████████▎| 5965/6434 [13:59:35<1:08:05,  8.71s/it, gpt_loss=0.201, loss_mean=0.281][A
+Train step of epoch 0:  93%|█████████▎| 5965/6434 [13:59:43<1:08:05,  8.71s/it, gpt_loss=0.28, loss_mean=0.281] [A
+Train step of epoch 0:  93%|█████████▎| 5966/6434 [13:59:43<1:06:10,  8.48s/it, gpt_loss=0.28, loss_mean=0.281][A
+Train step of epoch 0:  93%|█████████▎| 5966/6434 [13:59:51<1:06:10,  8.48s/it, gpt_loss=0.348, loss_mean=0.287][A
+Train step of epoch 0:  93%|█████████▎| 5967/6434 [13:59:51<1:04:47,  8.32s/it, gpt_loss=0.348, loss_mean=0.287][A
+Train step of epoch 0:  93%|█████████▎| 5967/6434 [13:59:58<1:04:47,  8.32s/it, gpt_loss=0.245, loss_mean=0.283][A
+Train step of epoch 0:  93%|█████████▎| 5968/6434 [13:59:58<1:02:01,  7.99s/it, gpt_loss=0.245, loss_mean=0.283][A
+Train step of epoch 0:  93%|█████████▎| 5968/6434 [14:00:06<1:02:01,  7.99s/it, gpt_loss=0.276, loss_mean=0.282][A
+Train step of epoch 0:  93%|█████████▎| 5969/6434 [14:00:06<1:02:26,  8.06s/it, gpt_loss=0.276, loss_mean=0.282][A
+[LID Router Debug] Step: 5970
+Batch Size: 10
+Audio Batch Size: 104
+LID Assignments: [9, 1, 1, 3, 3, 6, 1, 5, 4, 9]
+Active Experts in Batch: {1, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  93%|█████████▎| 5969/6434 [14:00:15<1:02:26,  8.06s/it, gpt_loss=0.284, loss_mean=0.282][A
+Train step of epoch 0:  93%|█████████▎| 5970/6434 [14:00:15<1:04:46,  8.38s/it, gpt_loss=0.284, loss_mean=0.282][A
+Train step of epoch 0:  93%|█████████▎| 5970/6434 [14:00:24<1:04:46,  8.38s/it, gpt_loss=0.357, loss_mean=0.29] [A
+Train step of epoch 0:  93%|█████████▎| 5971/6434 [14:00:24<1:05:39,  8.51s/it, gpt_loss=0.357, loss_mean=0.29][A
+Train step of epoch 0:  93%|█████████▎| 5971/6434 [14:00:34<1:05:39,  8.51s/it, gpt_loss=0.264, loss_mean=0.287][A
+Train step of epoch 0:  93%|█████████▎| 5972/6434 [14:00:34<1:08:23,  8.88s/it, gpt_loss=0.264, loss_mean=0.287][A
+Train step of epoch 0:  93%|█████████▎| 5972/6434 [14:00:42<1:08:23,  8.88s/it, gpt_loss=0.278, loss_mean=0.286][A
+Train step of epoch 0:  93%|█████████▎| 5973/6434 [14:00:42<1:06:28,  8.65s/it, gpt_loss=0.278, loss_mean=0.286][A
+Train step of epoch 0:  93%|█████████▎| 5973/6434 [14:00:50<1:06:28,  8.65s/it, gpt_loss=0.249, loss_mean=0.283][A
+Train step of epoch 0:  93%|█████████▎| 5974/6434 [14:00:50<1:04:23,  8.40s/it, gpt_loss=0.249, loss_mean=0.283][A
+Train step of epoch 0:  93%|█████████▎| 5974/6434 [14:00:58<1:04:23,  8.40s/it, gpt_loss=0.283, loss_mean=0.283][A
+Train step of epoch 0:  93%|█████████▎| 5975/6434 [14:00:59<1:04:41,  8.46s/it, gpt_loss=0.283, loss_mean=0.283][A
+Train step of epoch 0:  93%|█████████▎| 5975/6434 [14:01:07<1:04:41,  8.46s/it, gpt_loss=0.276, loss_mean=0.282][A
+Train step of epoch 0:  93%|█████████▎| 5976/6434 [14:01:07<1:03:31,  8.32s/it, gpt_loss=0.276, loss_mean=0.282][A
+Train step of epoch 0:  93%|█████████▎| 5976/6434 [14:01:16<1:03:31,  8.32s/it, gpt_loss=0.36, loss_mean=0.29]  [A
+Train step of epoch 0:  93%|█████████▎| 5977/6434 [14:01:16<1:05:15,  8.57s/it, gpt_loss=0.36, loss_mean=0.29][A
+Train step of epoch 0:  93%|█████████▎| 5977/6434 [14:01:24<1:05:15,  8.57s/it, gpt_loss=0.24, loss_mean=0.285][A
+Train step of epoch 0:  93%|█████████▎| 5978/6434 [14:01:24<1:04:27,  8.48s/it, gpt_loss=0.24, loss_mean=0.285][A
+Train step of epoch 0:  93%|█████████▎| 5978/6434 [14:01:33<1:04:27,  8.48s/it, gpt_loss=0.28, loss_mean=0.284][A
+Train step of epoch 0:  93%|█████████▎| 5979/6434 [14:01:33<1:05:47,  8.68s/it, gpt_loss=0.28, loss_mean=0.284][A
+[LID Router Debug] Step: 5980
+Batch Size: 10
+Audio Batch Size: 91
+LID Assignments: [0, 4, 4, 2, 4, 1, 3, 5, 9, 2]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  93%|█████████▎| 5979/6434 [14:01:41<1:05:47,  8.68s/it, gpt_loss=0.373, loss_mean=0.293][A
+Train step of epoch 0:  93%|█████████▎| 5980/6434 [14:01:41<1:04:55,  8.58s/it, gpt_loss=0.373, loss_mean=0.293][A
+Train step of epoch 0:  93%|█████████▎| 5980/6434 [14:01:49<1:04:55,  8.58s/it, gpt_loss=0.256, loss_mean=0.289][A
+Train step of epoch 0:  93%|█████████▎| 5981/6434 [14:01:49<1:01:41,  8.17s/it, gpt_loss=0.256, loss_mean=0.289][A
+Train step of epoch 0:  93%|█████████▎| 5981/6434 [14:01:56<1:01:41,  8.17s/it, gpt_loss=0.419, loss_mean=0.302][A
+Train step of epoch 0:  93%|█████████▎| 5982/6434 [14:01:56<1:00:47,  8.07s/it, gpt_loss=0.419, loss_mean=0.302][A
+Train step of epoch 0:  93%|█████████▎| 5982/6434 [14:02:06<1:00:47,  8.07s/it, gpt_loss=0.272, loss_mean=0.299][A
+Train step of epoch 0:  93%|█████████▎| 5983/6434 [14:02:06<1:04:10,  8.54s/it, gpt_loss=0.272, loss_mean=0.299][A
+Train step of epoch 0:  93%|█████████▎| 5983/6434 [14:02:14<1:04:10,  8.54s/it, gpt_loss=0.26, loss_mean=0.295] [A
+Train step of epoch 0:  93%|█████████▎| 5984/6434 [14:02:14<1:02:54,  8.39s/it, gpt_loss=0.26, loss_mean=0.295][A
+Train step of epoch 0:  93%|█████████▎| 5984/6434 [14:02:23<1:02:54,  8.39s/it, gpt_loss=0.324, loss_mean=0.298][A
+Train step of epoch 0:  93%|█████████▎| 5985/6434 [14:02:23<1:03:18,  8.46s/it, gpt_loss=0.324, loss_mean=0.298][A
+Train step of epoch 0:  93%|█████████▎| 5985/6434 [14:02:32<1:03:18,  8.46s/it, gpt_loss=0.34, loss_mean=0.302] [A
+Train step of epoch 0:  93%|█████████▎| 5986/6434 [14:02:32<1:04:58,  8.70s/it, gpt_loss=0.34, loss_mean=0.302][A
+Train step of epoch 0:  93%|█████████▎| 5986/6434 [14:02:41<1:04:58,  8.70s/it, gpt_loss=0.238, loss_mean=0.296][A
+Train step of epoch 0:  93%|█████████▎| 5987/6434 [14:02:41<1:04:31,  8.66s/it, gpt_loss=0.238, loss_mean=0.296][A
+Train step of epoch 0:  93%|█████████▎| 5987/6434 [14:02:49<1:04:31,  8.66s/it, gpt_loss=0.266, loss_mean=0.293][A
+Train step of epoch 0:  93%|█████████▎| 5988/6434 [14:02:49<1:04:37,  8.70s/it, gpt_loss=0.266, loss_mean=0.293][A
+Train step of epoch 0:  93%|█████████▎| 5988/6434 [14:02:59<1:04:37,  8.70s/it, gpt_loss=0.222, loss_mean=0.286][A
+Train step of epoch 0:  93%|█████████▎| 5989/6434 [14:02:59<1:05:47,  8.87s/it, gpt_loss=0.222, loss_mean=0.286][A
+[LID Router Debug] Step: 5990
+Batch Size: 10
+Audio Batch Size: 146
+LID Assignments: [3, 2, 5, 3, 2, 5, 9, 3, 5, 2]
+Active Experts in Batch: {9, 2, 3, 5}
+
+Train step of epoch 0:  93%|█████████▎| 5989/6434 [14:03:08<1:05:47,  8.87s/it, gpt_loss=0.284, loss_mean=0.286][A
+Train step of epoch 0:  93%|█████████▎| 5990/6434 [14:03:08<1:06:21,  8.97s/it, gpt_loss=0.284, loss_mean=0.286][A
+Train step of epoch 0:  93%|█████████▎| 5990/6434 [14:03:17<1:06:21,  8.97s/it, gpt_loss=0.232, loss_mean=0.28] [A
+Train step of epoch 0:  93%|█████████▎| 5991/6434 [14:03:17<1:06:18,  8.98s/it, gpt_loss=0.232, loss_mean=0.28][A
+Train step of epoch 0:  93%|█████████▎| 5991/6434 [14:03:26<1:06:18,  8.98s/it, gpt_loss=0.253, loss_mean=0.278][A
+Train step of epoch 0:  93%|█████████▎| 5992/6434 [14:03:26<1:06:05,  8.97s/it, gpt_loss=0.253, loss_mean=0.278][A
+Train step of epoch 0:  93%|█████████▎| 5992/6434 [14:03:33<1:06:05,  8.97s/it, gpt_loss=0.359, loss_mean=0.286][A
+Train step of epoch 0:  93%|█████████▎| 5993/6434 [14:03:33<1:02:46,  8.54s/it, gpt_loss=0.359, loss_mean=0.286][A
+Train step of epoch 0:  93%|█████████▎| 5993/6434 [14:03:42<1:02:46,  8.54s/it, gpt_loss=0.288, loss_mean=0.286][A
+Train step of epoch 0:  93%|█████████▎| 5994/6434 [14:03:42<1:03:20,  8.64s/it, gpt_loss=0.288, loss_mean=0.286][A
+Train step of epoch 0:  93%|█████████▎| 5994/6434 [14:03:50<1:03:20,  8.64s/it, gpt_loss=0.261, loss_mean=0.283][A
+Train step of epoch 0:  93%|█████████▎| 5995/6434 [14:03:50<1:00:33,  8.28s/it, gpt_loss=0.261, loss_mean=0.283][A
+Train step of epoch 0:  93%|█████████▎| 5995/6434 [14:03:58<1:00:33,  8.28s/it, gpt_loss=0.296, loss_mean=0.285][A
+Train step of epoch 0:  93%|█████████▎| 5996/6434 [14:03:58<1:01:20,  8.40s/it, gpt_loss=0.296, loss_mean=0.285][A
+Train step of epoch 0:  93%|█████████▎| 5996/6434 [14:04:07<1:01:20,  8.40s/it, gpt_loss=0.22, loss_mean=0.278] [A
+Train step of epoch 0:  93%|█████████▎| 5997/6434 [14:04:07<1:02:33,  8.59s/it, gpt_loss=0.22, loss_mean=0.278][A
+Train step of epoch 0:  93%|█████████▎| 5997/6434 [14:04:15<1:02:33,  8.59s/it, gpt_loss=0.228, loss_mean=0.273][A
+Train step of epoch 0:  93%|█████████▎| 5998/6434 [14:04:15<1:01:24,  8.45s/it, gpt_loss=0.228, loss_mean=0.273][A
+Train step of epoch 0:  93%|█████████▎| 5998/6434 [14:04:23<1:01:24,  8.45s/it, gpt_loss=0.244, loss_mean=0.27] [A
+Train step of epoch 0:  93%|█████████▎| 5999/6434 [14:04:23<58:44,  8.10s/it, gpt_loss=0.244, loss_mean=0.27]  [A
+[LID Router Debug] Step: 6000
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [2, 6, 1, 6, 9, 5, 5, 5, 2, 1]
+Active Experts in Batch: {1, 2, 5, 6, 9}
+[2026-02-07 06:00:35,635] [INFO] [logging.py:96:log_dist] [Rank 0] step=3000, skipped=0, lr=[1.5779527414110363e-05, 1.5779527414110363e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 06:00:35,635] [INFO] [timer.py:260:stop] epoch=0/micro_step=6000/global_step=3000, RunningAvgSamplesPerSec=4.746327525948466, CurrSamplesPerSec=5.11282826288266, MemAllocated=12.64GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  93%|█████████▎| 5999/6434 [14:04:31<58:44,  8.10s/it, gpt_loss=0.263, loss_mean=0.27][A
+Train step of epoch 0:  93%|█████████▎| 6000/6434 [14:04:31<59:13,  8.19s/it, gpt_loss=0.263, loss_mean=0.27][A
+Train step of epoch 0:  93%|█████████▎| 6000/6434 [14:04:40<59:13,  8.19s/it, gpt_loss=0.262, loss_mean=0.269][A
+Train step of epoch 0:  93%|█████████▎| 6001/6434 [14:04:40<1:00:23,  8.37s/it, gpt_loss=0.262, loss_mean=0.269][A
+Train step of epoch 0:  93%|█████████▎| 6001/6434 [14:04:49<1:00:23,  8.37s/it, gpt_loss=0.306, loss_mean=0.273][A
+Train step of epoch 0:  93%|█████████▎| 6002/6434 [14:04:49<1:01:08,  8.49s/it, gpt_loss=0.306, loss_mean=0.273][A
+Train step of epoch 0:  93%|█████████▎| 6002/6434 [14:04:56<1:01:08,  8.49s/it, gpt_loss=0.258, loss_mean=0.271][A
+Train step of epoch 0:  93%|█████████▎| 6003/6434 [14:04:56<59:19,  8.26s/it, gpt_loss=0.258, loss_mean=0.271]  [A
+Train step of epoch 0:  93%|█████████▎| 6003/6434 [14:05:04<59:19,  8.26s/it, gpt_loss=0.261, loss_mean=0.27] [A
+Train step of epoch 0:  93%|█████████▎| 6004/6434 [14:05:04<57:14,  7.99s/it, gpt_loss=0.261, loss_mean=0.27][A
+Train step of epoch 0:  93%|█████████▎| 6004/6434 [14:05:13<57:14,  7.99s/it, gpt_loss=0.33, loss_mean=0.276][A
+Train step of epoch 0:  93%|█████████▎| 6005/6434 [14:05:13<1:00:04,  8.40s/it, gpt_loss=0.33, loss_mean=0.276][A
+Train step of epoch 0:  93%|█████████▎| 6005/6434 [14:05:22<1:00:04,  8.40s/it, gpt_loss=0.196, loss_mean=0.268][A
+Train step of epoch 0:  93%|█████████▎| 6006/6434 [14:05:22<1:00:26,  8.47s/it, gpt_loss=0.196, loss_mean=0.268][A
+Train step of epoch 0:  93%|█████████▎| 6006/6434 [14:05:30<1:00:26,  8.47s/it, gpt_loss=0.305, loss_mean=0.272][A
+Train step of epoch 0:  93%|█████████▎| 6007/6434 [14:05:30<59:21,  8.34s/it, gpt_loss=0.305, loss_mean=0.272]  [A
+Train step of epoch 0:  93%|█████████▎| 6007/6434 [14:05:39<59:21,  8.34s/it, gpt_loss=0.316, loss_mean=0.276][A
+Train step of epoch 0:  93%|█████████▎| 6008/6434 [14:05:39<1:00:22,  8.50s/it, gpt_loss=0.316, loss_mean=0.276][A
+Train step of epoch 0:  93%|█████████▎| 6008/6434 [14:05:46<1:00:22,  8.50s/it, gpt_loss=0.22, loss_mean=0.271] [A
+Train step of epoch 0:  93%|█████████▎| 6009/6434 [14:05:46<58:06,  8.20s/it, gpt_loss=0.22, loss_mean=0.271]  [A
+[LID Router Debug] Step: 6010
+Batch Size: 10
+Audio Batch Size: 138
+LID Assignments: [3, 9, 9, 5, 6, 3, 2, 4, 1, 2]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  93%|█████████▎| 6009/6434 [14:05:55<58:06,  8.20s/it, gpt_loss=0.248, loss_mean=0.268][A
+Train step of epoch 0:  93%|█████████▎| 6010/6434 [14:05:55<59:31,  8.42s/it, gpt_loss=0.248, loss_mean=0.268][A
+Train step of epoch 0:  93%|█████████▎| 6010/6434 [14:06:04<59:31,  8.42s/it, gpt_loss=0.288, loss_mean=0.27] [A
+Train step of epoch 0:  93%|█████████▎| 6011/6434 [14:06:04<59:16,  8.41s/it, gpt_loss=0.288, loss_mean=0.27][A
+Train step of epoch 0:  93%|█████████▎| 6011/6434 [14:06:12<59:16,  8.41s/it, gpt_loss=0.402, loss_mean=0.283][A
+Train step of epoch 0:  93%|█████████▎| 6012/6434 [14:06:12<59:08,  8.41s/it, gpt_loss=0.402, loss_mean=0.283][A
+Train step of epoch 0:  93%|█████████▎| 6012/6434 [14:06:20<59:08,  8.41s/it, gpt_loss=0.416, loss_mean=0.297][A
+Train step of epoch 0:  93%|█████████▎| 6013/6434 [14:06:20<58:43,  8.37s/it, gpt_loss=0.416, loss_mean=0.297][A
+Train step of epoch 0:  93%|█████████▎| 6013/6434 [14:06:29<58:43,  8.37s/it, gpt_loss=0.262, loss_mean=0.293][A
+Train step of epoch 0:  93%|█████████▎| 6014/6434 [14:06:29<1:00:07,  8.59s/it, gpt_loss=0.262, loss_mean=0.293][A
+Train step of epoch 0:  93%|█████████▎| 6014/6434 [14:06:39<1:00:07,  8.59s/it, gpt_loss=0.211, loss_mean=0.285][A
+Train step of epoch 0:  93%|█████████▎| 6015/6434 [14:06:39<1:01:38,  8.83s/it, gpt_loss=0.211, loss_mean=0.285][A
+Train step of epoch 0:  93%|█████████▎| 6015/6434 [14:06:46<1:01:38,  8.83s/it, gpt_loss=0.376, loss_mean=0.294][A
+Train step of epoch 0:  94%|█████████▎| 6016/6434 [14:06:46<59:14,  8.50s/it, gpt_loss=0.376, loss_mean=0.294]  [A
+Train step of epoch 0:  94%|█████████▎| 6016/6434 [14:06:54<59:14,  8.50s/it, gpt_loss=0.291, loss_mean=0.294][A
+Train step of epoch 0:  94%|█████████▎| 6017/6434 [14:06:54<58:07,  8.36s/it, gpt_loss=0.291, loss_mean=0.294][A
+Train step of epoch 0:  94%|█████████▎| 6017/6434 [14:07:02<58:07,  8.36s/it, gpt_loss=0.32, loss_mean=0.296] [A
+Train step of epoch 0:  94%|█████████▎| 6018/6434 [14:07:02<57:14,  8.25s/it, gpt_loss=0.32, loss_mean=0.296][A
+Train step of epoch 0:  94%|█████████▎| 6018/6434 [14:07:12<57:14,  8.25s/it, gpt_loss=0.305, loss_mean=0.297][A
+Train step of epoch 0:  94%|█████████▎| 6019/6434 [14:07:12<59:34,  8.61s/it, gpt_loss=0.305, loss_mean=0.297][A
+[LID Router Debug] Step: 6020
+Batch Size: 10
+Audio Batch Size: 123
+LID Assignments: [2, 3, 3, 4, 5, 1, 1, 9, 9, 5]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  94%|█████████▎| 6019/6434 [14:07:21<59:34,  8.61s/it, gpt_loss=0.275, loss_mean=0.295][A
+Train step of epoch 0:  94%|█████████▎| 6020/6434 [14:07:21<59:46,  8.66s/it, gpt_loss=0.275, loss_mean=0.295][A
+Train step of epoch 0:  94%|█████████▎| 6020/6434 [14:07:29<59:46,  8.66s/it, gpt_loss=0.252, loss_mean=0.291][A
+Train step of epoch 0:  94%|█████████▎| 6021/6434 [14:07:29<58:33,  8.51s/it, gpt_loss=0.252, loss_mean=0.291][A
+Train step of epoch 0:  94%|█████████▎| 6021/6434 [14:07:37<58:33,  8.51s/it, gpt_loss=0.371, loss_mean=0.299][A
+Train step of epoch 0:  94%|█████████▎| 6022/6434 [14:07:37<58:30,  8.52s/it, gpt_loss=0.371, loss_mean=0.299][A
+Train step of epoch 0:  94%|█████████▎| 6022/6434 [14:07:46<58:30,  8.52s/it, gpt_loss=0.305, loss_mean=0.299][A
+Train step of epoch 0:  94%|█████████▎| 6023/6434 [14:07:46<58:34,  8.55s/it, gpt_loss=0.305, loss_mean=0.299][A
+Train step of epoch 0:  94%|█████████▎| 6023/6434 [14:07:55<58:34,  8.55s/it, gpt_loss=0.368, loss_mean=0.306][A
+Train step of epoch 0:  94%|█████████▎| 6024/6434 [14:07:55<58:41,  8.59s/it, gpt_loss=0.368, loss_mean=0.306][A
+Train step of epoch 0:  94%|█████████▎| 6024/6434 [14:08:03<58:41,  8.59s/it, gpt_loss=0.211, loss_mean=0.297][A
+Train step of epoch 0:  94%|█████████▎| 6025/6434 [14:08:03<57:18,  8.41s/it, gpt_loss=0.211, loss_mean=0.297][A
+Train step of epoch 0:  94%|█████████▎| 6025/6434 [14:08:11<57:18,  8.41s/it, gpt_loss=0.32, loss_mean=0.299] [A
+Train step of epoch 0:  94%|█████████▎| 6026/6434 [14:08:11<57:31,  8.46s/it, gpt_loss=0.32, loss_mean=0.299][A
+Train step of epoch 0:  94%|█████████▎| 6026/6434 [14:08:20<57:31,  8.46s/it, gpt_loss=0.306, loss_mean=0.3] [A
+Train step of epoch 0:  94%|█████████▎| 6027/6434 [14:08:20<57:05,  8.42s/it, gpt_loss=0.306, loss_mean=0.3][A
+Train step of epoch 0:  94%|█████████▎| 6027/6434 [14:08:29<57:05,  8.42s/it, gpt_loss=0.288, loss_mean=0.299][A
+Train step of epoch 0:  94%|█████████▎| 6028/6434 [14:08:29<58:13,  8.61s/it, gpt_loss=0.288, loss_mean=0.299][A
+Train step of epoch 0:  94%|█████████▎| 6028/6434 [14:08:37<58:13,  8.61s/it, gpt_loss=0.226, loss_mean=0.291][A
+Train step of epoch 0:  94%|█████████▎| 6029/6434 [14:08:37<58:15,  8.63s/it, gpt_loss=0.226, loss_mean=0.291][A
+[LID Router Debug] Step: 6030
+Batch Size: 10
+Audio Batch Size: 90
+LID Assignments: [0, 1, 1, 0, 3, 0, 6, 1, 2, 2]
+Active Experts in Batch: {0, 1, 2, 3, 6}
+
+Train step of epoch 0:  94%|█████████▎| 6029/6434 [14:08:47<58:15,  8.63s/it, gpt_loss=0.24, loss_mean=0.286] [A
+Train step of epoch 0:  94%|█████████▎| 6030/6434 [14:08:47<1:00:43,  9.02s/it, gpt_loss=0.24, loss_mean=0.286][A
+Train step of epoch 0:  94%|█████████▎| 6030/6434 [14:08:57<1:00:43,  9.02s/it, gpt_loss=0.298, loss_mean=0.287][A
+Train step of epoch 0:  94%|█████████▎| 6031/6434 [14:08:57<1:01:56,  9.22s/it, gpt_loss=0.298, loss_mean=0.287][A
+Train step of epoch 0:  94%|█████████▎| 6031/6434 [14:09:05<1:01:56,  9.22s/it, gpt_loss=0.301, loss_mean=0.289][A
+Train step of epoch 0:  94%|█████████▍| 6032/6434 [14:09:05<59:16,  8.85s/it, gpt_loss=0.301, loss_mean=0.289]  [A
+Train step of epoch 0:  94%|█████████▍| 6032/6434 [14:09:14<59:16,  8.85s/it, gpt_loss=0.287, loss_mean=0.289][A
+Train step of epoch 0:  94%|█████████▍| 6033/6434 [14:09:14<59:45,  8.94s/it, gpt_loss=0.287, loss_mean=0.289][A
+Train step of epoch 0:  94%|█████████▍| 6033/6434 [14:09:23<59:45,  8.94s/it, gpt_loss=0.247, loss_mean=0.284][A
+Train step of epoch 0:  94%|█████████▍| 6034/6434 [14:09:23<58:38,  8.80s/it, gpt_loss=0.247, loss_mean=0.284][A
+Train step of epoch 0:  94%|█████████▍| 6034/6434 [14:09:32<58:38,  8.80s/it, gpt_loss=0.271, loss_mean=0.283][A
+Train step of epoch 0:  94%|█████████▍| 6035/6434 [14:09:32<58:49,  8.85s/it, gpt_loss=0.271, loss_mean=0.283][A
+Train step of epoch 0:  94%|█████████▍| 6035/6434 [14:09:40<58:49,  8.85s/it, gpt_loss=0.235, loss_mean=0.278][A
+Train step of epoch 0:  94%|█████████▍| 6036/6434 [14:09:40<57:09,  8.62s/it, gpt_loss=0.235, loss_mean=0.278][A
+Train step of epoch 0:  94%|█████████▍| 6036/6434 [14:09:48<57:09,  8.62s/it, gpt_loss=0.205, loss_mean=0.271][A
+Train step of epoch 0:  94%|█████████▍| 6037/6434 [14:09:48<57:27,  8.68s/it, gpt_loss=0.205, loss_mean=0.271][A
+Train step of epoch 0:  94%|█████████▍| 6037/6434 [14:09:57<57:27,  8.68s/it, gpt_loss=0.253, loss_mean=0.269][A
+Train step of epoch 0:  94%|█████████▍| 6038/6434 [14:09:57<57:09,  8.66s/it, gpt_loss=0.253, loss_mean=0.269][A
+Train step of epoch 0:  94%|█████████▍| 6038/6434 [14:10:06<57:09,  8.66s/it, gpt_loss=0.292, loss_mean=0.271][A
+Train step of epoch 0:  94%|█████████▍| 6039/6434 [14:10:06<56:38,  8.60s/it, gpt_loss=0.292, loss_mean=0.271][A
+[LID Router Debug] Step: 6040
+Batch Size: 10
+Audio Batch Size: 119
+LID Assignments: [5, 9, 0, 3, 9, 5, 9, 3, 3, 9]
+Active Experts in Batch: {0, 9, 3, 5}
+
+Train step of epoch 0:  94%|█████████▍| 6039/6434 [14:10:14<56:38,  8.60s/it, gpt_loss=0.28, loss_mean=0.272] [A
+Train step of epoch 0:  94%|█████████▍| 6040/6434 [14:10:14<56:13,  8.56s/it, gpt_loss=0.28, loss_mean=0.272][A
+Train step of epoch 0:  94%|█████████▍| 6040/6434 [14:10:23<56:13,  8.56s/it, gpt_loss=0.259, loss_mean=0.271][A
+Train step of epoch 0:  94%|█████████▍| 6041/6434 [14:10:23<56:46,  8.67s/it, gpt_loss=0.259, loss_mean=0.271][A
+Train step of epoch 0:  94%|█████████▍| 6041/6434 [14:10:31<56:46,  8.67s/it, gpt_loss=0.266, loss_mean=0.27] [A
+Train step of epoch 0:  94%|█████████▍| 6042/6434 [14:10:31<55:22,  8.48s/it, gpt_loss=0.266, loss_mean=0.27][A
+Train step of epoch 0:  94%|█████████▍| 6042/6434 [14:10:40<55:22,  8.48s/it, gpt_loss=0.222, loss_mean=0.266][A
+Train step of epoch 0:  94%|█████████▍| 6043/6434 [14:10:40<56:00,  8.60s/it, gpt_loss=0.222, loss_mean=0.266][A
+Train step of epoch 0:  94%|█████████▍| 6043/6434 [14:10:48<56:00,  8.60s/it, gpt_loss=0.317, loss_mean=0.271][A
+Train step of epoch 0:  94%|█████████▍| 6044/6434 [14:10:48<55:03,  8.47s/it, gpt_loss=0.317, loss_mean=0.271][A
+Train step of epoch 0:  94%|█████████▍| 6044/6434 [14:10:57<55:03,  8.47s/it, gpt_loss=0.326, loss_mean=0.276][A
+Train step of epoch 0:  94%|█████████▍| 6045/6434 [14:10:57<55:25,  8.55s/it, gpt_loss=0.326, loss_mean=0.276][A
+Train step of epoch 0:  94%|█████████▍| 6045/6434 [14:11:05<55:25,  8.55s/it, gpt_loss=0.368, loss_mean=0.285][A
+Train step of epoch 0:  94%|█████████▍| 6046/6434 [14:11:05<54:32,  8.44s/it, gpt_loss=0.368, loss_mean=0.285][A
+Train step of epoch 0:  94%|█████████▍| 6046/6434 [14:11:14<54:32,  8.44s/it, gpt_loss=0.259, loss_mean=0.283][A
+Train step of epoch 0:  94%|█████████▍| 6047/6434 [14:11:14<54:52,  8.51s/it, gpt_loss=0.259, loss_mean=0.283][A
+Train step of epoch 0:  94%|█████████▍| 6047/6434 [14:11:21<54:52,  8.51s/it, gpt_loss=0.267, loss_mean=0.281][A
+Train step of epoch 0:  94%|█████████▍| 6048/6434 [14:11:21<52:58,  8.23s/it, gpt_loss=0.267, loss_mean=0.281][A
+Train step of epoch 0:  94%|█████████▍| 6048/6434 [14:11:29<52:58,  8.23s/it, gpt_loss=0.312, loss_mean=0.284][A
+Train step of epoch 0:  94%|█████████▍| 6049/6434 [14:11:29<51:56,  8.09s/it, gpt_loss=0.312, loss_mean=0.284][A
+[LID Router Debug] Step: 6050
+Batch Size: 10
+Audio Batch Size: 111
+LID Assignments: [0, 9, 8, 3, 2, 5, 5, 9, 9, 5]
+Active Experts in Batch: {0, 2, 3, 5, 8, 9}
+
+Train step of epoch 0:  94%|█████████▍| 6049/6434 [14:11:38<51:56,  8.09s/it, gpt_loss=0.318, loss_mean=0.288][A
+Train step of epoch 0:  94%|█████████▍| 6050/6434 [14:11:38<53:48,  8.41s/it, gpt_loss=0.318, loss_mean=0.288][A
+Train step of epoch 0:  94%|█████████▍| 6050/6434 [14:11:47<53:48,  8.41s/it, gpt_loss=0.267, loss_mean=0.286][A
+Train step of epoch 0:  94%|█████████▍| 6051/6434 [14:11:47<55:20,  8.67s/it, gpt_loss=0.267, loss_mean=0.286][A
+Train step of epoch 0:  94%|█████████▍| 6051/6434 [14:11:56<55:20,  8.67s/it, gpt_loss=0.307, loss_mean=0.288][A
+Train step of epoch 0:  94%|█████████▍| 6052/6434 [14:11:56<56:06,  8.81s/it, gpt_loss=0.307, loss_mean=0.288][A
+Train step of epoch 0:  94%|█████████▍| 6052/6434 [14:12:05<56:06,  8.81s/it, gpt_loss=0.233, loss_mean=0.282][A
+Train step of epoch 0:  94%|█████████▍| 6053/6434 [14:12:05<55:22,  8.72s/it, gpt_loss=0.233, loss_mean=0.282][A
+Train step of epoch 0:  94%|█████████▍| 6053/6434 [14:12:13<55:22,  8.72s/it, gpt_loss=0.267, loss_mean=0.281][A
+Train step of epoch 0:  94%|█████████▍| 6054/6434 [14:12:13<54:23,  8.59s/it, gpt_loss=0.267, loss_mean=0.281][A
+Train step of epoch 0:  94%|█████████▍| 6054/6434 [14:12:23<54:23,  8.59s/it, gpt_loss=0.269, loss_mean=0.28] [A
+Train step of epoch 0:  94%|█████████▍| 6055/6434 [14:12:23<56:35,  8.96s/it, gpt_loss=0.269, loss_mean=0.28][A
+Train step of epoch 0:  94%|█████████▍| 6055/6434 [14:12:31<56:35,  8.96s/it, gpt_loss=0.253, loss_mean=0.277][A
+Train step of epoch 0:  94%|█████████▍| 6056/6434 [14:12:31<54:41,  8.68s/it, gpt_loss=0.253, loss_mean=0.277][A
+Train step of epoch 0:  94%|█████████▍| 6056/6434 [14:12:38<54:41,  8.68s/it, gpt_loss=0.369, loss_mean=0.286][A
+Train step of epoch 0:  94%|█████████▍| 6057/6434 [14:12:38<51:38,  8.22s/it, gpt_loss=0.369, loss_mean=0.286][A
+Train step of epoch 0:  94%|█████████▍| 6057/6434 [14:12:47<51:38,  8.22s/it, gpt_loss=0.262, loss_mean=0.284][A
+Train step of epoch 0:  94%|█████████▍| 6058/6434 [14:12:47<52:58,  8.45s/it, gpt_loss=0.262, loss_mean=0.284][A
+Train step of epoch 0:  94%|█████████▍| 6058/6434 [14:12:56<52:58,  8.45s/it, gpt_loss=0.271, loss_mean=0.282][A
+Train step of epoch 0:  94%|█████████▍| 6059/6434 [14:12:56<52:34,  8.41s/it, gpt_loss=0.271, loss_mean=0.282][A
+[LID Router Debug] Step: 6060
+Batch Size: 10
+Audio Batch Size: 107
+LID Assignments: [3, 4, 0, 5, 2, 0, 0, 3, 0, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5}
+
+Train step of epoch 0:  94%|█████████▍| 6059/6434 [14:13:03<52:34,  8.41s/it, gpt_loss=0.254, loss_mean=0.28] [A
+Train step of epoch 0:  94%|█████████▍| 6060/6434 [14:13:03<51:28,  8.26s/it, gpt_loss=0.254, loss_mean=0.28][A
+Train step of epoch 0:  94%|█████████▍| 6060/6434 [14:13:12<51:28,  8.26s/it, gpt_loss=0.293, loss_mean=0.281][A
+Train step of epoch 0:  94%|█████████▍| 6061/6434 [14:13:12<52:31,  8.45s/it, gpt_loss=0.293, loss_mean=0.281][A
+Train step of epoch 0:  94%|█████████▍| 6061/6434 [14:13:20<52:31,  8.45s/it, gpt_loss=0.366, loss_mean=0.29] [A
+Train step of epoch 0:  94%|█████████▍| 6062/6434 [14:13:20<50:33,  8.15s/it, gpt_loss=0.366, loss_mean=0.29][A
+Train step of epoch 0:  94%|█████████▍| 6062/6434 [14:13:28<50:33,  8.15s/it, gpt_loss=0.283, loss_mean=0.289][A
+Train step of epoch 0:  94%|█████████▍| 6063/6434 [14:13:28<50:52,  8.23s/it, gpt_loss=0.283, loss_mean=0.289][A
+Train step of epoch 0:  94%|█████████▍| 6063/6434 [14:13:38<50:52,  8.23s/it, gpt_loss=0.327, loss_mean=0.293][A
+Train step of epoch 0:  94%|█████████▍| 6064/6434 [14:13:38<52:58,  8.59s/it, gpt_loss=0.327, loss_mean=0.293][A
+Train step of epoch 0:  94%|█████████▍| 6064/6434 [14:13:45<52:58,  8.59s/it, gpt_loss=0.355, loss_mean=0.299][A
+Train step of epoch 0:  94%|█████████▍| 6065/6434 [14:13:45<50:56,  8.28s/it, gpt_loss=0.355, loss_mean=0.299][A
+Train step of epoch 0:  94%|█████████▍| 6065/6434 [14:13:54<50:56,  8.28s/it, gpt_loss=0.253, loss_mean=0.294][A
+Train step of epoch 0:  94%|█████████▍| 6066/6434 [14:13:54<50:50,  8.29s/it, gpt_loss=0.253, loss_mean=0.294][A
+Train step of epoch 0:  94%|█████████▍| 6066/6434 [14:14:02<50:50,  8.29s/it, gpt_loss=0.243, loss_mean=0.289][A
+Train step of epoch 0:  94%|█████████▍| 6067/6434 [14:14:02<51:25,  8.41s/it, gpt_loss=0.243, loss_mean=0.289][A
+Train step of epoch 0:  94%|█████████▍| 6067/6434 [14:14:10<51:25,  8.41s/it, gpt_loss=0.264, loss_mean=0.287][A
+Train step of epoch 0:  94%|█████████▍| 6068/6434 [14:14:10<49:42,  8.15s/it, gpt_loss=0.264, loss_mean=0.287][A
+Train step of epoch 0:  94%|█████████▍| 6068/6434 [14:14:18<49:42,  8.15s/it, gpt_loss=0.313, loss_mean=0.289][A
+Train step of epoch 0:  94%|█████████▍| 6069/6434 [14:14:18<49:33,  8.15s/it, gpt_loss=0.313, loss_mean=0.289][A
+[LID Router Debug] Step: 6070
+Batch Size: 10
+Audio Batch Size: 121
+LID Assignments: [5, 4, 3, 5, 2, 11, 5, 4, 3, 9]
+Active Experts in Batch: {2, 3, 4, 5, 9, 11}
+
+Train step of epoch 0:  94%|█████████▍| 6069/6434 [14:14:26<49:33,  8.15s/it, gpt_loss=0.449, loss_mean=0.305][A
+Train step of epoch 0:  94%|█████████▍| 6070/6434 [14:14:26<49:39,  8.18s/it, gpt_loss=0.449, loss_mean=0.305][A
+Train step of epoch 0:  94%|█████████▍| 6070/6434 [14:14:35<49:39,  8.18s/it, gpt_loss=0.24, loss_mean=0.299] [A
+Train step of epoch 0:  94%|█████████▍| 6071/6434 [14:14:35<51:24,  8.50s/it, gpt_loss=0.24, loss_mean=0.299][A
+Train step of epoch 0:  94%|█████████▍| 6071/6434 [14:14:45<51:24,  8.50s/it, gpt_loss=0.325, loss_mean=0.301][A
+Train step of epoch 0:  94%|█████████▍| 6072/6434 [14:14:45<52:32,  8.71s/it, gpt_loss=0.325, loss_mean=0.301][A
+Train step of epoch 0:  94%|█████████▍| 6072/6434 [14:14:53<52:32,  8.71s/it, gpt_loss=0.316, loss_mean=0.303][A
+Train step of epoch 0:  94%|█████████▍| 6073/6434 [14:14:53<50:59,  8.47s/it, gpt_loss=0.316, loss_mean=0.303][A
+Train step of epoch 0:  94%|█████████▍| 6073/6434 [14:15:01<50:59,  8.47s/it, gpt_loss=0.223, loss_mean=0.295][A
+Train step of epoch 0:  94%|█████████▍| 6074/6434 [14:15:01<51:35,  8.60s/it, gpt_loss=0.223, loss_mean=0.295][A
+Train step of epoch 0:  94%|█████████▍| 6074/6434 [14:15:11<51:35,  8.60s/it, gpt_loss=0.305, loss_mean=0.296][A
+Train step of epoch 0:  94%|█████████▍| 6075/6434 [14:15:11<52:29,  8.77s/it, gpt_loss=0.305, loss_mean=0.296][A
+Train step of epoch 0:  94%|█████████▍| 6075/6434 [14:15:19<52:29,  8.77s/it, gpt_loss=0.27, loss_mean=0.293] [A
+Train step of epoch 0:  94%|█████████▍| 6076/6434 [14:15:19<50:52,  8.53s/it, gpt_loss=0.27, loss_mean=0.293][A
+Train step of epoch 0:  94%|█████████▍| 6076/6434 [14:15:27<50:52,  8.53s/it, gpt_loss=0.301, loss_mean=0.294][A
+Train step of epoch 0:  94%|█████████▍| 6077/6434 [14:15:27<50:01,  8.41s/it, gpt_loss=0.301, loss_mean=0.294][A
+Train step of epoch 0:  94%|█████████▍| 6077/6434 [14:15:35<50:01,  8.41s/it, gpt_loss=0.278, loss_mean=0.292][A
+Train step of epoch 0:  94%|█████████▍| 6078/6434 [14:15:35<50:29,  8.51s/it, gpt_loss=0.278, loss_mean=0.292][A
+Train step of epoch 0:  94%|█████████▍| 6078/6434 [14:15:44<50:29,  8.51s/it, gpt_loss=0.23, loss_mean=0.286] [A
+Train step of epoch 0:  94%|█████████▍| 6079/6434 [14:15:44<49:47,  8.42s/it, gpt_loss=0.23, loss_mean=0.286][A
+[LID Router Debug] Step: 6080
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [5, 1, 2, 1, 4, 0, 10, 1, 0, 5]
+Active Experts in Batch: {0, 1, 2, 4, 5, 10}
+
+Train step of epoch 0:  94%|█████████▍| 6079/6434 [14:15:51<49:47,  8.42s/it, gpt_loss=0.338, loss_mean=0.291][A
+Train step of epoch 0:  94%|█████████▍| 6080/6434 [14:15:51<47:56,  8.13s/it, gpt_loss=0.338, loss_mean=0.291][A
+Train step of epoch 0:  94%|█████████▍| 6080/6434 [14:16:00<47:56,  8.13s/it, gpt_loss=0.21, loss_mean=0.283] [A
+Train step of epoch 0:  95%|█████████▍| 6081/6434 [14:16:00<48:30,  8.25s/it, gpt_loss=0.21, loss_mean=0.283][A
+Train step of epoch 0:  95%|█████████▍| 6081/6434 [14:16:08<48:30,  8.25s/it, gpt_loss=0.295, loss_mean=0.284][A
+Train step of epoch 0:  95%|█████████▍| 6082/6434 [14:16:08<48:08,  8.21s/it, gpt_loss=0.295, loss_mean=0.284][A
+Train step of epoch 0:  95%|█████████▍| 6082/6434 [14:16:15<48:08,  8.21s/it, gpt_loss=0.299, loss_mean=0.286][A
+Train step of epoch 0:  95%|█████████▍| 6083/6434 [14:16:15<46:58,  8.03s/it, gpt_loss=0.299, loss_mean=0.286][A
+Train step of epoch 0:  95%|█████████▍| 6083/6434 [14:16:24<46:58,  8.03s/it, gpt_loss=0.323, loss_mean=0.29] [A
+Train step of epoch 0:  95%|█████████▍| 6084/6434 [14:16:24<47:58,  8.23s/it, gpt_loss=0.323, loss_mean=0.29][A
+Train step of epoch 0:  95%|█████████▍| 6084/6434 [14:16:32<47:58,  8.23s/it, gpt_loss=0.335, loss_mean=0.294][A
+Train step of epoch 0:  95%|█████████▍| 6085/6434 [14:16:32<47:48,  8.22s/it, gpt_loss=0.335, loss_mean=0.294][A
+Train step of epoch 0:  95%|█████████▍| 6085/6434 [14:16:40<47:48,  8.22s/it, gpt_loss=0.208, loss_mean=0.286][A
+Train step of epoch 0:  95%|█████████▍| 6086/6434 [14:16:40<47:23,  8.17s/it, gpt_loss=0.208, loss_mean=0.286][A
+Train step of epoch 0:  95%|█████████▍| 6086/6434 [14:16:49<47:23,  8.17s/it, gpt_loss=0.301, loss_mean=0.287][A
+Train step of epoch 0:  95%|█████████▍| 6087/6434 [14:16:49<48:45,  8.43s/it, gpt_loss=0.301, loss_mean=0.287][A
+Train step of epoch 0:  95%|█████████▍| 6087/6434 [14:16:57<48:45,  8.43s/it, gpt_loss=0.241, loss_mean=0.282][A
+Train step of epoch 0:  95%|█████████▍| 6088/6434 [14:16:57<47:15,  8.19s/it, gpt_loss=0.241, loss_mean=0.282][A
+Train step of epoch 0:  95%|█████████▍| 6088/6434 [14:17:05<47:15,  8.19s/it, gpt_loss=0.345, loss_mean=0.289][A
+Train step of epoch 0:  95%|█████████▍| 6089/6434 [14:17:05<47:18,  8.23s/it, gpt_loss=0.345, loss_mean=0.289][A
+[LID Router Debug] Step: 6090
+Batch Size: 10
+Audio Batch Size: 104
+LID Assignments: [1, 3, 4, 5, 3, 5, 1, 4, 1, 9]
+Active Experts in Batch: {1, 3, 4, 5, 9}
+
+Train step of epoch 0:  95%|█████████▍| 6089/6434 [14:17:13<47:18,  8.23s/it, gpt_loss=0.334, loss_mean=0.293][A
+Train step of epoch 0:  95%|█████████▍| 6090/6434 [14:17:13<46:02,  8.03s/it, gpt_loss=0.334, loss_mean=0.293][A
+Train step of epoch 0:  95%|█████████▍| 6090/6434 [14:17:21<46:02,  8.03s/it, gpt_loss=0.257, loss_mean=0.29] [A
+Train step of epoch 0:  95%|█████████▍| 6091/6434 [14:17:21<46:05,  8.06s/it, gpt_loss=0.257, loss_mean=0.29][A
+Train step of epoch 0:  95%|█████████▍| 6091/6434 [14:17:30<46:05,  8.06s/it, gpt_loss=0.296, loss_mean=0.29][A
+Train step of epoch 0:  95%|█████████▍| 6092/6434 [14:17:30<47:09,  8.27s/it, gpt_loss=0.296, loss_mean=0.29][A
+Train step of epoch 0:  95%|█████████▍| 6092/6434 [14:17:37<47:09,  8.27s/it, gpt_loss=0.301, loss_mean=0.291][A
+Train step of epoch 0:  95%|█████████▍| 6093/6434 [14:17:37<45:24,  7.99s/it, gpt_loss=0.301, loss_mean=0.291][A
+Train step of epoch 0:  95%|█████████▍| 6093/6434 [14:17:46<45:24,  7.99s/it, gpt_loss=0.341, loss_mean=0.296][A
+Train step of epoch 0:  95%|█████████▍| 6094/6434 [14:17:46<46:11,  8.15s/it, gpt_loss=0.341, loss_mean=0.296][A
+Train step of epoch 0:  95%|█████████▍| 6094/6434 [14:17:54<46:11,  8.15s/it, gpt_loss=0.352, loss_mean=0.302][A
+Train step of epoch 0:  95%|█████████▍| 6095/6434 [14:17:54<46:22,  8.21s/it, gpt_loss=0.352, loss_mean=0.302][A
+Train step of epoch 0:  95%|█████████▍| 6095/6434 [14:18:02<46:22,  8.21s/it, gpt_loss=0.265, loss_mean=0.298][A
+Train step of epoch 0:  95%|█████████▍| 6096/6434 [14:18:02<46:16,  8.21s/it, gpt_loss=0.265, loss_mean=0.298][A
+Train step of epoch 0:  95%|█████████▍| 6096/6434 [14:18:10<46:16,  8.21s/it, gpt_loss=0.312, loss_mean=0.3]  [A
+Train step of epoch 0:  95%|█████████▍| 6097/6434 [14:18:10<45:55,  8.18s/it, gpt_loss=0.312, loss_mean=0.3][A
+Train step of epoch 0:  95%|█████████▍| 6097/6434 [14:18:19<45:55,  8.18s/it, gpt_loss=0.277, loss_mean=0.297][A
+Train step of epoch 0:  95%|█████████▍| 6098/6434 [14:18:19<46:45,  8.35s/it, gpt_loss=0.277, loss_mean=0.297][A
+Train step of epoch 0:  95%|█████████▍| 6098/6434 [14:18:28<46:45,  8.35s/it, gpt_loss=0.295, loss_mean=0.297][A
+Train step of epoch 0:  95%|█████████▍| 6099/6434 [14:18:28<47:39,  8.54s/it, gpt_loss=0.295, loss_mean=0.297][A
+[LID Router Debug] Step: 6100
+Batch Size: 10
+Audio Batch Size: 119
+LID Assignments: [3, 0, 5, 1, 1, 2, 9, 3, 2, 0]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9}
+
+Train step of epoch 0:  95%|█████████▍| 6099/6434 [14:18:36<47:39,  8.54s/it, gpt_loss=0.347, loss_mean=0.302][A
+Train step of epoch 0:  95%|█████████▍| 6100/6434 [14:18:36<46:36,  8.37s/it, gpt_loss=0.347, loss_mean=0.302][A
+Train step of epoch 0:  95%|█████████▍| 6100/6434 [14:18:45<46:36,  8.37s/it, gpt_loss=0.364, loss_mean=0.308][A
+Train step of epoch 0:  95%|█████████▍| 6101/6434 [14:18:45<47:42,  8.59s/it, gpt_loss=0.364, loss_mean=0.308][A
+Train step of epoch 0:  95%|█████████▍| 6101/6434 [14:18:54<47:42,  8.59s/it, gpt_loss=0.241, loss_mean=0.302][A
+Train step of epoch 0:  95%|█████████▍| 6102/6434 [14:18:54<48:28,  8.76s/it, gpt_loss=0.241, loss_mean=0.302][A
+Train step of epoch 0:  95%|█████████▍| 6102/6434 [14:19:02<48:28,  8.76s/it, gpt_loss=0.293, loss_mean=0.301][A
+Train step of epoch 0:  95%|█████████▍| 6103/6434 [14:19:02<46:07,  8.36s/it, gpt_loss=0.293, loss_mean=0.301][A
+Train step of epoch 0:  95%|█████████▍| 6103/6434 [14:19:11<46:07,  8.36s/it, gpt_loss=0.301, loss_mean=0.301][A
+Train step of epoch 0:  95%|█████████▍| 6104/6434 [14:19:11<47:53,  8.71s/it, gpt_loss=0.301, loss_mean=0.301][A
+Train step of epoch 0:  95%|█████████▍| 6104/6434 [14:19:21<47:53,  8.71s/it, gpt_loss=0.322, loss_mean=0.303][A
+Train step of epoch 0:  95%|█████████▍| 6105/6434 [14:19:21<49:13,  8.98s/it, gpt_loss=0.322, loss_mean=0.303][A
+Train step of epoch 0:  95%|█████████▍| 6105/6434 [14:19:30<49:13,  8.98s/it, gpt_loss=0.187, loss_mean=0.291][A
+Train step of epoch 0:  95%|█████████▍| 6106/6434 [14:19:30<50:08,  9.17s/it, gpt_loss=0.187, loss_mean=0.291][A
+Train step of epoch 0:  95%|█████████▍| 6106/6434 [14:19:39<50:08,  9.17s/it, gpt_loss=0.299, loss_mean=0.292][A
+Train step of epoch 0:  95%|█████████▍| 6107/6434 [14:19:39<48:41,  8.93s/it, gpt_loss=0.299, loss_mean=0.292][A
+Train step of epoch 0:  95%|█████████▍| 6107/6434 [14:19:47<48:41,  8.93s/it, gpt_loss=0.286, loss_mean=0.291][A
+Train step of epoch 0:  95%|█████████▍| 6108/6434 [14:19:47<47:43,  8.78s/it, gpt_loss=0.286, loss_mean=0.291][A
+Train step of epoch 0:  95%|█████████▍| 6108/6434 [14:19:56<47:43,  8.78s/it, gpt_loss=0.317, loss_mean=0.294][A
+Train step of epoch 0:  95%|█████████▍| 6109/6434 [14:19:56<47:28,  8.76s/it, gpt_loss=0.317, loss_mean=0.294][A
+[LID Router Debug] Step: 6110
+Batch Size: 10
+Audio Batch Size: 127
+LID Assignments: [3, 4, 3, 5, 3, 3, 1, 9, 9, 5]
+Active Experts in Batch: {1, 3, 4, 5, 9}
+
+Train step of epoch 0:  95%|█████████▍| 6109/6434 [14:20:04<47:28,  8.76s/it, gpt_loss=0.287, loss_mean=0.293][A
+Train step of epoch 0:  95%|█████████▍| 6110/6434 [14:20:04<46:48,  8.67s/it, gpt_loss=0.287, loss_mean=0.293][A
+Train step of epoch 0:  95%|█████████▍| 6110/6434 [14:20:12<46:48,  8.67s/it, gpt_loss=0.312, loss_mean=0.295][A
+Train step of epoch 0:  95%|█████████▍| 6111/6434 [14:20:12<45:18,  8.42s/it, gpt_loss=0.312, loss_mean=0.295][A
+Train step of epoch 0:  95%|█████████▍| 6111/6434 [14:20:21<45:18,  8.42s/it, gpt_loss=0.263, loss_mean=0.292][A
+Train step of epoch 0:  95%|█████████▍| 6112/6434 [14:20:21<45:13,  8.43s/it, gpt_loss=0.263, loss_mean=0.292][A
+Train step of epoch 0:  95%|█████████▍| 6112/6434 [14:20:29<45:13,  8.43s/it, gpt_loss=0.278, loss_mean=0.29] [A
+Train step of epoch 0:  95%|█████████▌| 6113/6434 [14:20:29<44:39,  8.35s/it, gpt_loss=0.278, loss_mean=0.29][A
+Train step of epoch 0:  95%|█████████▌| 6113/6434 [14:20:37<44:39,  8.35s/it, gpt_loss=0.233, loss_mean=0.285][A
+Train step of epoch 0:  95%|█████████▌| 6114/6434 [14:20:37<43:33,  8.17s/it, gpt_loss=0.233, loss_mean=0.285][A
+Train step of epoch 0:  95%|█████████▌| 6114/6434 [14:20:45<43:33,  8.17s/it, gpt_loss=0.399, loss_mean=0.296][A
+Train step of epoch 0:  95%|█████████▌| 6115/6434 [14:20:45<43:55,  8.26s/it, gpt_loss=0.399, loss_mean=0.296][A
+Train step of epoch 0:  95%|█████████▌| 6115/6434 [14:20:53<43:55,  8.26s/it, gpt_loss=0.235, loss_mean=0.29] [A
+Train step of epoch 0:  95%|█████████▌| 6116/6434 [14:20:53<42:40,  8.05s/it, gpt_loss=0.235, loss_mean=0.29][A
+Train step of epoch 0:  95%|█████████▌| 6116/6434 [14:21:01<42:40,  8.05s/it, gpt_loss=0.329, loss_mean=0.294][A
+Train step of epoch 0:  95%|█████████▌| 6117/6434 [14:21:01<42:48,  8.10s/it, gpt_loss=0.329, loss_mean=0.294][A
+Train step of epoch 0:  95%|█████████▌| 6117/6434 [14:21:10<42:48,  8.10s/it, gpt_loss=0.396, loss_mean=0.304][A
+Train step of epoch 0:  95%|█████████▌| 6118/6434 [14:21:10<44:29,  8.45s/it, gpt_loss=0.396, loss_mean=0.304][A
+Train step of epoch 0:  95%|█████████▌| 6118/6434 [14:21:18<44:29,  8.45s/it, gpt_loss=0.251, loss_mean=0.299][A
+Train step of epoch 0:  95%|█████████▌| 6119/6434 [14:21:18<43:49,  8.35s/it, gpt_loss=0.251, loss_mean=0.299][A
+[LID Router Debug] Step: 6120
+Batch Size: 10
+Audio Batch Size: 72
+LID Assignments: [1, 6, 1, 1, 1, 1, 1, 6, 6, 4]
+Active Experts in Batch: {1, 4, 6}
+
+Train step of epoch 0:  95%|█████████▌| 6119/6434 [14:21:27<43:49,  8.35s/it, gpt_loss=0.204, loss_mean=0.289][A
+Train step of epoch 0:  95%|█████████▌| 6120/6434 [14:21:27<45:02,  8.61s/it, gpt_loss=0.204, loss_mean=0.289][A
+Train step of epoch 0:  95%|█████████▌| 6120/6434 [14:21:36<45:02,  8.61s/it, gpt_loss=0.254, loss_mean=0.286][A
+Train step of epoch 0:  95%|█████████▌| 6121/6434 [14:21:36<44:54,  8.61s/it, gpt_loss=0.254, loss_mean=0.286][A
+Train step of epoch 0:  95%|█████████▌| 6121/6434 [14:21:44<44:54,  8.61s/it, gpt_loss=0.306, loss_mean=0.288][A
+Train step of epoch 0:  95%|█████████▌| 6122/6434 [14:21:44<43:54,  8.44s/it, gpt_loss=0.306, loss_mean=0.288][A
+Train step of epoch 0:  95%|█████████▌| 6122/6434 [14:21:52<43:54,  8.44s/it, gpt_loss=0.335, loss_mean=0.293][A
+Train step of epoch 0:  95%|█████████▌| 6123/6434 [14:21:52<43:33,  8.40s/it, gpt_loss=0.335, loss_mean=0.293][A
+Train step of epoch 0:  95%|█████████▌| 6123/6434 [14:22:02<43:33,  8.40s/it, gpt_loss=0.28, loss_mean=0.291] [A
+Train step of epoch 0:  95%|█████████▌| 6124/6434 [14:22:02<45:00,  8.71s/it, gpt_loss=0.28, loss_mean=0.291][A
+Train step of epoch 0:  95%|█████████▌| 6124/6434 [14:22:09<45:00,  8.71s/it, gpt_loss=0.322, loss_mean=0.294][A
+Train step of epoch 0:  95%|█████████▌| 6125/6434 [14:22:09<43:04,  8.36s/it, gpt_loss=0.322, loss_mean=0.294][A
+Train step of epoch 0:  95%|█████████▌| 6125/6434 [14:22:19<43:04,  8.36s/it, gpt_loss=0.298, loss_mean=0.295][A
+Train step of epoch 0:  95%|█████████▌| 6126/6434 [14:22:19<44:46,  8.72s/it, gpt_loss=0.298, loss_mean=0.295][A
+Train step of epoch 0:  95%|█████████▌| 6126/6434 [14:22:27<44:46,  8.72s/it, gpt_loss=0.262, loss_mean=0.291][A
+Train step of epoch 0:  95%|█████████▌| 6127/6434 [14:22:27<43:16,  8.46s/it, gpt_loss=0.262, loss_mean=0.291][A
+Train step of epoch 0:  95%|█████████▌| 6127/6434 [14:22:37<43:16,  8.46s/it, gpt_loss=0.263, loss_mean=0.289][A
+Train step of epoch 0:  95%|█████████▌| 6128/6434 [14:22:37<45:27,  8.91s/it, gpt_loss=0.263, loss_mean=0.289][A
+Train step of epoch 0:  95%|█████████▌| 6128/6434 [14:22:45<45:27,  8.91s/it, gpt_loss=0.295, loss_mean=0.289][A
+Train step of epoch 0:  95%|█████████▌| 6129/6434 [14:22:45<43:55,  8.64s/it, gpt_loss=0.295, loss_mean=0.289][A
+[LID Router Debug] Step: 6130
+Batch Size: 10
+Audio Batch Size: 114
+LID Assignments: [8, 5, 7, 3, 2, 4, 1, 9, 1, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 7, 8, 9}
+
+Train step of epoch 0:  95%|█████████▌| 6129/6434 [14:22:52<43:55,  8.64s/it, gpt_loss=0.33, loss_mean=0.293] [A
+Train step of epoch 0:  95%|█████████▌| 6130/6434 [14:22:52<42:23,  8.37s/it, gpt_loss=0.33, loss_mean=0.293][A
+Train step of epoch 0:  95%|█████████▌| 6130/6434 [14:23:01<42:23,  8.37s/it, gpt_loss=0.27, loss_mean=0.291][A
+Train step of epoch 0:  95%|█████████▌| 6131/6434 [14:23:01<41:56,  8.30s/it, gpt_loss=0.27, loss_mean=0.291][A
+Train step of epoch 0:  95%|█████████▌| 6131/6434 [14:23:10<41:56,  8.30s/it, gpt_loss=0.29, loss_mean=0.291][A
+Train step of epoch 0:  95%|█████████▌| 6132/6434 [14:23:10<44:07,  8.77s/it, gpt_loss=0.29, loss_mean=0.291][A
+Train step of epoch 0:  95%|█████████▌| 6132/6434 [14:23:18<44:07,  8.77s/it, gpt_loss=0.272, loss_mean=0.289][A
+Train step of epoch 0:  95%|█████████▌| 6133/6434 [14:23:18<42:23,  8.45s/it, gpt_loss=0.272, loss_mean=0.289][A
+Train step of epoch 0:  95%|█████████▌| 6133/6434 [14:23:26<42:23,  8.45s/it, gpt_loss=0.283, loss_mean=0.288][A
+Train step of epoch 0:  95%|█████████▌| 6134/6434 [14:23:26<41:54,  8.38s/it, gpt_loss=0.283, loss_mean=0.288][A
+Train step of epoch 0:  95%|█████████▌| 6134/6434 [14:23:35<41:54,  8.38s/it, gpt_loss=0.321, loss_mean=0.292][A
+Train step of epoch 0:  95%|█████████▌| 6135/6434 [14:23:35<42:45,  8.58s/it, gpt_loss=0.321, loss_mean=0.292][A
+Train step of epoch 0:  95%|█████████▌| 6135/6434 [14:23:45<42:45,  8.58s/it, gpt_loss=0.346, loss_mean=0.297][A
+Train step of epoch 0:  95%|█████████▌| 6136/6434 [14:23:45<43:18,  8.72s/it, gpt_loss=0.346, loss_mean=0.297][A
+Train step of epoch 0:  95%|█████████▌| 6136/6434 [14:23:52<43:18,  8.72s/it, gpt_loss=0.322, loss_mean=0.3]  [A
+Train step of epoch 0:  95%|█████████▌| 6137/6434 [14:23:52<41:37,  8.41s/it, gpt_loss=0.322, loss_mean=0.3][A
+Train step of epoch 0:  95%|█████████▌| 6137/6434 [14:24:00<41:37,  8.41s/it, gpt_loss=0.216, loss_mean=0.291][A
+Train step of epoch 0:  95%|█████████▌| 6138/6434 [14:24:00<40:14,  8.16s/it, gpt_loss=0.216, loss_mean=0.291][A
+Train step of epoch 0:  95%|█████████▌| 6138/6434 [14:24:08<40:14,  8.16s/it, gpt_loss=0.232, loss_mean=0.285][A
+Train step of epoch 0:  95%|█████████▌| 6139/6434 [14:24:08<39:45,  8.09s/it, gpt_loss=0.232, loss_mean=0.285][A
+[LID Router Debug] Step: 6140
+Batch Size: 10
+Audio Batch Size: 98
+LID Assignments: [1, 3, 0, 1, 2, 3, 4, 6, 5, 2]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+Train step of epoch 0:  95%|█████████▌| 6139/6434 [14:24:15<39:45,  8.09s/it, gpt_loss=0.358, loss_mean=0.293][A
+Train step of epoch 0:  95%|█████████▌| 6140/6434 [14:24:15<38:33,  7.87s/it, gpt_loss=0.358, loss_mean=0.293][A
+Train step of epoch 0:  95%|█████████▌| 6140/6434 [14:24:23<38:33,  7.87s/it, gpt_loss=0.348, loss_mean=0.298][A
+Train step of epoch 0:  95%|█████████▌| 6141/6434 [14:24:23<38:16,  7.84s/it, gpt_loss=0.348, loss_mean=0.298][A
+Train step of epoch 0:  95%|█████████▌| 6141/6434 [14:24:33<38:16,  7.84s/it, gpt_loss=0.327, loss_mean=0.301][A
+Train step of epoch 0:  95%|█████████▌| 6142/6434 [14:24:33<40:57,  8.42s/it, gpt_loss=0.327, loss_mean=0.301][A
+Train step of epoch 0:  95%|█████████▌| 6142/6434 [14:24:41<40:57,  8.42s/it, gpt_loss=0.396, loss_mean=0.31] [A
+Train step of epoch 0:  95%|█████████▌| 6143/6434 [14:24:41<41:04,  8.47s/it, gpt_loss=0.396, loss_mean=0.31][A
+Train step of epoch 0:  95%|█████████▌| 6143/6434 [14:24:49<41:04,  8.47s/it, gpt_loss=0.284, loss_mean=0.308][A
+Train step of epoch 0:  95%|█████████▌| 6144/6434 [14:24:49<39:39,  8.21s/it, gpt_loss=0.284, loss_mean=0.308][A
+Train step of epoch 0:  95%|█████████▌| 6144/6434 [14:24:57<39:39,  8.21s/it, gpt_loss=0.29, loss_mean=0.306] [A
+Train step of epoch 0:  96%|█████████▌| 6145/6434 [14:24:57<39:32,  8.21s/it, gpt_loss=0.29, loss_mean=0.306][A
+Train step of epoch 0:  96%|█████████▌| 6145/6434 [14:25:05<39:32,  8.21s/it, gpt_loss=0.288, loss_mean=0.304][A
+Train step of epoch 0:  96%|█████████▌| 6146/6434 [14:25:05<39:11,  8.16s/it, gpt_loss=0.288, loss_mean=0.304][A
+Train step of epoch 0:  96%|█████████▌| 6146/6434 [14:25:12<39:11,  8.16s/it, gpt_loss=0.293, loss_mean=0.303][A
+Train step of epoch 0:  96%|█████████▌| 6147/6434 [14:25:12<37:08,  7.76s/it, gpt_loss=0.293, loss_mean=0.303][A
+Train step of epoch 0:  96%|█████████▌| 6147/6434 [14:25:20<37:08,  7.76s/it, gpt_loss=0.244, loss_mean=0.297][A
+Train step of epoch 0:  96%|█████████▌| 6148/6434 [14:25:20<37:29,  7.86s/it, gpt_loss=0.244, loss_mean=0.297][A
+Train step of epoch 0:  96%|█████████▌| 6148/6434 [14:25:31<37:29,  7.86s/it, gpt_loss=0.353, loss_mean=0.303][A
+Train step of epoch 0:  96%|█████████▌| 6149/6434 [14:25:31<41:24,  8.72s/it, gpt_loss=0.353, loss_mean=0.303][A
+[LID Router Debug] Step: 6150
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [0, 9, 4, 0, 4, 1, 1, 0, 5, 3]
+Active Experts in Batch: {0, 1, 3, 4, 5, 9}
+
+Train step of epoch 0:  96%|█████████▌| 6149/6434 [14:25:38<41:24,  8.72s/it, gpt_loss=0.279, loss_mean=0.3]  [A
+Train step of epoch 0:  96%|█████████▌| 6150/6434 [14:25:38<39:07,  8.27s/it, gpt_loss=0.279, loss_mean=0.3][A
+Train step of epoch 0:  96%|█████████▌| 6150/6434 [14:25:48<39:07,  8.27s/it, gpt_loss=0.264, loss_mean=0.297][A
+Train step of epoch 0:  96%|█████████▌| 6151/6434 [14:25:48<41:16,  8.75s/it, gpt_loss=0.264, loss_mean=0.297][A
+Train step of epoch 0:  96%|█████████▌| 6151/6434 [14:25:55<41:16,  8.75s/it, gpt_loss=0.283, loss_mean=0.295][A
+Train step of epoch 0:  96%|█████████▌| 6152/6434 [14:25:55<39:23,  8.38s/it, gpt_loss=0.283, loss_mean=0.295][A
+Train step of epoch 0:  96%|█████████▌| 6152/6434 [14:26:04<39:23,  8.38s/it, gpt_loss=0.187, loss_mean=0.284][A
+Train step of epoch 0:  96%|█████████▌| 6153/6434 [14:26:04<39:35,  8.46s/it, gpt_loss=0.187, loss_mean=0.284][A
+Train step of epoch 0:  96%|█████████▌| 6153/6434 [14:26:12<39:35,  8.46s/it, gpt_loss=0.291, loss_mean=0.285][A
+Train step of epoch 0:  96%|█████████▌| 6154/6434 [14:26:12<39:07,  8.38s/it, gpt_loss=0.291, loss_mean=0.285][A
+Train step of epoch 0:  96%|█████████▌| 6154/6434 [14:26:21<39:07,  8.38s/it, gpt_loss=0.256, loss_mean=0.282][A
+Train step of epoch 0:  96%|█████████▌| 6155/6434 [14:26:21<39:24,  8.47s/it, gpt_loss=0.256, loss_mean=0.282][A
+Train step of epoch 0:  96%|█████████▌| 6155/6434 [14:26:32<39:24,  8.47s/it, gpt_loss=0.24, loss_mean=0.278] [A
+Train step of epoch 0:  96%|█████████▌| 6156/6434 [14:26:32<42:40,  9.21s/it, gpt_loss=0.24, loss_mean=0.278][A
+Train step of epoch 0:  96%|█████████▌| 6156/6434 [14:26:41<42:40,  9.21s/it, gpt_loss=0.325, loss_mean=0.283][A
+Train step of epoch 0:  96%|█████████▌| 6157/6434 [14:26:41<42:10,  9.13s/it, gpt_loss=0.325, loss_mean=0.283][A
+Train step of epoch 0:  96%|█████████▌| 6157/6434 [14:26:51<42:10,  9.13s/it, gpt_loss=0.327, loss_mean=0.287][A
+Train step of epoch 0:  96%|█████████▌| 6158/6434 [14:26:51<43:12,  9.39s/it, gpt_loss=0.327, loss_mean=0.287][A
+Train step of epoch 0:  96%|█████████▌| 6158/6434 [14:26:59<43:12,  9.39s/it, gpt_loss=0.331, loss_mean=0.292][A
+Train step of epoch 0:  96%|█████████▌| 6159/6434 [14:26:59<40:55,  8.93s/it, gpt_loss=0.331, loss_mean=0.292][A
+[LID Router Debug] Step: 6160
+Batch Size: 10
+Audio Batch Size: 117
+LID Assignments: [4, 0, 3, 5, 9, 2, 2, 3, 0, 0]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  96%|█████████▌| 6159/6434 [14:27:08<40:55,  8.93s/it, gpt_loss=0.196, loss_mean=0.282][A
+Train step of epoch 0:  96%|█████████▌| 6160/6434 [14:27:08<41:09,  9.01s/it, gpt_loss=0.196, loss_mean=0.282][A
+Train step of epoch 0:  96%|█████████▌| 6160/6434 [14:27:16<41:09,  9.01s/it, gpt_loss=0.278, loss_mean=0.282][A
+Train step of epoch 0:  96%|█████████▌| 6161/6434 [14:27:16<40:05,  8.81s/it, gpt_loss=0.278, loss_mean=0.282][A
+Train step of epoch 0:  96%|█████████▌| 6161/6434 [14:27:25<40:05,  8.81s/it, gpt_loss=0.347, loss_mean=0.288][A
+Train step of epoch 0:  96%|█████████▌| 6162/6434 [14:27:25<40:24,  8.91s/it, gpt_loss=0.347, loss_mean=0.288][A
+Train step of epoch 0:  96%|█████████▌| 6162/6434 [14:27:35<40:24,  8.91s/it, gpt_loss=0.212, loss_mean=0.281][A
+Train step of epoch 0:  96%|█████████▌| 6163/6434 [14:27:35<41:01,  9.08s/it, gpt_loss=0.212, loss_mean=0.281][A
+Train step of epoch 0:  96%|█████████▌| 6163/6434 [14:27:43<41:01,  9.08s/it, gpt_loss=0.267, loss_mean=0.279][A
+Train step of epoch 0:  96%|█████████▌| 6164/6434 [14:27:43<39:48,  8.85s/it, gpt_loss=0.267, loss_mean=0.279][A
+Train step of epoch 0:  96%|█████████▌| 6164/6434 [14:27:52<39:48,  8.85s/it, gpt_loss=0.259, loss_mean=0.277][A
+Train step of epoch 0:  96%|█████████▌| 6165/6434 [14:27:52<39:50,  8.89s/it, gpt_loss=0.259, loss_mean=0.277][A
+Train step of epoch 0:  96%|█████████▌| 6165/6434 [14:28:01<39:50,  8.89s/it, gpt_loss=0.28, loss_mean=0.277] [A
+Train step of epoch 0:  96%|█████████▌| 6166/6434 [14:28:01<39:16,  8.79s/it, gpt_loss=0.28, loss_mean=0.277][A
+Train step of epoch 0:  96%|█████████▌| 6166/6434 [14:28:10<39:16,  8.79s/it, gpt_loss=0.25, loss_mean=0.275][A
+Train step of epoch 0:  96%|█████████▌| 6167/6434 [14:28:10<40:30,  9.10s/it, gpt_loss=0.25, loss_mean=0.275][A
+Train step of epoch 0:  96%|█████████▌| 6167/6434 [14:28:19<40:30,  9.10s/it, gpt_loss=0.342, loss_mean=0.281][A
+Train step of epoch 0:  96%|█████████▌| 6168/6434 [14:28:19<39:01,  8.80s/it, gpt_loss=0.342, loss_mean=0.281][A
+Train step of epoch 0:  96%|█████████▌| 6168/6434 [14:28:27<39:01,  8.80s/it, gpt_loss=0.307, loss_mean=0.284][A
+Train step of epoch 0:  96%|█████████▌| 6169/6434 [14:28:27<38:42,  8.77s/it, gpt_loss=0.307, loss_mean=0.284][A
+[LID Router Debug] Step: 6170
+Batch Size: 10
+Audio Batch Size: 121
+LID Assignments: [9, 4, 1, 3, 5, 9, 2, 0, 9, 2]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0:  96%|█████████▌| 6169/6434 [14:28:35<38:42,  8.77s/it, gpt_loss=0.366, loss_mean=0.292][A
+Train step of epoch 0:  96%|█████████▌| 6170/6434 [14:28:35<37:54,  8.62s/it, gpt_loss=0.366, loss_mean=0.292][A
+Train step of epoch 0:  96%|█████████▌| 6170/6434 [14:28:44<37:54,  8.62s/it, gpt_loss=0.269, loss_mean=0.29] [A
+Train step of epoch 0:  96%|█████████▌| 6171/6434 [14:28:44<37:50,  8.63s/it, gpt_loss=0.269, loss_mean=0.29][A
+Train step of epoch 0:  96%|█████████▌| 6171/6434 [14:28:52<37:50,  8.63s/it, gpt_loss=0.264, loss_mean=0.287][A
+Train step of epoch 0:  96%|█████████▌| 6172/6434 [14:28:52<37:07,  8.50s/it, gpt_loss=0.264, loss_mean=0.287][A
+Train step of epoch 0:  96%|█████████▌| 6172/6434 [14:29:00<37:07,  8.50s/it, gpt_loss=0.251, loss_mean=0.284][A
+Train step of epoch 0:  96%|█████████▌| 6173/6434 [14:29:00<36:18,  8.35s/it, gpt_loss=0.251, loss_mean=0.284][A
+Train step of epoch 0:  96%|█████████▌| 6173/6434 [14:29:10<36:18,  8.35s/it, gpt_loss=0.283, loss_mean=0.284][A
+Train step of epoch 0:  96%|█████████▌| 6174/6434 [14:29:10<37:43,  8.70s/it, gpt_loss=0.283, loss_mean=0.284][A
+Train step of epoch 0:  96%|█████████▌| 6174/6434 [14:29:19<37:43,  8.70s/it, gpt_loss=0.297, loss_mean=0.285][A
+Train step of epoch 0:  96%|█████████▌| 6175/6434 [14:29:19<38:06,  8.83s/it, gpt_loss=0.297, loss_mean=0.285][A
+Train step of epoch 0:  96%|█████████▌| 6175/6434 [14:29:27<38:06,  8.83s/it, gpt_loss=0.314, loss_mean=0.288][A
+Train step of epoch 0:  96%|█████████▌| 6176/6434 [14:29:27<37:03,  8.62s/it, gpt_loss=0.314, loss_mean=0.288][A
+Train step of epoch 0:  96%|█████████▌| 6176/6434 [14:29:34<37:03,  8.62s/it, gpt_loss=0.289, loss_mean=0.288][A
+Train step of epoch 0:  96%|█████████▌| 6177/6434 [14:29:34<35:16,  8.23s/it, gpt_loss=0.289, loss_mean=0.288][A
+Train step of epoch 0:  96%|█████████▌| 6177/6434 [14:29:42<35:16,  8.23s/it, gpt_loss=0.296, loss_mean=0.289][A
+Train step of epoch 0:  96%|█████████▌| 6178/6434 [14:29:42<34:49,  8.16s/it, gpt_loss=0.296, loss_mean=0.289][A
+Train step of epoch 0:  96%|█████████▌| 6178/6434 [14:29:50<34:49,  8.16s/it, gpt_loss=0.291, loss_mean=0.289][A
+Train step of epoch 0:  96%|█████████▌| 6179/6434 [14:29:50<34:12,  8.05s/it, gpt_loss=0.291, loss_mean=0.289][A
+[LID Router Debug] Step: 6180
+Batch Size: 10
+Audio Batch Size: 84
+LID Assignments: [3, 7, 9, 9, 4, 1, 5, 9, 5, 1]
+Active Experts in Batch: {1, 3, 4, 5, 7, 9}
+
+Train step of epoch 0:  96%|█████████▌| 6179/6434 [14:29:59<34:12,  8.05s/it, gpt_loss=0.264, loss_mean=0.286][A
+Train step of epoch 0:  96%|█████████▌| 6180/6434 [14:29:59<35:04,  8.29s/it, gpt_loss=0.264, loss_mean=0.286][A
+Train step of epoch 0:  96%|█████████▌| 6180/6434 [14:30:07<35:04,  8.29s/it, gpt_loss=0.297, loss_mean=0.288][A
+Train step of epoch 0:  96%|█████████▌| 6181/6434 [14:30:07<34:40,  8.22s/it, gpt_loss=0.297, loss_mean=0.288][A
+Train step of epoch 0:  96%|█████████▌| 6181/6434 [14:30:17<34:40,  8.22s/it, gpt_loss=0.362, loss_mean=0.295][A
+Train step of epoch 0:  96%|█████████▌| 6182/6434 [14:30:17<37:05,  8.83s/it, gpt_loss=0.362, loss_mean=0.295][A
+Train step of epoch 0:  96%|█████████▌| 6182/6434 [14:30:25<37:05,  8.83s/it, gpt_loss=0.28, loss_mean=0.293] [A
+Train step of epoch 0:  96%|█████████▌| 6183/6434 [14:30:25<35:22,  8.46s/it, gpt_loss=0.28, loss_mean=0.293][A
+Train step of epoch 0:  96%|█████████▌| 6183/6434 [14:30:34<35:22,  8.46s/it, gpt_loss=0.295, loss_mean=0.294][A
+Train step of epoch 0:  96%|█████████▌| 6184/6434 [14:30:34<35:35,  8.54s/it, gpt_loss=0.295, loss_mean=0.294][A
+Train step of epoch 0:  96%|█████████▌| 6184/6434 [14:30:42<35:35,  8.54s/it, gpt_loss=0.344, loss_mean=0.299][A
+Train step of epoch 0:  96%|█████████▌| 6185/6434 [14:30:42<34:52,  8.41s/it, gpt_loss=0.344, loss_mean=0.299][A
+Train step of epoch 0:  96%|█████████▌| 6185/6434 [14:30:50<34:52,  8.41s/it, gpt_loss=0.279, loss_mean=0.297][A
+Train step of epoch 0:  96%|█████████▌| 6186/6434 [14:30:50<35:04,  8.49s/it, gpt_loss=0.279, loss_mean=0.297][A
+Train step of epoch 0:  96%|█████████▌| 6186/6434 [14:30:59<35:04,  8.49s/it, gpt_loss=0.278, loss_mean=0.295][A
+Train step of epoch 0:  96%|█████████▌| 6187/6434 [14:30:59<34:38,  8.42s/it, gpt_loss=0.278, loss_mean=0.295][A
+Train step of epoch 0:  96%|█████████▌| 6187/6434 [14:31:08<34:38,  8.42s/it, gpt_loss=0.242, loss_mean=0.29] [A
+Train step of epoch 0:  96%|█████████▌| 6188/6434 [14:31:08<35:02,  8.55s/it, gpt_loss=0.242, loss_mean=0.29][A
+Train step of epoch 0:  96%|█████████▌| 6188/6434 [14:31:16<35:02,  8.55s/it, gpt_loss=0.326, loss_mean=0.293][A
+Train step of epoch 0:  96%|█████████▌| 6189/6434 [14:31:16<34:16,  8.39s/it, gpt_loss=0.326, loss_mean=0.293][A
+[LID Router Debug] Step: 6190
+Batch Size: 10
+Audio Batch Size: 120
+LID Assignments: [2, 3, 2, 9, 5, 5, 0, 2, 6, 4]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  96%|█████████▌| 6189/6434 [14:31:24<34:16,  8.39s/it, gpt_loss=0.291, loss_mean=0.293][A
+Train step of epoch 0:  96%|█████████▌| 6190/6434 [14:31:24<33:42,  8.29s/it, gpt_loss=0.291, loss_mean=0.293][A
+Train step of epoch 0:  96%|█████████▌| 6190/6434 [14:31:31<33:42,  8.29s/it, gpt_loss=0.247, loss_mean=0.288][A
+Train step of epoch 0:  96%|█████████▌| 6191/6434 [14:31:31<32:59,  8.15s/it, gpt_loss=0.247, loss_mean=0.288][A
+Train step of epoch 0:  96%|█████████▌| 6191/6434 [14:31:39<32:59,  8.15s/it, gpt_loss=0.308, loss_mean=0.29] [A
+Train step of epoch 0:  96%|█████████▌| 6192/6434 [14:31:39<32:27,  8.05s/it, gpt_loss=0.308, loss_mean=0.29][A
+Train step of epoch 0:  96%|█████████▌| 6192/6434 [14:31:48<32:27,  8.05s/it, gpt_loss=0.281, loss_mean=0.289][A
+Train step of epoch 0:  96%|█████████▋| 6193/6434 [14:31:48<33:25,  8.32s/it, gpt_loss=0.281, loss_mean=0.289][A
+Train step of epoch 0:  96%|█████████▋| 6193/6434 [14:31:57<33:25,  8.32s/it, gpt_loss=0.328, loss_mean=0.293][A
+Train step of epoch 0:  96%|█████████▋| 6194/6434 [14:31:57<33:56,  8.48s/it, gpt_loss=0.328, loss_mean=0.293][A
+Train step of epoch 0:  96%|█████████▋| 6194/6434 [14:32:05<33:56,  8.48s/it, gpt_loss=0.392, loss_mean=0.303][A
+Train step of epoch 0:  96%|█████████▋| 6195/6434 [14:32:05<32:59,  8.28s/it, gpt_loss=0.392, loss_mean=0.303][A
+Train step of epoch 0:  96%|█████████▋| 6195/6434 [14:32:15<32:59,  8.28s/it, gpt_loss=0.351, loss_mean=0.308][A
+Train step of epoch 0:  96%|█████████▋| 6196/6434 [14:32:15<35:00,  8.82s/it, gpt_loss=0.351, loss_mean=0.308][A
+Train step of epoch 0:  96%|█████████▋| 6196/6434 [14:32:24<35:00,  8.82s/it, gpt_loss=0.35, loss_mean=0.312] [A
+Train step of epoch 0:  96%|█████████▋| 6197/6434 [14:32:24<34:50,  8.82s/it, gpt_loss=0.35, loss_mean=0.312][A
+Train step of epoch 0:  96%|█████████▋| 6197/6434 [14:32:32<34:50,  8.82s/it, gpt_loss=0.233, loss_mean=0.304][A
+Train step of epoch 0:  96%|█████████▋| 6198/6434 [14:32:32<33:41,  8.57s/it, gpt_loss=0.233, loss_mean=0.304][A
+Train step of epoch 0:  96%|█████████▋| 6198/6434 [14:32:40<33:41,  8.57s/it, gpt_loss=0.342, loss_mean=0.308][A
+Train step of epoch 0:  96%|█████████▋| 6199/6434 [14:32:40<33:23,  8.52s/it, gpt_loss=0.342, loss_mean=0.308][A
+[LID Router Debug] Step: 6200
+Batch Size: 10
+Audio Batch Size: 133
+LID Assignments: [3, 2, 0, 2, 2, 3, 0, 1, 5, 3]
+Active Experts in Batch: {0, 1, 2, 3, 5}
+[2026-02-07 06:28:53,266] [INFO] [logging.py:96:log_dist] [Rank 0] step=3100, skipped=0, lr=[1.550810729575848e-05, 1.550810729575848e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 06:28:53,267] [INFO] [timer.py:260:stop] epoch=0/micro_step=6200/global_step=3100, RunningAvgSamplesPerSec=4.7454995594140055, CurrSamplesPerSec=4.708594594242639, MemAllocated=12.94GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  96%|█████████▋| 6199/6434 [14:32:49<33:23,  8.52s/it, gpt_loss=0.291, loss_mean=0.306][A
+Train step of epoch 0:  96%|█████████▋| 6200/6434 [14:32:49<33:19,  8.54s/it, gpt_loss=0.291, loss_mean=0.306][A
+Train step of epoch 0:  96%|█████████▋| 6200/6434 [14:32:57<33:19,  8.54s/it, gpt_loss=0.264, loss_mean=0.302][A
+Train step of epoch 0:  96%|█████████▋| 6201/6434 [14:32:57<33:15,  8.56s/it, gpt_loss=0.264, loss_mean=0.302][A
+Train step of epoch 0:  96%|█████████▋| 6201/6434 [14:33:07<33:15,  8.56s/it, gpt_loss=0.297, loss_mean=0.301][A
+Train step of epoch 0:  96%|█████████▋| 6202/6434 [14:33:07<33:59,  8.79s/it, gpt_loss=0.297, loss_mean=0.301][A
+Train step of epoch 0:  96%|█████████▋| 6202/6434 [14:33:15<33:59,  8.79s/it, gpt_loss=0.248, loss_mean=0.296][A
+Train step of epoch 0:  96%|█████████▋| 6203/6434 [14:33:15<33:34,  8.72s/it, gpt_loss=0.248, loss_mean=0.296][A
+Train step of epoch 0:  96%|█████████▋| 6203/6434 [14:33:23<33:34,  8.72s/it, gpt_loss=0.335, loss_mean=0.3]  [A
+Train step of epoch 0:  96%|█████████▋| 6204/6434 [14:33:23<32:50,  8.57s/it, gpt_loss=0.335, loss_mean=0.3][A
+Train step of epoch 0:  96%|█████████▋| 6204/6434 [14:33:31<32:50,  8.57s/it, gpt_loss=0.193, loss_mean=0.289][A
+Train step of epoch 0:  96%|█████████▋| 6205/6434 [14:33:31<31:27,  8.24s/it, gpt_loss=0.193, loss_mean=0.289][A
+Train step of epoch 0:  96%|█████████▋| 6205/6434 [14:33:40<31:27,  8.24s/it, gpt_loss=0.274, loss_mean=0.288][A
+Train step of epoch 0:  96%|█████████▋| 6206/6434 [14:33:40<31:49,  8.37s/it, gpt_loss=0.274, loss_mean=0.288][A
+Train step of epoch 0:  96%|█████████▋| 6206/6434 [14:33:48<31:49,  8.37s/it, gpt_loss=0.416, loss_mean=0.3]  [A
+Train step of epoch 0:  96%|█████████▋| 6207/6434 [14:33:48<31:55,  8.44s/it, gpt_loss=0.416, loss_mean=0.3][A
+Train step of epoch 0:  96%|█████████▋| 6207/6434 [14:33:58<31:55,  8.44s/it, gpt_loss=0.31, loss_mean=0.301][A
+Train step of epoch 0:  96%|█████████▋| 6208/6434 [14:33:58<32:47,  8.71s/it, gpt_loss=0.31, loss_mean=0.301][A
+Train step of epoch 0:  96%|█████████▋| 6208/6434 [14:34:06<32:47,  8.71s/it, gpt_loss=0.276, loss_mean=0.299][A
+Train step of epoch 0:  97%|█████████▋| 6209/6434 [14:34:06<32:41,  8.72s/it, gpt_loss=0.276, loss_mean=0.299][A
+[LID Router Debug] Step: 6210
+Batch Size: 10
+Audio Batch Size: 92
+LID Assignments: [1, 4, 9, 1, 2, 2, 9, 5, 4, 0]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+Train step of epoch 0:  97%|█████████▋| 6209/6434 [14:34:16<32:41,  8.72s/it, gpt_loss=0.335, loss_mean=0.303][A
+Train step of epoch 0:  97%|█████████▋| 6210/6434 [14:34:16<33:18,  8.92s/it, gpt_loss=0.335, loss_mean=0.303][A
+Train step of epoch 0:  97%|█████████▋| 6210/6434 [14:34:24<33:18,  8.92s/it, gpt_loss=0.263, loss_mean=0.299][A
+Train step of epoch 0:  97%|█████████▋| 6211/6434 [14:34:24<32:33,  8.76s/it, gpt_loss=0.263, loss_mean=0.299][A
+Train step of epoch 0:  97%|█████████▋| 6211/6434 [14:34:32<32:33,  8.76s/it, gpt_loss=0.241, loss_mean=0.293][A
+Train step of epoch 0:  97%|█████████▋| 6212/6434 [14:34:32<31:50,  8.60s/it, gpt_loss=0.241, loss_mean=0.293][A
+Train step of epoch 0:  97%|█████████▋| 6212/6434 [14:34:40<31:50,  8.60s/it, gpt_loss=0.282, loss_mean=0.292][A
+Train step of epoch 0:  97%|█████████▋| 6213/6434 [14:34:40<31:09,  8.46s/it, gpt_loss=0.282, loss_mean=0.292][A
+Train step of epoch 0:  97%|█████████▋| 6213/6434 [14:34:48<31:09,  8.46s/it, gpt_loss=0.25, loss_mean=0.288] [A
+Train step of epoch 0:  97%|█████████▋| 6214/6434 [14:34:48<30:23,  8.29s/it, gpt_loss=0.25, loss_mean=0.288][A
+Train step of epoch 0:  97%|█████████▋| 6214/6434 [14:34:56<30:23,  8.29s/it, gpt_loss=0.222, loss_mean=0.281][A
+Train step of epoch 0:  97%|█████████▋| 6215/6434 [14:34:56<29:34,  8.10s/it, gpt_loss=0.222, loss_mean=0.281][A
+Train step of epoch 0:  97%|█████████▋| 6215/6434 [14:35:05<29:34,  8.10s/it, gpt_loss=0.403, loss_mean=0.293][A
+Train step of epoch 0:  97%|█████████▋| 6216/6434 [14:35:05<29:51,  8.22s/it, gpt_loss=0.403, loss_mean=0.293][A
+Train step of epoch 0:  97%|█████████▋| 6216/6434 [14:35:13<29:51,  8.22s/it, gpt_loss=0.254, loss_mean=0.289][A
+Train step of epoch 0:  97%|█████████▋| 6217/6434 [14:35:13<29:32,  8.17s/it, gpt_loss=0.254, loss_mean=0.289][A
+Train step of epoch 0:  97%|█████████▋| 6217/6434 [14:35:21<29:32,  8.17s/it, gpt_loss=0.294, loss_mean=0.29] [A
+Train step of epoch 0:  97%|█████████▋| 6218/6434 [14:35:21<30:00,  8.34s/it, gpt_loss=0.294, loss_mean=0.29][A
+Train step of epoch 0:  97%|█████████▋| 6218/6434 [14:35:29<30:00,  8.34s/it, gpt_loss=0.321, loss_mean=0.293][A
+Train step of epoch 0:  97%|█████████▋| 6219/6434 [14:35:29<29:06,  8.12s/it, gpt_loss=0.321, loss_mean=0.293][A
+[LID Router Debug] Step: 6220
+Batch Size: 10
+Audio Batch Size: 91
+LID Assignments: [5, 3, 9, 4, 4, 1, 6, 5, 3, 5]
+Active Experts in Batch: {1, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  97%|█████████▋| 6219/6434 [14:35:37<29:06,  8.12s/it, gpt_loss=0.331, loss_mean=0.297][A
+Train step of epoch 0:  97%|█████████▋| 6220/6434 [14:35:37<28:44,  8.06s/it, gpt_loss=0.331, loss_mean=0.297][A
+Train step of epoch 0:  97%|█████████▋| 6220/6434 [14:35:45<28:44,  8.06s/it, gpt_loss=0.263, loss_mean=0.293][A
+Train step of epoch 0:  97%|█████████▋| 6221/6434 [14:35:45<28:29,  8.03s/it, gpt_loss=0.263, loss_mean=0.293][A
+Train step of epoch 0:  97%|█████████▋| 6221/6434 [14:35:53<28:29,  8.03s/it, gpt_loss=0.304, loss_mean=0.294][A
+Train step of epoch 0:  97%|█████████▋| 6222/6434 [14:35:53<28:53,  8.18s/it, gpt_loss=0.304, loss_mean=0.294][A
+Train step of epoch 0:  97%|█████████▋| 6222/6434 [14:36:00<28:53,  8.18s/it, gpt_loss=0.302, loss_mean=0.295][A
+Train step of epoch 0:  97%|█████████▋| 6223/6434 [14:36:00<27:34,  7.84s/it, gpt_loss=0.302, loss_mean=0.295][A
+Train step of epoch 0:  97%|█████████▋| 6223/6434 [14:36:10<27:34,  7.84s/it, gpt_loss=0.326, loss_mean=0.298][A
+Train step of epoch 0:  97%|█████████▋| 6224/6434 [14:36:10<29:36,  8.46s/it, gpt_loss=0.326, loss_mean=0.298][A
+Train step of epoch 0:  97%|█████████▋| 6224/6434 [14:36:19<29:36,  8.46s/it, gpt_loss=0.266, loss_mean=0.295][A
+Train step of epoch 0:  97%|█████████▋| 6225/6434 [14:36:19<29:44,  8.54s/it, gpt_loss=0.266, loss_mean=0.295][A
+Train step of epoch 0:  97%|█████████▋| 6225/6434 [14:36:28<29:44,  8.54s/it, gpt_loss=0.362, loss_mean=0.302][A
+Train step of epoch 0:  97%|█████████▋| 6226/6434 [14:36:28<29:59,  8.65s/it, gpt_loss=0.362, loss_mean=0.302][A
+Train step of epoch 0:  97%|█████████▋| 6226/6434 [14:36:36<29:59,  8.65s/it, gpt_loss=0.282, loss_mean=0.3]  [A
+Train step of epoch 0:  97%|█████████▋| 6227/6434 [14:36:36<29:19,  8.50s/it, gpt_loss=0.282, loss_mean=0.3][A
+Train step of epoch 0:  97%|█████████▋| 6227/6434 [14:36:45<29:19,  8.50s/it, gpt_loss=0.278, loss_mean=0.298][A
+Train step of epoch 0:  97%|█████████▋| 6228/6434 [14:36:45<29:11,  8.50s/it, gpt_loss=0.278, loss_mean=0.298][A
+Train step of epoch 0:  97%|█████████▋| 6228/6434 [14:36:53<29:11,  8.50s/it, gpt_loss=0.308, loss_mean=0.299][A
+Train step of epoch 0:  97%|█████████▋| 6229/6434 [14:36:53<28:42,  8.40s/it, gpt_loss=0.308, loss_mean=0.299][A
+[LID Router Debug] Step: 6230
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [4, 0, 0, 2, 9, 1, 1, 4, 9, 2]
+Active Experts in Batch: {0, 1, 2, 4, 9}
+
+Train step of epoch 0:  97%|█████████▋| 6229/6434 [14:37:01<28:42,  8.40s/it, gpt_loss=0.385, loss_mean=0.307][A
+Train step of epoch 0:  97%|█████████▋| 6230/6434 [14:37:01<28:39,  8.43s/it, gpt_loss=0.385, loss_mean=0.307][A
+Train step of epoch 0:  97%|█████████▋| 6230/6434 [14:37:10<28:39,  8.43s/it, gpt_loss=0.331, loss_mean=0.31] [A
+Train step of epoch 0:  97%|█████████▋| 6231/6434 [14:37:10<28:32,  8.43s/it, gpt_loss=0.331, loss_mean=0.31][A
+Train step of epoch 0:  97%|█████████▋| 6231/6434 [14:37:16<28:32,  8.43s/it, gpt_loss=0.259, loss_mean=0.305][A
+Train step of epoch 0:  97%|█████████▋| 6232/6434 [14:37:16<26:37,  7.91s/it, gpt_loss=0.259, loss_mean=0.305][A
+Train step of epoch 0:  97%|█████████▋| 6232/6434 [14:37:24<26:37,  7.91s/it, gpt_loss=0.257, loss_mean=0.3]  [A
+Train step of epoch 0:  97%|█████████▋| 6233/6434 [14:37:24<26:00,  7.76s/it, gpt_loss=0.257, loss_mean=0.3][A
+Train step of epoch 0:  97%|█████████▋| 6233/6434 [14:37:32<26:00,  7.76s/it, gpt_loss=0.281, loss_mean=0.298][A
+Train step of epoch 0:  97%|█████████▋| 6234/6434 [14:37:32<26:11,  7.86s/it, gpt_loss=0.281, loss_mean=0.298][A
+Train step of epoch 0:  97%|█████████▋| 6234/6434 [14:37:40<26:11,  7.86s/it, gpt_loss=0.268, loss_mean=0.295][A
+Train step of epoch 0:  97%|█████████▋| 6235/6434 [14:37:40<26:03,  7.86s/it, gpt_loss=0.268, loss_mean=0.295][A
+Train step of epoch 0:  97%|█████████▋| 6235/6434 [14:37:48<26:03,  7.86s/it, gpt_loss=0.237, loss_mean=0.289][A
+Train step of epoch 0:  97%|█████████▋| 6236/6434 [14:37:48<26:22,  7.99s/it, gpt_loss=0.237, loss_mean=0.289][A
+Train step of epoch 0:  97%|█████████▋| 6236/6434 [14:37:57<26:22,  7.99s/it, gpt_loss=0.329, loss_mean=0.293][A
+Train step of epoch 0:  97%|█████████▋| 6237/6434 [14:37:57<26:45,  8.15s/it, gpt_loss=0.329, loss_mean=0.293][A
+Train step of epoch 0:  97%|█████████▋| 6237/6434 [14:38:05<26:45,  8.15s/it, gpt_loss=0.32, loss_mean=0.296] [A
+Train step of epoch 0:  97%|█████████▋| 6238/6434 [14:38:05<26:27,  8.10s/it, gpt_loss=0.32, loss_mean=0.296][A
+Train step of epoch 0:  97%|█████████▋| 6238/6434 [14:38:12<26:27,  8.10s/it, gpt_loss=0.262, loss_mean=0.292][A
+Train step of epoch 0:  97%|█████████▋| 6239/6434 [14:38:12<25:25,  7.82s/it, gpt_loss=0.262, loss_mean=0.292][A
+[LID Router Debug] Step: 6240
+Batch Size: 10
+Audio Batch Size: 121
+LID Assignments: [2, 9, 4, 3, 1, 3, 9, 6, 4, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:  97%|█████████▋| 6239/6434 [14:38:20<25:25,  7.82s/it, gpt_loss=0.396, loss_mean=0.303][A
+Train step of epoch 0:  97%|█████████▋| 6240/6434 [14:38:20<25:34,  7.91s/it, gpt_loss=0.396, loss_mean=0.303][A
+Train step of epoch 0:  97%|█████████▋| 6240/6434 [14:38:29<25:34,  7.91s/it, gpt_loss=0.272, loss_mean=0.3]  [A
+Train step of epoch 0:  97%|█████████▋| 6241/6434 [14:38:29<26:34,  8.26s/it, gpt_loss=0.272, loss_mean=0.3][A
+Train step of epoch 0:  97%|█████████▋| 6241/6434 [14:38:37<26:34,  8.26s/it, gpt_loss=0.352, loss_mean=0.305][A
+Train step of epoch 0:  97%|█████████▋| 6242/6434 [14:38:37<26:44,  8.36s/it, gpt_loss=0.352, loss_mean=0.305][A
+Train step of epoch 0:  97%|█████████▋| 6242/6434 [14:38:45<26:44,  8.36s/it, gpt_loss=0.298, loss_mean=0.304][A
+Train step of epoch 0:  97%|█████████▋| 6243/6434 [14:38:45<26:09,  8.22s/it, gpt_loss=0.298, loss_mean=0.304][A
+Train step of epoch 0:  97%|█████████▋| 6243/6434 [14:38:55<26:09,  8.22s/it, gpt_loss=0.335, loss_mean=0.307][A
+Train step of epoch 0:  97%|█████████▋| 6244/6434 [14:38:55<26:59,  8.52s/it, gpt_loss=0.335, loss_mean=0.307][A
+Train step of epoch 0:  97%|█████████▋| 6244/6434 [14:39:02<26:59,  8.52s/it, gpt_loss=0.218, loss_mean=0.298][A
+Train step of epoch 0:  97%|█████████▋| 6245/6434 [14:39:02<26:09,  8.30s/it, gpt_loss=0.218, loss_mean=0.298][A
+Train step of epoch 0:  97%|█████████▋| 6245/6434 [14:39:10<26:09,  8.30s/it, gpt_loss=0.288, loss_mean=0.297][A
+Train step of epoch 0:  97%|█████████▋| 6246/6434 [14:39:10<24:56,  7.96s/it, gpt_loss=0.288, loss_mean=0.297][A
+Train step of epoch 0:  97%|█████████▋| 6246/6434 [14:39:17<24:56,  7.96s/it, gpt_loss=0.248, loss_mean=0.292][A
+Train step of epoch 0:  97%|█████████▋| 6247/6434 [14:39:17<24:01,  7.71s/it, gpt_loss=0.248, loss_mean=0.292][A
+Train step of epoch 0:  97%|█████████▋| 6247/6434 [14:39:26<24:01,  7.71s/it, gpt_loss=0.375, loss_mean=0.301][A
+Train step of epoch 0:  97%|█████████▋| 6248/6434 [14:39:26<25:07,  8.11s/it, gpt_loss=0.375, loss_mean=0.301][A
+Train step of epoch 0:  97%|█████████▋| 6248/6434 [14:39:34<25:07,  8.11s/it, gpt_loss=0.27, loss_mean=0.298] [A
+Train step of epoch 0:  97%|█████████▋| 6249/6434 [14:39:34<25:08,  8.16s/it, gpt_loss=0.27, loss_mean=0.298][A
+[LID Router Debug] Step: 6250
+Batch Size: 10
+Audio Batch Size: 124
+LID Assignments: [5, 3, 1, 5, 5, 0, 8, 9, 0, 2]
+Active Experts in Batch: {0, 1, 2, 3, 5, 8, 9}
+
+Train step of epoch 0:  97%|█████████▋| 6249/6434 [14:39:44<25:08,  8.16s/it, gpt_loss=0.287, loss_mean=0.297][A
+Train step of epoch 0:  97%|█████████▋| 6250/6434 [14:39:44<26:30,  8.64s/it, gpt_loss=0.287, loss_mean=0.297][A
+Train step of epoch 0:  97%|█████████▋| 6250/6434 [14:39:52<26:30,  8.64s/it, gpt_loss=0.299, loss_mean=0.297][A
+Train step of epoch 0:  97%|█████████▋| 6251/6434 [14:39:52<25:53,  8.49s/it, gpt_loss=0.299, loss_mean=0.297][A
+Train step of epoch 0:  97%|█████████▋| 6251/6434 [14:40:00<25:53,  8.49s/it, gpt_loss=0.326, loss_mean=0.3]  [A
+Train step of epoch 0:  97%|█████████▋| 6252/6434 [14:40:00<25:20,  8.35s/it, gpt_loss=0.326, loss_mean=0.3][A
+Train step of epoch 0:  97%|█████████▋| 6252/6434 [14:40:09<25:20,  8.35s/it, gpt_loss=0.289, loss_mean=0.299][A
+Train step of epoch 0:  97%|█████████▋| 6253/6434 [14:40:09<26:15,  8.70s/it, gpt_loss=0.289, loss_mean=0.299][A
+Train step of epoch 0:  97%|█████████▋| 6253/6434 [14:40:20<26:15,  8.70s/it, gpt_loss=0.333, loss_mean=0.302][A
+Train step of epoch 0:  97%|█████████▋| 6254/6434 [14:40:20<27:42,  9.24s/it, gpt_loss=0.333, loss_mean=0.302][A
+Train step of epoch 0:  97%|█████████▋| 6254/6434 [14:40:29<27:42,  9.24s/it, gpt_loss=0.326, loss_mean=0.305][A
+Train step of epoch 0:  97%|█████████▋| 6255/6434 [14:40:29<27:00,  9.05s/it, gpt_loss=0.326, loss_mean=0.305][A
+Train step of epoch 0:  97%|█████████▋| 6255/6434 [14:40:37<27:00,  9.05s/it, gpt_loss=0.375, loss_mean=0.312][A
+Train step of epoch 0:  97%|█████████▋| 6256/6434 [14:40:37<26:21,  8.89s/it, gpt_loss=0.375, loss_mean=0.312][A
+Train step of epoch 0:  97%|█████████▋| 6256/6434 [14:40:45<26:21,  8.89s/it, gpt_loss=0.273, loss_mean=0.308][A
+Train step of epoch 0:  97%|█████████▋| 6257/6434 [14:40:45<25:44,  8.72s/it, gpt_loss=0.273, loss_mean=0.308][A
+Train step of epoch 0:  97%|█████████▋| 6257/6434 [14:40:54<25:44,  8.72s/it, gpt_loss=0.218, loss_mean=0.299][A
+Train step of epoch 0:  97%|█████████▋| 6258/6434 [14:40:54<25:37,  8.74s/it, gpt_loss=0.218, loss_mean=0.299][A
+Train step of epoch 0:  97%|█████████▋| 6258/6434 [14:41:04<25:37,  8.74s/it, gpt_loss=0.327, loss_mean=0.302][A
+Train step of epoch 0:  97%|█████████▋| 6259/6434 [14:41:04<26:33,  9.10s/it, gpt_loss=0.327, loss_mean=0.302][A
+[LID Router Debug] Step: 6260
+Batch Size: 10
+Audio Batch Size: 90
+LID Assignments: [1, 9, 5, 0, 1, 0, 4, 2, 5, 9]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+Train step of epoch 0:  97%|█████████▋| 6259/6434 [14:41:12<26:33,  9.10s/it, gpt_loss=0.45, loss_mean=0.316] [A
+Train step of epoch 0:  97%|█████████▋| 6260/6434 [14:41:12<25:11,  8.69s/it, gpt_loss=0.45, loss_mean=0.316][A
+Train step of epoch 0:  97%|█████████▋| 6260/6434 [14:41:20<25:11,  8.69s/it, gpt_loss=0.393, loss_mean=0.324][A
+Train step of epoch 0:  97%|█████████▋| 6261/6434 [14:41:20<24:36,  8.53s/it, gpt_loss=0.393, loss_mean=0.324][A
+Train step of epoch 0:  97%|█████████▋| 6261/6434 [14:41:29<24:36,  8.53s/it, gpt_loss=0.24, loss_mean=0.316] [A
+Train step of epoch 0:  97%|█████████▋| 6262/6434 [14:41:29<25:04,  8.75s/it, gpt_loss=0.24, loss_mean=0.316][A
+Train step of epoch 0:  97%|█████████▋| 6262/6434 [14:41:39<25:04,  8.75s/it, gpt_loss=0.188, loss_mean=0.303][A
+Train step of epoch 0:  97%|█████████▋| 6263/6434 [14:41:39<25:32,  8.96s/it, gpt_loss=0.188, loss_mean=0.303][A
+Train step of epoch 0:  97%|█████████▋| 6263/6434 [14:41:49<25:32,  8.96s/it, gpt_loss=0.216, loss_mean=0.294][A
+Train step of epoch 0:  97%|█████████▋| 6264/6434 [14:41:49<26:18,  9.29s/it, gpt_loss=0.216, loss_mean=0.294][A
+Train step of epoch 0:  97%|█████████▋| 6264/6434 [14:41:57<26:18,  9.29s/it, gpt_loss=0.304, loss_mean=0.295][A
+Train step of epoch 0:  97%|█████████▋| 6265/6434 [14:41:57<25:14,  8.96s/it, gpt_loss=0.304, loss_mean=0.295][A
+Train step of epoch 0:  97%|█████████▋| 6265/6434 [14:42:06<25:14,  8.96s/it, gpt_loss=0.366, loss_mean=0.302][A
+Train step of epoch 0:  97%|█████████▋| 6266/6434 [14:42:06<25:23,  9.07s/it, gpt_loss=0.366, loss_mean=0.302][A
+Train step of epoch 0:  97%|█████████▋| 6266/6434 [14:42:16<25:23,  9.07s/it, gpt_loss=0.241, loss_mean=0.296][A
+Train step of epoch 0:  97%|█████████▋| 6267/6434 [14:42:16<26:07,  9.39s/it, gpt_loss=0.241, loss_mean=0.296][A
+Train step of epoch 0:  97%|█████████▋| 6267/6434 [14:42:25<26:07,  9.39s/it, gpt_loss=0.295, loss_mean=0.296][A
+Train step of epoch 0:  97%|█████████▋| 6268/6434 [14:42:25<25:25,  9.19s/it, gpt_loss=0.295, loss_mean=0.296][A
+Train step of epoch 0:  97%|█████████▋| 6268/6434 [14:42:33<25:25,  9.19s/it, gpt_loss=0.27, loss_mean=0.293] [A
+Train step of epoch 0:  97%|█████████▋| 6269/6434 [14:42:33<24:22,  8.86s/it, gpt_loss=0.27, loss_mean=0.293][A
+[LID Router Debug] Step: 6270
+Batch Size: 10
+Audio Batch Size: 137
+LID Assignments: [9, 2, 0, 9, 0, 3, 1, 3, 6, 2]
+Active Experts in Batch: {0, 1, 2, 3, 6, 9}
+
+Train step of epoch 0:  97%|█████████▋| 6269/6434 [14:42:43<24:22,  8.86s/it, gpt_loss=0.287, loss_mean=0.293][A
+Train step of epoch 0:  97%|█████████▋| 6270/6434 [14:42:43<24:56,  9.12s/it, gpt_loss=0.287, loss_mean=0.293][A
+Train step of epoch 0:  97%|█████████▋| 6270/6434 [14:42:51<24:56,  9.12s/it, gpt_loss=0.381, loss_mean=0.302][A
+Train step of epoch 0:  97%|█████████▋| 6271/6434 [14:42:51<23:57,  8.82s/it, gpt_loss=0.381, loss_mean=0.302][A
+Train step of epoch 0:  97%|█████████▋| 6271/6434 [14:43:00<23:57,  8.82s/it, gpt_loss=0.331, loss_mean=0.304][A
+Train step of epoch 0:  97%|█████████▋| 6272/6434 [14:43:00<23:44,  8.79s/it, gpt_loss=0.331, loss_mean=0.304][A
+Train step of epoch 0:  97%|█████████▋| 6272/6434 [14:43:09<23:44,  8.79s/it, gpt_loss=0.342, loss_mean=0.308][A
+Train step of epoch 0:  97%|█████████▋| 6273/6434 [14:43:09<23:46,  8.86s/it, gpt_loss=0.342, loss_mean=0.308][A
+Train step of epoch 0:  97%|█████████▋| 6273/6434 [14:43:16<23:46,  8.86s/it, gpt_loss=0.367, loss_mean=0.314][A
+Train step of epoch 0:  98%|█████████▊| 6274/6434 [14:43:16<22:34,  8.47s/it, gpt_loss=0.367, loss_mean=0.314][A
+Train step of epoch 0:  98%|█████████▊| 6274/6434 [14:43:25<22:34,  8.47s/it, gpt_loss=0.319, loss_mean=0.315][A
+Train step of epoch 0:  98%|█████████▊| 6275/6434 [14:43:25<22:43,  8.58s/it, gpt_loss=0.319, loss_mean=0.315][A
+Train step of epoch 0:  98%|█████████▊| 6275/6434 [14:43:35<22:43,  8.58s/it, gpt_loss=0.261, loss_mean=0.309][A
+Train step of epoch 0:  98%|█████████▊| 6276/6434 [14:43:35<23:12,  8.81s/it, gpt_loss=0.261, loss_mean=0.309][A
+Train step of epoch 0:  98%|█████████▊| 6276/6434 [14:43:42<23:12,  8.81s/it, gpt_loss=0.277, loss_mean=0.306][A
+Train step of epoch 0:  98%|█████████▊| 6277/6434 [14:43:42<21:57,  8.39s/it, gpt_loss=0.277, loss_mean=0.306][A
+Train step of epoch 0:  98%|█████████▊| 6277/6434 [14:43:49<21:57,  8.39s/it, gpt_loss=0.317, loss_mean=0.307][A
+Train step of epoch 0:  98%|█████████▊| 6278/6434 [14:43:49<20:47,  8.00s/it, gpt_loss=0.317, loss_mean=0.307][A
+Train step of epoch 0:  98%|█████████▊| 6278/6434 [14:43:57<20:47,  8.00s/it, gpt_loss=0.384, loss_mean=0.315][A
+Train step of epoch 0:  98%|█████████▊| 6279/6434 [14:43:57<20:39,  8.00s/it, gpt_loss=0.384, loss_mean=0.315][A
+[LID Router Debug] Step: 6280
+Batch Size: 10
+Audio Batch Size: 96
+LID Assignments: [0, 4, 2, 3, 5, 7, 4, 1, 0, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 7}
+
+Train step of epoch 0:  98%|█████████▊| 6279/6434 [14:44:05<20:39,  8.00s/it, gpt_loss=0.35, loss_mean=0.318] [A
+Train step of epoch 0:  98%|█████████▊| 6280/6434 [14:44:05<20:37,  8.04s/it, gpt_loss=0.35, loss_mean=0.318][A
+Train step of epoch 0:  98%|█████████▊| 6280/6434 [14:44:15<20:37,  8.04s/it, gpt_loss=0.245, loss_mean=0.311][A
+Train step of epoch 0:  98%|█████████▊| 6281/6434 [14:44:15<21:51,  8.57s/it, gpt_loss=0.245, loss_mean=0.311][A
+Train step of epoch 0:  98%|█████████▊| 6281/6434 [14:44:23<21:51,  8.57s/it, gpt_loss=0.295, loss_mean=0.309][A
+Train step of epoch 0:  98%|█████████▊| 6282/6434 [14:44:23<21:38,  8.54s/it, gpt_loss=0.295, loss_mean=0.309][A
+Train step of epoch 0:  98%|█████████▊| 6282/6434 [14:44:32<21:38,  8.54s/it, gpt_loss=0.24, loss_mean=0.302] [A
+Train step of epoch 0:  98%|█████████▊| 6283/6434 [14:44:32<21:34,  8.57s/it, gpt_loss=0.24, loss_mean=0.302][A
+Train step of epoch 0:  98%|█████████▊| 6283/6434 [14:44:41<21:34,  8.57s/it, gpt_loss=0.295, loss_mean=0.302][A
+Train step of epoch 0:  98%|█████████▊| 6284/6434 [14:44:41<21:56,  8.78s/it, gpt_loss=0.295, loss_mean=0.302][A
+Train step of epoch 0:  98%|█████████▊| 6284/6434 [14:44:49<21:56,  8.78s/it, gpt_loss=0.298, loss_mean=0.301][A
+Train step of epoch 0:  98%|█████████▊| 6285/6434 [14:44:49<20:56,  8.43s/it, gpt_loss=0.298, loss_mean=0.301][A
+Train step of epoch 0:  98%|█████████▊| 6285/6434 [14:44:56<20:56,  8.43s/it, gpt_loss=0.271, loss_mean=0.298][A
+Train step of epoch 0:  98%|█████████▊| 6286/6434 [14:44:56<19:59,  8.10s/it, gpt_loss=0.271, loss_mean=0.298][A
+Train step of epoch 0:  98%|█████████▊| 6286/6434 [14:45:05<19:59,  8.10s/it, gpt_loss=0.294, loss_mean=0.298][A
+Train step of epoch 0:  98%|█████████▊| 6287/6434 [14:45:05<20:08,  8.22s/it, gpt_loss=0.294, loss_mean=0.298][A
+Train step of epoch 0:  98%|█████████▊| 6287/6434 [14:45:14<20:08,  8.22s/it, gpt_loss=0.253, loss_mean=0.293][A
+Train step of epoch 0:  98%|█████████▊| 6288/6434 [14:45:14<20:29,  8.42s/it, gpt_loss=0.253, loss_mean=0.293][A
+Train step of epoch 0:  98%|█████████▊| 6288/6434 [14:45:21<20:29,  8.42s/it, gpt_loss=0.272, loss_mean=0.291][A
+Train step of epoch 0:  98%|█████████▊| 6289/6434 [14:45:21<19:32,  8.09s/it, gpt_loss=0.272, loss_mean=0.291][A
+[LID Router Debug] Step: 6290
+Batch Size: 10
+Audio Batch Size: 110
+LID Assignments: [9, 4, 4, 5, 1, 9, 9, 2, 0, 0]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+Train step of epoch 0:  98%|█████████▊| 6289/6434 [14:45:29<19:32,  8.09s/it, gpt_loss=0.333, loss_mean=0.295][A
+Train step of epoch 0:  98%|█████████▊| 6290/6434 [14:45:29<19:05,  7.95s/it, gpt_loss=0.333, loss_mean=0.295][A
+Train step of epoch 0:  98%|█████████▊| 6290/6434 [14:45:36<19:05,  7.95s/it, gpt_loss=0.296, loss_mean=0.295][A
+Train step of epoch 0:  98%|█████████▊| 6291/6434 [14:45:36<18:49,  7.90s/it, gpt_loss=0.296, loss_mean=0.295][A
+Train step of epoch 0:  98%|█████████▊| 6291/6434 [14:45:44<18:49,  7.90s/it, gpt_loss=0.295, loss_mean=0.295][A
+Train step of epoch 0:  98%|█████████▊| 6292/6434 [14:45:44<18:38,  7.88s/it, gpt_loss=0.295, loss_mean=0.295][A
+Train step of epoch 0:  98%|█████████▊| 6292/6434 [14:45:52<18:38,  7.88s/it, gpt_loss=0.384, loss_mean=0.304][A
+Train step of epoch 0:  98%|█████████▊| 6293/6434 [14:45:52<18:15,  7.77s/it, gpt_loss=0.384, loss_mean=0.304][A
+Train step of epoch 0:  98%|█████████▊| 6293/6434 [14:46:00<18:15,  7.77s/it, gpt_loss=0.278, loss_mean=0.302][A
+Train step of epoch 0:  98%|█████████▊| 6294/6434 [14:46:00<18:43,  8.02s/it, gpt_loss=0.278, loss_mean=0.302][A
+Train step of epoch 0:  98%|█████████▊| 6294/6434 [14:46:08<18:43,  8.02s/it, gpt_loss=0.248, loss_mean=0.296][A
+Train step of epoch 0:  98%|█████████▊| 6295/6434 [14:46:08<18:28,  7.98s/it, gpt_loss=0.248, loss_mean=0.296][A
+Train step of epoch 0:  98%|█████████▊| 6295/6434 [14:46:17<18:28,  7.98s/it, gpt_loss=0.262, loss_mean=0.293][A
+Train step of epoch 0:  98%|█████████▊| 6296/6434 [14:46:17<18:34,  8.08s/it, gpt_loss=0.262, loss_mean=0.293][A
+Train step of epoch 0:  98%|█████████▊| 6296/6434 [14:46:24<18:34,  8.08s/it, gpt_loss=0.308, loss_mean=0.294][A
+Train step of epoch 0:  98%|█████████▊| 6297/6434 [14:46:24<18:11,  7.97s/it, gpt_loss=0.308, loss_mean=0.294][A
+Train step of epoch 0:  98%|█████████▊| 6297/6434 [14:46:33<18:11,  7.97s/it, gpt_loss=0.307, loss_mean=0.296][A
+Train step of epoch 0:  98%|█████████▊| 6298/6434 [14:46:33<18:15,  8.05s/it, gpt_loss=0.307, loss_mean=0.296][A
+Train step of epoch 0:  98%|█████████▊| 6298/6434 [14:46:41<18:15,  8.05s/it, gpt_loss=0.242, loss_mean=0.29] [A
+Train step of epoch 0:  98%|█████████▊| 6299/6434 [14:46:41<18:27,  8.20s/it, gpt_loss=0.242, loss_mean=0.29][A
+[LID Router Debug] Step: 6300
+Batch Size: 10
+Audio Batch Size: 95
+LID Assignments: [2, 9, 6, 4, 3, 1, 2, 9, 5, 6]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  98%|█████████▊| 6299/6434 [14:46:50<18:27,  8.20s/it, gpt_loss=0.268, loss_mean=0.288][A
+Train step of epoch 0:  98%|█████████▊| 6300/6434 [14:46:50<18:29,  8.28s/it, gpt_loss=0.268, loss_mean=0.288][A
+Train step of epoch 0:  98%|█████████▊| 6300/6434 [14:46:57<18:29,  8.28s/it, gpt_loss=0.382, loss_mean=0.297][A
+Train step of epoch 0:  98%|█████████▊| 6301/6434 [14:46:57<18:01,  8.13s/it, gpt_loss=0.382, loss_mean=0.297][A
+Train step of epoch 0:  98%|█████████▊| 6301/6434 [14:47:06<18:01,  8.13s/it, gpt_loss=0.334, loss_mean=0.301][A
+Train step of epoch 0:  98%|█████████▊| 6302/6434 [14:47:06<18:25,  8.37s/it, gpt_loss=0.334, loss_mean=0.301][A
+Train step of epoch 0:  98%|█████████▊| 6302/6434 [14:47:14<18:25,  8.37s/it, gpt_loss=0.312, loss_mean=0.302][A
+Train step of epoch 0:  98%|█████████▊| 6303/6434 [14:47:14<18:01,  8.26s/it, gpt_loss=0.312, loss_mean=0.302][A
+Train step of epoch 0:  98%|█████████▊| 6303/6434 [14:47:23<18:01,  8.26s/it, gpt_loss=0.374, loss_mean=0.309][A
+Train step of epoch 0:  98%|█████████▊| 6304/6434 [14:47:23<18:28,  8.53s/it, gpt_loss=0.374, loss_mean=0.309][A
+Train step of epoch 0:  98%|█████████▊| 6304/6434 [14:47:32<18:28,  8.53s/it, gpt_loss=0.343, loss_mean=0.313][A
+Train step of epoch 0:  98%|█████████▊| 6305/6434 [14:47:32<18:26,  8.58s/it, gpt_loss=0.343, loss_mean=0.313][A
+Train step of epoch 0:  98%|█████████▊| 6305/6434 [14:47:41<18:26,  8.58s/it, gpt_loss=0.285, loss_mean=0.31] [A
+Train step of epoch 0:  98%|█████████▊| 6306/6434 [14:47:41<18:35,  8.72s/it, gpt_loss=0.285, loss_mean=0.31][A
+Train step of epoch 0:  98%|█████████▊| 6306/6434 [14:47:49<18:35,  8.72s/it, gpt_loss=0.301, loss_mean=0.309][A
+Train step of epoch 0:  98%|█████████▊| 6307/6434 [14:47:49<17:34,  8.31s/it, gpt_loss=0.301, loss_mean=0.309][A
+Train step of epoch 0:  98%|█████████▊| 6307/6434 [14:47:57<17:34,  8.31s/it, gpt_loss=0.297, loss_mean=0.308][A
+Train step of epoch 0:  98%|█████████▊| 6308/6434 [14:47:57<17:49,  8.49s/it, gpt_loss=0.297, loss_mean=0.308][A
+Train step of epoch 0:  98%|█████████▊| 6308/6434 [14:48:06<17:49,  8.49s/it, gpt_loss=0.291, loss_mean=0.306][A
+Train step of epoch 0:  98%|█████████▊| 6309/6434 [14:48:06<17:57,  8.62s/it, gpt_loss=0.291, loss_mean=0.306][A
+[LID Router Debug] Step: 6310
+Batch Size: 10
+Audio Batch Size: 96
+LID Assignments: [2, 5, 4, 0, 9, 0, 5, 0, 1, 6]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6, 9}
+
+Train step of epoch 0:  98%|█████████▊| 6309/6434 [14:48:13<17:57,  8.62s/it, gpt_loss=0.292, loss_mean=0.305][A
+Train step of epoch 0:  98%|█████████▊| 6310/6434 [14:48:13<16:47,  8.12s/it, gpt_loss=0.292, loss_mean=0.305][A
+Train step of epoch 0:  98%|█████████▊| 6310/6434 [14:48:22<16:47,  8.12s/it, gpt_loss=0.242, loss_mean=0.298][A
+Train step of epoch 0:  98%|█████████▊| 6311/6434 [14:48:22<17:08,  8.36s/it, gpt_loss=0.242, loss_mean=0.298][A
+Train step of epoch 0:  98%|█████████▊| 6311/6434 [14:48:30<17:08,  8.36s/it, gpt_loss=0.327, loss_mean=0.301][A
+Train step of epoch 0:  98%|█████████▊| 6312/6434 [14:48:30<16:50,  8.28s/it, gpt_loss=0.327, loss_mean=0.301][A
+Train step of epoch 0:  98%|█████████▊| 6312/6434 [14:48:38<16:50,  8.28s/it, gpt_loss=0.284, loss_mean=0.299][A
+Train step of epoch 0:  98%|█████████▊| 6313/6434 [14:48:38<16:35,  8.23s/it, gpt_loss=0.284, loss_mean=0.299][A
+Train step of epoch 0:  98%|█████████▊| 6313/6434 [14:48:47<16:35,  8.23s/it, gpt_loss=0.22, loss_mean=0.291] [A
+Train step of epoch 0:  98%|█████████▊| 6314/6434 [14:48:47<16:35,  8.30s/it, gpt_loss=0.22, loss_mean=0.291][A
+Train step of epoch 0:  98%|█████████▊| 6314/6434 [14:48:55<16:35,  8.30s/it, gpt_loss=0.306, loss_mean=0.293][A
+Train step of epoch 0:  98%|█████████▊| 6315/6434 [14:48:55<16:13,  8.18s/it, gpt_loss=0.306, loss_mean=0.293][A
+Train step of epoch 0:  98%|█████████▊| 6315/6434 [14:49:02<16:13,  8.18s/it, gpt_loss=0.322, loss_mean=0.296][A
+Train step of epoch 0:  98%|█████████▊| 6316/6434 [14:49:02<15:46,  8.03s/it, gpt_loss=0.322, loss_mean=0.296][A
+Train step of epoch 0:  98%|█████████▊| 6316/6434 [14:49:11<15:46,  8.03s/it, gpt_loss=0.25, loss_mean=0.291] [A
+Train step of epoch 0:  98%|█████████▊| 6317/6434 [14:49:11<15:45,  8.08s/it, gpt_loss=0.25, loss_mean=0.291][A
+Train step of epoch 0:  98%|█████████▊| 6317/6434 [14:49:19<15:45,  8.08s/it, gpt_loss=0.304, loss_mean=0.293][A
+Train step of epoch 0:  98%|█████████▊| 6318/6434 [14:49:20<16:03,  8.31s/it, gpt_loss=0.304, loss_mean=0.293][A
+Train step of epoch 0:  98%|█████████▊| 6318/6434 [14:49:28<16:03,  8.31s/it, gpt_loss=0.354, loss_mean=0.299][A
+Train step of epoch 0:  98%|█████████▊| 6319/6434 [14:49:28<16:10,  8.44s/it, gpt_loss=0.354, loss_mean=0.299][A
+[LID Router Debug] Step: 6320
+Batch Size: 10
+Audio Batch Size: 113
+LID Assignments: [1, 3, 1, 9, 2, 9, 0, 5, 1, 3]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9}
+
+Train step of epoch 0:  98%|█████████▊| 6319/6434 [14:49:37<16:10,  8.44s/it, gpt_loss=0.292, loss_mean=0.298][A
+Train step of epoch 0:  98%|█████████▊| 6320/6434 [14:49:37<16:16,  8.57s/it, gpt_loss=0.292, loss_mean=0.298][A
+Train step of epoch 0:  98%|█████████▊| 6320/6434 [14:49:45<16:16,  8.57s/it, gpt_loss=0.241, loss_mean=0.292][A
+Train step of epoch 0:  98%|█████████▊| 6321/6434 [14:49:45<15:58,  8.48s/it, gpt_loss=0.241, loss_mean=0.292][A
+Train step of epoch 0:  98%|█████████▊| 6321/6434 [14:49:54<15:58,  8.48s/it, gpt_loss=0.26, loss_mean=0.289] [A
+Train step of epoch 0:  98%|█████████▊| 6322/6434 [14:49:54<15:39,  8.39s/it, gpt_loss=0.26, loss_mean=0.289][A
+Train step of epoch 0:  98%|█████████▊| 6322/6434 [14:50:02<15:39,  8.39s/it, gpt_loss=0.217, loss_mean=0.282][A
+Train step of epoch 0:  98%|█████████▊| 6323/6434 [14:50:02<15:32,  8.40s/it, gpt_loss=0.217, loss_mean=0.282][A
+Train step of epoch 0:  98%|█████████▊| 6323/6434 [14:50:11<15:32,  8.40s/it, gpt_loss=0.29, loss_mean=0.283] [A
+Train step of epoch 0:  98%|█████████▊| 6324/6434 [14:50:11<15:42,  8.57s/it, gpt_loss=0.29, loss_mean=0.283][A
+Train step of epoch 0:  98%|█████████▊| 6324/6434 [14:50:19<15:42,  8.57s/it, gpt_loss=0.313, loss_mean=0.286][A
+Train step of epoch 0:  98%|█████████▊| 6325/6434 [14:50:19<15:25,  8.49s/it, gpt_loss=0.313, loss_mean=0.286][A
+Train step of epoch 0:  98%|█████████▊| 6325/6434 [14:50:28<15:25,  8.49s/it, gpt_loss=0.354, loss_mean=0.293][A
+Train step of epoch 0:  98%|█████████▊| 6326/6434 [14:50:28<15:10,  8.43s/it, gpt_loss=0.354, loss_mean=0.293][A
+Train step of epoch 0:  98%|█████████▊| 6326/6434 [14:50:36<15:10,  8.43s/it, gpt_loss=0.251, loss_mean=0.288][A
+Train step of epoch 0:  98%|█████████▊| 6327/6434 [14:50:36<15:11,  8.52s/it, gpt_loss=0.251, loss_mean=0.288][A
+Train step of epoch 0:  98%|█████████▊| 6327/6434 [14:50:45<15:11,  8.52s/it, gpt_loss=0.296, loss_mean=0.289][A
+Train step of epoch 0:  98%|█████████▊| 6328/6434 [14:50:45<15:08,  8.58s/it, gpt_loss=0.296, loss_mean=0.289][A
+Train step of epoch 0:  98%|█████████▊| 6328/6434 [14:50:53<15:08,  8.58s/it, gpt_loss=0.317, loss_mean=0.292][A
+Train step of epoch 0:  98%|█████████▊| 6329/6434 [14:50:53<14:29,  8.28s/it, gpt_loss=0.317, loss_mean=0.292][A
+[LID Router Debug] Step: 6330
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [4, 2, 6, 4, 2, 9, 4, 3, 9, 0]
+Active Experts in Batch: {0, 2, 3, 4, 6, 9}
+
+Train step of epoch 0:  98%|█████████▊| 6329/6434 [14:51:01<14:29,  8.28s/it, gpt_loss=0.246, loss_mean=0.287][A
+Train step of epoch 0:  98%|█████████▊| 6330/6434 [14:51:01<14:31,  8.38s/it, gpt_loss=0.246, loss_mean=0.287][A
+Train step of epoch 0:  98%|█████████▊| 6330/6434 [14:51:10<14:31,  8.38s/it, gpt_loss=0.33, loss_mean=0.292] [A
+Train step of epoch 0:  98%|█████████▊| 6331/6434 [14:51:10<14:38,  8.53s/it, gpt_loss=0.33, loss_mean=0.292][A
+Train step of epoch 0:  98%|█████████▊| 6331/6434 [14:51:19<14:38,  8.53s/it, gpt_loss=0.303, loss_mean=0.293][A
+Train step of epoch 0:  98%|█████████▊| 6332/6434 [14:51:19<14:31,  8.55s/it, gpt_loss=0.303, loss_mean=0.293][A
+Train step of epoch 0:  98%|█████████▊| 6332/6434 [14:51:27<14:31,  8.55s/it, gpt_loss=0.272, loss_mean=0.291][A
+Train step of epoch 0:  98%|█████████▊| 6333/6434 [14:51:27<14:02,  8.34s/it, gpt_loss=0.272, loss_mean=0.291][A
+Train step of epoch 0:  98%|█████████▊| 6333/6434 [14:51:36<14:02,  8.34s/it, gpt_loss=0.27, loss_mean=0.289] [A
+Train step of epoch 0:  98%|█████████▊| 6334/6434 [14:51:36<14:27,  8.68s/it, gpt_loss=0.27, loss_mean=0.289][A
+Train step of epoch 0:  98%|█████████▊| 6334/6434 [14:51:45<14:27,  8.68s/it, gpt_loss=0.296, loss_mean=0.289][A
+Train step of epoch 0:  98%|█████████▊| 6335/6434 [14:51:45<14:34,  8.84s/it, gpt_loss=0.296, loss_mean=0.289][A
+Train step of epoch 0:  98%|█████████▊| 6335/6434 [14:51:53<14:34,  8.84s/it, gpt_loss=0.223, loss_mean=0.283][A
+Train step of epoch 0:  98%|█████████▊| 6336/6434 [14:51:53<13:58,  8.56s/it, gpt_loss=0.223, loss_mean=0.283][A
+Train step of epoch 0:  98%|█████████▊| 6336/6434 [14:52:01<13:58,  8.56s/it, gpt_loss=0.258, loss_mean=0.28] [A
+Train step of epoch 0:  98%|█████████▊| 6337/6434 [14:52:01<13:37,  8.43s/it, gpt_loss=0.258, loss_mean=0.28][A
+Train step of epoch 0:  98%|█████████▊| 6337/6434 [14:52:09<13:37,  8.43s/it, gpt_loss=0.298, loss_mean=0.282][A
+Train step of epoch 0:  99%|█████████▊| 6338/6434 [14:52:09<13:20,  8.34s/it, gpt_loss=0.298, loss_mean=0.282][A
+Train step of epoch 0:  99%|█████████▊| 6338/6434 [14:52:18<13:20,  8.34s/it, gpt_loss=0.425, loss_mean=0.296][A
+Train step of epoch 0:  99%|█████████▊| 6339/6434 [14:52:18<13:32,  8.55s/it, gpt_loss=0.425, loss_mean=0.296][A
+[LID Router Debug] Step: 6340
+Batch Size: 10
+Audio Batch Size: 84
+LID Assignments: [2, 0, 9, 2, 4, 1, 5, 4, 5, 9]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+Train step of epoch 0:  99%|█████████▊| 6339/6434 [14:52:27<13:32,  8.55s/it, gpt_loss=0.271, loss_mean=0.294][A
+Train step of epoch 0:  99%|█████████▊| 6340/6434 [14:52:27<13:23,  8.55s/it, gpt_loss=0.271, loss_mean=0.294][A
+Train step of epoch 0:  99%|█████████▊| 6340/6434 [14:52:34<13:23,  8.55s/it, gpt_loss=0.334, loss_mean=0.298][A
+Train step of epoch 0:  99%|█████████▊| 6341/6434 [14:52:34<12:47,  8.25s/it, gpt_loss=0.334, loss_mean=0.298][A
+Train step of epoch 0:  99%|█████████▊| 6341/6434 [14:52:42<12:47,  8.25s/it, gpt_loss=0.275, loss_mean=0.296][A
+Train step of epoch 0:  99%|█████████▊| 6342/6434 [14:52:42<12:16,  8.00s/it, gpt_loss=0.275, loss_mean=0.296][A
+Train step of epoch 0:  99%|█████████▊| 6342/6434 [14:52:51<12:16,  8.00s/it, gpt_loss=0.25, loss_mean=0.291] [A
+Train step of epoch 0:  99%|█████████▊| 6343/6434 [14:52:51<12:23,  8.17s/it, gpt_loss=0.25, loss_mean=0.291][A
+Train step of epoch 0:  99%|█████████▊| 6343/6434 [14:52:59<12:23,  8.17s/it, gpt_loss=0.269, loss_mean=0.289][A
+Train step of epoch 0:  99%|█████████▊| 6344/6434 [14:52:59<12:14,  8.16s/it, gpt_loss=0.269, loss_mean=0.289][A
+Train step of epoch 0:  99%|█████████▊| 6344/6434 [14:53:08<12:14,  8.16s/it, gpt_loss=0.259, loss_mean=0.286][A
+Train step of epoch 0:  99%|█████████▊| 6345/6434 [14:53:08<12:28,  8.41s/it, gpt_loss=0.259, loss_mean=0.286][A
+Train step of epoch 0:  99%|█████████▊| 6345/6434 [14:53:16<12:28,  8.41s/it, gpt_loss=0.368, loss_mean=0.294][A
+Train step of epoch 0:  99%|█████████▊| 6346/6434 [14:53:16<12:10,  8.30s/it, gpt_loss=0.368, loss_mean=0.294][A
+Train step of epoch 0:  99%|█████████▊| 6346/6434 [14:53:23<12:10,  8.30s/it, gpt_loss=0.261, loss_mean=0.291][A
+Train step of epoch 0:  99%|█████████▊| 6347/6434 [14:53:23<11:41,  8.07s/it, gpt_loss=0.261, loss_mean=0.291][A
+Train step of epoch 0:  99%|█████████▊| 6347/6434 [14:53:33<11:41,  8.07s/it, gpt_loss=0.29, loss_mean=0.291] [A
+Train step of epoch 0:  99%|█████████▊| 6348/6434 [14:53:33<12:29,  8.72s/it, gpt_loss=0.29, loss_mean=0.291][A
+Train step of epoch 0:  99%|█████████▊| 6348/6434 [14:53:42<12:29,  8.72s/it, gpt_loss=0.263, loss_mean=0.288][A
+Train step of epoch 0:  99%|█████████▊| 6349/6434 [14:53:42<12:21,  8.72s/it, gpt_loss=0.263, loss_mean=0.288][A
+[LID Router Debug] Step: 6350
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [3, 9, 2, 3, 4, 4, 2, 1, 1, 9]
+Active Experts in Batch: {1, 2, 3, 4, 9}
+
+Train step of epoch 0:  99%|█████████▊| 6349/6434 [14:53:51<12:21,  8.72s/it, gpt_loss=0.26, loss_mean=0.285] [A
+Train step of epoch 0:  99%|█████████▊| 6350/6434 [14:53:51<12:21,  8.83s/it, gpt_loss=0.26, loss_mean=0.285][A
+Train step of epoch 0:  99%|█████████▊| 6350/6434 [14:54:00<12:21,  8.83s/it, gpt_loss=0.236, loss_mean=0.28][A
+Train step of epoch 0:  99%|█████████▊| 6351/6434 [14:54:00<12:13,  8.84s/it, gpt_loss=0.236, loss_mean=0.28][A
+Train step of epoch 0:  99%|█████████▊| 6351/6434 [14:54:08<12:13,  8.84s/it, gpt_loss=0.264, loss_mean=0.279][A
+Train step of epoch 0:  99%|█████████▊| 6352/6434 [14:54:08<11:30,  8.42s/it, gpt_loss=0.264, loss_mean=0.279][A
+Train step of epoch 0:  99%|█████████▊| 6352/6434 [14:54:16<11:30,  8.42s/it, gpt_loss=0.237, loss_mean=0.274][A
+Train step of epoch 0:  99%|█████████▊| 6353/6434 [14:54:16<11:19,  8.38s/it, gpt_loss=0.237, loss_mean=0.274][A
+Train step of epoch 0:  99%|█████████▊| 6353/6434 [14:54:24<11:19,  8.38s/it, gpt_loss=0.354, loss_mean=0.282][A
+Train step of epoch 0:  99%|█████████▉| 6354/6434 [14:54:24<11:07,  8.34s/it, gpt_loss=0.354, loss_mean=0.282][A
+Train step of epoch 0:  99%|█████████▉| 6354/6434 [14:54:33<11:07,  8.34s/it, gpt_loss=0.308, loss_mean=0.285][A
+Train step of epoch 0:  99%|█████████▉| 6355/6434 [14:54:33<11:08,  8.46s/it, gpt_loss=0.308, loss_mean=0.285][A
+Train step of epoch 0:  99%|█████████▉| 6355/6434 [14:54:41<11:08,  8.46s/it, gpt_loss=0.296, loss_mean=0.286][A
+Train step of epoch 0:  99%|█████████▉| 6356/6434 [14:54:41<10:59,  8.45s/it, gpt_loss=0.296, loss_mean=0.286][A
+Train step of epoch 0:  99%|█████████▉| 6356/6434 [14:54:49<10:59,  8.45s/it, gpt_loss=0.228, loss_mean=0.28] [A
+Train step of epoch 0:  99%|█████████▉| 6357/6434 [14:54:49<10:38,  8.29s/it, gpt_loss=0.228, loss_mean=0.28][A
+Train step of epoch 0:  99%|█████████▉| 6357/6434 [14:54:59<10:38,  8.29s/it, gpt_loss=0.218, loss_mean=0.274][A
+Train step of epoch 0:  99%|█████████▉| 6358/6434 [14:54:59<11:00,  8.69s/it, gpt_loss=0.218, loss_mean=0.274][A
+Train step of epoch 0:  99%|█████████▉| 6358/6434 [14:55:07<11:00,  8.69s/it, gpt_loss=0.268, loss_mean=0.273][A
+Train step of epoch 0:  99%|█████████▉| 6359/6434 [14:55:07<10:33,  8.44s/it, gpt_loss=0.268, loss_mean=0.273][A
+[LID Router Debug] Step: 6360
+Batch Size: 10
+Audio Batch Size: 95
+LID Assignments: [4, 9, 1, 1, 9, 2, 5, 0, 0, 5]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+Train step of epoch 0:  99%|█████████▉| 6359/6434 [14:55:17<10:33,  8.44s/it, gpt_loss=0.324, loss_mean=0.278][A
+Train step of epoch 0:  99%|█████████▉| 6360/6434 [14:55:17<11:06,  9.00s/it, gpt_loss=0.324, loss_mean=0.278][A
+Train step of epoch 0:  99%|█████████▉| 6360/6434 [14:55:25<11:06,  9.00s/it, gpt_loss=0.33, loss_mean=0.284] [A
+Train step of epoch 0:  99%|█████████▉| 6361/6434 [14:55:25<10:44,  8.83s/it, gpt_loss=0.33, loss_mean=0.284][A
+Train step of epoch 0:  99%|█████████▉| 6361/6434 [14:55:34<10:44,  8.83s/it, gpt_loss=0.283, loss_mean=0.283][A
+Train step of epoch 0:  99%|█████████▉| 6362/6434 [14:55:34<10:20,  8.62s/it, gpt_loss=0.283, loss_mean=0.283][A
+Train step of epoch 0:  99%|█████████▉| 6362/6434 [14:55:42<10:20,  8.62s/it, gpt_loss=0.267, loss_mean=0.282][A
+Train step of epoch 0:  99%|█████████▉| 6363/6434 [14:55:42<10:13,  8.64s/it, gpt_loss=0.267, loss_mean=0.282][A
+Train step of epoch 0:  99%|█████████▉| 6363/6434 [14:55:52<10:13,  8.64s/it, gpt_loss=0.298, loss_mean=0.283][A
+Train step of epoch 0:  99%|█████████▉| 6364/6434 [14:55:52<10:20,  8.87s/it, gpt_loss=0.298, loss_mean=0.283][A
+Train step of epoch 0:  99%|█████████▉| 6364/6434 [14:56:01<10:20,  8.87s/it, gpt_loss=0.275, loss_mean=0.283][A
+Train step of epoch 0:  99%|█████████▉| 6365/6434 [14:56:01<10:17,  8.95s/it, gpt_loss=0.275, loss_mean=0.283][A
+Train step of epoch 0:  99%|█████████▉| 6365/6434 [14:56:10<10:17,  8.95s/it, gpt_loss=0.326, loss_mean=0.287][A
+Train step of epoch 0:  99%|█████████▉| 6366/6434 [14:56:10<10:07,  8.94s/it, gpt_loss=0.326, loss_mean=0.287][A
+Train step of epoch 0:  99%|█████████▉| 6366/6434 [14:56:18<10:07,  8.94s/it, gpt_loss=0.236, loss_mean=0.282][A
+Train step of epoch 0:  99%|█████████▉| 6367/6434 [14:56:18<09:40,  8.66s/it, gpt_loss=0.236, loss_mean=0.282][A
+Train step of epoch 0:  99%|█████████▉| 6367/6434 [14:56:26<09:40,  8.66s/it, gpt_loss=0.288, loss_mean=0.282][A
+Train step of epoch 0:  99%|█████████▉| 6368/6434 [14:56:26<09:16,  8.43s/it, gpt_loss=0.288, loss_mean=0.282][A
+Train step of epoch 0:  99%|█████████▉| 6368/6434 [14:56:34<09:16,  8.43s/it, gpt_loss=0.282, loss_mean=0.282][A
+Train step of epoch 0:  99%|█████████▉| 6369/6434 [14:56:34<09:14,  8.54s/it, gpt_loss=0.282, loss_mean=0.282][A
+[LID Router Debug] Step: 6370
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [4, 5, 1, 2, 3, 4, 1, 9, 6, 1]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6, 9}
+
+Train step of epoch 0:  99%|█████████▉| 6369/6434 [14:56:42<09:14,  8.54s/it, gpt_loss=0.267, loss_mean=0.281][A
+Train step of epoch 0:  99%|█████████▉| 6370/6434 [14:56:42<08:46,  8.22s/it, gpt_loss=0.267, loss_mean=0.281][A
+Train step of epoch 0:  99%|█████████▉| 6370/6434 [14:56:51<08:46,  8.22s/it, gpt_loss=0.26, loss_mean=0.279] [A
+Train step of epoch 0:  99%|█████████▉| 6371/6434 [14:56:51<08:46,  8.36s/it, gpt_loss=0.26, loss_mean=0.279][A
+Train step of epoch 0:  99%|█████████▉| 6371/6434 [14:56:59<08:46,  8.36s/it, gpt_loss=0.35, loss_mean=0.286][A
+Train step of epoch 0:  99%|█████████▉| 6372/6434 [14:56:59<08:45,  8.48s/it, gpt_loss=0.35, loss_mean=0.286][A
+Train step of epoch 0:  99%|█████████▉| 6372/6434 [14:57:07<08:45,  8.48s/it, gpt_loss=0.293, loss_mean=0.287][A
+Train step of epoch 0:  99%|█████████▉| 6373/6434 [14:57:07<08:27,  8.32s/it, gpt_loss=0.293, loss_mean=0.287][A
+Train step of epoch 0:  99%|█████████▉| 6373/6434 [14:57:17<08:27,  8.32s/it, gpt_loss=0.263, loss_mean=0.284][A
+Train step of epoch 0:  99%|█████████▉| 6374/6434 [14:57:17<08:41,  8.68s/it, gpt_loss=0.263, loss_mean=0.284][A
+Train step of epoch 0:  99%|█████████▉| 6374/6434 [14:57:25<08:41,  8.68s/it, gpt_loss=0.294, loss_mean=0.285][A
+Train step of epoch 0:  99%|█████████▉| 6375/6434 [14:57:25<08:22,  8.51s/it, gpt_loss=0.294, loss_mean=0.285][A
+Train step of epoch 0:  99%|█████████▉| 6375/6434 [14:57:34<08:22,  8.51s/it, gpt_loss=0.301, loss_mean=0.287][A
+Train step of epoch 0:  99%|█████████▉| 6376/6434 [14:57:34<08:18,  8.60s/it, gpt_loss=0.301, loss_mean=0.287][A
+Train step of epoch 0:  99%|█████████▉| 6376/6434 [14:57:43<08:18,  8.60s/it, gpt_loss=0.267, loss_mean=0.285][A
+Train step of epoch 0:  99%|█████████▉| 6377/6434 [14:57:43<08:16,  8.72s/it, gpt_loss=0.267, loss_mean=0.285][A
+Train step of epoch 0:  99%|█████████▉| 6377/6434 [14:57:50<08:16,  8.72s/it, gpt_loss=0.312, loss_mean=0.288][A
+Train step of epoch 0:  99%|█████████▉| 6378/6434 [14:57:50<07:51,  8.43s/it, gpt_loss=0.312, loss_mean=0.288][A
+Train step of epoch 0:  99%|█████████▉| 6378/6434 [14:58:00<07:51,  8.43s/it, gpt_loss=0.226, loss_mean=0.281][A
+Train step of epoch 0:  99%|█████████▉| 6379/6434 [14:58:00<08:01,  8.75s/it, gpt_loss=0.226, loss_mean=0.281][A
+[LID Router Debug] Step: 6380
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [2, 4, 3, 2, 2, 10, 6, 4, 3, 4]
+Active Experts in Batch: {2, 3, 4, 6, 10}
+
+Train step of epoch 0:  99%|█████████▉| 6379/6434 [14:58:09<08:01,  8.75s/it, gpt_loss=0.305, loss_mean=0.284][A
+Train step of epoch 0:  99%|█████████▉| 6380/6434 [14:58:09<07:53,  8.78s/it, gpt_loss=0.305, loss_mean=0.284][A
+Train step of epoch 0:  99%|█████████▉| 6380/6434 [14:58:17<07:53,  8.78s/it, gpt_loss=0.42, loss_mean=0.297] [A
+Train step of epoch 0:  99%|█████████▉| 6381/6434 [14:58:17<07:29,  8.48s/it, gpt_loss=0.42, loss_mean=0.297][A
+Train step of epoch 0:  99%|█████████▉| 6381/6434 [14:58:25<07:29,  8.48s/it, gpt_loss=0.313, loss_mean=0.299][A
+Train step of epoch 0:  99%|█████████▉| 6382/6434 [14:58:25<07:25,  8.56s/it, gpt_loss=0.313, loss_mean=0.299][A
+Train step of epoch 0:  99%|█████████▉| 6382/6434 [14:58:33<07:25,  8.56s/it, gpt_loss=0.357, loss_mean=0.305][A
+Train step of epoch 0:  99%|█████████▉| 6383/6434 [14:58:33<07:01,  8.26s/it, gpt_loss=0.357, loss_mean=0.305][A
+Train step of epoch 0:  99%|█████████▉| 6383/6434 [14:58:42<07:01,  8.26s/it, gpt_loss=0.215, loss_mean=0.296][A
+Train step of epoch 0:  99%|█████████▉| 6384/6434 [14:58:42<07:07,  8.56s/it, gpt_loss=0.215, loss_mean=0.296][A
+Train step of epoch 0:  99%|█████████▉| 6384/6434 [14:58:51<07:07,  8.56s/it, gpt_loss=0.29, loss_mean=0.295] [A
+Train step of epoch 0:  99%|█████████▉| 6385/6434 [14:58:51<07:08,  8.74s/it, gpt_loss=0.29, loss_mean=0.295][A
+Train step of epoch 0:  99%|█████████▉| 6385/6434 [14:58:59<07:08,  8.74s/it, gpt_loss=0.28, loss_mean=0.294][A
+Train step of epoch 0:  99%|█████████▉| 6386/6434 [14:58:59<06:45,  8.46s/it, gpt_loss=0.28, loss_mean=0.294][A
+Train step of epoch 0:  99%|█████████▉| 6386/6434 [14:59:07<06:45,  8.46s/it, gpt_loss=0.277, loss_mean=0.292][A
+Train step of epoch 0:  99%|█████████▉| 6387/6434 [14:59:07<06:25,  8.20s/it, gpt_loss=0.277, loss_mean=0.292][A
+Train step of epoch 0:  99%|█████████▉| 6387/6434 [14:59:16<06:25,  8.20s/it, gpt_loss=0.348, loss_mean=0.298][A
+Train step of epoch 0:  99%|█████████▉| 6388/6434 [14:59:16<06:26,  8.40s/it, gpt_loss=0.348, loss_mean=0.298][A
+Train step of epoch 0:  99%|█████████▉| 6388/6434 [14:59:24<06:26,  8.40s/it, gpt_loss=0.392, loss_mean=0.307][A
+Train step of epoch 0:  99%|█████████▉| 6389/6434 [14:59:24<06:15,  8.34s/it, gpt_loss=0.392, loss_mean=0.307][A
+[LID Router Debug] Step: 6390
+Batch Size: 10
+Audio Batch Size: 115
+LID Assignments: [5, 4, 4, 9, 5, 5, 3, 9, 2, 5]
+Active Experts in Batch: {2, 3, 4, 5, 9}
+
+Train step of epoch 0:  99%|█████████▉| 6389/6434 [14:59:32<06:15,  8.34s/it, gpt_loss=0.328, loss_mean=0.309][A
+Train step of epoch 0:  99%|█████████▉| 6390/6434 [14:59:32<06:10,  8.43s/it, gpt_loss=0.328, loss_mean=0.309][A
+Train step of epoch 0:  99%|█████████▉| 6390/6434 [14:59:40<06:10,  8.43s/it, gpt_loss=0.255, loss_mean=0.304][A
+Train step of epoch 0:  99%|█████████▉| 6391/6434 [14:59:40<05:51,  8.18s/it, gpt_loss=0.255, loss_mean=0.304][A
+Train step of epoch 0:  99%|█████████▉| 6391/6434 [14:59:49<05:51,  8.18s/it, gpt_loss=0.33, loss_mean=0.306] [A
+Train step of epoch 0:  99%|█████████▉| 6392/6434 [14:59:49<05:51,  8.37s/it, gpt_loss=0.33, loss_mean=0.306][A
+Train step of epoch 0:  99%|█████████▉| 6392/6434 [14:59:58<05:51,  8.37s/it, gpt_loss=0.295, loss_mean=0.305][A
+Train step of epoch 0:  99%|█████████▉| 6393/6434 [14:59:58<05:53,  8.61s/it, gpt_loss=0.295, loss_mean=0.305][A
+Train step of epoch 0:  99%|█████████▉| 6393/6434 [15:00:06<05:53,  8.61s/it, gpt_loss=0.388, loss_mean=0.313][A
+Train step of epoch 0:  99%|█████████▉| 6394/6434 [15:00:06<05:39,  8.48s/it, gpt_loss=0.388, loss_mean=0.313][A
+Train step of epoch 0:  99%|█████████▉| 6394/6434 [15:00:14<05:39,  8.48s/it, gpt_loss=0.315, loss_mean=0.314][A
+Train step of epoch 0:  99%|█████████▉| 6395/6434 [15:00:14<05:20,  8.22s/it, gpt_loss=0.315, loss_mean=0.314][A
+Train step of epoch 0:  99%|█████████▉| 6395/6434 [15:00:22<05:20,  8.22s/it, gpt_loss=0.313, loss_mean=0.314][A
+Train step of epoch 0:  99%|█████████▉| 6396/6434 [15:00:22<05:07,  8.09s/it, gpt_loss=0.313, loss_mean=0.314][A
+Train step of epoch 0:  99%|█████████▉| 6396/6434 [15:00:29<05:07,  8.09s/it, gpt_loss=0.362, loss_mean=0.318][A
+Train step of epoch 0:  99%|█████████▉| 6397/6434 [15:00:29<04:54,  7.95s/it, gpt_loss=0.362, loss_mean=0.318][A
+Train step of epoch 0:  99%|█████████▉| 6397/6434 [15:00:39<04:54,  7.95s/it, gpt_loss=0.227, loss_mean=0.309][A
+Train step of epoch 0:  99%|█████████▉| 6398/6434 [15:00:39<05:02,  8.41s/it, gpt_loss=0.227, loss_mean=0.309][A
+Train step of epoch 0:  99%|█████████▉| 6398/6434 [15:00:47<05:02,  8.41s/it, gpt_loss=0.333, loss_mean=0.312][A
+Train step of epoch 0:  99%|█████████▉| 6399/6434 [15:00:47<04:50,  8.30s/it, gpt_loss=0.333, loss_mean=0.312][A
+[LID Router Debug] Step: 6400
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [1, 4, 4, 4, 9, 3, 2, 9, 5, 1]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+[2026-02-07 06:56:59,172] [INFO] [logging.py:96:log_dist] [Rank 0] step=3200, skipped=0, lr=[1.5230732029798301e-05, 1.5230732029798301e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 06:56:59,173] [INFO] [timer.py:260:stop] epoch=0/micro_step=6400/global_step=3200, RunningAvgSamplesPerSec=4.745773697355563, CurrSamplesPerSec=4.987406344272568, MemAllocated=12.6GB, MaxMemAllocated=49.73GB
+
+Train step of epoch 0:  99%|█████████▉| 6399/6434 [15:00:55<04:50,  8.30s/it, gpt_loss=0.305, loss_mean=0.311][A
+Train step of epoch 0:  99%|█████████▉| 6400/6434 [15:00:55<04:39,  8.22s/it, gpt_loss=0.305, loss_mean=0.311][A
+Train step of epoch 0:  99%|█████████▉| 6400/6434 [15:01:02<04:39,  8.22s/it, gpt_loss=0.313, loss_mean=0.311][A
+Train step of epoch 0:  99%|█████████▉| 6401/6434 [15:01:02<04:22,  7.95s/it, gpt_loss=0.313, loss_mean=0.311][A
+Train step of epoch 0:  99%|█████████▉| 6401/6434 [15:01:09<04:22,  7.95s/it, gpt_loss=0.203, loss_mean=0.3]  [A
+Train step of epoch 0: 100%|█████████▉| 6402/6434 [15:01:09<04:06,  7.70s/it, gpt_loss=0.203, loss_mean=0.3][A
+Train step of epoch 0: 100%|█████████▉| 6402/6434 [15:01:18<04:06,  7.70s/it, gpt_loss=0.262, loss_mean=0.296][A
+Train step of epoch 0: 100%|█████████▉| 6403/6434 [15:01:18<04:05,  7.91s/it, gpt_loss=0.262, loss_mean=0.296][A
+Train step of epoch 0: 100%|█████████▉| 6403/6434 [15:01:26<04:05,  7.91s/it, gpt_loss=0.301, loss_mean=0.297][A
+Train step of epoch 0: 100%|█████████▉| 6404/6434 [15:01:26<04:05,  8.19s/it, gpt_loss=0.301, loss_mean=0.297][A
+Train step of epoch 0: 100%|█████████▉| 6404/6434 [15:01:35<04:05,  8.19s/it, gpt_loss=0.276, loss_mean=0.295][A
+Train step of epoch 0: 100%|█████████▉| 6405/6434 [15:01:35<03:59,  8.27s/it, gpt_loss=0.276, loss_mean=0.295][A
+Train step of epoch 0: 100%|█████████▉| 6405/6434 [15:01:44<03:59,  8.27s/it, gpt_loss=0.31, loss_mean=0.296] [A
+Train step of epoch 0: 100%|█████████▉| 6406/6434 [15:01:44<03:55,  8.41s/it, gpt_loss=0.31, loss_mean=0.296][A
+Train step of epoch 0: 100%|█████████▉| 6406/6434 [15:01:52<03:55,  8.41s/it, gpt_loss=0.366, loss_mean=0.303][A
+Train step of epoch 0: 100%|█████████▉| 6407/6434 [15:01:52<03:44,  8.33s/it, gpt_loss=0.366, loss_mean=0.303][A
+Train step of epoch 0: 100%|█████████▉| 6407/6434 [15:02:00<03:44,  8.33s/it, gpt_loss=0.224, loss_mean=0.295][A
+Train step of epoch 0: 100%|█████████▉| 6408/6434 [15:02:00<03:36,  8.33s/it, gpt_loss=0.224, loss_mean=0.295][A
+Train step of epoch 0: 100%|█████████▉| 6408/6434 [15:02:08<03:36,  8.33s/it, gpt_loss=0.302, loss_mean=0.296][A
+Train step of epoch 0: 100%|█████████▉| 6409/6434 [15:02:08<03:29,  8.37s/it, gpt_loss=0.302, loss_mean=0.296][A
+[LID Router Debug] Step: 6410
+Batch Size: 10
+Audio Batch Size: 77
+LID Assignments: [1, 4, 9, 4, 6, 1, 6, 7, 6, 2]
+Active Experts in Batch: {1, 2, 4, 6, 7, 9}
+
+Train step of epoch 0: 100%|█████████▉| 6409/6434 [15:02:18<03:29,  8.37s/it, gpt_loss=0.294, loss_mean=0.296][A
+Train step of epoch 0: 100%|█████████▉| 6410/6434 [15:02:18<03:26,  8.60s/it, gpt_loss=0.294, loss_mean=0.296][A
+Train step of epoch 0: 100%|█████████▉| 6410/6434 [15:02:27<03:26,  8.60s/it, gpt_loss=0.354, loss_mean=0.302][A
+Train step of epoch 0: 100%|█████████▉| 6411/6434 [15:02:27<03:19,  8.68s/it, gpt_loss=0.354, loss_mean=0.302][A
+Train step of epoch 0: 100%|█████████▉| 6411/6434 [15:02:36<03:19,  8.68s/it, gpt_loss=0.319, loss_mean=0.303][A
+Train step of epoch 0: 100%|█████████▉| 6412/6434 [15:02:36<03:14,  8.84s/it, gpt_loss=0.319, loss_mean=0.303][A
+Train step of epoch 0: 100%|█████████▉| 6412/6434 [15:02:45<03:14,  8.84s/it, gpt_loss=0.268, loss_mean=0.3]  [A
+Train step of epoch 0: 100%|█████████▉| 6413/6434 [15:02:45<03:06,  8.90s/it, gpt_loss=0.268, loss_mean=0.3][A
+Train step of epoch 0: 100%|█████████▉| 6413/6434 [15:02:54<03:06,  8.90s/it, gpt_loss=0.219, loss_mean=0.292][A
+Train step of epoch 0: 100%|█████████▉| 6414/6434 [15:02:54<03:02,  9.13s/it, gpt_loss=0.219, loss_mean=0.292][A
+Train step of epoch 0: 100%|█████████▉| 6414/6434 [15:03:03<03:02,  9.13s/it, gpt_loss=0.257, loss_mean=0.288][A
+Train step of epoch 0: 100%|█████████▉| 6415/6434 [15:03:03<02:51,  9.01s/it, gpt_loss=0.257, loss_mean=0.288][A
+Train step of epoch 0: 100%|█████████▉| 6415/6434 [15:03:11<02:51,  9.01s/it, gpt_loss=0.338, loss_mean=0.293][A
+Train step of epoch 0: 100%|█████████▉| 6416/6434 [15:03:11<02:37,  8.74s/it, gpt_loss=0.338, loss_mean=0.293][A
+Train step of epoch 0: 100%|█████████▉| 6416/6434 [15:03:20<02:37,  8.74s/it, gpt_loss=0.295, loss_mean=0.293][A
+Train step of epoch 0: 100%|█████████▉| 6417/6434 [15:03:20<02:28,  8.76s/it, gpt_loss=0.295, loss_mean=0.293][A
+Train step of epoch 0: 100%|█████████▉| 6417/6434 [15:03:29<02:28,  8.76s/it, gpt_loss=0.271, loss_mean=0.291][A
+Train step of epoch 0: 100%|█████████▉| 6418/6434 [15:03:29<02:19,  8.72s/it, gpt_loss=0.271, loss_mean=0.291][A
+Train step of epoch 0: 100%|█████████▉| 6418/6434 [15:03:36<02:19,  8.72s/it, gpt_loss=0.307, loss_mean=0.293][A
+Train step of epoch 0: 100%|█████████▉| 6419/6434 [15:03:36<02:06,  8.42s/it, gpt_loss=0.307, loss_mean=0.293][A
+[LID Router Debug] Step: 6420
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [3, 0, 9, 4, 2, 0, 0, 5, 4, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+Train step of epoch 0: 100%|█████████▉| 6419/6434 [15:03:45<02:06,  8.42s/it, gpt_loss=0.319, loss_mean=0.295][A
+Train step of epoch 0: 100%|█████████▉| 6420/6434 [15:03:45<02:00,  8.61s/it, gpt_loss=0.319, loss_mean=0.295][A
+Train step of epoch 0: 100%|█████████▉| 6420/6434 [15:03:54<02:00,  8.61s/it, gpt_loss=0.29, loss_mean=0.295] [A
+Train step of epoch 0: 100%|█████████▉| 6421/6434 [15:03:54<01:52,  8.66s/it, gpt_loss=0.29, loss_mean=0.295][A
+Train step of epoch 0: 100%|█████████▉| 6421/6434 [15:04:03<01:52,  8.66s/it, gpt_loss=0.267, loss_mean=0.292][A
+Train step of epoch 0: 100%|█████████▉| 6422/6434 [15:04:03<01:42,  8.58s/it, gpt_loss=0.267, loss_mean=0.292][A
+Train step of epoch 0: 100%|█████████▉| 6422/6434 [15:04:12<01:42,  8.58s/it, gpt_loss=0.327, loss_mean=0.296][A
+Train step of epoch 0: 100%|█████████▉| 6423/6434 [15:04:12<01:35,  8.67s/it, gpt_loss=0.327, loss_mean=0.296][A
+Train step of epoch 0: 100%|█████████▉| 6423/6434 [15:04:20<01:35,  8.67s/it, gpt_loss=0.271, loss_mean=0.293][A
+Train step of epoch 0: 100%|█████████▉| 6424/6434 [15:04:20<01:25,  8.55s/it, gpt_loss=0.271, loss_mean=0.293][A
+Train step of epoch 0: 100%|█████████▉| 6424/6434 [15:04:28<01:25,  8.55s/it, gpt_loss=0.378, loss_mean=0.302][A
+Train step of epoch 0: 100%|█████████▉| 6425/6434 [15:04:28<01:15,  8.44s/it, gpt_loss=0.378, loss_mean=0.302][A
+Train step of epoch 0: 100%|█████████▉| 6425/6434 [15:04:37<01:15,  8.44s/it, gpt_loss=0.269, loss_mean=0.298][A
+Train step of epoch 0: 100%|█████████▉| 6426/6434 [15:04:37<01:08,  8.55s/it, gpt_loss=0.269, loss_mean=0.298][A
+Train step of epoch 0: 100%|█████████▉| 6426/6434 [15:04:45<01:08,  8.55s/it, gpt_loss=0.257, loss_mean=0.294][A
+Train step of epoch 0: 100%|█████████▉| 6427/6434 [15:04:45<00:58,  8.32s/it, gpt_loss=0.257, loss_mean=0.294][A
+Train step of epoch 0: 100%|█████████▉| 6427/6434 [15:04:52<00:58,  8.32s/it, gpt_loss=0.28, loss_mean=0.293] [A
+Train step of epoch 0: 100%|█████████▉| 6428/6434 [15:04:52<00:48,  8.06s/it, gpt_loss=0.28, loss_mean=0.293][A
+Train step of epoch 0: 100%|█████████▉| 6428/6434 [15:05:01<00:48,  8.06s/it, gpt_loss=0.252, loss_mean=0.289][A
+Train step of epoch 0: 100%|█████████▉| 6429/6434 [15:05:01<00:41,  8.26s/it, gpt_loss=0.252, loss_mean=0.289][A
+[LID Router Debug] Step: 6430
+Batch Size: 10
+Audio Batch Size: 136
+LID Assignments: [2, 2, 2, 3, 5, 4, 3, 3, 0, 3]
+Active Experts in Batch: {0, 2, 3, 4, 5}
+
+Train step of epoch 0: 100%|█████████▉| 6429/6434 [15:05:10<00:41,  8.26s/it, gpt_loss=0.3, loss_mean=0.29]   [A
+Train step of epoch 0: 100%|█████████▉| 6430/6434 [15:05:10<00:33,  8.48s/it, gpt_loss=0.3, loss_mean=0.29][A
+Train step of epoch 0: 100%|█████████▉| 6430/6434 [15:05:17<00:33,  8.48s/it, gpt_loss=0.277, loss_mean=0.289][A
+Train step of epoch 0: 100%|█████████▉| 6431/6434 [15:05:17<00:24,  8.21s/it, gpt_loss=0.277, loss_mean=0.289][A
+Train step of epoch 0: 100%|█████████▉| 6431/6434 [15:05:24<00:24,  8.21s/it, gpt_loss=0.298, loss_mean=0.29] [A
+Train step of epoch 0: 100%|█████████▉| 6432/6434 [15:05:24<00:15,  7.84s/it, gpt_loss=0.298, loss_mean=0.29][A
+Train step of epoch 0: 100%|█████████▉| 6432/6434 [15:05:33<00:15,  7.84s/it, gpt_loss=0.234, loss_mean=0.284][A
+Train step of epoch 0: 100%|█████████▉| 6433/6434 [15:05:33<00:07,  7.98s/it, gpt_loss=0.234, loss_mean=0.284][A
+Train step of epoch 0: 100%|█████████▉| 6433/6434 [15:05:41<00:07,  7.98s/it, gpt_loss=0.294, loss_mean=0.285][A
+Train step of epoch 0: 100%|██████████| 6434/6434 [15:05:41<00:00,  7.97s/it, gpt_loss=0.294, loss_mean=0.285][ATrain epoch:  33%|███▎      | 1/3 [15:05:42<30:11:25, 54342.61s/it]
+
+Train step of epoch 1:   0%|          | 0/6434 [00:00<?, ?it/s][A[ATrain step of epoch 0: 100%|██████████| 6434/6434 [15:05:42<00:00,  8.45s/it, gpt_loss=0.294, loss_mean=0.285]
+
+
+Train step of epoch 1:   0%|          | 0/6434 [00:17<?, ?it/s, gpt_loss=0.216, loss_mean=0.0216][A[A
+
+Train step of epoch 1:   0%|          | 1/6434 [00:17<32:00:30, 17.91s/it, gpt_loss=0.216, loss_mean=0.0216][A[A
+
+Train step of epoch 1:   0%|          | 1/6434 [00:26<32:00:30, 17.91s/it, gpt_loss=0.266, loss_mean=0.046] [A[A
+
+Train step of epoch 1:   0%|          | 2/6434 [00:26<22:16:19, 12.47s/it, gpt_loss=0.266, loss_mean=0.046][A[A
+
+Train step of epoch 1:   0%|          | 2/6434 [00:34<22:16:19, 12.47s/it, gpt_loss=0.271, loss_mean=0.0686][A[A
+
+Train step of epoch 1:   0%|          | 3/6434 [00:34<18:51:17, 10.55s/it, gpt_loss=0.271, loss_mean=0.0686][A[A
+
+Train step of epoch 1:   0%|          | 3/6434 [00:43<18:51:17, 10.55s/it, gpt_loss=0.323, loss_mean=0.094] [A[A
+
+Train step of epoch 1:   0%|          | 4/6434 [00:43<17:37:47,  9.87s/it, gpt_loss=0.323, loss_mean=0.094][A[A
+
+Train step of epoch 1:   0%|          | 4/6434 [00:51<17:37:47,  9.87s/it, gpt_loss=0.271, loss_mean=0.112][A[A
+
+Train step of epoch 1:   0%|          | 5/6434 [00:51<16:30:57,  9.25s/it, gpt_loss=0.271, loss_mean=0.112][A[A
+[LID Router Debug] Step: 6440
+Batch Size: 10
+Audio Batch Size: 114
+LID Assignments: [5, 5, 9, 2, 3, 3, 4, 6, 4, 2]
+Active Experts in Batch: {2, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:   0%|          | 5/6434 [01:00<16:30:57,  9.25s/it, gpt_loss=0.24, loss_mean=0.125] [A[A
+
+Train step of epoch 1:   0%|          | 6/6434 [01:00<16:14:46,  9.10s/it, gpt_loss=0.24, loss_mean=0.125][A[A
+
+Train step of epoch 1:   0%|          | 6/6434 [01:09<16:14:46,  9.10s/it, gpt_loss=0.288, loss_mean=0.141][A[A
+
+Train step of epoch 1:   0%|          | 7/6434 [01:09<15:56:56,  8.93s/it, gpt_loss=0.288, loss_mean=0.141][A[A
+
+Train step of epoch 1:   0%|          | 7/6434 [01:17<15:56:56,  8.93s/it, gpt_loss=0.257, loss_mean=0.153][A[A
+
+Train step of epoch 1:   0%|          | 8/6434 [01:17<15:37:02,  8.75s/it, gpt_loss=0.257, loss_mean=0.153][A[A
+
+Train step of epoch 1:   0%|          | 8/6434 [01:25<15:37:02,  8.75s/it, gpt_loss=0.309, loss_mean=0.168][A[A
+
+Train step of epoch 1:   0%|          | 9/6434 [01:25<15:23:03,  8.62s/it, gpt_loss=0.309, loss_mean=0.168][A[A
+
+Train step of epoch 1:   0%|          | 9/6434 [01:32<15:23:03,  8.62s/it, gpt_loss=0.25, loss_mean=0.176] [A[A
+
+Train step of epoch 1:   0%|          | 10/6434 [01:32<14:28:17,  8.11s/it, gpt_loss=0.25, loss_mean=0.176][A[A
+
+Train step of epoch 1:   0%|          | 10/6434 [01:39<14:28:17,  8.11s/it, gpt_loss=0.208, loss_mean=0.18][A[A
+
+Train step of epoch 1:   0%|          | 11/6434 [01:39<13:54:46,  7.80s/it, gpt_loss=0.208, loss_mean=0.18][A[A
+
+Train step of epoch 1:   0%|          | 11/6434 [01:47<13:54:46,  7.80s/it, gpt_loss=0.292, loss_mean=0.191][A[A
+
+Train step of epoch 1:   0%|          | 12/6434 [01:47<13:59:23,  7.84s/it, gpt_loss=0.292, loss_mean=0.191][A[A
+
+Train step of epoch 1:   0%|          | 12/6434 [01:56<13:59:23,  7.84s/it, gpt_loss=0.249, loss_mean=0.197][A[A
+
+Train step of epoch 1:   0%|          | 13/6434 [01:56<14:08:52,  7.93s/it, gpt_loss=0.249, loss_mean=0.197][A[A
+
+Train step of epoch 1:   0%|          | 13/6434 [02:05<14:08:52,  7.93s/it, gpt_loss=0.229, loss_mean=0.2]  [A[A
+
+Train step of epoch 1:   0%|          | 14/6434 [02:05<15:01:44,  8.43s/it, gpt_loss=0.229, loss_mean=0.2][A[A
+
+Train step of epoch 1:   0%|          | 14/6434 [02:14<15:01:44,  8.43s/it, gpt_loss=0.192, loss_mean=0.199][A[A
+
+Train step of epoch 1:   0%|          | 15/6434 [02:14<15:09:09,  8.50s/it, gpt_loss=0.192, loss_mean=0.199][A[A
+[LID Router Debug] Step: 6450
+Batch Size: 10
+Audio Batch Size: 136
+LID Assignments: [2, 1, 0, 2, 2, 10, 3, 4, 3, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 10}
+
+
+Train step of epoch 1:   0%|          | 15/6434 [02:23<15:09:09,  8.50s/it, gpt_loss=0.252, loss_mean=0.204][A[A
+
+Train step of epoch 1:   0%|          | 16/6434 [02:23<15:25:33,  8.65s/it, gpt_loss=0.252, loss_mean=0.204][A[A
+
+Train step of epoch 1:   0%|          | 16/6434 [02:30<15:25:33,  8.65s/it, gpt_loss=0.22, loss_mean=0.206] [A[A
+
+Train step of epoch 1:   0%|          | 17/6434 [02:30<14:48:38,  8.31s/it, gpt_loss=0.22, loss_mean=0.206][A[A
+
+Train step of epoch 1:   0%|          | 17/6434 [02:40<14:48:38,  8.31s/it, gpt_loss=0.302, loss_mean=0.216][A[A
+
+Train step of epoch 1:   0%|          | 18/6434 [02:40<15:19:34,  8.60s/it, gpt_loss=0.302, loss_mean=0.216][A[A
+
+Train step of epoch 1:   0%|          | 18/6434 [02:49<15:19:34,  8.60s/it, gpt_loss=0.204, loss_mean=0.214][A[A
+
+Train step of epoch 1:   0%|          | 19/6434 [02:49<15:58:55,  8.97s/it, gpt_loss=0.204, loss_mean=0.214][A[A
+
+Train step of epoch 1:   0%|          | 19/6434 [02:58<15:58:55,  8.97s/it, gpt_loss=0.266, loss_mean=0.22] [A[A
+
+Train step of epoch 1:   0%|          | 20/6434 [02:58<16:00:26,  8.98s/it, gpt_loss=0.266, loss_mean=0.22][A[A
+
+Train step of epoch 1:   0%|          | 20/6434 [03:06<16:00:26,  8.98s/it, gpt_loss=0.276, loss_mean=0.225][A[A
+
+Train step of epoch 1:   0%|          | 21/6434 [03:06<15:05:40,  8.47s/it, gpt_loss=0.276, loss_mean=0.225][A[A
+
+Train step of epoch 1:   0%|          | 21/6434 [03:14<15:05:40,  8.47s/it, gpt_loss=0.293, loss_mean=0.232][A[A
+
+Train step of epoch 1:   0%|          | 22/6434 [03:14<15:14:04,  8.55s/it, gpt_loss=0.293, loss_mean=0.232][A[A
+
+Train step of epoch 1:   0%|          | 22/6434 [03:22<15:14:04,  8.55s/it, gpt_loss=0.27, loss_mean=0.236] [A[A
+
+Train step of epoch 1:   0%|          | 23/6434 [03:22<14:32:05,  8.16s/it, gpt_loss=0.27, loss_mean=0.236][A[A
+
+Train step of epoch 1:   0%|          | 23/6434 [03:30<14:32:05,  8.16s/it, gpt_loss=0.297, loss_mean=0.242][A[A
+
+Train step of epoch 1:   0%|          | 24/6434 [03:30<14:52:13,  8.35s/it, gpt_loss=0.297, loss_mean=0.242][A[A
+
+Train step of epoch 1:   0%|          | 24/6434 [03:38<14:52:13,  8.35s/it, gpt_loss=0.292, loss_mean=0.247][A[A
+
+Train step of epoch 1:   0%|          | 25/6434 [03:38<14:36:45,  8.21s/it, gpt_loss=0.292, loss_mean=0.247][A[A
+[LID Router Debug] Step: 6460
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [9, 9, 5, 4, 3, 5, 3, 9, 6, 4]
+Active Experts in Batch: {3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:   0%|          | 25/6434 [03:46<14:36:45,  8.21s/it, gpt_loss=0.287, loss_mean=0.251][A[A
+
+Train step of epoch 1:   0%|          | 26/6434 [03:46<14:18:15,  8.04s/it, gpt_loss=0.287, loss_mean=0.251][A[A
+
+Train step of epoch 1:   0%|          | 26/6434 [03:55<14:18:15,  8.04s/it, gpt_loss=0.289, loss_mean=0.255][A[A
+
+Train step of epoch 1:   0%|          | 27/6434 [03:55<14:44:52,  8.29s/it, gpt_loss=0.289, loss_mean=0.255][A[A
+
+Train step of epoch 1:   0%|          | 27/6434 [04:02<14:44:52,  8.29s/it, gpt_loss=0.275, loss_mean=0.257][A[A
+
+Train step of epoch 1:   0%|          | 28/6434 [04:02<14:10:53,  7.97s/it, gpt_loss=0.275, loss_mean=0.257][A[A
+
+Train step of epoch 1:   0%|          | 28/6434 [04:10<14:10:53,  7.97s/it, gpt_loss=0.229, loss_mean=0.254][A[A
+
+Train step of epoch 1:   0%|          | 29/6434 [04:10<14:23:23,  8.09s/it, gpt_loss=0.229, loss_mean=0.254][A[A
+
+Train step of epoch 1:   0%|          | 29/6434 [04:20<14:23:23,  8.09s/it, gpt_loss=0.283, loss_mean=0.257][A[A
+
+Train step of epoch 1:   0%|          | 30/6434 [04:20<14:56:50,  8.40s/it, gpt_loss=0.283, loss_mean=0.257][A[A
+
+Train step of epoch 1:   0%|          | 30/6434 [04:28<14:56:50,  8.40s/it, gpt_loss=0.273, loss_mean=0.259][A[A
+
+Train step of epoch 1:   0%|          | 31/6434 [04:28<14:46:07,  8.30s/it, gpt_loss=0.273, loss_mean=0.259][A[A
+
+Train step of epoch 1:   0%|          | 31/6434 [04:36<14:46:07,  8.30s/it, gpt_loss=0.293, loss_mean=0.262][A[A
+
+Train step of epoch 1:   0%|          | 32/6434 [04:36<14:36:56,  8.22s/it, gpt_loss=0.293, loss_mean=0.262][A[A
+
+Train step of epoch 1:   0%|          | 32/6434 [04:44<14:36:56,  8.22s/it, gpt_loss=0.253, loss_mean=0.261][A[A
+
+Train step of epoch 1:   1%|          | 33/6434 [04:44<14:53:35,  8.38s/it, gpt_loss=0.253, loss_mean=0.261][A[A
+
+Train step of epoch 1:   1%|          | 33/6434 [04:52<14:53:35,  8.38s/it, gpt_loss=0.292, loss_mean=0.264][A[A
+
+Train step of epoch 1:   1%|          | 34/6434 [04:52<14:33:41,  8.19s/it, gpt_loss=0.292, loss_mean=0.264][A[A
+
+Train step of epoch 1:   1%|          | 34/6434 [05:00<14:33:41,  8.19s/it, gpt_loss=0.271, loss_mean=0.265][A[A
+
+Train step of epoch 1:   1%|          | 35/6434 [05:00<14:24:29,  8.11s/it, gpt_loss=0.271, loss_mean=0.265][A[A
+[LID Router Debug] Step: 6470
+Batch Size: 10
+Audio Batch Size: 143
+LID Assignments: [2, 2, 9, 0, 9, 5, 3, 3, 0, 3]
+Active Experts in Batch: {0, 2, 3, 5, 9}
+
+
+Train step of epoch 1:   1%|          | 35/6434 [05:10<14:24:29,  8.11s/it, gpt_loss=0.293, loss_mean=0.268][A[A
+
+Train step of epoch 1:   1%|          | 36/6434 [05:10<15:17:30,  8.60s/it, gpt_loss=0.293, loss_mean=0.268][A[A
+
+Train step of epoch 1:   1%|          | 36/6434 [05:18<15:17:30,  8.60s/it, gpt_loss=0.322, loss_mean=0.273][A[A
+
+Train step of epoch 1:   1%|          | 37/6434 [05:18<15:06:39,  8.50s/it, gpt_loss=0.322, loss_mean=0.273][A[A
+
+Train step of epoch 1:   1%|          | 37/6434 [05:26<15:06:39,  8.50s/it, gpt_loss=0.32, loss_mean=0.278] [A[A
+
+Train step of epoch 1:   1%|          | 38/6434 [05:27<15:01:50,  8.46s/it, gpt_loss=0.32, loss_mean=0.278][A[A
+
+Train step of epoch 1:   1%|          | 38/6434 [05:35<15:01:50,  8.46s/it, gpt_loss=0.223, loss_mean=0.272][A[A
+
+Train step of epoch 1:   1%|          | 39/6434 [05:35<15:08:09,  8.52s/it, gpt_loss=0.223, loss_mean=0.272][A[A
+
+Train step of epoch 1:   1%|          | 39/6434 [05:43<15:08:09,  8.52s/it, gpt_loss=0.253, loss_mean=0.27] [A[A
+
+Train step of epoch 1:   1%|          | 40/6434 [05:43<14:56:02,  8.41s/it, gpt_loss=0.253, loss_mean=0.27][A[A
+
+Train step of epoch 1:   1%|          | 40/6434 [05:51<14:56:02,  8.41s/it, gpt_loss=0.305, loss_mean=0.274][A[A
+
+Train step of epoch 1:   1%|          | 41/6434 [05:51<14:47:47,  8.33s/it, gpt_loss=0.305, loss_mean=0.274][A[A
+
+Train step of epoch 1:   1%|          | 41/6434 [06:00<14:47:47,  8.33s/it, gpt_loss=0.355, loss_mean=0.282][A[A
+
+Train step of epoch 1:   1%|          | 42/6434 [06:00<15:09:22,  8.54s/it, gpt_loss=0.355, loss_mean=0.282][A[A
+
+Train step of epoch 1:   1%|          | 42/6434 [06:10<15:09:22,  8.54s/it, gpt_loss=0.311, loss_mean=0.285][A[A
+
+Train step of epoch 1:   1%|          | 43/6434 [06:10<15:30:26,  8.74s/it, gpt_loss=0.311, loss_mean=0.285][A[A
+
+Train step of epoch 1:   1%|          | 43/6434 [06:17<15:30:26,  8.74s/it, gpt_loss=0.274, loss_mean=0.284][A[A
+
+Train step of epoch 1:   1%|          | 44/6434 [06:17<14:56:03,  8.41s/it, gpt_loss=0.274, loss_mean=0.284][A[A
+
+Train step of epoch 1:   1%|          | 44/6434 [06:25<14:56:03,  8.41s/it, gpt_loss=0.223, loss_mean=0.278][A[A
+
+Train step of epoch 1:   1%|          | 45/6434 [06:25<14:24:34,  8.12s/it, gpt_loss=0.223, loss_mean=0.278][A[A
+[LID Router Debug] Step: 6480
+Batch Size: 10
+Audio Batch Size: 122
+LID Assignments: [3, 5, 3, 9, 9, 0, 1, 3, 2, 5]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9}
+
+
+Train step of epoch 1:   1%|          | 45/6434 [06:33<14:24:34,  8.12s/it, gpt_loss=0.293, loss_mean=0.279][A[A
+
+Train step of epoch 1:   1%|          | 46/6434 [06:33<14:36:24,  8.23s/it, gpt_loss=0.293, loss_mean=0.279][A[A
+
+Train step of epoch 1:   1%|          | 46/6434 [06:42<14:36:24,  8.23s/it, gpt_loss=0.262, loss_mean=0.278][A[A
+
+Train step of epoch 1:   1%|          | 47/6434 [06:42<14:37:53,  8.25s/it, gpt_loss=0.262, loss_mean=0.278][A[A
+
+Train step of epoch 1:   1%|          | 47/6434 [06:50<14:37:53,  8.25s/it, gpt_loss=0.247, loss_mean=0.274][A[A
+
+Train step of epoch 1:   1%|          | 48/6434 [06:50<14:44:07,  8.31s/it, gpt_loss=0.247, loss_mean=0.274][A[A
+
+Train step of epoch 1:   1%|          | 48/6434 [07:00<14:44:07,  8.31s/it, gpt_loss=0.252, loss_mean=0.272][A[A
+
+Train step of epoch 1:   1%|          | 49/6434 [07:00<15:35:59,  8.80s/it, gpt_loss=0.252, loss_mean=0.272][A[A
+
+Train step of epoch 1:   1%|          | 49/6434 [07:09<15:35:59,  8.80s/it, gpt_loss=0.235, loss_mean=0.269][A[A
+
+Train step of epoch 1:   1%|          | 50/6434 [07:09<15:59:48,  9.02s/it, gpt_loss=0.235, loss_mean=0.269][A[A
+
+Train step of epoch 1:   1%|          | 50/6434 [07:18<15:59:48,  9.02s/it, gpt_loss=0.311, loss_mean=0.273][A[A
+
+Train step of epoch 1:   1%|          | 51/6434 [07:18<15:35:26,  8.79s/it, gpt_loss=0.311, loss_mean=0.273][A[A
+
+Train step of epoch 1:   1%|          | 51/6434 [07:28<15:35:26,  8.79s/it, gpt_loss=0.358, loss_mean=0.281][A[A
+
+Train step of epoch 1:   1%|          | 52/6434 [07:28<16:22:29,  9.24s/it, gpt_loss=0.358, loss_mean=0.281][A[A
+
+Train step of epoch 1:   1%|          | 52/6434 [07:37<16:22:29,  9.24s/it, gpt_loss=0.363, loss_mean=0.289][A[A
+
+Train step of epoch 1:   1%|          | 53/6434 [07:37<15:59:39,  9.02s/it, gpt_loss=0.363, loss_mean=0.289][A[A
+
+Train step of epoch 1:   1%|          | 53/6434 [07:43<15:59:39,  9.02s/it, gpt_loss=0.248, loss_mean=0.285][A[A
+
+Train step of epoch 1:   1%|          | 54/6434 [07:43<14:48:29,  8.36s/it, gpt_loss=0.248, loss_mean=0.285][A[A
+
+Train step of epoch 1:   1%|          | 54/6434 [07:51<14:48:29,  8.36s/it, gpt_loss=0.263, loss_mean=0.283][A[A
+
+Train step of epoch 1:   1%|          | 55/6434 [07:51<14:35:40,  8.24s/it, gpt_loss=0.263, loss_mean=0.283][A[A
+[LID Router Debug] Step: 6490
+Batch Size: 10
+Audio Batch Size: 87
+LID Assignments: [1, 5, 9, 0, 4, 1, 5, 2, 5, 5]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+
+Train step of epoch 1:   1%|          | 55/6434 [08:00<14:35:40,  8.24s/it, gpt_loss=0.22, loss_mean=0.277] [A[A
+
+Train step of epoch 1:   1%|          | 56/6434 [08:00<14:46:22,  8.34s/it, gpt_loss=0.22, loss_mean=0.277][A[A
+
+Train step of epoch 1:   1%|          | 56/6434 [08:08<14:46:22,  8.34s/it, gpt_loss=0.255, loss_mean=0.275][A[A
+
+Train step of epoch 1:   1%|          | 57/6434 [08:08<14:46:54,  8.34s/it, gpt_loss=0.255, loss_mean=0.275][A[A
+
+Train step of epoch 1:   1%|          | 57/6434 [08:17<14:46:54,  8.34s/it, gpt_loss=0.267, loss_mean=0.274][A[A
+
+Train step of epoch 1:   1%|          | 58/6434 [08:17<15:00:33,  8.47s/it, gpt_loss=0.267, loss_mean=0.274][A[A
+
+Train step of epoch 1:   1%|          | 58/6434 [08:25<15:00:33,  8.47s/it, gpt_loss=0.335, loss_mean=0.28] [A[A
+
+Train step of epoch 1:   1%|          | 59/6434 [08:25<14:46:30,  8.34s/it, gpt_loss=0.335, loss_mean=0.28][A[A
+
+Train step of epoch 1:   1%|          | 59/6434 [08:34<14:46:30,  8.34s/it, gpt_loss=0.283, loss_mean=0.28][A[A
+
+Train step of epoch 1:   1%|          | 60/6434 [08:34<15:11:50,  8.58s/it, gpt_loss=0.283, loss_mean=0.28][A[A
+
+Train step of epoch 1:   1%|          | 60/6434 [08:43<15:11:50,  8.58s/it, gpt_loss=0.233, loss_mean=0.275][A[A
+
+Train step of epoch 1:   1%|          | 61/6434 [08:43<15:06:39,  8.54s/it, gpt_loss=0.233, loss_mean=0.275][A[A
+
+Train step of epoch 1:   1%|          | 61/6434 [08:51<15:06:39,  8.54s/it, gpt_loss=0.262, loss_mean=0.274][A[A
+
+Train step of epoch 1:   1%|          | 62/6434 [08:51<15:02:48,  8.50s/it, gpt_loss=0.262, loss_mean=0.274][A[A
+
+Train step of epoch 1:   1%|          | 62/6434 [09:00<15:02:48,  8.50s/it, gpt_loss=0.293, loss_mean=0.276][A[A
+
+Train step of epoch 1:   1%|          | 63/6434 [09:00<15:29:54,  8.76s/it, gpt_loss=0.293, loss_mean=0.276][A[A
+
+Train step of epoch 1:   1%|          | 63/6434 [09:08<15:29:54,  8.76s/it, gpt_loss=0.2, loss_mean=0.268]  [A[A
+
+Train step of epoch 1:   1%|          | 64/6434 [09:08<15:03:35,  8.51s/it, gpt_loss=0.2, loss_mean=0.268][A[A
+
+Train step of epoch 1:   1%|          | 64/6434 [09:17<15:03:35,  8.51s/it, gpt_loss=0.265, loss_mean=0.268][A[A
+
+Train step of epoch 1:   1%|          | 65/6434 [09:17<15:13:20,  8.60s/it, gpt_loss=0.265, loss_mean=0.268][A[A
+[LID Router Debug] Step: 6500
+Batch Size: 10
+Audio Batch Size: 105
+LID Assignments: [2, 1, 6, 2, 0, 0, 9, 9, 4, 2]
+Active Experts in Batch: {0, 1, 2, 4, 6, 9}
+
+
+Train step of epoch 1:   1%|          | 65/6434 [09:26<15:13:20,  8.60s/it, gpt_loss=0.213, loss_mean=0.263][A[A
+
+Train step of epoch 1:   1%|          | 66/6434 [09:26<15:14:03,  8.61s/it, gpt_loss=0.213, loss_mean=0.263][A[A
+
+Train step of epoch 1:   1%|          | 66/6434 [09:35<15:14:03,  8.61s/it, gpt_loss=0.322, loss_mean=0.269][A[A
+
+Train step of epoch 1:   1%|          | 67/6434 [09:35<15:33:35,  8.80s/it, gpt_loss=0.322, loss_mean=0.269][A[A
+
+Train step of epoch 1:   1%|          | 67/6434 [09:43<15:33:35,  8.80s/it, gpt_loss=0.255, loss_mean=0.267][A[A
+
+Train step of epoch 1:   1%|          | 68/6434 [09:43<14:59:23,  8.48s/it, gpt_loss=0.255, loss_mean=0.267][A[A
+
+Train step of epoch 1:   1%|          | 68/6434 [09:51<14:59:23,  8.48s/it, gpt_loss=0.359, loss_mean=0.276][A[A
+
+Train step of epoch 1:   1%|          | 69/6434 [09:51<14:56:02,  8.45s/it, gpt_loss=0.359, loss_mean=0.276][A[A
+
+Train step of epoch 1:   1%|          | 69/6434 [09:59<14:56:02,  8.45s/it, gpt_loss=0.326, loss_mean=0.281][A[A
+
+Train step of epoch 1:   1%|          | 70/6434 [09:59<14:51:34,  8.41s/it, gpt_loss=0.326, loss_mean=0.281][A[A
+
+Train step of epoch 1:   1%|          | 70/6434 [10:07<14:51:34,  8.41s/it, gpt_loss=0.335, loss_mean=0.287][A[A
+
+Train step of epoch 1:   1%|          | 71/6434 [10:07<14:39:56,  8.30s/it, gpt_loss=0.335, loss_mean=0.287][A[A
+
+Train step of epoch 1:   1%|          | 71/6434 [10:16<14:39:56,  8.30s/it, gpt_loss=0.24, loss_mean=0.282] [A[A
+
+Train step of epoch 1:   1%|          | 72/6434 [10:16<14:58:58,  8.48s/it, gpt_loss=0.24, loss_mean=0.282][A[A
+
+Train step of epoch 1:   1%|          | 72/6434 [10:24<14:58:58,  8.48s/it, gpt_loss=0.33, loss_mean=0.287][A[A
+
+Train step of epoch 1:   1%|          | 73/6434 [10:24<14:44:45,  8.35s/it, gpt_loss=0.33, loss_mean=0.287][A[A
+
+Train step of epoch 1:   1%|          | 73/6434 [10:33<14:44:45,  8.35s/it, gpt_loss=0.234, loss_mean=0.282][A[A
+
+Train step of epoch 1:   1%|          | 74/6434 [10:33<14:43:26,  8.33s/it, gpt_loss=0.234, loss_mean=0.282][A[A
+
+Train step of epoch 1:   1%|          | 74/6434 [10:42<14:43:26,  8.33s/it, gpt_loss=0.287, loss_mean=0.282][A[A
+
+Train step of epoch 1:   1%|          | 75/6434 [10:42<15:21:36,  8.70s/it, gpt_loss=0.287, loss_mean=0.282][A[A
+[LID Router Debug] Step: 6510
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [1, 4, 0, 4, 2, 9, 2, 2, 1, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+
+Train step of epoch 1:   1%|          | 75/6434 [10:50<15:21:36,  8.70s/it, gpt_loss=0.282, loss_mean=0.282][A[A
+
+Train step of epoch 1:   1%|          | 76/6434 [10:50<14:40:54,  8.31s/it, gpt_loss=0.282, loss_mean=0.282][A[A
+
+Train step of epoch 1:   1%|          | 76/6434 [10:59<14:40:54,  8.31s/it, gpt_loss=0.272, loss_mean=0.281][A[A
+
+Train step of epoch 1:   1%|          | 77/6434 [10:59<14:59:31,  8.49s/it, gpt_loss=0.272, loss_mean=0.281][A[A
+
+Train step of epoch 1:   1%|          | 77/6434 [11:07<14:59:31,  8.49s/it, gpt_loss=0.331, loss_mean=0.286][A[A
+
+Train step of epoch 1:   1%|          | 78/6434 [11:07<15:11:45,  8.61s/it, gpt_loss=0.331, loss_mean=0.286][A[A
+
+Train step of epoch 1:   1%|          | 78/6434 [11:16<15:11:45,  8.61s/it, gpt_loss=0.269, loss_mean=0.284][A[A
+
+Train step of epoch 1:   1%|          | 79/6434 [11:16<15:12:22,  8.61s/it, gpt_loss=0.269, loss_mean=0.284][A[A
+
+Train step of epoch 1:   1%|          | 79/6434 [11:24<15:12:22,  8.61s/it, gpt_loss=0.248, loss_mean=0.281][A[A
+
+Train step of epoch 1:   1%|          | 80/6434 [11:24<14:50:59,  8.41s/it, gpt_loss=0.248, loss_mean=0.281][A[A
+
+Train step of epoch 1:   1%|          | 80/6434 [11:32<14:50:59,  8.41s/it, gpt_loss=0.263, loss_mean=0.279][A[A
+
+Train step of epoch 1:   1%|▏         | 81/6434 [11:32<14:49:34,  8.40s/it, gpt_loss=0.263, loss_mean=0.279][A[A
+
+Train step of epoch 1:   1%|▏         | 81/6434 [11:41<14:49:34,  8.40s/it, gpt_loss=0.298, loss_mean=0.281][A[A
+
+Train step of epoch 1:   1%|▏         | 82/6434 [11:41<14:54:21,  8.45s/it, gpt_loss=0.298, loss_mean=0.281][A[A
+
+Train step of epoch 1:   1%|▏         | 82/6434 [11:50<14:54:21,  8.45s/it, gpt_loss=0.315, loss_mean=0.284][A[A
+
+Train step of epoch 1:   1%|▏         | 83/6434 [11:50<15:03:09,  8.53s/it, gpt_loss=0.315, loss_mean=0.284][A[A
+
+Train step of epoch 1:   1%|▏         | 83/6434 [11:58<15:03:09,  8.53s/it, gpt_loss=0.33, loss_mean=0.289] [A[A
+
+Train step of epoch 1:   1%|▏         | 84/6434 [11:58<14:51:48,  8.43s/it, gpt_loss=0.33, loss_mean=0.289][A[A
+
+Train step of epoch 1:   1%|▏         | 84/6434 [12:06<14:51:48,  8.43s/it, gpt_loss=0.202, loss_mean=0.28][A[A
+
+Train step of epoch 1:   1%|▏         | 85/6434 [12:06<14:43:28,  8.35s/it, gpt_loss=0.202, loss_mean=0.28][A[A
+[LID Router Debug] Step: 6520
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [1, 2, 2, 0, 5, 10, 0, 5, 6, 2]
+Active Experts in Batch: {0, 1, 2, 5, 6, 10}
+
+
+Train step of epoch 1:   1%|▏         | 85/6434 [12:14<14:43:28,  8.35s/it, gpt_loss=0.266, loss_mean=0.279][A[A
+
+Train step of epoch 1:   1%|▏         | 86/6434 [12:14<14:31:30,  8.24s/it, gpt_loss=0.266, loss_mean=0.279][A[A
+
+Train step of epoch 1:   1%|▏         | 86/6434 [12:22<14:31:30,  8.24s/it, gpt_loss=0.315, loss_mean=0.282][A[A
+
+Train step of epoch 1:   1%|▏         | 87/6434 [12:22<14:32:11,  8.25s/it, gpt_loss=0.315, loss_mean=0.282][A[A
+
+Train step of epoch 1:   1%|▏         | 87/6434 [12:31<14:32:11,  8.25s/it, gpt_loss=0.308, loss_mean=0.285][A[A
+
+Train step of epoch 1:   1%|▏         | 88/6434 [12:31<14:41:38,  8.34s/it, gpt_loss=0.308, loss_mean=0.285][A[A
+
+Train step of epoch 1:   1%|▏         | 88/6434 [12:39<14:41:38,  8.34s/it, gpt_loss=0.267, loss_mean=0.283][A[A
+
+Train step of epoch 1:   1%|▏         | 89/6434 [12:39<14:36:39,  8.29s/it, gpt_loss=0.267, loss_mean=0.283][A[A
+
+Train step of epoch 1:   1%|▏         | 89/6434 [12:47<14:36:39,  8.29s/it, gpt_loss=0.299, loss_mean=0.285][A[A
+
+Train step of epoch 1:   1%|▏         | 90/6434 [12:47<14:30:21,  8.23s/it, gpt_loss=0.299, loss_mean=0.285][A[A
+
+Train step of epoch 1:   1%|▏         | 90/6434 [12:54<14:30:21,  8.23s/it, gpt_loss=0.293, loss_mean=0.286][A[A
+
+Train step of epoch 1:   1%|▏         | 91/6434 [12:54<13:56:07,  7.91s/it, gpt_loss=0.293, loss_mean=0.286][A[A
+
+Train step of epoch 1:   1%|▏         | 91/6434 [13:03<13:56:07,  7.91s/it, gpt_loss=0.285, loss_mean=0.286][A[A
+
+Train step of epoch 1:   1%|▏         | 92/6434 [13:03<14:13:51,  8.08s/it, gpt_loss=0.285, loss_mean=0.286][A[A
+
+Train step of epoch 1:   1%|▏         | 92/6434 [13:11<14:13:51,  8.08s/it, gpt_loss=0.24, loss_mean=0.281] [A[A
+
+Train step of epoch 1:   1%|▏         | 93/6434 [13:11<14:12:35,  8.07s/it, gpt_loss=0.24, loss_mean=0.281][A[A
+
+Train step of epoch 1:   1%|▏         | 93/6434 [13:20<14:12:35,  8.07s/it, gpt_loss=0.289, loss_mean=0.282][A[A
+
+Train step of epoch 1:   1%|▏         | 94/6434 [13:20<14:37:39,  8.31s/it, gpt_loss=0.289, loss_mean=0.282][A[A
+
+Train step of epoch 1:   1%|▏         | 94/6434 [13:27<14:37:39,  8.31s/it, gpt_loss=0.254, loss_mean=0.279][A[A
+
+Train step of epoch 1:   1%|▏         | 95/6434 [13:27<14:04:01,  7.99s/it, gpt_loss=0.254, loss_mean=0.279][A[A
+[LID Router Debug] Step: 6530
+Batch Size: 10
+Audio Batch Size: 132
+LID Assignments: [4, 4, 3, 9, 3, 3, 2, 4, 5, 6]
+Active Experts in Batch: {2, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:   1%|▏         | 95/6434 [13:36<14:04:01,  7.99s/it, gpt_loss=0.25, loss_mean=0.276] [A[A
+
+Train step of epoch 1:   1%|▏         | 96/6434 [13:36<14:39:07,  8.32s/it, gpt_loss=0.25, loss_mean=0.276][A[A
+
+Train step of epoch 1:   1%|▏         | 96/6434 [13:44<14:39:07,  8.32s/it, gpt_loss=0.23, loss_mean=0.272][A[A
+
+Train step of epoch 1:   2%|▏         | 97/6434 [13:44<14:21:22,  8.16s/it, gpt_loss=0.23, loss_mean=0.272][A[A
+
+Train step of epoch 1:   2%|▏         | 97/6434 [13:52<14:21:22,  8.16s/it, gpt_loss=0.224, loss_mean=0.267][A[A
+
+Train step of epoch 1:   2%|▏         | 98/6434 [13:52<14:09:08,  8.04s/it, gpt_loss=0.224, loss_mean=0.267][A[A
+
+Train step of epoch 1:   2%|▏         | 98/6434 [14:00<14:09:08,  8.04s/it, gpt_loss=0.193, loss_mean=0.259][A[A
+
+Train step of epoch 1:   2%|▏         | 99/6434 [14:00<14:21:35,  8.16s/it, gpt_loss=0.193, loss_mean=0.259][A[A
+
+Train step of epoch 1:   2%|▏         | 99/6434 [14:08<14:21:35,  8.16s/it, gpt_loss=0.23, loss_mean=0.257] [A[A
+
+Train step of epoch 1:   2%|▏         | 100/6434 [14:08<14:06:38,  8.02s/it, gpt_loss=0.23, loss_mean=0.257][A[A
+
+Train step of epoch 1:   2%|▏         | 100/6434 [14:16<14:06:38,  8.02s/it, gpt_loss=0.218, loss_mean=0.253][A[A
+
+Train step of epoch 1:   2%|▏         | 101/6434 [14:16<14:09:50,  8.05s/it, gpt_loss=0.218, loss_mean=0.253][A[A
+
+Train step of epoch 1:   2%|▏         | 101/6434 [14:24<14:09:50,  8.05s/it, gpt_loss=0.225, loss_mean=0.25] [A[A
+
+Train step of epoch 1:   2%|▏         | 102/6434 [14:24<14:20:20,  8.15s/it, gpt_loss=0.225, loss_mean=0.25][A[A
+
+Train step of epoch 1:   2%|▏         | 102/6434 [14:32<14:20:20,  8.15s/it, gpt_loss=0.32, loss_mean=0.257][A[A
+
+Train step of epoch 1:   2%|▏         | 103/6434 [14:32<14:15:51,  8.11s/it, gpt_loss=0.32, loss_mean=0.257][A[A
+
+Train step of epoch 1:   2%|▏         | 103/6434 [14:40<14:15:51,  8.11s/it, gpt_loss=0.312, loss_mean=0.262][A[A
+
+Train step of epoch 1:   2%|▏         | 104/6434 [14:40<13:58:05,  7.94s/it, gpt_loss=0.312, loss_mean=0.262][A[A
+
+Train step of epoch 1:   2%|▏         | 104/6434 [14:49<13:58:05,  7.94s/it, gpt_loss=0.272, loss_mean=0.263][A[A
+
+Train step of epoch 1:   2%|▏         | 105/6434 [14:49<14:52:28,  8.46s/it, gpt_loss=0.272, loss_mean=0.263][A[A
+[LID Router Debug] Step: 6540
+Batch Size: 10
+Audio Batch Size: 153
+LID Assignments: [9, 0, 3, 4, 5, 3, 5, 3, 9, 3]
+Active Experts in Batch: {0, 3, 4, 5, 9}
+
+
+Train step of epoch 1:   2%|▏         | 105/6434 [14:59<14:52:28,  8.46s/it, gpt_loss=0.289, loss_mean=0.266][A[A
+
+Train step of epoch 1:   2%|▏         | 106/6434 [14:59<15:26:45,  8.79s/it, gpt_loss=0.289, loss_mean=0.266][A[A
+
+Train step of epoch 1:   2%|▏         | 106/6434 [15:07<15:26:45,  8.79s/it, gpt_loss=0.283, loss_mean=0.268][A[A
+
+Train step of epoch 1:   2%|▏         | 107/6434 [15:07<15:07:52,  8.61s/it, gpt_loss=0.283, loss_mean=0.268][A[A
+
+Train step of epoch 1:   2%|▏         | 107/6434 [15:16<15:07:52,  8.61s/it, gpt_loss=0.277, loss_mean=0.269][A[A
+
+Train step of epoch 1:   2%|▏         | 108/6434 [15:16<15:27:48,  8.80s/it, gpt_loss=0.277, loss_mean=0.269][A[A
+
+Train step of epoch 1:   2%|▏         | 108/6434 [15:24<15:27:48,  8.80s/it, gpt_loss=0.3, loss_mean=0.272]  [A[A
+
+Train step of epoch 1:   2%|▏         | 109/6434 [15:24<14:47:32,  8.42s/it, gpt_loss=0.3, loss_mean=0.272][A[A
+
+Train step of epoch 1:   2%|▏         | 109/6434 [15:33<14:47:32,  8.42s/it, gpt_loss=0.222, loss_mean=0.267][A[A
+
+Train step of epoch 1:   2%|▏         | 110/6434 [15:33<14:53:51,  8.48s/it, gpt_loss=0.222, loss_mean=0.267][A[A
+
+Train step of epoch 1:   2%|▏         | 110/6434 [15:42<14:53:51,  8.48s/it, gpt_loss=0.314, loss_mean=0.271][A[A
+
+Train step of epoch 1:   2%|▏         | 111/6434 [15:42<15:10:19,  8.64s/it, gpt_loss=0.314, loss_mean=0.271][A[A
+
+Train step of epoch 1:   2%|▏         | 111/6434 [15:49<15:10:19,  8.64s/it, gpt_loss=0.316, loss_mean=0.276][A[A
+
+Train step of epoch 1:   2%|▏         | 112/6434 [15:49<14:41:10,  8.36s/it, gpt_loss=0.316, loss_mean=0.276][A[A
+
+Train step of epoch 1:   2%|▏         | 112/6434 [15:58<14:41:10,  8.36s/it, gpt_loss=0.252, loss_mean=0.274][A[A
+
+Train step of epoch 1:   2%|▏         | 113/6434 [15:58<15:01:31,  8.56s/it, gpt_loss=0.252, loss_mean=0.274][A[A
+
+Train step of epoch 1:   2%|▏         | 113/6434 [16:06<15:01:31,  8.56s/it, gpt_loss=0.208, loss_mean=0.267][A[A
+
+Train step of epoch 1:   2%|▏         | 114/6434 [16:06<14:46:23,  8.42s/it, gpt_loss=0.208, loss_mean=0.267][A[A
+
+Train step of epoch 1:   2%|▏         | 114/6434 [16:16<14:46:23,  8.42s/it, gpt_loss=0.27, loss_mean=0.267] [A[A
+
+Train step of epoch 1:   2%|▏         | 115/6434 [16:16<15:26:01,  8.79s/it, gpt_loss=0.27, loss_mean=0.267][A[A
+[LID Router Debug] Step: 6550
+Batch Size: 10
+Audio Batch Size: 118
+LID Assignments: [3, 2, 1, 5, 2, 9, 2, 4, 3, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:   2%|▏         | 115/6434 [16:25<15:26:01,  8.79s/it, gpt_loss=0.355, loss_mean=0.276][A[A
+
+Train step of epoch 1:   2%|▏         | 116/6434 [16:25<15:29:30,  8.83s/it, gpt_loss=0.355, loss_mean=0.276][A[A
+
+Train step of epoch 1:   2%|▏         | 116/6434 [16:33<15:29:30,  8.83s/it, gpt_loss=0.2, loss_mean=0.268]  [A[A
+
+Train step of epoch 1:   2%|▏         | 117/6434 [16:33<15:10:01,  8.64s/it, gpt_loss=0.2, loss_mean=0.268][A[A
+
+Train step of epoch 1:   2%|▏         | 117/6434 [16:41<15:10:01,  8.64s/it, gpt_loss=0.316, loss_mean=0.273][A[A
+
+Train step of epoch 1:   2%|▏         | 118/6434 [16:41<14:47:06,  8.43s/it, gpt_loss=0.316, loss_mean=0.273][A[A
+
+Train step of epoch 1:   2%|▏         | 118/6434 [16:50<14:47:06,  8.43s/it, gpt_loss=0.288, loss_mean=0.275][A[A
+
+Train step of epoch 1:   2%|▏         | 119/6434 [16:50<15:04:51,  8.60s/it, gpt_loss=0.288, loss_mean=0.275][A[A
+
+Train step of epoch 1:   2%|▏         | 119/6434 [16:58<15:04:51,  8.60s/it, gpt_loss=0.344, loss_mean=0.282][A[A
+
+Train step of epoch 1:   2%|▏         | 120/6434 [16:58<14:30:52,  8.28s/it, gpt_loss=0.344, loss_mean=0.282][A[A
+
+Train step of epoch 1:   2%|▏         | 120/6434 [17:06<14:30:52,  8.28s/it, gpt_loss=0.264, loss_mean=0.28] [A[A
+
+Train step of epoch 1:   2%|▏         | 121/6434 [17:06<14:27:37,  8.25s/it, gpt_loss=0.264, loss_mean=0.28][A[A
+
+Train step of epoch 1:   2%|▏         | 121/6434 [17:14<14:27:37,  8.25s/it, gpt_loss=0.283, loss_mean=0.28][A[A
+
+Train step of epoch 1:   2%|▏         | 122/6434 [17:14<14:35:46,  8.32s/it, gpt_loss=0.283, loss_mean=0.28][A[A
+
+Train step of epoch 1:   2%|▏         | 122/6434 [17:22<14:35:46,  8.32s/it, gpt_loss=0.301, loss_mean=0.282][A[A
+
+Train step of epoch 1:   2%|▏         | 123/6434 [17:22<14:22:51,  8.20s/it, gpt_loss=0.301, loss_mean=0.282][A[A
+
+Train step of epoch 1:   2%|▏         | 123/6434 [17:31<14:22:51,  8.20s/it, gpt_loss=0.311, loss_mean=0.285][A[A
+
+Train step of epoch 1:   2%|▏         | 124/6434 [17:31<14:33:55,  8.31s/it, gpt_loss=0.311, loss_mean=0.285][A[A
+
+Train step of epoch 1:   2%|▏         | 124/6434 [17:39<14:33:55,  8.31s/it, gpt_loss=0.321, loss_mean=0.289][A[A
+
+Train step of epoch 1:   2%|▏         | 125/6434 [17:39<14:18:20,  8.16s/it, gpt_loss=0.321, loss_mean=0.289][A[A
+[LID Router Debug] Step: 6560
+Batch Size: 10
+Audio Batch Size: 78
+LID Assignments: [9, 1, 1, 0, 5, 1, 2, 5, 5, 0]
+Active Experts in Batch: {0, 1, 2, 5, 9}
+
+
+Train step of epoch 1:   2%|▏         | 125/6434 [17:46<14:18:20,  8.16s/it, gpt_loss=0.444, loss_mean=0.304][A[A
+
+Train step of epoch 1:   2%|▏         | 126/6434 [17:46<13:47:28,  7.87s/it, gpt_loss=0.444, loss_mean=0.304][A[A
+
+Train step of epoch 1:   2%|▏         | 126/6434 [17:54<13:47:28,  7.87s/it, gpt_loss=0.249, loss_mean=0.299][A[A
+
+Train step of epoch 1:   2%|▏         | 127/6434 [17:54<13:56:12,  7.96s/it, gpt_loss=0.249, loss_mean=0.299][A[A
+
+Train step of epoch 1:   2%|▏         | 127/6434 [18:02<13:56:12,  7.96s/it, gpt_loss=0.251, loss_mean=0.294][A[A
+
+Train step of epoch 1:   2%|▏         | 128/6434 [18:02<13:52:35,  7.92s/it, gpt_loss=0.251, loss_mean=0.294][A[A
+
+Train step of epoch 1:   2%|▏         | 128/6434 [18:10<13:52:35,  7.92s/it, gpt_loss=0.237, loss_mean=0.288][A[A
+
+Train step of epoch 1:   2%|▏         | 129/6434 [18:10<13:50:51,  7.91s/it, gpt_loss=0.237, loss_mean=0.288][A[A
+
+Train step of epoch 1:   2%|▏         | 129/6434 [18:17<13:50:51,  7.91s/it, gpt_loss=0.231, loss_mean=0.283][A[A
+
+Train step of epoch 1:   2%|▏         | 130/6434 [18:17<13:46:28,  7.87s/it, gpt_loss=0.231, loss_mean=0.283][A[A
+
+Train step of epoch 1:   2%|▏         | 130/6434 [18:25<13:46:28,  7.87s/it, gpt_loss=0.222, loss_mean=0.276][A[A
+
+Train step of epoch 1:   2%|▏         | 131/6434 [18:25<13:33:47,  7.75s/it, gpt_loss=0.222, loss_mean=0.276][A[A
+
+Train step of epoch 1:   2%|▏         | 131/6434 [18:34<13:33:47,  7.75s/it, gpt_loss=0.23, loss_mean=0.272] [A[A
+
+Train step of epoch 1:   2%|▏         | 132/6434 [18:34<14:12:52,  8.12s/it, gpt_loss=0.23, loss_mean=0.272][A[A
+
+Train step of epoch 1:   2%|▏         | 132/6434 [18:43<14:12:52,  8.12s/it, gpt_loss=0.259, loss_mean=0.271][A[A
+
+Train step of epoch 1:   2%|▏         | 133/6434 [18:43<14:43:51,  8.42s/it, gpt_loss=0.259, loss_mean=0.271][A[A
+
+Train step of epoch 1:   2%|▏         | 133/6434 [18:52<14:43:51,  8.42s/it, gpt_loss=0.278, loss_mean=0.271][A[A
+
+Train step of epoch 1:   2%|▏         | 134/6434 [18:52<14:48:13,  8.46s/it, gpt_loss=0.278, loss_mean=0.271][A[A
+
+Train step of epoch 1:   2%|▏         | 134/6434 [19:01<14:48:13,  8.46s/it, gpt_loss=0.236, loss_mean=0.268][A[A
+
+Train step of epoch 1:   2%|▏         | 135/6434 [19:01<15:15:44,  8.72s/it, gpt_loss=0.236, loss_mean=0.268][A[A
+[LID Router Debug] Step: 6570
+Batch Size: 10
+Audio Batch Size: 138
+LID Assignments: [9, 3, 2, 2, 2, 9, 9, 6, 0, 1]
+Active Experts in Batch: {0, 1, 2, 3, 6, 9}
+
+
+Train step of epoch 1:   2%|▏         | 135/6434 [19:10<15:15:44,  8.72s/it, gpt_loss=0.266, loss_mean=0.268][A[A
+
+Train step of epoch 1:   2%|▏         | 136/6434 [19:10<15:24:55,  8.81s/it, gpt_loss=0.266, loss_mean=0.268][A[A
+
+Train step of epoch 1:   2%|▏         | 136/6434 [19:18<15:24:55,  8.81s/it, gpt_loss=0.212, loss_mean=0.262][A[A
+
+Train step of epoch 1:   2%|▏         | 137/6434 [19:18<14:54:06,  8.52s/it, gpt_loss=0.212, loss_mean=0.262][A[A
+
+Train step of epoch 1:   2%|▏         | 137/6434 [19:26<14:54:06,  8.52s/it, gpt_loss=0.222, loss_mean=0.258][A[A
+
+Train step of epoch 1:   2%|▏         | 138/6434 [19:26<14:32:51,  8.32s/it, gpt_loss=0.222, loss_mean=0.258][A[A
+
+Train step of epoch 1:   2%|▏         | 138/6434 [19:33<14:32:51,  8.32s/it, gpt_loss=0.269, loss_mean=0.259][A[A
+
+Train step of epoch 1:   2%|▏         | 139/6434 [19:33<14:17:42,  8.18s/it, gpt_loss=0.269, loss_mean=0.259][A[A
+
+Train step of epoch 1:   2%|▏         | 139/6434 [19:41<14:17:42,  8.18s/it, gpt_loss=0.271, loss_mean=0.26] [A[A
+
+Train step of epoch 1:   2%|▏         | 140/6434 [19:41<14:07:44,  8.08s/it, gpt_loss=0.271, loss_mean=0.26][A[A
+
+Train step of epoch 1:   2%|▏         | 140/6434 [19:49<14:07:44,  8.08s/it, gpt_loss=0.266, loss_mean=0.261][A[A
+
+Train step of epoch 1:   2%|▏         | 141/6434 [19:49<14:09:26,  8.10s/it, gpt_loss=0.266, loss_mean=0.261][A[A
+
+Train step of epoch 1:   2%|▏         | 141/6434 [19:58<14:09:26,  8.10s/it, gpt_loss=0.261, loss_mean=0.261][A[A
+
+Train step of epoch 1:   2%|▏         | 142/6434 [19:58<14:12:53,  8.13s/it, gpt_loss=0.261, loss_mean=0.261][A[A
+
+Train step of epoch 1:   2%|▏         | 142/6434 [20:07<14:12:53,  8.13s/it, gpt_loss=0.31, loss_mean=0.266] [A[A
+
+Train step of epoch 1:   2%|▏         | 143/6434 [20:07<14:36:44,  8.36s/it, gpt_loss=0.31, loss_mean=0.266][A[A
+
+Train step of epoch 1:   2%|▏         | 143/6434 [20:16<14:36:44,  8.36s/it, gpt_loss=0.215, loss_mean=0.261][A[A
+
+Train step of epoch 1:   2%|▏         | 144/6434 [20:16<14:57:57,  8.57s/it, gpt_loss=0.215, loss_mean=0.261][A[A
+
+Train step of epoch 1:   2%|▏         | 144/6434 [20:24<14:57:57,  8.57s/it, gpt_loss=0.244, loss_mean=0.259][A[A
+
+Train step of epoch 1:   2%|▏         | 145/6434 [20:24<14:52:59,  8.52s/it, gpt_loss=0.244, loss_mean=0.259][A[A
+[LID Router Debug] Step: 6580
+Batch Size: 10
+Audio Batch Size: 95
+LID Assignments: [1, 9, 2, 5, 1, 6, 6, 6, 2, 0]
+Active Experts in Batch: {0, 1, 2, 5, 6, 9}
+
+
+Train step of epoch 1:   2%|▏         | 145/6434 [20:32<14:52:59,  8.52s/it, gpt_loss=0.331, loss_mean=0.266][A[A
+
+Train step of epoch 1:   2%|▏         | 146/6434 [20:32<14:30:32,  8.31s/it, gpt_loss=0.331, loss_mean=0.266][A[A
+
+Train step of epoch 1:   2%|▏         | 146/6434 [20:40<14:30:32,  8.31s/it, gpt_loss=0.252, loss_mean=0.265][A[A
+
+Train step of epoch 1:   2%|▏         | 147/6434 [20:40<14:34:20,  8.34s/it, gpt_loss=0.252, loss_mean=0.265][A[A
+
+Train step of epoch 1:   2%|▏         | 147/6434 [20:50<14:34:20,  8.34s/it, gpt_loss=0.244, loss_mean=0.263][A[A
+
+Train step of epoch 1:   2%|▏         | 148/6434 [20:50<15:20:28,  8.79s/it, gpt_loss=0.244, loss_mean=0.263][A[A
+
+Train step of epoch 1:   2%|▏         | 148/6434 [20:58<15:20:28,  8.79s/it, gpt_loss=0.217, loss_mean=0.258][A[A
+
+Train step of epoch 1:   2%|▏         | 149/6434 [20:58<15:05:37,  8.65s/it, gpt_loss=0.217, loss_mean=0.258][A[A
+
+Train step of epoch 1:   2%|▏         | 149/6434 [21:06<15:05:37,  8.65s/it, gpt_loss=0.334, loss_mean=0.266][A[A
+
+Train step of epoch 1:   2%|▏         | 150/6434 [21:06<14:22:44,  8.24s/it, gpt_loss=0.334, loss_mean=0.266][A[A
+
+Train step of epoch 1:   2%|▏         | 150/6434 [21:13<14:22:44,  8.24s/it, gpt_loss=0.257, loss_mean=0.265][A[A
+
+Train step of epoch 1:   2%|▏         | 151/6434 [21:13<13:46:01,  7.89s/it, gpt_loss=0.257, loss_mean=0.265][A[A
+
+Train step of epoch 1:   2%|▏         | 151/6434 [21:22<13:46:01,  7.89s/it, gpt_loss=0.243, loss_mean=0.263][A[A
+
+Train step of epoch 1:   2%|▏         | 152/6434 [21:22<14:25:59,  8.27s/it, gpt_loss=0.243, loss_mean=0.263][A[A
+
+Train step of epoch 1:   2%|▏         | 152/6434 [21:30<14:25:59,  8.27s/it, gpt_loss=0.242, loss_mean=0.261][A[A
+
+Train step of epoch 1:   2%|▏         | 153/6434 [21:30<14:33:10,  8.34s/it, gpt_loss=0.242, loss_mean=0.261][A[A
+
+Train step of epoch 1:   2%|▏         | 153/6434 [21:39<14:33:10,  8.34s/it, gpt_loss=0.304, loss_mean=0.265][A[A
+
+Train step of epoch 1:   2%|▏         | 154/6434 [21:39<14:31:17,  8.32s/it, gpt_loss=0.304, loss_mean=0.265][A[A
+
+Train step of epoch 1:   2%|▏         | 154/6434 [21:47<14:31:17,  8.32s/it, gpt_loss=0.281, loss_mean=0.267][A[A
+
+Train step of epoch 1:   2%|▏         | 155/6434 [21:47<14:43:34,  8.44s/it, gpt_loss=0.281, loss_mean=0.267][A[A
+[LID Router Debug] Step: 6590
+Batch Size: 10
+Audio Batch Size: 105
+LID Assignments: [1, 3, 1, 0, 1, 9, 1, 9, 5, 1]
+Active Experts in Batch: {0, 1, 3, 5, 9}
+
+
+Train step of epoch 1:   2%|▏         | 155/6434 [21:56<14:43:34,  8.44s/it, gpt_loss=0.232, loss_mean=0.263][A[A
+
+Train step of epoch 1:   2%|▏         | 156/6434 [21:56<14:35:39,  8.37s/it, gpt_loss=0.232, loss_mean=0.263][A[A
+
+Train step of epoch 1:   2%|▏         | 156/6434 [22:04<14:35:39,  8.37s/it, gpt_loss=0.221, loss_mean=0.259][A[A
+
+Train step of epoch 1:   2%|▏         | 157/6434 [22:04<14:21:02,  8.23s/it, gpt_loss=0.221, loss_mean=0.259][A[A
+
+Train step of epoch 1:   2%|▏         | 157/6434 [22:12<14:21:02,  8.23s/it, gpt_loss=0.235, loss_mean=0.256][A[A
+
+Train step of epoch 1:   2%|▏         | 158/6434 [22:12<14:27:00,  8.29s/it, gpt_loss=0.235, loss_mean=0.256][A[A
+
+Train step of epoch 1:   2%|▏         | 158/6434 [22:20<14:27:00,  8.29s/it, gpt_loss=0.283, loss_mean=0.259][A[A
+
+Train step of epoch 1:   2%|▏         | 159/6434 [22:20<14:05:00,  8.08s/it, gpt_loss=0.283, loss_mean=0.259][A[A
+
+Train step of epoch 1:   2%|▏         | 159/6434 [22:27<14:05:00,  8.08s/it, gpt_loss=0.282, loss_mean=0.261][A[A
+
+Train step of epoch 1:   2%|▏         | 160/6434 [22:27<13:58:34,  8.02s/it, gpt_loss=0.282, loss_mean=0.261][A[A
+
+Train step of epoch 1:   2%|▏         | 160/6434 [22:37<13:58:34,  8.02s/it, gpt_loss=0.215, loss_mean=0.257][A[A
+
+Train step of epoch 1:   3%|▎         | 161/6434 [22:37<14:40:47,  8.42s/it, gpt_loss=0.215, loss_mean=0.257][A[A
+
+Train step of epoch 1:   3%|▎         | 161/6434 [22:45<14:40:47,  8.42s/it, gpt_loss=0.309, loss_mean=0.262][A[A
+
+Train step of epoch 1:   3%|▎         | 162/6434 [22:45<14:44:03,  8.46s/it, gpt_loss=0.309, loss_mean=0.262][A[A
+
+Train step of epoch 1:   3%|▎         | 162/6434 [22:54<14:44:03,  8.46s/it, gpt_loss=0.21, loss_mean=0.257] [A[A
+
+Train step of epoch 1:   3%|▎         | 163/6434 [22:54<14:55:31,  8.57s/it, gpt_loss=0.21, loss_mean=0.257][A[A
+
+Train step of epoch 1:   3%|▎         | 163/6434 [23:03<14:55:31,  8.57s/it, gpt_loss=0.306, loss_mean=0.262][A[A
+
+Train step of epoch 1:   3%|▎         | 164/6434 [23:03<15:16:21,  8.77s/it, gpt_loss=0.306, loss_mean=0.262][A[A
+
+Train step of epoch 1:   3%|▎         | 164/6434 [23:12<15:16:21,  8.77s/it, gpt_loss=0.201, loss_mean=0.256][A[A
+
+Train step of epoch 1:   3%|▎         | 165/6434 [23:12<14:56:58,  8.58s/it, gpt_loss=0.201, loss_mean=0.256][A[A
+[LID Router Debug] Step: 6600
+Batch Size: 10
+Audio Batch Size: 124
+LID Assignments: [5, 2, 3, 0, 0, 1, 5, 2, 3, 9]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9}
+[2026-02-07 07:25:06,698] [INFO] [logging.py:96:log_dist] [Rank 0] step=3300, skipped=0, lr=[1.4947701503389426e-05, 1.4947701503389426e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 07:25:06,699] [INFO] [timer.py:260:stop] epoch=0/micro_step=6600/global_step=3300, RunningAvgSamplesPerSec=4.7464208828629815, CurrSamplesPerSec=4.930966114071202, MemAllocated=12.79GB, MaxMemAllocated=49.73GB
+
+
+Train step of epoch 1:   3%|▎         | 165/6434 [23:20<14:56:58,  8.58s/it, gpt_loss=0.285, loss_mean=0.259][A[A
+
+Train step of epoch 1:   3%|▎         | 166/6434 [23:20<14:41:34,  8.44s/it, gpt_loss=0.285, loss_mean=0.259][A[A
+
+Train step of epoch 1:   3%|▎         | 166/6434 [23:28<14:41:34,  8.44s/it, gpt_loss=0.285, loss_mean=0.261][A[A
+
+Train step of epoch 1:   3%|▎         | 167/6434 [23:28<14:49:25,  8.52s/it, gpt_loss=0.285, loss_mean=0.261][A[A
+
+Train step of epoch 1:   3%|▎         | 167/6434 [23:37<14:49:25,  8.52s/it, gpt_loss=0.32, loss_mean=0.267] [A[A
+
+Train step of epoch 1:   3%|▎         | 168/6434 [23:37<15:01:49,  8.64s/it, gpt_loss=0.32, loss_mean=0.267][A[A
+
+Train step of epoch 1:   3%|▎         | 168/6434 [23:46<15:01:49,  8.64s/it, gpt_loss=0.201, loss_mean=0.261][A[A
+
+Train step of epoch 1:   3%|▎         | 169/6434 [23:46<15:04:10,  8.66s/it, gpt_loss=0.201, loss_mean=0.261][A[A
+
+Train step of epoch 1:   3%|▎         | 169/6434 [23:55<15:04:10,  8.66s/it, gpt_loss=0.32, loss_mean=0.266] [A[A
+
+Train step of epoch 1:   3%|▎         | 170/6434 [23:55<15:09:57,  8.72s/it, gpt_loss=0.32, loss_mean=0.266][A[A
+
+Train step of epoch 1:   3%|▎         | 170/6434 [24:03<15:09:57,  8.72s/it, gpt_loss=0.3, loss_mean=0.27]  [A[A
+
+Train step of epoch 1:   3%|▎         | 171/6434 [24:03<15:01:15,  8.63s/it, gpt_loss=0.3, loss_mean=0.27][A[A
+
+Train step of epoch 1:   3%|▎         | 171/6434 [24:12<15:01:15,  8.63s/it, gpt_loss=0.332, loss_mean=0.276][A[A
+
+Train step of epoch 1:   3%|▎         | 172/6434 [24:12<15:08:26,  8.70s/it, gpt_loss=0.332, loss_mean=0.276][A[A
+
+Train step of epoch 1:   3%|▎         | 172/6434 [24:20<15:08:26,  8.70s/it, gpt_loss=0.393, loss_mean=0.288][A[A
+
+Train step of epoch 1:   3%|▎         | 173/6434 [24:20<14:50:05,  8.53s/it, gpt_loss=0.393, loss_mean=0.288][A[A
+
+Train step of epoch 1:   3%|▎         | 173/6434 [24:29<14:50:05,  8.53s/it, gpt_loss=0.282, loss_mean=0.287][A[A
+
+Train step of epoch 1:   3%|▎         | 174/6434 [24:29<14:58:37,  8.61s/it, gpt_loss=0.282, loss_mean=0.287][A[A
+
+Train step of epoch 1:   3%|▎         | 174/6434 [24:37<14:58:37,  8.61s/it, gpt_loss=0.35, loss_mean=0.293] [A[A
+
+Train step of epoch 1:   3%|▎         | 175/6434 [24:37<14:22:30,  8.27s/it, gpt_loss=0.35, loss_mean=0.293][A[A
+[LID Router Debug] Step: 6610
+Batch Size: 10
+Audio Batch Size: 155
+LID Assignments: [9, 9, 3, 1, 2, 1, 2, 8, 3, 3]
+Active Experts in Batch: {1, 2, 3, 8, 9}
+
+
+Train step of epoch 1:   3%|▎         | 175/6434 [24:46<14:22:30,  8.27s/it, gpt_loss=0.401, loss_mean=0.304][A[A
+
+Train step of epoch 1:   3%|▎         | 176/6434 [24:46<15:09:53,  8.72s/it, gpt_loss=0.401, loss_mean=0.304][A[A
+
+Train step of epoch 1:   3%|▎         | 176/6434 [24:54<15:09:53,  8.72s/it, gpt_loss=0.307, loss_mean=0.304][A[A
+
+Train step of epoch 1:   3%|▎         | 177/6434 [24:54<14:41:17,  8.45s/it, gpt_loss=0.307, loss_mean=0.304][A[A
+
+Train step of epoch 1:   3%|▎         | 177/6434 [25:02<14:41:17,  8.45s/it, gpt_loss=0.206, loss_mean=0.295][A[A
+
+Train step of epoch 1:   3%|▎         | 178/6434 [25:02<14:34:42,  8.39s/it, gpt_loss=0.206, loss_mean=0.295][A[A
+
+Train step of epoch 1:   3%|▎         | 178/6434 [25:12<14:34:42,  8.39s/it, gpt_loss=0.247, loss_mean=0.29] [A[A
+
+Train step of epoch 1:   3%|▎         | 179/6434 [25:12<14:58:41,  8.62s/it, gpt_loss=0.247, loss_mean=0.29][A[A
+
+Train step of epoch 1:   3%|▎         | 179/6434 [25:21<14:58:41,  8.62s/it, gpt_loss=0.296, loss_mean=0.29][A[A
+
+Train step of epoch 1:   3%|▎         | 180/6434 [25:21<15:17:33,  8.80s/it, gpt_loss=0.296, loss_mean=0.29][A[A
+
+Train step of epoch 1:   3%|▎         | 180/6434 [25:28<15:17:33,  8.80s/it, gpt_loss=0.214, loss_mean=0.283][A[A
+
+Train step of epoch 1:   3%|▎         | 181/6434 [25:28<14:44:14,  8.48s/it, gpt_loss=0.214, loss_mean=0.283][A[A
+
+Train step of epoch 1:   3%|▎         | 181/6434 [25:37<14:44:14,  8.48s/it, gpt_loss=0.283, loss_mean=0.283][A[A
+
+Train step of epoch 1:   3%|▎         | 182/6434 [25:37<14:56:08,  8.60s/it, gpt_loss=0.283, loss_mean=0.283][A[A
+
+Train step of epoch 1:   3%|▎         | 182/6434 [25:46<14:56:08,  8.60s/it, gpt_loss=0.236, loss_mean=0.278][A[A
+
+Train step of epoch 1:   3%|▎         | 183/6434 [25:46<15:06:53,  8.70s/it, gpt_loss=0.236, loss_mean=0.278][A[A
+
+Train step of epoch 1:   3%|▎         | 183/6434 [25:55<15:06:53,  8.70s/it, gpt_loss=0.229, loss_mean=0.273][A[A
+
+Train step of epoch 1:   3%|▎         | 184/6434 [25:55<15:20:15,  8.83s/it, gpt_loss=0.229, loss_mean=0.273][A[A
+
+Train step of epoch 1:   3%|▎         | 184/6434 [26:05<15:20:15,  8.83s/it, gpt_loss=0.282, loss_mean=0.274][A[A
+
+Train step of epoch 1:   3%|▎         | 185/6434 [26:05<15:34:54,  8.98s/it, gpt_loss=0.282, loss_mean=0.274][A[A
+[LID Router Debug] Step: 6620
+Batch Size: 10
+Audio Batch Size: 148
+LID Assignments: [0, 0, 3, 3, 9, 1, 1, 2, 3, 7]
+Active Experts in Batch: {0, 1, 2, 3, 7, 9}
+
+
+Train step of epoch 1:   3%|▎         | 185/6434 [26:14<15:34:54,  8.98s/it, gpt_loss=0.307, loss_mean=0.277][A[A
+
+Train step of epoch 1:   3%|▎         | 186/6434 [26:14<15:39:06,  9.02s/it, gpt_loss=0.307, loss_mean=0.277][A[A
+
+Train step of epoch 1:   3%|▎         | 186/6434 [26:23<15:39:06,  9.02s/it, gpt_loss=0.299, loss_mean=0.28] [A[A
+
+Train step of epoch 1:   3%|▎         | 187/6434 [26:23<15:41:54,  9.05s/it, gpt_loss=0.299, loss_mean=0.28][A[A
+
+Train step of epoch 1:   3%|▎         | 187/6434 [26:32<15:41:54,  9.05s/it, gpt_loss=0.268, loss_mean=0.278][A[A
+
+Train step of epoch 1:   3%|▎         | 188/6434 [26:32<15:50:48,  9.13s/it, gpt_loss=0.268, loss_mean=0.278][A[A
+
+Train step of epoch 1:   3%|▎         | 188/6434 [26:41<15:50:48,  9.13s/it, gpt_loss=0.299, loss_mean=0.28] [A[A
+
+Train step of epoch 1:   3%|▎         | 189/6434 [26:41<15:50:43,  9.13s/it, gpt_loss=0.299, loss_mean=0.28][A[A
+
+Train step of epoch 1:   3%|▎         | 189/6434 [26:49<15:50:43,  9.13s/it, gpt_loss=0.249, loss_mean=0.277][A[A
+
+Train step of epoch 1:   3%|▎         | 190/6434 [26:49<15:13:29,  8.78s/it, gpt_loss=0.249, loss_mean=0.277][A[A
+
+Train step of epoch 1:   3%|▎         | 190/6434 [26:57<15:13:29,  8.78s/it, gpt_loss=0.229, loss_mean=0.272][A[A
+
+Train step of epoch 1:   3%|▎         | 191/6434 [26:57<14:46:42,  8.52s/it, gpt_loss=0.229, loss_mean=0.272][A[A
+
+Train step of epoch 1:   3%|▎         | 191/6434 [27:05<14:46:42,  8.52s/it, gpt_loss=0.223, loss_mean=0.268][A[A
+
+Train step of epoch 1:   3%|▎         | 192/6434 [27:05<14:18:43,  8.25s/it, gpt_loss=0.223, loss_mean=0.268][A[A
+
+Train step of epoch 1:   3%|▎         | 192/6434 [27:13<14:18:43,  8.25s/it, gpt_loss=0.244, loss_mean=0.265][A[A
+
+Train step of epoch 1:   3%|▎         | 193/6434 [27:13<14:03:00,  8.10s/it, gpt_loss=0.244, loss_mean=0.265][A[A
+
+Train step of epoch 1:   3%|▎         | 193/6434 [27:21<14:03:00,  8.10s/it, gpt_loss=0.269, loss_mean=0.266][A[A
+
+Train step of epoch 1:   3%|▎         | 194/6434 [27:21<14:11:38,  8.19s/it, gpt_loss=0.269, loss_mean=0.266][A[A
+
+Train step of epoch 1:   3%|▎         | 194/6434 [27:30<14:11:38,  8.19s/it, gpt_loss=0.312, loss_mean=0.27] [A[A
+
+Train step of epoch 1:   3%|▎         | 195/6434 [27:30<14:19:24,  8.26s/it, gpt_loss=0.312, loss_mean=0.27][A[A
+[LID Router Debug] Step: 6630
+Batch Size: 10
+Audio Batch Size: 100
+LID Assignments: [6, 3, 2, 1, 2, 4, 1, 2, 1, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6}
+
+
+Train step of epoch 1:   3%|▎         | 195/6434 [27:38<14:19:24,  8.26s/it, gpt_loss=0.31, loss_mean=0.274][A[A
+
+Train step of epoch 1:   3%|▎         | 196/6434 [27:38<14:28:07,  8.35s/it, gpt_loss=0.31, loss_mean=0.274][A[A
+
+Train step of epoch 1:   3%|▎         | 196/6434 [27:47<14:28:07,  8.35s/it, gpt_loss=0.42, loss_mean=0.289][A[A
+
+Train step of epoch 1:   3%|▎         | 197/6434 [27:47<14:34:56,  8.42s/it, gpt_loss=0.42, loss_mean=0.289][A[A
+
+Train step of epoch 1:   3%|▎         | 197/6434 [27:54<14:34:56,  8.42s/it, gpt_loss=0.25, loss_mean=0.285][A[A
+
+Train step of epoch 1:   3%|▎         | 198/6434 [27:54<14:15:40,  8.23s/it, gpt_loss=0.25, loss_mean=0.285][A[A
+
+Train step of epoch 1:   3%|▎         | 198/6434 [28:02<14:15:40,  8.23s/it, gpt_loss=0.229, loss_mean=0.279][A[A
+
+Train step of epoch 1:   3%|▎         | 199/6434 [28:02<14:06:35,  8.15s/it, gpt_loss=0.229, loss_mean=0.279][A[A
+
+Train step of epoch 1:   3%|▎         | 199/6434 [28:11<14:06:35,  8.15s/it, gpt_loss=0.268, loss_mean=0.278][A[A
+
+Train step of epoch 1:   3%|▎         | 200/6434 [28:11<14:16:12,  8.24s/it, gpt_loss=0.268, loss_mean=0.278][A[A
+
+Train step of epoch 1:   3%|▎         | 200/6434 [28:21<14:16:12,  8.24s/it, gpt_loss=0.228, loss_mean=0.273][A[A
+
+Train step of epoch 1:   3%|▎         | 201/6434 [28:21<15:07:43,  8.74s/it, gpt_loss=0.228, loss_mean=0.273][A[A
+
+Train step of epoch 1:   3%|▎         | 201/6434 [28:29<15:07:43,  8.74s/it, gpt_loss=0.243, loss_mean=0.27] [A[A
+
+Train step of epoch 1:   3%|▎         | 202/6434 [28:29<14:57:30,  8.64s/it, gpt_loss=0.243, loss_mean=0.27][A[A
+
+Train step of epoch 1:   3%|▎         | 202/6434 [28:38<14:57:30,  8.64s/it, gpt_loss=0.25, loss_mean=0.268][A[A
+
+Train step of epoch 1:   3%|▎         | 203/6434 [28:38<15:00:03,  8.67s/it, gpt_loss=0.25, loss_mean=0.268][A[A
+
+Train step of epoch 1:   3%|▎         | 203/6434 [28:48<15:00:03,  8.67s/it, gpt_loss=0.26, loss_mean=0.267][A[A
+
+Train step of epoch 1:   3%|▎         | 204/6434 [28:48<15:38:27,  9.04s/it, gpt_loss=0.26, loss_mean=0.267][A[A
+
+Train step of epoch 1:   3%|▎         | 204/6434 [28:56<15:38:27,  9.04s/it, gpt_loss=0.246, loss_mean=0.265][A[A
+
+Train step of epoch 1:   3%|▎         | 205/6434 [28:56<15:26:48,  8.93s/it, gpt_loss=0.246, loss_mean=0.265][A[A
+[LID Router Debug] Step: 6640
+Batch Size: 10
+Audio Batch Size: 122
+LID Assignments: [2, 5, 5, 2, 2, 5, 0, 0, 3, 9]
+Active Experts in Batch: {0, 2, 3, 5, 9}
+
+
+Train step of epoch 1:   3%|▎         | 205/6434 [29:04<15:26:48,  8.93s/it, gpt_loss=0.27, loss_mean=0.266] [A[A
+
+Train step of epoch 1:   3%|▎         | 206/6434 [29:04<14:55:48,  8.63s/it, gpt_loss=0.27, loss_mean=0.266][A[A
+
+Train step of epoch 1:   3%|▎         | 206/6434 [29:12<14:55:48,  8.63s/it, gpt_loss=0.242, loss_mean=0.263][A[A
+
+Train step of epoch 1:   3%|▎         | 207/6434 [29:12<14:31:40,  8.40s/it, gpt_loss=0.242, loss_mean=0.263][A[A
+
+Train step of epoch 1:   3%|▎         | 207/6434 [29:20<14:31:40,  8.40s/it, gpt_loss=0.282, loss_mean=0.265][A[A
+
+Train step of epoch 1:   3%|▎         | 208/6434 [29:20<14:24:07,  8.33s/it, gpt_loss=0.282, loss_mean=0.265][A[A
+
+Train step of epoch 1:   3%|▎         | 208/6434 [29:29<14:24:07,  8.33s/it, gpt_loss=0.238, loss_mean=0.262][A[A
+
+Train step of epoch 1:   3%|▎         | 209/6434 [29:29<14:30:40,  8.39s/it, gpt_loss=0.238, loss_mean=0.262][A[A
+
+Train step of epoch 1:   3%|▎         | 209/6434 [29:37<14:30:40,  8.39s/it, gpt_loss=0.26, loss_mean=0.262] [A[A
+
+Train step of epoch 1:   3%|▎         | 210/6434 [29:37<14:19:21,  8.28s/it, gpt_loss=0.26, loss_mean=0.262][A[A
+
+Train step of epoch 1:   3%|▎         | 210/6434 [29:46<14:19:21,  8.28s/it, gpt_loss=0.227, loss_mean=0.259][A[A
+
+Train step of epoch 1:   3%|▎         | 211/6434 [29:46<14:25:52,  8.35s/it, gpt_loss=0.227, loss_mean=0.259][A[A
+
+Train step of epoch 1:   3%|▎         | 211/6434 [29:54<14:25:52,  8.35s/it, gpt_loss=0.44, loss_mean=0.277] [A[A
+
+Train step of epoch 1:   3%|▎         | 212/6434 [29:54<14:19:57,  8.29s/it, gpt_loss=0.44, loss_mean=0.277][A[A
+
+Train step of epoch 1:   3%|▎         | 212/6434 [30:02<14:19:57,  8.29s/it, gpt_loss=0.264, loss_mean=0.275][A[A
+
+Train step of epoch 1:   3%|▎         | 213/6434 [30:02<14:35:44,  8.45s/it, gpt_loss=0.264, loss_mean=0.275][A[A
+
+Train step of epoch 1:   3%|▎         | 213/6434 [30:11<14:35:44,  8.45s/it, gpt_loss=0.244, loss_mean=0.272][A[A
+
+Train step of epoch 1:   3%|▎         | 214/6434 [30:11<14:38:47,  8.48s/it, gpt_loss=0.244, loss_mean=0.272][A[A
+
+Train step of epoch 1:   3%|▎         | 214/6434 [30:20<14:38:47,  8.48s/it, gpt_loss=0.28, loss_mean=0.273] [A[A
+
+Train step of epoch 1:   3%|▎         | 215/6434 [30:20<14:55:22,  8.64s/it, gpt_loss=0.28, loss_mean=0.273][A[A
+[LID Router Debug] Step: 6650
+Batch Size: 10
+Audio Batch Size: 94
+LID Assignments: [5, 4, 4, 5, 5, 5, 3, 2, 6, 1]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6}
+
+
+Train step of epoch 1:   3%|▎         | 215/6434 [30:29<14:55:22,  8.64s/it, gpt_loss=0.295, loss_mean=0.275][A[A
+
+Train step of epoch 1:   3%|▎         | 216/6434 [30:29<15:13:52,  8.82s/it, gpt_loss=0.295, loss_mean=0.275][A[A
+
+Train step of epoch 1:   3%|▎         | 216/6434 [30:37<15:13:52,  8.82s/it, gpt_loss=0.242, loss_mean=0.272][A[A
+
+Train step of epoch 1:   3%|▎         | 217/6434 [30:37<14:44:25,  8.54s/it, gpt_loss=0.242, loss_mean=0.272][A[A
+
+Train step of epoch 1:   3%|▎         | 217/6434 [30:46<14:44:25,  8.54s/it, gpt_loss=0.293, loss_mean=0.274][A[A
+
+Train step of epoch 1:   3%|▎         | 218/6434 [30:46<14:43:12,  8.53s/it, gpt_loss=0.293, loss_mean=0.274][A[A
+
+Train step of epoch 1:   3%|▎         | 218/6434 [30:54<14:43:12,  8.53s/it, gpt_loss=0.303, loss_mean=0.277][A[A
+
+Train step of epoch 1:   3%|▎         | 219/6434 [30:54<14:33:36,  8.43s/it, gpt_loss=0.303, loss_mean=0.277][A[A
+
+Train step of epoch 1:   3%|▎         | 219/6434 [31:03<14:33:36,  8.43s/it, gpt_loss=0.235, loss_mean=0.273][A[A
+
+Train step of epoch 1:   3%|▎         | 220/6434 [31:03<15:04:31,  8.73s/it, gpt_loss=0.235, loss_mean=0.273][A[A
+
+Train step of epoch 1:   3%|▎         | 220/6434 [31:12<15:04:31,  8.73s/it, gpt_loss=0.37, loss_mean=0.283] [A[A
+
+Train step of epoch 1:   3%|▎         | 221/6434 [31:12<14:53:41,  8.63s/it, gpt_loss=0.37, loss_mean=0.283][A[A
+
+Train step of epoch 1:   3%|▎         | 221/6434 [31:20<14:53:41,  8.63s/it, gpt_loss=0.266, loss_mean=0.281][A[A
+
+Train step of epoch 1:   3%|▎         | 222/6434 [31:20<14:42:15,  8.52s/it, gpt_loss=0.266, loss_mean=0.281][A[A
+
+Train step of epoch 1:   3%|▎         | 222/6434 [31:29<14:42:15,  8.52s/it, gpt_loss=0.253, loss_mean=0.278][A[A
+
+Train step of epoch 1:   3%|▎         | 223/6434 [31:29<14:44:27,  8.54s/it, gpt_loss=0.253, loss_mean=0.278][A[A
+
+Train step of epoch 1:   3%|▎         | 223/6434 [31:38<14:44:27,  8.54s/it, gpt_loss=0.281, loss_mean=0.278][A[A
+
+Train step of epoch 1:   3%|▎         | 224/6434 [31:38<15:11:16,  8.80s/it, gpt_loss=0.281, loss_mean=0.278][A[A
+
+Train step of epoch 1:   3%|▎         | 224/6434 [31:47<15:11:16,  8.80s/it, gpt_loss=0.306, loss_mean=0.281][A[A
+
+Train step of epoch 1:   3%|▎         | 225/6434 [31:47<15:19:16,  8.88s/it, gpt_loss=0.306, loss_mean=0.281][A[A
+[LID Router Debug] Step: 6660
+Batch Size: 10
+Audio Batch Size: 134
+LID Assignments: [5, 3, 3, 2, 9, 3, 0, 4, 4, 0]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:   3%|▎         | 225/6434 [31:55<15:19:16,  8.88s/it, gpt_loss=0.27, loss_mean=0.28]  [A[A
+
+Train step of epoch 1:   4%|▎         | 226/6434 [31:55<15:04:08,  8.74s/it, gpt_loss=0.27, loss_mean=0.28][A[A
+
+Train step of epoch 1:   4%|▎         | 226/6434 [32:03<15:04:08,  8.74s/it, gpt_loss=0.238, loss_mean=0.276][A[A
+
+Train step of epoch 1:   4%|▎         | 227/6434 [32:03<14:33:49,  8.45s/it, gpt_loss=0.238, loss_mean=0.276][A[A
+
+Train step of epoch 1:   4%|▎         | 227/6434 [32:12<14:33:49,  8.45s/it, gpt_loss=0.258, loss_mean=0.274][A[A
+
+Train step of epoch 1:   4%|▎         | 228/6434 [32:12<14:30:37,  8.42s/it, gpt_loss=0.258, loss_mean=0.274][A[A
+
+Train step of epoch 1:   4%|▎         | 228/6434 [32:21<14:30:37,  8.42s/it, gpt_loss=0.243, loss_mean=0.271][A[A
+
+Train step of epoch 1:   4%|▎         | 229/6434 [32:21<15:14:34,  8.84s/it, gpt_loss=0.243, loss_mean=0.271][A[A
+
+Train step of epoch 1:   4%|▎         | 229/6434 [32:30<15:14:34,  8.84s/it, gpt_loss=0.269, loss_mean=0.271][A[A
+
+Train step of epoch 1:   4%|▎         | 230/6434 [32:30<14:57:54,  8.68s/it, gpt_loss=0.269, loss_mean=0.271][A[A
+
+Train step of epoch 1:   4%|▎         | 230/6434 [32:39<14:57:54,  8.68s/it, gpt_loss=0.263, loss_mean=0.27] [A[A
+
+Train step of epoch 1:   4%|▎         | 231/6434 [32:39<15:05:07,  8.76s/it, gpt_loss=0.263, loss_mean=0.27][A[A
+
+Train step of epoch 1:   4%|▎         | 231/6434 [32:46<15:05:07,  8.76s/it, gpt_loss=0.235, loss_mean=0.266][A[A
+
+Train step of epoch 1:   4%|▎         | 232/6434 [32:46<14:36:43,  8.48s/it, gpt_loss=0.235, loss_mean=0.266][A[A
+
+Train step of epoch 1:   4%|▎         | 232/6434 [32:54<14:36:43,  8.48s/it, gpt_loss=0.296, loss_mean=0.269][A[A
+
+Train step of epoch 1:   4%|▎         | 233/6434 [32:54<14:10:07,  8.23s/it, gpt_loss=0.296, loss_mean=0.269][A[A
+
+Train step of epoch 1:   4%|▎         | 233/6434 [33:02<14:10:07,  8.23s/it, gpt_loss=0.252, loss_mean=0.268][A[A
+
+Train step of epoch 1:   4%|▎         | 234/6434 [33:02<13:52:34,  8.06s/it, gpt_loss=0.252, loss_mean=0.268][A[A
+
+Train step of epoch 1:   4%|▎         | 234/6434 [33:11<13:52:34,  8.06s/it, gpt_loss=0.299, loss_mean=0.271][A[A
+
+Train step of epoch 1:   4%|▎         | 235/6434 [33:11<14:25:10,  8.37s/it, gpt_loss=0.299, loss_mean=0.271][A[A
+[LID Router Debug] Step: 6670
+Batch Size: 10
+Audio Batch Size: 135
+LID Assignments: [2, 3, 3, 3, 5, 3, 9, 9, 5, 6]
+Active Experts in Batch: {2, 3, 5, 6, 9}
+
+
+Train step of epoch 1:   4%|▎         | 235/6434 [33:20<14:25:10,  8.37s/it, gpt_loss=0.267, loss_mean=0.27] [A[A
+
+Train step of epoch 1:   4%|▎         | 236/6434 [33:20<14:40:12,  8.52s/it, gpt_loss=0.267, loss_mean=0.27][A[A
+
+Train step of epoch 1:   4%|▎         | 236/6434 [33:29<14:40:12,  8.52s/it, gpt_loss=0.256, loss_mean=0.269][A[A
+
+Train step of epoch 1:   4%|▎         | 237/6434 [33:29<14:59:55,  8.71s/it, gpt_loss=0.256, loss_mean=0.269][A[A
+
+Train step of epoch 1:   4%|▎         | 237/6434 [33:37<14:59:55,  8.71s/it, gpt_loss=0.284, loss_mean=0.271][A[A
+
+Train step of epoch 1:   4%|▎         | 238/6434 [33:37<14:50:18,  8.62s/it, gpt_loss=0.284, loss_mean=0.271][A[A
+
+Train step of epoch 1:   4%|▎         | 238/6434 [33:45<14:50:18,  8.62s/it, gpt_loss=0.258, loss_mean=0.269][A[A
+
+Train step of epoch 1:   4%|▎         | 239/6434 [33:45<14:22:27,  8.35s/it, gpt_loss=0.258, loss_mean=0.269][A[A
+
+Train step of epoch 1:   4%|▎         | 239/6434 [33:53<14:22:27,  8.35s/it, gpt_loss=0.303, loss_mean=0.273][A[A
+
+Train step of epoch 1:   4%|▎         | 240/6434 [33:53<14:11:22,  8.25s/it, gpt_loss=0.303, loss_mean=0.273][A[A
+
+Train step of epoch 1:   4%|▎         | 240/6434 [34:01<14:11:22,  8.25s/it, gpt_loss=0.251, loss_mean=0.27] [A[A
+
+Train step of epoch 1:   4%|▎         | 241/6434 [34:01<13:58:05,  8.12s/it, gpt_loss=0.251, loss_mean=0.27][A[A
+
+Train step of epoch 1:   4%|▎         | 241/6434 [34:10<13:58:05,  8.12s/it, gpt_loss=0.338, loss_mean=0.277][A[A
+
+Train step of epoch 1:   4%|▍         | 242/6434 [34:10<14:17:07,  8.31s/it, gpt_loss=0.338, loss_mean=0.277][A[A
+
+Train step of epoch 1:   4%|▍         | 242/6434 [34:18<14:17:07,  8.31s/it, gpt_loss=0.253, loss_mean=0.275][A[A
+
+Train step of epoch 1:   4%|▍         | 243/6434 [34:18<14:35:10,  8.48s/it, gpt_loss=0.253, loss_mean=0.275][A[A
+
+Train step of epoch 1:   4%|▍         | 243/6434 [34:27<14:35:10,  8.48s/it, gpt_loss=0.207, loss_mean=0.268][A[A
+
+Train step of epoch 1:   4%|▍         | 244/6434 [34:27<14:50:11,  8.63s/it, gpt_loss=0.207, loss_mean=0.268][A[A
+
+Train step of epoch 1:   4%|▍         | 244/6434 [34:36<14:50:11,  8.63s/it, gpt_loss=0.258, loss_mean=0.267][A[A
+
+Train step of epoch 1:   4%|▍         | 245/6434 [34:36<14:32:57,  8.46s/it, gpt_loss=0.258, loss_mean=0.267][A[A
+[LID Router Debug] Step: 6680
+Batch Size: 10
+Audio Batch Size: 120
+LID Assignments: [4, 4, 2, 6, 1, 0, 9, 5, 9, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:   4%|▍         | 245/6434 [34:45<14:32:57,  8.46s/it, gpt_loss=0.229, loss_mean=0.263][A[A
+
+Train step of epoch 1:   4%|▍         | 246/6434 [34:45<15:05:39,  8.78s/it, gpt_loss=0.229, loss_mean=0.263][A[A
+
+Train step of epoch 1:   4%|▍         | 246/6434 [34:54<15:05:39,  8.78s/it, gpt_loss=0.24, loss_mean=0.261] [A[A
+
+Train step of epoch 1:   4%|▍         | 247/6434 [34:54<15:02:50,  8.76s/it, gpt_loss=0.24, loss_mean=0.261][A[A
+
+Train step of epoch 1:   4%|▍         | 247/6434 [35:03<15:02:50,  8.76s/it, gpt_loss=0.324, loss_mean=0.267][A[A
+
+Train step of epoch 1:   4%|▍         | 248/6434 [35:03<15:07:39,  8.80s/it, gpt_loss=0.324, loss_mean=0.267][A[A
+
+Train step of epoch 1:   4%|▍         | 248/6434 [35:12<15:07:39,  8.80s/it, gpt_loss=0.267, loss_mean=0.267][A[A
+
+Train step of epoch 1:   4%|▍         | 249/6434 [35:12<15:11:09,  8.84s/it, gpt_loss=0.267, loss_mean=0.267][A[A
+
+Train step of epoch 1:   4%|▍         | 249/6434 [35:19<15:11:09,  8.84s/it, gpt_loss=0.302, loss_mean=0.271][A[A
+
+Train step of epoch 1:   4%|▍         | 250/6434 [35:19<14:32:08,  8.46s/it, gpt_loss=0.302, loss_mean=0.271][A[A
+
+Train step of epoch 1:   4%|▍         | 250/6434 [35:28<14:32:08,  8.46s/it, gpt_loss=0.255, loss_mean=0.269][A[A
+
+Train step of epoch 1:   4%|▍         | 251/6434 [35:28<14:39:32,  8.54s/it, gpt_loss=0.255, loss_mean=0.269][A[A
+
+Train step of epoch 1:   4%|▍         | 251/6434 [35:36<14:39:32,  8.54s/it, gpt_loss=0.264, loss_mean=0.269][A[A
+
+Train step of epoch 1:   4%|▍         | 252/6434 [35:36<14:36:12,  8.50s/it, gpt_loss=0.264, loss_mean=0.269][A[A
+
+Train step of epoch 1:   4%|▍         | 252/6434 [35:46<14:36:12,  8.50s/it, gpt_loss=0.351, loss_mean=0.277][A[A
+
+Train step of epoch 1:   4%|▍         | 253/6434 [35:46<15:12:58,  8.86s/it, gpt_loss=0.351, loss_mean=0.277][A[A
+
+Train step of epoch 1:   4%|▍         | 253/6434 [35:53<15:12:58,  8.86s/it, gpt_loss=0.179, loss_mean=0.267][A[A
+
+Train step of epoch 1:   4%|▍         | 254/6434 [35:53<14:18:47,  8.34s/it, gpt_loss=0.179, loss_mean=0.267][A[A
+
+Train step of epoch 1:   4%|▍         | 254/6434 [36:01<14:18:47,  8.34s/it, gpt_loss=0.309, loss_mean=0.271][A[A
+
+Train step of epoch 1:   4%|▍         | 255/6434 [36:01<14:10:31,  8.26s/it, gpt_loss=0.309, loss_mean=0.271][A[A
+[LID Router Debug] Step: 6690
+Batch Size: 10
+Audio Batch Size: 82
+LID Assignments: [5, 0, 1, 1, 2, 4, 9, 9, 1, 9]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+
+Train step of epoch 1:   4%|▍         | 255/6434 [36:09<14:10:31,  8.26s/it, gpt_loss=0.396, loss_mean=0.284][A[A
+
+Train step of epoch 1:   4%|▍         | 256/6434 [36:09<13:52:40,  8.09s/it, gpt_loss=0.396, loss_mean=0.284][A[A
+
+Train step of epoch 1:   4%|▍         | 256/6434 [36:18<13:52:40,  8.09s/it, gpt_loss=0.325, loss_mean=0.288][A[A
+
+Train step of epoch 1:   4%|▍         | 257/6434 [36:18<14:38:12,  8.53s/it, gpt_loss=0.325, loss_mean=0.288][A[A
+
+Train step of epoch 1:   4%|▍         | 257/6434 [36:28<14:38:12,  8.53s/it, gpt_loss=0.325, loss_mean=0.291][A[A
+
+Train step of epoch 1:   4%|▍         | 258/6434 [36:28<15:22:50,  8.97s/it, gpt_loss=0.325, loss_mean=0.291][A[A
+
+Train step of epoch 1:   4%|▍         | 258/6434 [36:38<15:22:50,  8.97s/it, gpt_loss=0.208, loss_mean=0.283][A[A
+
+Train step of epoch 1:   4%|▍         | 259/6434 [36:38<15:35:38,  9.09s/it, gpt_loss=0.208, loss_mean=0.283][A[A
+
+Train step of epoch 1:   4%|▍         | 259/6434 [36:46<15:35:38,  9.09s/it, gpt_loss=0.29, loss_mean=0.284] [A[A
+
+Train step of epoch 1:   4%|▍         | 260/6434 [36:46<15:22:26,  8.96s/it, gpt_loss=0.29, loss_mean=0.284][A[A
+
+Train step of epoch 1:   4%|▍         | 260/6434 [36:55<15:22:26,  8.96s/it, gpt_loss=0.228, loss_mean=0.278][A[A
+
+Train step of epoch 1:   4%|▍         | 261/6434 [36:55<15:16:46,  8.91s/it, gpt_loss=0.228, loss_mean=0.278][A[A
+
+Train step of epoch 1:   4%|▍         | 261/6434 [37:04<15:16:46,  8.91s/it, gpt_loss=0.359, loss_mean=0.286][A[A
+
+Train step of epoch 1:   4%|▍         | 262/6434 [37:04<15:11:07,  8.86s/it, gpt_loss=0.359, loss_mean=0.286][A[A
+
+Train step of epoch 1:   4%|▍         | 262/6434 [37:12<15:11:07,  8.86s/it, gpt_loss=0.243, loss_mean=0.282][A[A
+
+Train step of epoch 1:   4%|▍         | 263/6434 [37:12<14:51:38,  8.67s/it, gpt_loss=0.243, loss_mean=0.282][A[A
+
+Train step of epoch 1:   4%|▍         | 263/6434 [37:20<14:51:38,  8.67s/it, gpt_loss=0.235, loss_mean=0.277][A[A
+
+Train step of epoch 1:   4%|▍         | 264/6434 [37:20<14:35:11,  8.51s/it, gpt_loss=0.235, loss_mean=0.277][A[A
+
+Train step of epoch 1:   4%|▍         | 264/6434 [37:29<14:35:11,  8.51s/it, gpt_loss=0.317, loss_mean=0.281][A[A
+
+Train step of epoch 1:   4%|▍         | 265/6434 [37:29<14:35:41,  8.52s/it, gpt_loss=0.317, loss_mean=0.281][A[A
+[LID Router Debug] Step: 6700
+Batch Size: 10
+Audio Batch Size: 113
+LID Assignments: [0, 0, 4, 8, 9, 0, 3, 1, 2, 6]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6, 8, 9}
+
+
+Train step of epoch 1:   4%|▍         | 265/6434 [37:38<14:35:41,  8.52s/it, gpt_loss=0.372, loss_mean=0.29] [A[A
+
+Train step of epoch 1:   4%|▍         | 266/6434 [37:38<14:52:05,  8.68s/it, gpt_loss=0.372, loss_mean=0.29][A[A
+
+Train step of epoch 1:   4%|▍         | 266/6434 [37:47<14:52:05,  8.68s/it, gpt_loss=0.319, loss_mean=0.293][A[A
+
+Train step of epoch 1:   4%|▍         | 267/6434 [37:47<14:54:50,  8.71s/it, gpt_loss=0.319, loss_mean=0.293][A[A
+
+Train step of epoch 1:   4%|▍         | 267/6434 [37:55<14:54:50,  8.71s/it, gpt_loss=0.224, loss_mean=0.286][A[A
+
+Train step of epoch 1:   4%|▍         | 268/6434 [37:55<14:27:47,  8.44s/it, gpt_loss=0.224, loss_mean=0.286][A[A
+
+Train step of epoch 1:   4%|▍         | 268/6434 [38:03<14:27:47,  8.44s/it, gpt_loss=0.335, loss_mean=0.291][A[A
+
+Train step of epoch 1:   4%|▍         | 269/6434 [38:03<14:28:08,  8.45s/it, gpt_loss=0.335, loss_mean=0.291][A[A
+
+Train step of epoch 1:   4%|▍         | 269/6434 [38:12<14:28:08,  8.45s/it, gpt_loss=0.35, loss_mean=0.297] [A[A
+
+Train step of epoch 1:   4%|▍         | 270/6434 [38:12<14:38:17,  8.55s/it, gpt_loss=0.35, loss_mean=0.297][A[A
+
+Train step of epoch 1:   4%|▍         | 270/6434 [38:20<14:38:17,  8.55s/it, gpt_loss=0.268, loss_mean=0.294][A[A
+
+Train step of epoch 1:   4%|▍         | 271/6434 [38:20<14:17:44,  8.35s/it, gpt_loss=0.268, loss_mean=0.294][A[A
+
+Train step of epoch 1:   4%|▍         | 271/6434 [38:28<14:17:44,  8.35s/it, gpt_loss=0.299, loss_mean=0.295][A[A
+
+Train step of epoch 1:   4%|▍         | 272/6434 [38:28<14:09:19,  8.27s/it, gpt_loss=0.299, loss_mean=0.295][A[A
+
+Train step of epoch 1:   4%|▍         | 272/6434 [38:38<14:09:19,  8.27s/it, gpt_loss=0.253, loss_mean=0.29] [A[A
+
+Train step of epoch 1:   4%|▍         | 273/6434 [38:38<14:59:54,  8.76s/it, gpt_loss=0.253, loss_mean=0.29][A[A
+
+Train step of epoch 1:   4%|▍         | 273/6434 [38:47<14:59:54,  8.76s/it, gpt_loss=0.386, loss_mean=0.3] [A[A
+
+Train step of epoch 1:   4%|▍         | 274/6434 [38:47<15:27:19,  9.03s/it, gpt_loss=0.386, loss_mean=0.3][A[A
+
+Train step of epoch 1:   4%|▍         | 274/6434 [38:55<15:27:19,  9.03s/it, gpt_loss=0.289, loss_mean=0.299][A[A
+
+Train step of epoch 1:   4%|▍         | 275/6434 [38:55<14:33:54,  8.51s/it, gpt_loss=0.289, loss_mean=0.299][A[A
+[LID Router Debug] Step: 6710
+Batch Size: 10
+Audio Batch Size: 96
+LID Assignments: [0, 3, 4, 9, 6, 2, 5, 1, 0, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:   4%|▍         | 275/6434 [39:03<14:33:54,  8.51s/it, gpt_loss=0.282, loss_mean=0.297][A[A
+
+Train step of epoch 1:   4%|▍         | 276/6434 [39:03<14:22:57,  8.41s/it, gpt_loss=0.282, loss_mean=0.297][A[A
+
+Train step of epoch 1:   4%|▍         | 276/6434 [39:10<14:22:57,  8.41s/it, gpt_loss=0.217, loss_mean=0.289][A[A
+
+Train step of epoch 1:   4%|▍         | 277/6434 [39:10<13:51:58,  8.11s/it, gpt_loss=0.217, loss_mean=0.289][A[A
+
+Train step of epoch 1:   4%|▍         | 277/6434 [39:18<13:51:58,  8.11s/it, gpt_loss=0.252, loss_mean=0.285][A[A
+
+Train step of epoch 1:   4%|▍         | 278/6434 [39:18<13:28:46,  7.88s/it, gpt_loss=0.252, loss_mean=0.285][A[A
+
+Train step of epoch 1:   4%|▍         | 278/6434 [39:25<13:28:46,  7.88s/it, gpt_loss=0.285, loss_mean=0.285][A[A
+
+Train step of epoch 1:   4%|▍         | 279/6434 [39:25<13:27:58,  7.88s/it, gpt_loss=0.285, loss_mean=0.285][A[A
+
+Train step of epoch 1:   4%|▍         | 279/6434 [39:33<13:27:58,  7.88s/it, gpt_loss=0.28, loss_mean=0.285] [A[A
+
+Train step of epoch 1:   4%|▍         | 280/6434 [39:33<13:28:17,  7.88s/it, gpt_loss=0.28, loss_mean=0.285][A[A
+
+Train step of epoch 1:   4%|▍         | 280/6434 [39:42<13:28:17,  7.88s/it, gpt_loss=0.329, loss_mean=0.289][A[A
+
+Train step of epoch 1:   4%|▍         | 281/6434 [39:42<13:51:06,  8.10s/it, gpt_loss=0.329, loss_mean=0.289][A[A
+
+Train step of epoch 1:   4%|▍         | 281/6434 [39:50<13:51:06,  8.10s/it, gpt_loss=0.354, loss_mean=0.296][A[A
+
+Train step of epoch 1:   4%|▍         | 282/6434 [39:50<13:52:09,  8.12s/it, gpt_loss=0.354, loss_mean=0.296][A[A
+
+Train step of epoch 1:   4%|▍         | 282/6434 [39:59<13:52:09,  8.12s/it, gpt_loss=0.261, loss_mean=0.292][A[A
+
+Train step of epoch 1:   4%|▍         | 283/6434 [39:59<14:06:42,  8.26s/it, gpt_loss=0.261, loss_mean=0.292][A[A
+
+Train step of epoch 1:   4%|▍         | 283/6434 [40:08<14:06:42,  8.26s/it, gpt_loss=0.331, loss_mean=0.296][A[A
+
+Train step of epoch 1:   4%|▍         | 284/6434 [40:08<14:29:01,  8.48s/it, gpt_loss=0.331, loss_mean=0.296][A[A
+
+Train step of epoch 1:   4%|▍         | 284/6434 [40:16<14:29:01,  8.48s/it, gpt_loss=0.295, loss_mean=0.296][A[A
+
+Train step of epoch 1:   4%|▍         | 285/6434 [40:16<14:16:41,  8.36s/it, gpt_loss=0.295, loss_mean=0.296][A[A
+[LID Router Debug] Step: 6720
+Batch Size: 10
+Audio Batch Size: 108
+LID Assignments: [9, 9, 8, 1, 4, 4, 4, 2, 9, 2]
+Active Experts in Batch: {1, 2, 4, 8, 9}
+
+
+Train step of epoch 1:   4%|▍         | 285/6434 [40:23<14:16:41,  8.36s/it, gpt_loss=0.393, loss_mean=0.306][A[A
+
+Train step of epoch 1:   4%|▍         | 286/6434 [40:23<13:57:08,  8.17s/it, gpt_loss=0.393, loss_mean=0.306][A[A
+
+Train step of epoch 1:   4%|▍         | 286/6434 [40:35<13:57:08,  8.17s/it, gpt_loss=0.24, loss_mean=0.299] [A[A
+
+Train step of epoch 1:   4%|▍         | 287/6434 [40:35<15:33:22,  9.11s/it, gpt_loss=0.24, loss_mean=0.299][A[A
+
+Train step of epoch 1:   4%|▍         | 287/6434 [40:44<15:33:22,  9.11s/it, gpt_loss=0.278, loss_mean=0.297][A[A
+
+Train step of epoch 1:   4%|▍         | 288/6434 [40:44<15:29:58,  9.08s/it, gpt_loss=0.278, loss_mean=0.297][A[A
+
+Train step of epoch 1:   4%|▍         | 288/6434 [40:52<15:29:58,  9.08s/it, gpt_loss=0.313, loss_mean=0.299][A[A
+
+Train step of epoch 1:   4%|▍         | 289/6434 [40:52<15:08:57,  8.88s/it, gpt_loss=0.313, loss_mean=0.299][A[A
+
+Train step of epoch 1:   4%|▍         | 289/6434 [41:00<15:08:57,  8.88s/it, gpt_loss=0.191, loss_mean=0.288][A[A
+
+Train step of epoch 1:   5%|▍         | 290/6434 [41:00<14:46:39,  8.66s/it, gpt_loss=0.191, loss_mean=0.288][A[A
+
+Train step of epoch 1:   5%|▍         | 290/6434 [41:09<14:46:39,  8.66s/it, gpt_loss=0.271, loss_mean=0.286][A[A
+
+Train step of epoch 1:   5%|▍         | 291/6434 [41:09<14:44:42,  8.64s/it, gpt_loss=0.271, loss_mean=0.286][A[A
+
+Train step of epoch 1:   5%|▍         | 291/6434 [41:17<14:44:42,  8.64s/it, gpt_loss=0.327, loss_mean=0.29] [A[A
+
+Train step of epoch 1:   5%|▍         | 292/6434 [41:17<14:28:04,  8.48s/it, gpt_loss=0.327, loss_mean=0.29][A[A
+
+Train step of epoch 1:   5%|▍         | 292/6434 [41:25<14:28:04,  8.48s/it, gpt_loss=0.317, loss_mean=0.293][A[A
+
+Train step of epoch 1:   5%|▍         | 293/6434 [41:25<13:56:03,  8.17s/it, gpt_loss=0.317, loss_mean=0.293][A[A
+
+Train step of epoch 1:   5%|▍         | 293/6434 [41:33<13:56:03,  8.17s/it, gpt_loss=0.324, loss_mean=0.296][A[A
+
+Train step of epoch 1:   5%|▍         | 294/6434 [41:33<14:16:13,  8.37s/it, gpt_loss=0.324, loss_mean=0.296][A[A
+
+Train step of epoch 1:   5%|▍         | 294/6434 [41:42<14:16:13,  8.37s/it, gpt_loss=0.293, loss_mean=0.296][A[A
+
+Train step of epoch 1:   5%|▍         | 295/6434 [41:42<14:15:46,  8.36s/it, gpt_loss=0.293, loss_mean=0.296][A[A
+[LID Router Debug] Step: 6730
+Batch Size: 10
+Audio Batch Size: 107
+LID Assignments: [5, 5, 3, 2, 1, 6, 3, 1, 6, 2]
+Active Experts in Batch: {1, 2, 3, 5, 6}
+
+
+Train step of epoch 1:   5%|▍         | 295/6434 [41:49<14:15:46,  8.36s/it, gpt_loss=0.305, loss_mean=0.297][A[A
+
+Train step of epoch 1:   5%|▍         | 296/6434 [41:49<13:56:45,  8.18s/it, gpt_loss=0.305, loss_mean=0.297][A[A
+
+Train step of epoch 1:   5%|▍         | 296/6434 [41:58<13:56:45,  8.18s/it, gpt_loss=0.203, loss_mean=0.287][A[A
+
+Train step of epoch 1:   5%|▍         | 297/6434 [41:58<13:57:33,  8.19s/it, gpt_loss=0.203, loss_mean=0.287][A[A
+
+Train step of epoch 1:   5%|▍         | 297/6434 [42:07<13:57:33,  8.19s/it, gpt_loss=0.224, loss_mean=0.281][A[A
+
+Train step of epoch 1:   5%|▍         | 298/6434 [42:07<14:24:52,  8.46s/it, gpt_loss=0.224, loss_mean=0.281][A[A
+
+Train step of epoch 1:   5%|▍         | 298/6434 [42:16<14:24:52,  8.46s/it, gpt_loss=0.253, loss_mean=0.278][A[A
+
+Train step of epoch 1:   5%|▍         | 299/6434 [42:16<14:42:55,  8.63s/it, gpt_loss=0.253, loss_mean=0.278][A[A
+
+Train step of epoch 1:   5%|▍         | 299/6434 [42:24<14:42:55,  8.63s/it, gpt_loss=0.285, loss_mean=0.279][A[A
+
+Train step of epoch 1:   5%|▍         | 300/6434 [42:24<14:37:10,  8.58s/it, gpt_loss=0.285, loss_mean=0.279][A[A
+
+Train step of epoch 1:   5%|▍         | 300/6434 [42:32<14:37:10,  8.58s/it, gpt_loss=0.302, loss_mean=0.281][A[A
+
+Train step of epoch 1:   5%|▍         | 301/6434 [42:32<14:14:16,  8.36s/it, gpt_loss=0.302, loss_mean=0.281][A[A
+
+Train step of epoch 1:   5%|▍         | 301/6434 [42:41<14:14:16,  8.36s/it, gpt_loss=0.267, loss_mean=0.28] [A[A
+
+Train step of epoch 1:   5%|▍         | 302/6434 [42:41<14:19:17,  8.41s/it, gpt_loss=0.267, loss_mean=0.28][A[A
+
+Train step of epoch 1:   5%|▍         | 302/6434 [42:50<14:19:17,  8.41s/it, gpt_loss=0.325, loss_mean=0.284][A[A
+
+Train step of epoch 1:   5%|▍         | 303/6434 [42:50<14:36:09,  8.57s/it, gpt_loss=0.325, loss_mean=0.284][A[A
+
+Train step of epoch 1:   5%|▍         | 303/6434 [42:58<14:36:09,  8.57s/it, gpt_loss=0.307, loss_mean=0.287][A[A
+
+Train step of epoch 1:   5%|▍         | 304/6434 [42:58<14:34:06,  8.56s/it, gpt_loss=0.307, loss_mean=0.287][A[A
+
+Train step of epoch 1:   5%|▍         | 304/6434 [43:06<14:34:06,  8.56s/it, gpt_loss=0.302, loss_mean=0.288][A[A
+
+Train step of epoch 1:   5%|▍         | 305/6434 [43:06<14:03:31,  8.26s/it, gpt_loss=0.302, loss_mean=0.288][A[A
+[LID Router Debug] Step: 6740
+Batch Size: 10
+Audio Batch Size: 100
+LID Assignments: [6, 0, 1, 3, 1, 4, 9, 4, 9, 3]
+Active Experts in Batch: {0, 1, 3, 4, 6, 9}
+
+
+Train step of epoch 1:   5%|▍         | 305/6434 [43:14<14:03:31,  8.26s/it, gpt_loss=0.22, loss_mean=0.281] [A[A
+
+Train step of epoch 1:   5%|▍         | 306/6434 [43:14<14:05:48,  8.28s/it, gpt_loss=0.22, loss_mean=0.281][A[A
+
+Train step of epoch 1:   5%|▍         | 306/6434 [43:22<14:05:48,  8.28s/it, gpt_loss=0.332, loss_mean=0.286][A[A
+
+Train step of epoch 1:   5%|▍         | 307/6434 [43:22<13:58:47,  8.21s/it, gpt_loss=0.332, loss_mean=0.286][A[A
+
+Train step of epoch 1:   5%|▍         | 307/6434 [43:31<13:58:47,  8.21s/it, gpt_loss=0.23, loss_mean=0.281] [A[A
+
+Train step of epoch 1:   5%|▍         | 308/6434 [43:31<14:11:51,  8.34s/it, gpt_loss=0.23, loss_mean=0.281][A[A
+
+Train step of epoch 1:   5%|▍         | 308/6434 [43:40<14:11:51,  8.34s/it, gpt_loss=0.239, loss_mean=0.276][A[A
+
+Train step of epoch 1:   5%|▍         | 309/6434 [43:40<14:33:45,  8.56s/it, gpt_loss=0.239, loss_mean=0.276][A[A
+
+Train step of epoch 1:   5%|▍         | 309/6434 [43:48<14:33:45,  8.56s/it, gpt_loss=0.235, loss_mean=0.272][A[A
+
+Train step of epoch 1:   5%|▍         | 310/6434 [43:48<14:18:32,  8.41s/it, gpt_loss=0.235, loss_mean=0.272][A[A
+
+Train step of epoch 1:   5%|▍         | 310/6434 [43:57<14:18:32,  8.41s/it, gpt_loss=0.229, loss_mean=0.268][A[A
+
+Train step of epoch 1:   5%|▍         | 311/6434 [43:57<14:41:58,  8.64s/it, gpt_loss=0.229, loss_mean=0.268][A[A
+
+Train step of epoch 1:   5%|▍         | 311/6434 [44:06<14:41:58,  8.64s/it, gpt_loss=0.277, loss_mean=0.269][A[A
+
+Train step of epoch 1:   5%|▍         | 312/6434 [44:06<14:56:05,  8.78s/it, gpt_loss=0.277, loss_mean=0.269][A[A
+
+Train step of epoch 1:   5%|▍         | 312/6434 [44:15<14:56:05,  8.78s/it, gpt_loss=0.247, loss_mean=0.267][A[A
+
+Train step of epoch 1:   5%|▍         | 313/6434 [44:15<14:55:45,  8.78s/it, gpt_loss=0.247, loss_mean=0.267][A[A
+
+Train step of epoch 1:   5%|▍         | 313/6434 [44:22<14:55:45,  8.78s/it, gpt_loss=0.286, loss_mean=0.269][A[A
+
+Train step of epoch 1:   5%|▍         | 314/6434 [44:22<14:11:43,  8.35s/it, gpt_loss=0.286, loss_mean=0.269][A[A
+
+Train step of epoch 1:   5%|▍         | 314/6434 [44:31<14:11:43,  8.35s/it, gpt_loss=0.243, loss_mean=0.266][A[A
+
+Train step of epoch 1:   5%|▍         | 315/6434 [44:31<14:10:05,  8.34s/it, gpt_loss=0.243, loss_mean=0.266][A[A
+[LID Router Debug] Step: 6750
+Batch Size: 10
+Audio Batch Size: 123
+LID Assignments: [0, 1, 1, 9, 3, 2, 9, 6, 3, 6]
+Active Experts in Batch: {0, 1, 2, 3, 6, 9}
+
+
+Train step of epoch 1:   5%|▍         | 315/6434 [44:39<14:10:05,  8.34s/it, gpt_loss=0.248, loss_mean=0.264][A[A
+
+Train step of epoch 1:   5%|▍         | 316/6434 [44:39<14:27:57,  8.51s/it, gpt_loss=0.248, loss_mean=0.264][A[A
+
+Train step of epoch 1:   5%|▍         | 316/6434 [44:49<14:27:57,  8.51s/it, gpt_loss=0.259, loss_mean=0.264][A[A
+
+Train step of epoch 1:   5%|▍         | 317/6434 [44:49<14:44:41,  8.68s/it, gpt_loss=0.259, loss_mean=0.264][A[A
+
+Train step of epoch 1:   5%|▍         | 317/6434 [44:57<14:44:41,  8.68s/it, gpt_loss=0.312, loss_mean=0.269][A[A
+
+Train step of epoch 1:   5%|▍         | 318/6434 [44:57<14:45:40,  8.69s/it, gpt_loss=0.312, loss_mean=0.269][A[A
+
+Train step of epoch 1:   5%|▍         | 318/6434 [45:05<14:45:40,  8.69s/it, gpt_loss=0.28, loss_mean=0.27]  [A[A
+
+Train step of epoch 1:   5%|▍         | 319/6434 [45:05<14:24:57,  8.49s/it, gpt_loss=0.28, loss_mean=0.27][A[A
+
+Train step of epoch 1:   5%|▍         | 319/6434 [45:14<14:24:57,  8.49s/it, gpt_loss=0.312, loss_mean=0.274][A[A
+
+Train step of epoch 1:   5%|▍         | 320/6434 [45:14<14:18:17,  8.42s/it, gpt_loss=0.312, loss_mean=0.274][A[A
+
+Train step of epoch 1:   5%|▍         | 320/6434 [45:23<14:18:17,  8.42s/it, gpt_loss=0.285, loss_mean=0.275][A[A
+
+Train step of epoch 1:   5%|▍         | 321/6434 [45:23<15:02:08,  8.85s/it, gpt_loss=0.285, loss_mean=0.275][A[A
+
+Train step of epoch 1:   5%|▍         | 321/6434 [45:32<15:02:08,  8.85s/it, gpt_loss=0.351, loss_mean=0.283][A[A
+
+Train step of epoch 1:   5%|▌         | 322/6434 [45:32<14:50:05,  8.74s/it, gpt_loss=0.351, loss_mean=0.283][A[A
+
+Train step of epoch 1:   5%|▌         | 322/6434 [45:40<14:50:05,  8.74s/it, gpt_loss=0.265, loss_mean=0.281][A[A
+
+Train step of epoch 1:   5%|▌         | 323/6434 [45:40<14:39:18,  8.63s/it, gpt_loss=0.265, loss_mean=0.281][A[A
+
+Train step of epoch 1:   5%|▌         | 323/6434 [45:47<14:39:18,  8.63s/it, gpt_loss=0.214, loss_mean=0.274][A[A
+
+Train step of epoch 1:   5%|▌         | 324/6434 [45:47<13:45:43,  8.11s/it, gpt_loss=0.214, loss_mean=0.274][A[A
+
+Train step of epoch 1:   5%|▌         | 324/6434 [45:57<13:45:43,  8.11s/it, gpt_loss=0.381, loss_mean=0.285][A[A
+
+Train step of epoch 1:   5%|▌         | 325/6434 [45:57<14:27:27,  8.52s/it, gpt_loss=0.381, loss_mean=0.285][A[A
+[LID Router Debug] Step: 6760
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [6, 0, 0, 2, 3, 6, 2, 9, 3, 4]
+Active Experts in Batch: {0, 2, 3, 4, 6, 9}
+
+
+Train step of epoch 1:   5%|▌         | 325/6434 [46:05<14:27:27,  8.52s/it, gpt_loss=0.298, loss_mean=0.286][A[A
+
+Train step of epoch 1:   5%|▌         | 326/6434 [46:05<14:24:15,  8.49s/it, gpt_loss=0.298, loss_mean=0.286][A[A
+
+Train step of epoch 1:   5%|▌         | 326/6434 [46:14<14:24:15,  8.49s/it, gpt_loss=0.233, loss_mean=0.281][A[A
+
+Train step of epoch 1:   5%|▌         | 327/6434 [46:14<14:34:38,  8.59s/it, gpt_loss=0.233, loss_mean=0.281][A[A
+
+Train step of epoch 1:   5%|▌         | 327/6434 [46:23<14:34:38,  8.59s/it, gpt_loss=0.338, loss_mean=0.287][A[A
+
+Train step of epoch 1:   5%|▌         | 328/6434 [46:23<14:57:21,  8.82s/it, gpt_loss=0.338, loss_mean=0.287][A[A
+
+Train step of epoch 1:   5%|▌         | 328/6434 [46:31<14:57:21,  8.82s/it, gpt_loss=0.224, loss_mean=0.28] [A[A
+
+Train step of epoch 1:   5%|▌         | 329/6434 [46:31<14:32:00,  8.57s/it, gpt_loss=0.224, loss_mean=0.28][A[A
+
+Train step of epoch 1:   5%|▌         | 329/6434 [46:40<14:32:00,  8.57s/it, gpt_loss=0.3, loss_mean=0.282] [A[A
+
+Train step of epoch 1:   5%|▌         | 330/6434 [46:40<14:36:22,  8.61s/it, gpt_loss=0.3, loss_mean=0.282][A[A
+
+Train step of epoch 1:   5%|▌         | 330/6434 [46:48<14:36:22,  8.61s/it, gpt_loss=0.283, loss_mean=0.282][A[A
+
+Train step of epoch 1:   5%|▌         | 331/6434 [46:48<14:15:50,  8.41s/it, gpt_loss=0.283, loss_mean=0.282][A[A
+
+Train step of epoch 1:   5%|▌         | 331/6434 [46:56<14:15:50,  8.41s/it, gpt_loss=0.221, loss_mean=0.276][A[A
+
+Train step of epoch 1:   5%|▌         | 332/6434 [46:56<14:04:29,  8.30s/it, gpt_loss=0.221, loss_mean=0.276][A[A
+
+Train step of epoch 1:   5%|▌         | 332/6434 [47:05<14:04:29,  8.30s/it, gpt_loss=0.257, loss_mean=0.274][A[A
+
+Train step of epoch 1:   5%|▌         | 333/6434 [47:05<14:18:25,  8.44s/it, gpt_loss=0.257, loss_mean=0.274][A[A
+
+Train step of epoch 1:   5%|▌         | 333/6434 [47:14<14:18:25,  8.44s/it, gpt_loss=0.267, loss_mean=0.274][A[A
+
+Train step of epoch 1:   5%|▌         | 334/6434 [47:14<14:50:43,  8.76s/it, gpt_loss=0.267, loss_mean=0.274][A[A
+
+Train step of epoch 1:   5%|▌         | 334/6434 [47:22<14:50:43,  8.76s/it, gpt_loss=0.31, loss_mean=0.277] [A[A
+
+Train step of epoch 1:   5%|▌         | 335/6434 [47:22<14:07:51,  8.34s/it, gpt_loss=0.31, loss_mean=0.277][A[A
+[LID Router Debug] Step: 6770
+Batch Size: 10
+Audio Batch Size: 87
+LID Assignments: [1, 5, 7, 4, 9, 4, 0, 4, 4, 2]
+Active Experts in Batch: {0, 1, 2, 4, 5, 7, 9}
+
+
+Train step of epoch 1:   5%|▌         | 335/6434 [47:31<14:07:51,  8.34s/it, gpt_loss=0.212, loss_mean=0.271][A[A
+
+Train step of epoch 1:   5%|▌         | 336/6434 [47:31<14:38:08,  8.64s/it, gpt_loss=0.212, loss_mean=0.271][A[A
+
+Train step of epoch 1:   5%|▌         | 336/6434 [47:40<14:38:08,  8.64s/it, gpt_loss=0.4, loss_mean=0.284]  [A[A
+
+Train step of epoch 1:   5%|▌         | 337/6434 [47:40<14:53:25,  8.79s/it, gpt_loss=0.4, loss_mean=0.284][A[A
+
+Train step of epoch 1:   5%|▌         | 337/6434 [47:48<14:53:25,  8.79s/it, gpt_loss=0.308, loss_mean=0.286][A[A
+
+Train step of epoch 1:   5%|▌         | 338/6434 [47:48<14:27:45,  8.54s/it, gpt_loss=0.308, loss_mean=0.286][A[A
+
+Train step of epoch 1:   5%|▌         | 338/6434 [47:55<14:27:45,  8.54s/it, gpt_loss=0.278, loss_mean=0.285][A[A
+
+Train step of epoch 1:   5%|▌         | 339/6434 [47:55<13:53:03,  8.20s/it, gpt_loss=0.278, loss_mean=0.285][A[A
+
+Train step of epoch 1:   5%|▌         | 339/6434 [48:05<13:53:03,  8.20s/it, gpt_loss=0.224, loss_mean=0.279][A[A
+
+Train step of epoch 1:   5%|▌         | 340/6434 [48:05<14:31:01,  8.58s/it, gpt_loss=0.224, loss_mean=0.279][A[A
+
+Train step of epoch 1:   5%|▌         | 340/6434 [48:14<14:31:01,  8.58s/it, gpt_loss=0.255, loss_mean=0.277][A[A
+
+Train step of epoch 1:   5%|▌         | 341/6434 [48:14<14:36:16,  8.63s/it, gpt_loss=0.255, loss_mean=0.277][A[A
+
+Train step of epoch 1:   5%|▌         | 341/6434 [48:21<14:36:16,  8.63s/it, gpt_loss=0.246, loss_mean=0.274][A[A
+
+Train step of epoch 1:   5%|▌         | 342/6434 [48:21<14:06:59,  8.34s/it, gpt_loss=0.246, loss_mean=0.274][A[A
+
+Train step of epoch 1:   5%|▌         | 342/6434 [48:29<14:06:59,  8.34s/it, gpt_loss=0.294, loss_mean=0.276][A[A
+
+Train step of epoch 1:   5%|▌         | 343/6434 [48:29<13:56:00,  8.24s/it, gpt_loss=0.294, loss_mean=0.276][A[A
+
+Train step of epoch 1:   5%|▌         | 343/6434 [48:38<13:56:00,  8.24s/it, gpt_loss=0.324, loss_mean=0.281][A[A
+
+Train step of epoch 1:   5%|▌         | 344/6434 [48:38<14:03:58,  8.32s/it, gpt_loss=0.324, loss_mean=0.281][A[A
+
+Train step of epoch 1:   5%|▌         | 344/6434 [48:46<14:03:58,  8.32s/it, gpt_loss=0.325, loss_mean=0.285][A[A
+
+Train step of epoch 1:   5%|▌         | 345/6434 [48:46<14:05:44,  8.33s/it, gpt_loss=0.325, loss_mean=0.285][A[A
+[LID Router Debug] Step: 6780
+Batch Size: 10
+Audio Batch Size: 115
+LID Assignments: [5, 4, 9, 2, 0, 2, 0, 6, 9, 2]
+Active Experts in Batch: {0, 2, 4, 5, 6, 9}
+
+
+Train step of epoch 1:   5%|▌         | 345/6434 [48:54<14:05:44,  8.33s/it, gpt_loss=0.2, loss_mean=0.277]  [A[A
+
+Train step of epoch 1:   5%|▌         | 346/6434 [48:54<13:54:09,  8.22s/it, gpt_loss=0.2, loss_mean=0.277][A[A
+
+Train step of epoch 1:   5%|▌         | 346/6434 [49:03<13:54:09,  8.22s/it, gpt_loss=0.278, loss_mean=0.277][A[A
+
+Train step of epoch 1:   5%|▌         | 347/6434 [49:03<14:15:42,  8.43s/it, gpt_loss=0.278, loss_mean=0.277][A[A
+
+Train step of epoch 1:   5%|▌         | 347/6434 [49:10<14:15:42,  8.43s/it, gpt_loss=0.296, loss_mean=0.279][A[A
+
+Train step of epoch 1:   5%|▌         | 348/6434 [49:10<13:44:49,  8.13s/it, gpt_loss=0.296, loss_mean=0.279][A[A
+
+Train step of epoch 1:   5%|▌         | 348/6434 [49:18<13:44:49,  8.13s/it, gpt_loss=0.292, loss_mean=0.28] [A[A
+
+Train step of epoch 1:   5%|▌         | 349/6434 [49:18<13:24:06,  7.93s/it, gpt_loss=0.292, loss_mean=0.28][A[A
+
+Train step of epoch 1:   5%|▌         | 349/6434 [49:26<13:24:06,  7.93s/it, gpt_loss=0.254, loss_mean=0.277][A[A
+
+Train step of epoch 1:   5%|▌         | 350/6434 [49:26<13:28:03,  7.97s/it, gpt_loss=0.254, loss_mean=0.277][A[A
+
+Train step of epoch 1:   5%|▌         | 350/6434 [49:35<13:28:03,  7.97s/it, gpt_loss=0.244, loss_mean=0.274][A[A
+
+Train step of epoch 1:   5%|▌         | 351/6434 [49:35<14:07:27,  8.36s/it, gpt_loss=0.244, loss_mean=0.274][A[A
+
+Train step of epoch 1:   5%|▌         | 351/6434 [49:43<14:07:27,  8.36s/it, gpt_loss=0.317, loss_mean=0.278][A[A
+
+Train step of epoch 1:   5%|▌         | 352/6434 [49:43<13:58:12,  8.27s/it, gpt_loss=0.317, loss_mean=0.278][A[A
+
+Train step of epoch 1:   5%|▌         | 352/6434 [49:51<13:58:12,  8.27s/it, gpt_loss=0.272, loss_mean=0.278][A[A
+
+Train step of epoch 1:   5%|▌         | 353/6434 [49:51<13:55:31,  8.24s/it, gpt_loss=0.272, loss_mean=0.278][A[A
+
+Train step of epoch 1:   5%|▌         | 353/6434 [50:01<13:55:31,  8.24s/it, gpt_loss=0.322, loss_mean=0.282][A[A
+
+Train step of epoch 1:   6%|▌         | 354/6434 [50:01<14:23:11,  8.52s/it, gpt_loss=0.322, loss_mean=0.282][A[A
+
+Train step of epoch 1:   6%|▌         | 354/6434 [50:09<14:23:11,  8.52s/it, gpt_loss=0.215, loss_mean=0.276][A[A
+
+Train step of epoch 1:   6%|▌         | 355/6434 [50:09<14:30:08,  8.59s/it, gpt_loss=0.215, loss_mean=0.276][A[A
+[LID Router Debug] Step: 6790
+Batch Size: 10
+Audio Batch Size: 131
+LID Assignments: [0, 2, 9, 3, 1, 2, 0, 10, 9, 6]
+Active Experts in Batch: {0, 1, 2, 3, 6, 9, 10}
+
+
+Train step of epoch 1:   6%|▌         | 355/6434 [50:18<14:30:08,  8.59s/it, gpt_loss=0.233, loss_mean=0.271][A[A
+
+Train step of epoch 1:   6%|▌         | 356/6434 [50:18<14:24:29,  8.53s/it, gpt_loss=0.233, loss_mean=0.271][A[A
+
+Train step of epoch 1:   6%|▌         | 356/6434 [50:26<14:24:29,  8.53s/it, gpt_loss=0.282, loss_mean=0.272][A[A
+
+Train step of epoch 1:   6%|▌         | 357/6434 [50:26<14:24:18,  8.53s/it, gpt_loss=0.282, loss_mean=0.272][A[A
+
+Train step of epoch 1:   6%|▌         | 357/6434 [50:36<14:24:18,  8.53s/it, gpt_loss=0.267, loss_mean=0.272][A[A
+
+Train step of epoch 1:   6%|▌         | 358/6434 [50:36<14:54:58,  8.84s/it, gpt_loss=0.267, loss_mean=0.272][A[A
+
+Train step of epoch 1:   6%|▌         | 358/6434 [50:44<14:54:58,  8.84s/it, gpt_loss=0.237, loss_mean=0.268][A[A
+
+Train step of epoch 1:   6%|▌         | 359/6434 [50:44<14:19:51,  8.49s/it, gpt_loss=0.237, loss_mean=0.268][A[A
+
+Train step of epoch 1:   6%|▌         | 359/6434 [50:52<14:19:51,  8.49s/it, gpt_loss=0.355, loss_mean=0.277][A[A
+
+Train step of epoch 1:   6%|▌         | 360/6434 [50:52<14:16:08,  8.46s/it, gpt_loss=0.355, loss_mean=0.277][A[A
+
+Train step of epoch 1:   6%|▌         | 360/6434 [51:01<14:16:08,  8.46s/it, gpt_loss=0.269, loss_mean=0.276][A[A
+
+Train step of epoch 1:   6%|▌         | 361/6434 [51:01<14:32:50,  8.62s/it, gpt_loss=0.269, loss_mean=0.276][A[A
+
+Train step of epoch 1:   6%|▌         | 361/6434 [51:09<14:32:50,  8.62s/it, gpt_loss=0.375, loss_mean=0.286][A[A
+
+Train step of epoch 1:   6%|▌         | 362/6434 [51:09<14:14:23,  8.44s/it, gpt_loss=0.375, loss_mean=0.286][A[A
+
+Train step of epoch 1:   6%|▌         | 362/6434 [51:16<14:14:23,  8.44s/it, gpt_loss=0.252, loss_mean=0.283][A[A
+
+Train step of epoch 1:   6%|▌         | 363/6434 [51:16<13:43:57,  8.14s/it, gpt_loss=0.252, loss_mean=0.283][A[A
+
+Train step of epoch 1:   6%|▌         | 363/6434 [51:24<13:43:57,  8.14s/it, gpt_loss=0.222, loss_mean=0.277][A[A
+
+Train step of epoch 1:   6%|▌         | 364/6434 [51:24<13:36:17,  8.07s/it, gpt_loss=0.222, loss_mean=0.277][A[A
+
+Train step of epoch 1:   6%|▌         | 364/6434 [51:33<13:36:17,  8.07s/it, gpt_loss=0.275, loss_mean=0.276][A[A
+
+Train step of epoch 1:   6%|▌         | 365/6434 [51:33<13:49:31,  8.20s/it, gpt_loss=0.275, loss_mean=0.276][A[A
+[LID Router Debug] Step: 6800
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [2, 4, 1, 6, 5, 9, 10, 9, 9, 2]
+Active Experts in Batch: {1, 2, 4, 5, 6, 9, 10}
+[2026-02-07 07:53:28,271] [INFO] [logging.py:96:log_dist] [Rank 0] step=3400, skipped=0, lr=[1.4659321717935227e-05, 1.4659321717935227e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 07:53:28,272] [INFO] [timer.py:260:stop] epoch=0/micro_step=6800/global_step=3400, RunningAvgSamplesPerSec=4.745355815365404, CurrSamplesPerSec=4.741311609586601, MemAllocated=12.69GB, MaxMemAllocated=49.73GB
+
+
+Train step of epoch 1:   6%|▌         | 365/6434 [51:41<13:49:31,  8.20s/it, gpt_loss=0.393, loss_mean=0.288][A[A
+
+Train step of epoch 1:   6%|▌         | 366/6434 [51:41<13:55:27,  8.26s/it, gpt_loss=0.393, loss_mean=0.288][A[A
+
+Train step of epoch 1:   6%|▌         | 366/6434 [51:49<13:55:27,  8.26s/it, gpt_loss=0.254, loss_mean=0.285][A[A
+
+Train step of epoch 1:   6%|▌         | 367/6434 [51:49<13:41:11,  8.12s/it, gpt_loss=0.254, loss_mean=0.285][A[A
+
+Train step of epoch 1:   6%|▌         | 367/6434 [51:57<13:41:11,  8.12s/it, gpt_loss=0.262, loss_mean=0.282][A[A
+
+Train step of epoch 1:   6%|▌         | 368/6434 [51:57<13:34:21,  8.05s/it, gpt_loss=0.262, loss_mean=0.282][A[A
+
+Train step of epoch 1:   6%|▌         | 368/6434 [52:05<13:34:21,  8.05s/it, gpt_loss=0.277, loss_mean=0.282][A[A
+
+Train step of epoch 1:   6%|▌         | 369/6434 [52:05<13:25:19,  7.97s/it, gpt_loss=0.277, loss_mean=0.282][A[A
+
+Train step of epoch 1:   6%|▌         | 369/6434 [52:12<13:25:19,  7.97s/it, gpt_loss=0.307, loss_mean=0.284][A[A
+
+Train step of epoch 1:   6%|▌         | 370/6434 [52:12<12:54:00,  7.66s/it, gpt_loss=0.307, loss_mean=0.284][A[A
+
+Train step of epoch 1:   6%|▌         | 370/6434 [52:20<12:54:00,  7.66s/it, gpt_loss=0.228, loss_mean=0.279][A[A
+
+Train step of epoch 1:   6%|▌         | 371/6434 [52:20<13:04:13,  7.76s/it, gpt_loss=0.228, loss_mean=0.279][A[A
+
+Train step of epoch 1:   6%|▌         | 371/6434 [52:28<13:04:13,  7.76s/it, gpt_loss=0.306, loss_mean=0.281][A[A
+
+Train step of epoch 1:   6%|▌         | 372/6434 [52:28<13:37:47,  8.09s/it, gpt_loss=0.306, loss_mean=0.281][A[A
+
+Train step of epoch 1:   6%|▌         | 372/6434 [52:37<13:37:47,  8.09s/it, gpt_loss=0.269, loss_mean=0.28] [A[A
+
+Train step of epoch 1:   6%|▌         | 373/6434 [52:37<13:59:16,  8.31s/it, gpt_loss=0.269, loss_mean=0.28][A[A
+
+Train step of epoch 1:   6%|▌         | 373/6434 [52:45<13:59:16,  8.31s/it, gpt_loss=0.274, loss_mean=0.28][A[A
+
+Train step of epoch 1:   6%|▌         | 374/6434 [52:45<13:56:27,  8.28s/it, gpt_loss=0.274, loss_mean=0.28][A[A
+
+Train step of epoch 1:   6%|▌         | 374/6434 [52:54<13:56:27,  8.28s/it, gpt_loss=0.289, loss_mean=0.281][A[A
+
+Train step of epoch 1:   6%|▌         | 375/6434 [52:54<14:01:28,  8.33s/it, gpt_loss=0.289, loss_mean=0.281][A[A
+[LID Router Debug] Step: 6810
+Batch Size: 10
+Audio Batch Size: 94
+LID Assignments: [3, 5, 1, 0, 4, 9, 2, 9, 5, 6]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:   6%|▌         | 375/6434 [53:03<14:01:28,  8.33s/it, gpt_loss=0.36, loss_mean=0.288] [A[A
+
+Train step of epoch 1:   6%|▌         | 376/6434 [53:03<14:16:36,  8.48s/it, gpt_loss=0.36, loss_mean=0.288][A[A
+
+Train step of epoch 1:   6%|▌         | 376/6434 [53:10<14:16:36,  8.48s/it, gpt_loss=0.296, loss_mean=0.289][A[A
+
+Train step of epoch 1:   6%|▌         | 377/6434 [53:10<13:52:55,  8.25s/it, gpt_loss=0.296, loss_mean=0.289][A[A
+
+Train step of epoch 1:   6%|▌         | 377/6434 [53:18<13:52:55,  8.25s/it, gpt_loss=0.307, loss_mean=0.291][A[A
+
+Train step of epoch 1:   6%|▌         | 378/6434 [53:18<13:45:02,  8.17s/it, gpt_loss=0.307, loss_mean=0.291][A[A
+
+Train step of epoch 1:   6%|▌         | 378/6434 [53:27<13:45:02,  8.17s/it, gpt_loss=0.322, loss_mean=0.294][A[A
+
+Train step of epoch 1:   6%|▌         | 379/6434 [53:27<13:41:08,  8.14s/it, gpt_loss=0.322, loss_mean=0.294][A[A
+
+Train step of epoch 1:   6%|▌         | 379/6434 [53:35<13:41:08,  8.14s/it, gpt_loss=0.294, loss_mean=0.294][A[A
+
+Train step of epoch 1:   6%|▌         | 380/6434 [53:35<13:51:39,  8.24s/it, gpt_loss=0.294, loss_mean=0.294][A[A
+
+Train step of epoch 1:   6%|▌         | 380/6434 [53:44<13:51:39,  8.24s/it, gpt_loss=0.291, loss_mean=0.294][A[A
+
+Train step of epoch 1:   6%|▌         | 381/6434 [53:44<14:05:47,  8.38s/it, gpt_loss=0.291, loss_mean=0.294][A[A
+
+Train step of epoch 1:   6%|▌         | 381/6434 [53:53<14:05:47,  8.38s/it, gpt_loss=0.261, loss_mean=0.29] [A[A
+
+Train step of epoch 1:   6%|▌         | 382/6434 [53:53<14:21:04,  8.54s/it, gpt_loss=0.261, loss_mean=0.29][A[A
+
+Train step of epoch 1:   6%|▌         | 382/6434 [54:01<14:21:04,  8.54s/it, gpt_loss=0.199, loss_mean=0.281][A[A
+
+Train step of epoch 1:   6%|▌         | 383/6434 [54:01<14:30:30,  8.63s/it, gpt_loss=0.199, loss_mean=0.281][A[A
+
+Train step of epoch 1:   6%|▌         | 383/6434 [54:10<14:30:30,  8.63s/it, gpt_loss=0.284, loss_mean=0.282][A[A
+
+Train step of epoch 1:   6%|▌         | 384/6434 [54:10<14:24:26,  8.57s/it, gpt_loss=0.284, loss_mean=0.282][A[A
+
+Train step of epoch 1:   6%|▌         | 384/6434 [54:18<14:24:26,  8.57s/it, gpt_loss=0.253, loss_mean=0.279][A[A
+
+Train step of epoch 1:   6%|▌         | 385/6434 [54:18<14:02:31,  8.36s/it, gpt_loss=0.253, loss_mean=0.279][A[A
+[LID Router Debug] Step: 6820
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [3, 2, 2, 1, 10, 4, 4, 6, 4, 6]
+Active Experts in Batch: {1, 2, 3, 4, 6, 10}
+
+
+Train step of epoch 1:   6%|▌         | 385/6434 [54:26<14:02:31,  8.36s/it, gpt_loss=0.219, loss_mean=0.273][A[A
+
+Train step of epoch 1:   6%|▌         | 386/6434 [54:26<14:13:24,  8.47s/it, gpt_loss=0.219, loss_mean=0.273][A[A
+
+Train step of epoch 1:   6%|▌         | 386/6434 [54:35<14:13:24,  8.47s/it, gpt_loss=0.273, loss_mean=0.273][A[A
+
+Train step of epoch 1:   6%|▌         | 387/6434 [54:35<14:09:13,  8.43s/it, gpt_loss=0.273, loss_mean=0.273][A[A
+
+Train step of epoch 1:   6%|▌         | 387/6434 [54:44<14:09:13,  8.43s/it, gpt_loss=0.32, loss_mean=0.278] [A[A
+
+Train step of epoch 1:   6%|▌         | 388/6434 [54:44<14:32:10,  8.66s/it, gpt_loss=0.32, loss_mean=0.278][A[A
+
+Train step of epoch 1:   6%|▌         | 388/6434 [54:53<14:32:10,  8.66s/it, gpt_loss=0.235, loss_mean=0.273][A[A
+
+Train step of epoch 1:   6%|▌         | 389/6434 [54:53<14:37:01,  8.70s/it, gpt_loss=0.235, loss_mean=0.273][A[A
+
+Train step of epoch 1:   6%|▌         | 389/6434 [55:01<14:37:01,  8.70s/it, gpt_loss=0.303, loss_mean=0.276][A[A
+
+Train step of epoch 1:   6%|▌         | 390/6434 [55:01<14:10:27,  8.44s/it, gpt_loss=0.303, loss_mean=0.276][A[A
+
+Train step of epoch 1:   6%|▌         | 390/6434 [55:09<14:10:27,  8.44s/it, gpt_loss=0.291, loss_mean=0.278][A[A
+
+Train step of epoch 1:   6%|▌         | 391/6434 [55:09<14:01:41,  8.36s/it, gpt_loss=0.291, loss_mean=0.278][A[A
+
+Train step of epoch 1:   6%|▌         | 391/6434 [55:17<14:01:41,  8.36s/it, gpt_loss=0.284, loss_mean=0.278][A[A
+
+Train step of epoch 1:   6%|▌         | 392/6434 [55:17<14:06:51,  8.41s/it, gpt_loss=0.284, loss_mean=0.278][A[A
+
+Train step of epoch 1:   6%|▌         | 392/6434 [55:25<14:06:51,  8.41s/it, gpt_loss=0.193, loss_mean=0.27] [A[A
+
+Train step of epoch 1:   6%|▌         | 393/6434 [55:25<13:32:29,  8.07s/it, gpt_loss=0.193, loss_mean=0.27][A[A
+
+Train step of epoch 1:   6%|▌         | 393/6434 [55:33<13:32:29,  8.07s/it, gpt_loss=0.295, loss_mean=0.272][A[A
+
+Train step of epoch 1:   6%|▌         | 394/6434 [55:33<13:41:31,  8.16s/it, gpt_loss=0.295, loss_mean=0.272][A[A
+
+Train step of epoch 1:   6%|▌         | 394/6434 [55:40<13:41:31,  8.16s/it, gpt_loss=0.238, loss_mean=0.269][A[A
+
+Train step of epoch 1:   6%|▌         | 395/6434 [55:40<13:17:19,  7.92s/it, gpt_loss=0.238, loss_mean=0.269][A[A
+[LID Router Debug] Step: 6830
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [4, 0, 1, 3, 9, 2, 2, 4, 2, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+
+Train step of epoch 1:   6%|▌         | 395/6434 [55:48<13:17:19,  7.92s/it, gpt_loss=0.267, loss_mean=0.269][A[A
+
+Train step of epoch 1:   6%|▌         | 396/6434 [55:48<13:03:00,  7.78s/it, gpt_loss=0.267, loss_mean=0.269][A[A
+
+Train step of epoch 1:   6%|▌         | 396/6434 [55:55<13:03:00,  7.78s/it, gpt_loss=0.224, loss_mean=0.264][A[A
+
+Train step of epoch 1:   6%|▌         | 397/6434 [55:55<12:58:46,  7.74s/it, gpt_loss=0.224, loss_mean=0.264][A[A
+
+Train step of epoch 1:   6%|▌         | 397/6434 [56:03<12:58:46,  7.74s/it, gpt_loss=0.324, loss_mean=0.27] [A[A
+
+Train step of epoch 1:   6%|▌         | 398/6434 [56:03<12:55:42,  7.71s/it, gpt_loss=0.324, loss_mean=0.27][A[A
+
+Train step of epoch 1:   6%|▌         | 398/6434 [56:12<12:55:42,  7.71s/it, gpt_loss=0.26, loss_mean=0.269][A[A
+
+Train step of epoch 1:   6%|▌         | 399/6434 [56:12<13:28:05,  8.03s/it, gpt_loss=0.26, loss_mean=0.269][A[A
+
+Train step of epoch 1:   6%|▌         | 399/6434 [56:20<13:28:05,  8.03s/it, gpt_loss=0.286, loss_mean=0.271][A[A
+
+Train step of epoch 1:   6%|▌         | 400/6434 [56:20<13:32:08,  8.08s/it, gpt_loss=0.286, loss_mean=0.271][A[A
+
+Train step of epoch 1:   6%|▌         | 400/6434 [56:29<13:32:08,  8.08s/it, gpt_loss=0.246, loss_mean=0.268][A[A
+
+Train step of epoch 1:   6%|▌         | 401/6434 [56:29<13:44:36,  8.20s/it, gpt_loss=0.246, loss_mean=0.268][A[A
+
+Train step of epoch 1:   6%|▌         | 401/6434 [56:37<13:44:36,  8.20s/it, gpt_loss=0.282, loss_mean=0.27] [A[A
+
+Train step of epoch 1:   6%|▌         | 402/6434 [56:37<13:42:35,  8.18s/it, gpt_loss=0.282, loss_mean=0.27][A[A
+
+Train step of epoch 1:   6%|▌         | 402/6434 [56:46<13:42:35,  8.18s/it, gpt_loss=0.342, loss_mean=0.277][A[A
+
+Train step of epoch 1:   6%|▋         | 403/6434 [56:46<14:01:05,  8.37s/it, gpt_loss=0.342, loss_mean=0.277][A[A
+
+Train step of epoch 1:   6%|▋         | 403/6434 [56:53<14:01:05,  8.37s/it, gpt_loss=0.294, loss_mean=0.279][A[A
+
+Train step of epoch 1:   6%|▋         | 404/6434 [56:53<13:30:47,  8.07s/it, gpt_loss=0.294, loss_mean=0.279][A[A
+
+Train step of epoch 1:   6%|▋         | 404/6434 [57:01<13:30:47,  8.07s/it, gpt_loss=0.302, loss_mean=0.281][A[A
+
+Train step of epoch 1:   6%|▋         | 405/6434 [57:01<13:39:44,  8.16s/it, gpt_loss=0.302, loss_mean=0.281][A[A
+[LID Router Debug] Step: 6840
+Batch Size: 10
+Audio Batch Size: 90
+LID Assignments: [4, 4, 4, 4, 2, 9, 4, 3, 0, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+
+Train step of epoch 1:   6%|▋         | 405/6434 [57:09<13:39:44,  8.16s/it, gpt_loss=0.393, loss_mean=0.292][A[A
+
+Train step of epoch 1:   6%|▋         | 406/6434 [57:09<13:14:13,  7.91s/it, gpt_loss=0.393, loss_mean=0.292][A[A
+
+Train step of epoch 1:   6%|▋         | 406/6434 [57:17<13:14:13,  7.91s/it, gpt_loss=0.273, loss_mean=0.29] [A[A
+
+Train step of epoch 1:   6%|▋         | 407/6434 [57:17<13:33:42,  8.10s/it, gpt_loss=0.273, loss_mean=0.29][A[A
+
+Train step of epoch 1:   6%|▋         | 407/6434 [57:24<13:33:42,  8.10s/it, gpt_loss=0.23, loss_mean=0.284][A[A
+
+Train step of epoch 1:   6%|▋         | 408/6434 [57:24<12:59:17,  7.76s/it, gpt_loss=0.23, loss_mean=0.284][A[A
+
+Train step of epoch 1:   6%|▋         | 408/6434 [57:33<12:59:17,  7.76s/it, gpt_loss=0.239, loss_mean=0.28][A[A
+
+Train step of epoch 1:   6%|▋         | 409/6434 [57:33<13:36:20,  8.13s/it, gpt_loss=0.239, loss_mean=0.28][A[A
+
+Train step of epoch 1:   6%|▋         | 409/6434 [57:41<13:36:20,  8.13s/it, gpt_loss=0.207, loss_mean=0.272][A[A
+
+Train step of epoch 1:   6%|▋         | 410/6434 [57:41<13:34:32,  8.11s/it, gpt_loss=0.207, loss_mean=0.272][A[A
+
+Train step of epoch 1:   6%|▋         | 410/6434 [57:51<13:34:32,  8.11s/it, gpt_loss=0.244, loss_mean=0.27] [A[A
+
+Train step of epoch 1:   6%|▋         | 411/6434 [57:51<14:15:58,  8.53s/it, gpt_loss=0.244, loss_mean=0.27][A[A
+
+Train step of epoch 1:   6%|▋         | 411/6434 [57:59<14:15:58,  8.53s/it, gpt_loss=0.346, loss_mean=0.277][A[A
+
+Train step of epoch 1:   6%|▋         | 412/6434 [57:59<14:24:32,  8.61s/it, gpt_loss=0.346, loss_mean=0.277][A[A
+
+Train step of epoch 1:   6%|▋         | 412/6434 [58:08<14:24:32,  8.61s/it, gpt_loss=0.246, loss_mean=0.274][A[A
+
+Train step of epoch 1:   6%|▋         | 413/6434 [58:08<14:26:47,  8.64s/it, gpt_loss=0.246, loss_mean=0.274][A[A
+
+Train step of epoch 1:   6%|▋         | 413/6434 [58:16<14:26:47,  8.64s/it, gpt_loss=0.221, loss_mean=0.269][A[A
+
+Train step of epoch 1:   6%|▋         | 414/6434 [58:16<14:17:53,  8.55s/it, gpt_loss=0.221, loss_mean=0.269][A[A
+
+Train step of epoch 1:   6%|▋         | 414/6434 [58:26<14:17:53,  8.55s/it, gpt_loss=0.24, loss_mean=0.266] [A[A
+
+Train step of epoch 1:   6%|▋         | 415/6434 [58:26<14:35:09,  8.72s/it, gpt_loss=0.24, loss_mean=0.266][A[A
+[LID Router Debug] Step: 6850
+Batch Size: 10
+Audio Batch Size: 112
+LID Assignments: [9, 4, 1, 0, 9, 0, 0, 0, 9, 3]
+Active Experts in Batch: {0, 1, 3, 4, 9}
+
+
+Train step of epoch 1:   6%|▋         | 415/6434 [58:33<14:35:09,  8.72s/it, gpt_loss=0.33, loss_mean=0.272][A[A
+
+Train step of epoch 1:   6%|▋         | 416/6434 [58:33<14:06:36,  8.44s/it, gpt_loss=0.33, loss_mean=0.272][A[A
+
+Train step of epoch 1:   6%|▋         | 416/6434 [58:41<14:06:36,  8.44s/it, gpt_loss=0.229, loss_mean=0.268][A[A
+
+Train step of epoch 1:   6%|▋         | 417/6434 [58:41<13:47:17,  8.25s/it, gpt_loss=0.229, loss_mean=0.268][A[A
+
+Train step of epoch 1:   6%|▋         | 417/6434 [58:50<13:47:17,  8.25s/it, gpt_loss=0.243, loss_mean=0.265][A[A
+
+Train step of epoch 1:   6%|▋         | 418/6434 [58:50<14:01:44,  8.40s/it, gpt_loss=0.243, loss_mean=0.265][A[A
+
+Train step of epoch 1:   6%|▋         | 418/6434 [58:58<14:01:44,  8.40s/it, gpt_loss=0.208, loss_mean=0.26] [A[A
+
+Train step of epoch 1:   7%|▋         | 419/6434 [58:58<13:56:54,  8.35s/it, gpt_loss=0.208, loss_mean=0.26][A[A
+
+Train step of epoch 1:   7%|▋         | 419/6434 [59:07<13:56:54,  8.35s/it, gpt_loss=0.271, loss_mean=0.261][A[A
+
+Train step of epoch 1:   7%|▋         | 420/6434 [59:07<14:05:37,  8.44s/it, gpt_loss=0.271, loss_mean=0.261][A[A
+
+Train step of epoch 1:   7%|▋         | 420/6434 [59:15<14:05:37,  8.44s/it, gpt_loss=0.278, loss_mean=0.263][A[A
+
+Train step of epoch 1:   7%|▋         | 421/6434 [59:15<14:00:38,  8.39s/it, gpt_loss=0.278, loss_mean=0.263][A[A
+
+Train step of epoch 1:   7%|▋         | 421/6434 [59:23<14:00:38,  8.39s/it, gpt_loss=0.253, loss_mean=0.262][A[A
+
+Train step of epoch 1:   7%|▋         | 422/6434 [59:23<13:35:02,  8.13s/it, gpt_loss=0.253, loss_mean=0.262][A[A
+
+Train step of epoch 1:   7%|▋         | 422/6434 [59:30<13:35:02,  8.13s/it, gpt_loss=0.283, loss_mean=0.264][A[A
+
+Train step of epoch 1:   7%|▋         | 423/6434 [59:30<13:01:53,  7.80s/it, gpt_loss=0.283, loss_mean=0.264][A[A
+
+Train step of epoch 1:   7%|▋         | 423/6434 [59:38<13:01:53,  7.80s/it, gpt_loss=0.313, loss_mean=0.269][A[A
+
+Train step of epoch 1:   7%|▋         | 424/6434 [59:38<13:31:24,  8.10s/it, gpt_loss=0.313, loss_mean=0.269][A[A
+
+Train step of epoch 1:   7%|▋         | 424/6434 [59:47<13:31:24,  8.10s/it, gpt_loss=0.217, loss_mean=0.264][A[A
+
+Train step of epoch 1:   7%|▋         | 425/6434 [59:47<13:56:59,  8.36s/it, gpt_loss=0.217, loss_mean=0.264][A[A
+[LID Router Debug] Step: 6860
+Batch Size: 10
+Audio Batch Size: 115
+LID Assignments: [4, 0, 5, 4, 3, 0, 5, 3, 4, 3]
+Active Experts in Batch: {0, 3, 4, 5}
+
+
+Train step of epoch 1:   7%|▋         | 425/6434 [59:56<13:56:59,  8.36s/it, gpt_loss=0.296, loss_mean=0.267][A[A
+
+Train step of epoch 1:   7%|▋         | 426/6434 [59:56<13:52:42,  8.32s/it, gpt_loss=0.296, loss_mean=0.267][A[A
+
+Train step of epoch 1:   7%|▋         | 426/6434 [1:00:05<13:52:42,  8.32s/it, gpt_loss=0.289, loss_mean=0.269][A[A
+
+Train step of epoch 1:   7%|▋         | 427/6434 [1:00:05<14:31:48,  8.71s/it, gpt_loss=0.289, loss_mean=0.269][A[A
+
+Train step of epoch 1:   7%|▋         | 427/6434 [1:00:14<14:31:48,  8.71s/it, gpt_loss=0.248, loss_mean=0.267][A[A
+
+Train step of epoch 1:   7%|▋         | 428/6434 [1:00:14<14:24:00,  8.63s/it, gpt_loss=0.248, loss_mean=0.267][A[A
+
+Train step of epoch 1:   7%|▋         | 428/6434 [1:00:22<14:24:00,  8.63s/it, gpt_loss=0.311, loss_mean=0.271][A[A
+
+Train step of epoch 1:   7%|▋         | 429/6434 [1:00:22<14:11:29,  8.51s/it, gpt_loss=0.311, loss_mean=0.271][A[A
+
+Train step of epoch 1:   7%|▋         | 429/6434 [1:00:31<14:11:29,  8.51s/it, gpt_loss=0.274, loss_mean=0.272][A[A
+
+Train step of epoch 1:   7%|▋         | 430/6434 [1:00:31<14:16:12,  8.56s/it, gpt_loss=0.274, loss_mean=0.272][A[A
+
+Train step of epoch 1:   7%|▋         | 430/6434 [1:00:38<14:16:12,  8.56s/it, gpt_loss=0.282, loss_mean=0.273][A[A
+
+Train step of epoch 1:   7%|▋         | 431/6434 [1:00:38<13:45:35,  8.25s/it, gpt_loss=0.282, loss_mean=0.273][A[A
+
+Train step of epoch 1:   7%|▋         | 431/6434 [1:00:46<13:45:35,  8.25s/it, gpt_loss=0.282, loss_mean=0.274][A[A
+
+Train step of epoch 1:   7%|▋         | 432/6434 [1:00:46<13:37:49,  8.18s/it, gpt_loss=0.282, loss_mean=0.274][A[A
+
+Train step of epoch 1:   7%|▋         | 432/6434 [1:00:55<13:37:49,  8.18s/it, gpt_loss=0.252, loss_mean=0.271][A[A
+
+Train step of epoch 1:   7%|▋         | 433/6434 [1:00:55<13:53:54,  8.34s/it, gpt_loss=0.252, loss_mean=0.271][A[A
+
+Train step of epoch 1:   7%|▋         | 433/6434 [1:01:04<13:53:54,  8.34s/it, gpt_loss=0.22, loss_mean=0.266] [A[A
+
+Train step of epoch 1:   7%|▋         | 434/6434 [1:01:04<14:19:13,  8.59s/it, gpt_loss=0.22, loss_mean=0.266][A[A
+
+Train step of epoch 1:   7%|▋         | 434/6434 [1:01:12<14:19:13,  8.59s/it, gpt_loss=0.412, loss_mean=0.281][A[A
+
+Train step of epoch 1:   7%|▋         | 435/6434 [1:01:12<14:14:21,  8.55s/it, gpt_loss=0.412, loss_mean=0.281][A[A
+[LID Router Debug] Step: 6870
+Batch Size: 10
+Audio Batch Size: 128
+LID Assignments: [4, 3, 9, 9, 5, 7, 2, 3, 4, 3]
+Active Experts in Batch: {2, 3, 4, 5, 7, 9}
+
+
+Train step of epoch 1:   7%|▋         | 435/6434 [1:01:21<14:14:21,  8.55s/it, gpt_loss=0.3, loss_mean=0.283]  [A[A
+
+Train step of epoch 1:   7%|▋         | 436/6434 [1:01:21<14:10:04,  8.50s/it, gpt_loss=0.3, loss_mean=0.283][A[A
+
+Train step of epoch 1:   7%|▋         | 436/6434 [1:01:28<14:10:04,  8.50s/it, gpt_loss=0.273, loss_mean=0.282][A[A
+
+Train step of epoch 1:   7%|▋         | 437/6434 [1:01:28<13:39:33,  8.20s/it, gpt_loss=0.273, loss_mean=0.282][A[A
+
+Train step of epoch 1:   7%|▋         | 437/6434 [1:01:37<13:39:33,  8.20s/it, gpt_loss=0.254, loss_mean=0.279][A[A
+
+Train step of epoch 1:   7%|▋         | 438/6434 [1:01:37<13:39:45,  8.20s/it, gpt_loss=0.254, loss_mean=0.279][A[A
+
+Train step of epoch 1:   7%|▋         | 438/6434 [1:01:46<13:39:45,  8.20s/it, gpt_loss=0.272, loss_mean=0.278][A[A
+
+Train step of epoch 1:   7%|▋         | 439/6434 [1:01:46<14:20:03,  8.61s/it, gpt_loss=0.272, loss_mean=0.278][A[A
+
+Train step of epoch 1:   7%|▋         | 439/6434 [1:01:54<14:20:03,  8.61s/it, gpt_loss=0.212, loss_mean=0.272][A[A
+
+Train step of epoch 1:   7%|▋         | 440/6434 [1:01:54<13:59:19,  8.40s/it, gpt_loss=0.212, loss_mean=0.272][A[A
+
+Train step of epoch 1:   7%|▋         | 440/6434 [1:02:03<13:59:19,  8.40s/it, gpt_loss=0.3, loss_mean=0.274]  [A[A
+
+Train step of epoch 1:   7%|▋         | 441/6434 [1:02:03<14:20:36,  8.62s/it, gpt_loss=0.3, loss_mean=0.274][A[A
+
+Train step of epoch 1:   7%|▋         | 441/6434 [1:02:11<14:20:36,  8.62s/it, gpt_loss=0.261, loss_mean=0.273][A[A
+
+Train step of epoch 1:   7%|▋         | 442/6434 [1:02:11<14:04:10,  8.45s/it, gpt_loss=0.261, loss_mean=0.273][A[A
+
+Train step of epoch 1:   7%|▋         | 442/6434 [1:02:19<14:04:10,  8.45s/it, gpt_loss=0.319, loss_mean=0.278][A[A
+
+Train step of epoch 1:   7%|▋         | 443/6434 [1:02:19<13:42:06,  8.23s/it, gpt_loss=0.319, loss_mean=0.278][A[A
+
+Train step of epoch 1:   7%|▋         | 443/6434 [1:02:28<13:42:06,  8.23s/it, gpt_loss=0.262, loss_mean=0.276][A[A
+
+Train step of epoch 1:   7%|▋         | 444/6434 [1:02:28<14:06:27,  8.48s/it, gpt_loss=0.262, loss_mean=0.276][A[A
+
+Train step of epoch 1:   7%|▋         | 444/6434 [1:02:36<14:06:27,  8.48s/it, gpt_loss=0.271, loss_mean=0.276][A[A
+
+Train step of epoch 1:   7%|▋         | 445/6434 [1:02:36<13:54:11,  8.36s/it, gpt_loss=0.271, loss_mean=0.276][A[A
+[LID Router Debug] Step: 6880
+Batch Size: 10
+Audio Batch Size: 78
+LID Assignments: [9, 2, 1, 0, 4, 1, 2, 2, 0, 1]
+Active Experts in Batch: {0, 1, 2, 4, 9}
+
+
+Train step of epoch 1:   7%|▋         | 445/6434 [1:02:45<13:54:11,  8.36s/it, gpt_loss=0.264, loss_mean=0.274][A[A
+
+Train step of epoch 1:   7%|▋         | 446/6434 [1:02:45<13:57:45,  8.39s/it, gpt_loss=0.264, loss_mean=0.274][A[A
+
+Train step of epoch 1:   7%|▋         | 446/6434 [1:02:53<13:57:45,  8.39s/it, gpt_loss=0.246, loss_mean=0.272][A[A
+
+Train step of epoch 1:   7%|▋         | 447/6434 [1:02:53<14:06:07,  8.48s/it, gpt_loss=0.246, loss_mean=0.272][A[A
+
+Train step of epoch 1:   7%|▋         | 447/6434 [1:03:01<14:06:07,  8.48s/it, gpt_loss=0.303, loss_mean=0.275][A[A
+
+Train step of epoch 1:   7%|▋         | 448/6434 [1:03:01<13:52:51,  8.35s/it, gpt_loss=0.303, loss_mean=0.275][A[A
+
+Train step of epoch 1:   7%|▋         | 448/6434 [1:03:10<13:52:51,  8.35s/it, gpt_loss=0.257, loss_mean=0.273][A[A
+
+Train step of epoch 1:   7%|▋         | 449/6434 [1:03:10<14:08:55,  8.51s/it, gpt_loss=0.257, loss_mean=0.273][A[A
+
+Train step of epoch 1:   7%|▋         | 449/6434 [1:03:19<14:08:55,  8.51s/it, gpt_loss=0.255, loss_mean=0.271][A[A
+
+Train step of epoch 1:   7%|▋         | 450/6434 [1:03:19<14:04:57,  8.47s/it, gpt_loss=0.255, loss_mean=0.271][A[A
+
+Train step of epoch 1:   7%|▋         | 450/6434 [1:03:26<14:04:57,  8.47s/it, gpt_loss=0.336, loss_mean=0.278][A[A
+
+Train step of epoch 1:   7%|▋         | 451/6434 [1:03:26<13:37:38,  8.20s/it, gpt_loss=0.336, loss_mean=0.278][A[A
+
+Train step of epoch 1:   7%|▋         | 451/6434 [1:03:35<13:37:38,  8.20s/it, gpt_loss=0.217, loss_mean=0.272][A[A
+
+Train step of epoch 1:   7%|▋         | 452/6434 [1:03:35<13:48:33,  8.31s/it, gpt_loss=0.217, loss_mean=0.272][A[A
+
+Train step of epoch 1:   7%|▋         | 452/6434 [1:03:43<13:48:33,  8.31s/it, gpt_loss=0.228, loss_mean=0.267][A[A
+
+Train step of epoch 1:   7%|▋         | 453/6434 [1:03:43<13:39:08,  8.22s/it, gpt_loss=0.228, loss_mean=0.267][A[A
+
+Train step of epoch 1:   7%|▋         | 453/6434 [1:03:51<13:39:08,  8.22s/it, gpt_loss=0.334, loss_mean=0.274][A[A
+
+Train step of epoch 1:   7%|▋         | 454/6434 [1:03:51<13:49:55,  8.33s/it, gpt_loss=0.334, loss_mean=0.274][A[A
+
+Train step of epoch 1:   7%|▋         | 454/6434 [1:03:59<13:49:55,  8.33s/it, gpt_loss=0.31, loss_mean=0.278] [A[A
+
+Train step of epoch 1:   7%|▋         | 455/6434 [1:03:59<13:42:55,  8.26s/it, gpt_loss=0.31, loss_mean=0.278][A[A
+[LID Router Debug] Step: 6890
+Batch Size: 10
+Audio Batch Size: 106
+LID Assignments: [4, 2, 2, 5, 1, 9, 6, 4, 2, 9]
+Active Experts in Batch: {1, 2, 4, 5, 6, 9}
+
+
+Train step of epoch 1:   7%|▋         | 455/6434 [1:04:08<13:42:55,  8.26s/it, gpt_loss=0.387, loss_mean=0.288][A[A
+
+Train step of epoch 1:   7%|▋         | 456/6434 [1:04:08<13:44:14,  8.27s/it, gpt_loss=0.387, loss_mean=0.288][A[A
+
+Train step of epoch 1:   7%|▋         | 456/6434 [1:04:15<13:44:14,  8.27s/it, gpt_loss=0.295, loss_mean=0.289][A[A
+
+Train step of epoch 1:   7%|▋         | 457/6434 [1:04:15<13:25:59,  8.09s/it, gpt_loss=0.295, loss_mean=0.289][A[A
+
+Train step of epoch 1:   7%|▋         | 457/6434 [1:04:24<13:25:59,  8.09s/it, gpt_loss=0.248, loss_mean=0.285][A[A
+
+Train step of epoch 1:   7%|▋         | 458/6434 [1:04:24<13:46:50,  8.30s/it, gpt_loss=0.248, loss_mean=0.285][A[A
+
+Train step of epoch 1:   7%|▋         | 458/6434 [1:04:32<13:46:50,  8.30s/it, gpt_loss=0.225, loss_mean=0.279][A[A
+
+Train step of epoch 1:   7%|▋         | 459/6434 [1:04:32<13:20:45,  8.04s/it, gpt_loss=0.225, loss_mean=0.279][A[A
+
+Train step of epoch 1:   7%|▋         | 459/6434 [1:04:41<13:20:45,  8.04s/it, gpt_loss=0.261, loss_mean=0.277][A[A
+
+Train step of epoch 1:   7%|▋         | 460/6434 [1:04:41<13:47:07,  8.31s/it, gpt_loss=0.261, loss_mean=0.277][A[A
+
+Train step of epoch 1:   7%|▋         | 460/6434 [1:04:49<13:47:07,  8.31s/it, gpt_loss=0.276, loss_mean=0.277][A[A
+
+Train step of epoch 1:   7%|▋         | 461/6434 [1:04:49<14:02:44,  8.47s/it, gpt_loss=0.276, loss_mean=0.277][A[A
+
+Train step of epoch 1:   7%|▋         | 461/6434 [1:04:59<14:02:44,  8.47s/it, gpt_loss=0.353, loss_mean=0.285][A[A
+
+Train step of epoch 1:   7%|▋         | 462/6434 [1:04:59<14:36:32,  8.81s/it, gpt_loss=0.353, loss_mean=0.285][A[A
+
+Train step of epoch 1:   7%|▋         | 462/6434 [1:05:07<14:36:32,  8.81s/it, gpt_loss=0.266, loss_mean=0.283][A[A
+
+Train step of epoch 1:   7%|▋         | 463/6434 [1:05:07<14:06:24,  8.51s/it, gpt_loss=0.266, loss_mean=0.283][A[A
+
+Train step of epoch 1:   7%|▋         | 463/6434 [1:05:14<14:06:24,  8.51s/it, gpt_loss=0.242, loss_mean=0.279][A[A
+
+Train step of epoch 1:   7%|▋         | 464/6434 [1:05:14<13:36:39,  8.21s/it, gpt_loss=0.242, loss_mean=0.279][A[A
+
+Train step of epoch 1:   7%|▋         | 464/6434 [1:05:22<13:36:39,  8.21s/it, gpt_loss=0.239, loss_mean=0.275][A[A
+
+Train step of epoch 1:   7%|▋         | 465/6434 [1:05:22<13:11:37,  7.96s/it, gpt_loss=0.239, loss_mean=0.275][A[A
+[LID Router Debug] Step: 6900
+Batch Size: 10
+Audio Batch Size: 98
+LID Assignments: [0, 9, 2, 5, 1, 9, 0, 1, 4, 4]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+
+Train step of epoch 1:   7%|▋         | 465/6434 [1:05:30<13:11:37,  7.96s/it, gpt_loss=0.282, loss_mean=0.275][A[A
+
+Train step of epoch 1:   7%|▋         | 466/6434 [1:05:30<13:24:05,  8.08s/it, gpt_loss=0.282, loss_mean=0.275][A[A
+
+Train step of epoch 1:   7%|▋         | 466/6434 [1:05:39<13:24:05,  8.08s/it, gpt_loss=0.256, loss_mean=0.274][A[A
+
+Train step of epoch 1:   7%|▋         | 467/6434 [1:05:39<13:38:26,  8.23s/it, gpt_loss=0.256, loss_mean=0.274][A[A
+
+Train step of epoch 1:   7%|▋         | 467/6434 [1:05:47<13:38:26,  8.23s/it, gpt_loss=0.266, loss_mean=0.273][A[A
+
+Train step of epoch 1:   7%|▋         | 468/6434 [1:05:47<13:29:53,  8.15s/it, gpt_loss=0.266, loss_mean=0.273][A[A
+
+Train step of epoch 1:   7%|▋         | 468/6434 [1:05:56<13:29:53,  8.15s/it, gpt_loss=0.268, loss_mean=0.272][A[A
+
+Train step of epoch 1:   7%|▋         | 469/6434 [1:05:56<14:07:58,  8.53s/it, gpt_loss=0.268, loss_mean=0.272][A[A
+
+Train step of epoch 1:   7%|▋         | 469/6434 [1:06:04<14:07:58,  8.53s/it, gpt_loss=0.294, loss_mean=0.274][A[A
+
+Train step of epoch 1:   7%|▋         | 470/6434 [1:06:04<13:57:09,  8.42s/it, gpt_loss=0.294, loss_mean=0.274][A[A
+
+Train step of epoch 1:   7%|▋         | 470/6434 [1:06:12<13:57:09,  8.42s/it, gpt_loss=0.286, loss_mean=0.276][A[A
+
+Train step of epoch 1:   7%|▋         | 471/6434 [1:06:12<13:38:37,  8.24s/it, gpt_loss=0.286, loss_mean=0.276][A[A
+
+Train step of epoch 1:   7%|▋         | 471/6434 [1:06:20<13:38:37,  8.24s/it, gpt_loss=0.251, loss_mean=0.273][A[A
+
+Train step of epoch 1:   7%|▋         | 472/6434 [1:06:20<13:27:07,  8.12s/it, gpt_loss=0.251, loss_mean=0.273][A[A
+
+Train step of epoch 1:   7%|▋         | 472/6434 [1:06:28<13:27:07,  8.12s/it, gpt_loss=0.281, loss_mean=0.274][A[A
+
+Train step of epoch 1:   7%|▋         | 473/6434 [1:06:28<13:40:56,  8.26s/it, gpt_loss=0.281, loss_mean=0.274][A[A
+
+Train step of epoch 1:   7%|▋         | 473/6434 [1:06:36<13:40:56,  8.26s/it, gpt_loss=0.275, loss_mean=0.274][A[A
+
+Train step of epoch 1:   7%|▋         | 474/6434 [1:06:36<13:31:33,  8.17s/it, gpt_loss=0.275, loss_mean=0.274][A[A
+
+Train step of epoch 1:   7%|▋         | 474/6434 [1:06:45<13:31:33,  8.17s/it, gpt_loss=0.229, loss_mean=0.27] [A[A
+
+Train step of epoch 1:   7%|▋         | 475/6434 [1:06:45<13:34:05,  8.20s/it, gpt_loss=0.229, loss_mean=0.27][A[A
+[LID Router Debug] Step: 6910
+Batch Size: 10
+Audio Batch Size: 130
+LID Assignments: [2, 2, 3, 4, 2, 6, 3, 2, 3, 4]
+Active Experts in Batch: {2, 3, 4, 6}
+
+
+Train step of epoch 1:   7%|▋         | 475/6434 [1:06:54<13:34:05,  8.20s/it, gpt_loss=0.253, loss_mean=0.268][A[A
+
+Train step of epoch 1:   7%|▋         | 476/6434 [1:06:54<14:07:03,  8.53s/it, gpt_loss=0.253, loss_mean=0.268][A[A
+
+Train step of epoch 1:   7%|▋         | 476/6434 [1:07:02<14:07:03,  8.53s/it, gpt_loss=0.238, loss_mean=0.265][A[A
+
+Train step of epoch 1:   7%|▋         | 477/6434 [1:07:02<13:47:55,  8.34s/it, gpt_loss=0.238, loss_mean=0.265][A[A
+
+Train step of epoch 1:   7%|▋         | 477/6434 [1:07:11<13:47:55,  8.34s/it, gpt_loss=0.285, loss_mean=0.267][A[A
+
+Train step of epoch 1:   7%|▋         | 478/6434 [1:07:11<14:04:24,  8.51s/it, gpt_loss=0.285, loss_mean=0.267][A[A
+
+Train step of epoch 1:   7%|▋         | 478/6434 [1:07:18<14:04:24,  8.51s/it, gpt_loss=0.268, loss_mean=0.267][A[A
+
+Train step of epoch 1:   7%|▋         | 479/6434 [1:07:18<13:42:48,  8.29s/it, gpt_loss=0.268, loss_mean=0.267][A[A
+
+Train step of epoch 1:   7%|▋         | 479/6434 [1:07:26<13:42:48,  8.29s/it, gpt_loss=0.336, loss_mean=0.274][A[A
+
+Train step of epoch 1:   7%|▋         | 480/6434 [1:07:26<13:29:02,  8.15s/it, gpt_loss=0.336, loss_mean=0.274][A[A
+
+Train step of epoch 1:   7%|▋         | 480/6434 [1:07:35<13:29:02,  8.15s/it, gpt_loss=0.271, loss_mean=0.274][A[A
+
+Train step of epoch 1:   7%|▋         | 481/6434 [1:07:35<13:40:53,  8.27s/it, gpt_loss=0.271, loss_mean=0.274][A[A
+
+Train step of epoch 1:   7%|▋         | 481/6434 [1:07:44<13:40:53,  8.27s/it, gpt_loss=0.304, loss_mean=0.277][A[A
+
+Train step of epoch 1:   7%|▋         | 482/6434 [1:07:44<14:17:39,  8.65s/it, gpt_loss=0.304, loss_mean=0.277][A[A
+
+Train step of epoch 1:   7%|▋         | 482/6434 [1:07:53<14:17:39,  8.65s/it, gpt_loss=0.291, loss_mean=0.278][A[A
+
+Train step of epoch 1:   8%|▊         | 483/6434 [1:07:53<14:06:47,  8.54s/it, gpt_loss=0.291, loss_mean=0.278][A[A
+
+Train step of epoch 1:   8%|▊         | 483/6434 [1:08:00<14:06:47,  8.54s/it, gpt_loss=0.219, loss_mean=0.272][A[A
+
+Train step of epoch 1:   8%|▊         | 484/6434 [1:08:00<13:36:38,  8.23s/it, gpt_loss=0.219, loss_mean=0.272][A[A
+
+Train step of epoch 1:   8%|▊         | 484/6434 [1:08:08<13:36:38,  8.23s/it, gpt_loss=0.299, loss_mean=0.275][A[A
+
+Train step of epoch 1:   8%|▊         | 485/6434 [1:08:08<13:30:09,  8.17s/it, gpt_loss=0.299, loss_mean=0.275][A[A
+[LID Router Debug] Step: 6920
+Batch Size: 10
+Audio Batch Size: 95
+LID Assignments: [1, 5, 4, 4, 4, 9, 1, 5, 3, 9]
+Active Experts in Batch: {1, 3, 4, 5, 9}
+
+
+Train step of epoch 1:   8%|▊         | 485/6434 [1:08:16<13:30:09,  8.17s/it, gpt_loss=0.315, loss_mean=0.279][A[A
+
+Train step of epoch 1:   8%|▊         | 486/6434 [1:08:16<13:17:06,  8.04s/it, gpt_loss=0.315, loss_mean=0.279][A[A
+
+Train step of epoch 1:   8%|▊         | 486/6434 [1:08:25<13:17:06,  8.04s/it, gpt_loss=0.213, loss_mean=0.272][A[A
+
+Train step of epoch 1:   8%|▊         | 487/6434 [1:08:25<13:57:01,  8.44s/it, gpt_loss=0.213, loss_mean=0.272][A[A
+
+Train step of epoch 1:   8%|▊         | 487/6434 [1:08:35<13:57:01,  8.44s/it, gpt_loss=0.297, loss_mean=0.275][A[A
+
+Train step of epoch 1:   8%|▊         | 488/6434 [1:08:35<14:23:40,  8.72s/it, gpt_loss=0.297, loss_mean=0.275][A[A
+
+Train step of epoch 1:   8%|▊         | 488/6434 [1:08:42<14:23:40,  8.72s/it, gpt_loss=0.287, loss_mean=0.276][A[A
+
+Train step of epoch 1:   8%|▊         | 489/6434 [1:08:42<13:50:17,  8.38s/it, gpt_loss=0.287, loss_mean=0.276][A[A
+
+Train step of epoch 1:   8%|▊         | 489/6434 [1:08:52<13:50:17,  8.38s/it, gpt_loss=0.257, loss_mean=0.274][A[A
+
+Train step of epoch 1:   8%|▊         | 490/6434 [1:08:52<14:28:24,  8.77s/it, gpt_loss=0.257, loss_mean=0.274][A[A
+
+Train step of epoch 1:   8%|▊         | 490/6434 [1:09:00<14:28:24,  8.77s/it, gpt_loss=0.314, loss_mean=0.278][A[A
+
+Train step of epoch 1:   8%|▊         | 491/6434 [1:09:00<14:10:20,  8.59s/it, gpt_loss=0.314, loss_mean=0.278][A[A
+
+Train step of epoch 1:   8%|▊         | 491/6434 [1:09:08<14:10:20,  8.59s/it, gpt_loss=0.273, loss_mean=0.278][A[A
+
+Train step of epoch 1:   8%|▊         | 492/6434 [1:09:08<13:44:17,  8.32s/it, gpt_loss=0.273, loss_mean=0.278][A[A
+
+Train step of epoch 1:   8%|▊         | 492/6434 [1:09:18<13:44:17,  8.32s/it, gpt_loss=0.307, loss_mean=0.281][A[A
+
+Train step of epoch 1:   8%|▊         | 493/6434 [1:09:18<14:31:58,  8.81s/it, gpt_loss=0.307, loss_mean=0.281][A[A
+
+Train step of epoch 1:   8%|▊         | 493/6434 [1:09:26<14:31:58,  8.81s/it, gpt_loss=0.286, loss_mean=0.281][A[A
+
+Train step of epoch 1:   8%|▊         | 494/6434 [1:09:26<14:13:03,  8.62s/it, gpt_loss=0.286, loss_mean=0.281][A[A
+
+Train step of epoch 1:   8%|▊         | 494/6434 [1:09:36<14:13:03,  8.62s/it, gpt_loss=0.24, loss_mean=0.277] [A[A
+
+Train step of epoch 1:   8%|▊         | 495/6434 [1:09:36<14:51:11,  9.00s/it, gpt_loss=0.24, loss_mean=0.277][A[A
+[LID Router Debug] Step: 6930
+Batch Size: 10
+Audio Batch Size: 138
+LID Assignments: [4, 5, 9, 0, 9, 3, 3, 1, 3, 0]
+Active Experts in Batch: {0, 1, 3, 4, 5, 9}
+
+
+Train step of epoch 1:   8%|▊         | 495/6434 [1:09:45<14:51:11,  9.00s/it, gpt_loss=0.287, loss_mean=0.278][A[A
+
+Train step of epoch 1:   8%|▊         | 496/6434 [1:09:45<14:50:17,  9.00s/it, gpt_loss=0.287, loss_mean=0.278][A[A
+
+Train step of epoch 1:   8%|▊         | 496/6434 [1:09:53<14:50:17,  9.00s/it, gpt_loss=0.274, loss_mean=0.278][A[A
+
+Train step of epoch 1:   8%|▊         | 497/6434 [1:09:53<14:31:21,  8.81s/it, gpt_loss=0.274, loss_mean=0.278][A[A
+
+Train step of epoch 1:   8%|▊         | 497/6434 [1:10:01<14:31:21,  8.81s/it, gpt_loss=0.333, loss_mean=0.283][A[A
+
+Train step of epoch 1:   8%|▊         | 498/6434 [1:10:01<14:13:18,  8.63s/it, gpt_loss=0.333, loss_mean=0.283][A[A
+
+Train step of epoch 1:   8%|▊         | 498/6434 [1:10:09<14:13:18,  8.63s/it, gpt_loss=0.265, loss_mean=0.281][A[A
+
+Train step of epoch 1:   8%|▊         | 499/6434 [1:10:09<13:49:58,  8.39s/it, gpt_loss=0.265, loss_mean=0.281][A[A
+
+Train step of epoch 1:   8%|▊         | 499/6434 [1:10:18<13:49:58,  8.39s/it, gpt_loss=0.306, loss_mean=0.284][A[A
+
+Train step of epoch 1:   8%|▊         | 500/6434 [1:10:18<14:11:05,  8.61s/it, gpt_loss=0.306, loss_mean=0.284][A[A
+
+Train step of epoch 1:   8%|▊         | 500/6434 [1:10:26<14:11:05,  8.61s/it, gpt_loss=0.31, loss_mean=0.286] [A[A
+
+Train step of epoch 1:   8%|▊         | 501/6434 [1:10:26<13:54:58,  8.44s/it, gpt_loss=0.31, loss_mean=0.286][A[A
+
+Train step of epoch 1:   8%|▊         | 501/6434 [1:10:35<13:54:58,  8.44s/it, gpt_loss=0.263, loss_mean=0.284][A[A
+
+Train step of epoch 1:   8%|▊         | 502/6434 [1:10:35<13:59:16,  8.49s/it, gpt_loss=0.263, loss_mean=0.284][A[A
+
+Train step of epoch 1:   8%|▊         | 502/6434 [1:10:44<13:59:16,  8.49s/it, gpt_loss=0.232, loss_mean=0.279][A[A
+
+Train step of epoch 1:   8%|▊         | 503/6434 [1:10:44<14:03:18,  8.53s/it, gpt_loss=0.232, loss_mean=0.279][A[A
+
+Train step of epoch 1:   8%|▊         | 503/6434 [1:10:51<14:03:18,  8.53s/it, gpt_loss=0.279, loss_mean=0.279][A[A
+
+Train step of epoch 1:   8%|▊         | 504/6434 [1:10:51<13:22:41,  8.12s/it, gpt_loss=0.279, loss_mean=0.279][A[A
+
+Train step of epoch 1:   8%|▊         | 504/6434 [1:11:00<13:22:41,  8.12s/it, gpt_loss=0.266, loss_mean=0.277][A[A
+
+Train step of epoch 1:   8%|▊         | 505/6434 [1:11:00<13:53:29,  8.43s/it, gpt_loss=0.266, loss_mean=0.277][A[A
+[LID Router Debug] Step: 6940
+Batch Size: 10
+Audio Batch Size: 136
+LID Assignments: [9, 2, 9, 4, 2, 3, 2, 9, 5, 3]
+Active Experts in Batch: {2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:   8%|▊         | 505/6434 [1:11:09<13:53:29,  8.43s/it, gpt_loss=0.295, loss_mean=0.279][A[A
+
+Train step of epoch 1:   8%|▊         | 506/6434 [1:11:09<14:08:13,  8.59s/it, gpt_loss=0.295, loss_mean=0.279][A[A
+
+Train step of epoch 1:   8%|▊         | 506/6434 [1:11:17<14:08:13,  8.59s/it, gpt_loss=0.25, loss_mean=0.276] [A[A
+
+Train step of epoch 1:   8%|▊         | 507/6434 [1:11:17<13:51:39,  8.42s/it, gpt_loss=0.25, loss_mean=0.276][A[A
+
+Train step of epoch 1:   8%|▊         | 507/6434 [1:11:26<13:51:39,  8.42s/it, gpt_loss=0.27, loss_mean=0.276][A[A
+
+Train step of epoch 1:   8%|▊         | 508/6434 [1:11:26<14:14:16,  8.65s/it, gpt_loss=0.27, loss_mean=0.276][A[A
+
+Train step of epoch 1:   8%|▊         | 508/6434 [1:11:34<14:14:16,  8.65s/it, gpt_loss=0.248, loss_mean=0.273][A[A
+
+Train step of epoch 1:   8%|▊         | 509/6434 [1:11:34<13:49:13,  8.40s/it, gpt_loss=0.248, loss_mean=0.273][A[A
+
+Train step of epoch 1:   8%|▊         | 509/6434 [1:11:42<13:49:13,  8.40s/it, gpt_loss=0.34, loss_mean=0.28]  [A[A
+
+Train step of epoch 1:   8%|▊         | 510/6434 [1:11:42<13:26:22,  8.17s/it, gpt_loss=0.34, loss_mean=0.28][A[A
+
+Train step of epoch 1:   8%|▊         | 510/6434 [1:11:50<13:26:22,  8.17s/it, gpt_loss=0.272, loss_mean=0.279][A[A
+
+Train step of epoch 1:   8%|▊         | 511/6434 [1:11:50<13:46:02,  8.37s/it, gpt_loss=0.272, loss_mean=0.279][A[A
+
+Train step of epoch 1:   8%|▊         | 511/6434 [1:11:59<13:46:02,  8.37s/it, gpt_loss=0.323, loss_mean=0.283][A[A
+
+Train step of epoch 1:   8%|▊         | 512/6434 [1:11:59<13:42:06,  8.33s/it, gpt_loss=0.323, loss_mean=0.283][A[A
+
+Train step of epoch 1:   8%|▊         | 512/6434 [1:12:08<13:42:06,  8.33s/it, gpt_loss=0.366, loss_mean=0.292][A[A
+
+Train step of epoch 1:   8%|▊         | 513/6434 [1:12:08<14:00:38,  8.52s/it, gpt_loss=0.366, loss_mean=0.292][A[A
+
+Train step of epoch 1:   8%|▊         | 513/6434 [1:12:17<14:00:38,  8.52s/it, gpt_loss=0.292, loss_mean=0.292][A[A
+
+Train step of epoch 1:   8%|▊         | 514/6434 [1:12:17<14:27:42,  8.79s/it, gpt_loss=0.292, loss_mean=0.292][A[A
+
+Train step of epoch 1:   8%|▊         | 514/6434 [1:12:24<14:27:42,  8.79s/it, gpt_loss=0.238, loss_mean=0.286][A[A
+
+Train step of epoch 1:   8%|▊         | 515/6434 [1:12:24<13:37:15,  8.28s/it, gpt_loss=0.238, loss_mean=0.286][A[A
+[LID Router Debug] Step: 6950
+Batch Size: 10
+Audio Batch Size: 128
+LID Assignments: [0, 4, 6, 0, 3, 3, 2, 9, 2, 3]
+Active Experts in Batch: {0, 2, 3, 4, 6, 9}
+
+
+Train step of epoch 1:   8%|▊         | 515/6434 [1:12:33<13:37:15,  8.28s/it, gpt_loss=0.245, loss_mean=0.282][A[A
+
+Train step of epoch 1:   8%|▊         | 516/6434 [1:12:33<13:42:15,  8.34s/it, gpt_loss=0.245, loss_mean=0.282][A[A
+
+Train step of epoch 1:   8%|▊         | 516/6434 [1:12:40<13:42:15,  8.34s/it, gpt_loss=0.311, loss_mean=0.285][A[A
+
+Train step of epoch 1:   8%|▊         | 517/6434 [1:12:40<13:20:11,  8.11s/it, gpt_loss=0.311, loss_mean=0.285][A[A
+
+Train step of epoch 1:   8%|▊         | 517/6434 [1:12:49<13:20:11,  8.11s/it, gpt_loss=0.206, loss_mean=0.277][A[A
+
+Train step of epoch 1:   8%|▊         | 518/6434 [1:12:49<13:33:09,  8.25s/it, gpt_loss=0.206, loss_mean=0.277][A[A
+
+Train step of epoch 1:   8%|▊         | 518/6434 [1:12:57<13:33:09,  8.25s/it, gpt_loss=0.256, loss_mean=0.275][A[A
+
+Train step of epoch 1:   8%|▊         | 519/6434 [1:12:57<13:41:42,  8.34s/it, gpt_loss=0.256, loss_mean=0.275][A[A
+
+Train step of epoch 1:   8%|▊         | 519/6434 [1:13:05<13:41:42,  8.34s/it, gpt_loss=0.306, loss_mean=0.278][A[A
+
+Train step of epoch 1:   8%|▊         | 520/6434 [1:13:05<13:20:54,  8.13s/it, gpt_loss=0.306, loss_mean=0.278][A[A
+
+Train step of epoch 1:   8%|▊         | 520/6434 [1:13:14<13:20:54,  8.13s/it, gpt_loss=0.226, loss_mean=0.273][A[A
+
+Train step of epoch 1:   8%|▊         | 521/6434 [1:13:14<13:51:09,  8.43s/it, gpt_loss=0.226, loss_mean=0.273][A[A
+
+Train step of epoch 1:   8%|▊         | 521/6434 [1:13:22<13:51:09,  8.43s/it, gpt_loss=0.274, loss_mean=0.273][A[A
+
+Train step of epoch 1:   8%|▊         | 522/6434 [1:13:22<13:36:29,  8.29s/it, gpt_loss=0.274, loss_mean=0.273][A[A
+
+Train step of epoch 1:   8%|▊         | 522/6434 [1:13:31<13:36:29,  8.29s/it, gpt_loss=0.313, loss_mean=0.277][A[A
+
+Train step of epoch 1:   8%|▊         | 523/6434 [1:13:31<13:49:46,  8.42s/it, gpt_loss=0.313, loss_mean=0.277][A[A
+
+Train step of epoch 1:   8%|▊         | 523/6434 [1:13:39<13:49:46,  8.42s/it, gpt_loss=0.246, loss_mean=0.274][A[A
+
+Train step of epoch 1:   8%|▊         | 524/6434 [1:13:39<13:46:42,  8.39s/it, gpt_loss=0.246, loss_mean=0.274][A[A
+
+Train step of epoch 1:   8%|▊         | 524/6434 [1:13:47<13:46:42,  8.39s/it, gpt_loss=0.273, loss_mean=0.274][A[A
+
+Train step of epoch 1:   8%|▊         | 525/6434 [1:13:47<13:37:48,  8.30s/it, gpt_loss=0.273, loss_mean=0.274][A[A
+[LID Router Debug] Step: 6960
+Batch Size: 10
+Audio Batch Size: 124
+LID Assignments: [3, 4, 5, 3, 3, 1, 5, 1, 0, 5]
+Active Experts in Batch: {0, 1, 3, 4, 5}
+
+
+Train step of epoch 1:   8%|▊         | 525/6434 [1:13:55<13:37:48,  8.30s/it, gpt_loss=0.247, loss_mean=0.271][A[A
+
+Train step of epoch 1:   8%|▊         | 526/6434 [1:13:55<13:35:13,  8.28s/it, gpt_loss=0.247, loss_mean=0.271][A[A
+
+Train step of epoch 1:   8%|▊         | 526/6434 [1:14:03<13:35:13,  8.28s/it, gpt_loss=0.275, loss_mean=0.272][A[A
+
+Train step of epoch 1:   8%|▊         | 527/6434 [1:14:03<13:22:01,  8.15s/it, gpt_loss=0.275, loss_mean=0.272][A[A
+
+Train step of epoch 1:   8%|▊         | 527/6434 [1:14:12<13:22:01,  8.15s/it, gpt_loss=0.302, loss_mean=0.275][A[A
+
+Train step of epoch 1:   8%|▊         | 528/6434 [1:14:12<13:35:32,  8.29s/it, gpt_loss=0.302, loss_mean=0.275][A[A
+
+Train step of epoch 1:   8%|▊         | 528/6434 [1:14:19<13:35:32,  8.29s/it, gpt_loss=0.25, loss_mean=0.272] [A[A
+
+Train step of epoch 1:   8%|▊         | 529/6434 [1:14:19<13:12:24,  8.05s/it, gpt_loss=0.25, loss_mean=0.272][A[A
+
+Train step of epoch 1:   8%|▊         | 529/6434 [1:14:27<13:12:24,  8.05s/it, gpt_loss=0.265, loss_mean=0.271][A[A
+
+Train step of epoch 1:   8%|▊         | 530/6434 [1:14:27<13:15:02,  8.08s/it, gpt_loss=0.265, loss_mean=0.271][A[A
+
+Train step of epoch 1:   8%|▊         | 530/6434 [1:14:36<13:15:02,  8.08s/it, gpt_loss=0.353, loss_mean=0.28] [A[A
+
+Train step of epoch 1:   8%|▊         | 531/6434 [1:14:36<13:26:23,  8.20s/it, gpt_loss=0.353, loss_mean=0.28][A[A
+
+Train step of epoch 1:   8%|▊         | 531/6434 [1:14:45<13:26:23,  8.20s/it, gpt_loss=0.232, loss_mean=0.275][A[A
+
+Train step of epoch 1:   8%|▊         | 532/6434 [1:14:45<13:55:10,  8.49s/it, gpt_loss=0.232, loss_mean=0.275][A[A
+
+Train step of epoch 1:   8%|▊         | 532/6434 [1:14:53<13:55:10,  8.49s/it, gpt_loss=0.308, loss_mean=0.278][A[A
+
+Train step of epoch 1:   8%|▊         | 533/6434 [1:14:53<13:41:06,  8.35s/it, gpt_loss=0.308, loss_mean=0.278][A[A
+
+Train step of epoch 1:   8%|▊         | 533/6434 [1:15:01<13:41:06,  8.35s/it, gpt_loss=0.245, loss_mean=0.275][A[A
+
+Train step of epoch 1:   8%|▊         | 534/6434 [1:15:01<13:21:47,  8.15s/it, gpt_loss=0.245, loss_mean=0.275][A[A
+
+Train step of epoch 1:   8%|▊         | 534/6434 [1:15:09<13:21:47,  8.15s/it, gpt_loss=0.347, loss_mean=0.282][A[A
+
+Train step of epoch 1:   8%|▊         | 535/6434 [1:15:09<13:17:38,  8.11s/it, gpt_loss=0.347, loss_mean=0.282][A[A
+[LID Router Debug] Step: 6970
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [9, 6, 1, 0, 4, 8, 9, 0, 9, 5]
+Active Experts in Batch: {0, 1, 4, 5, 6, 8, 9}
+
+
+Train step of epoch 1:   8%|▊         | 535/6434 [1:15:18<13:17:38,  8.11s/it, gpt_loss=0.375, loss_mean=0.291][A[A
+
+Train step of epoch 1:   8%|▊         | 536/6434 [1:15:18<14:00:28,  8.55s/it, gpt_loss=0.375, loss_mean=0.291][A[A
+
+Train step of epoch 1:   8%|▊         | 536/6434 [1:15:27<14:00:28,  8.55s/it, gpt_loss=0.302, loss_mean=0.292][A[A
+
+Train step of epoch 1:   8%|▊         | 537/6434 [1:15:27<13:58:33,  8.53s/it, gpt_loss=0.302, loss_mean=0.292][A[A
+
+Train step of epoch 1:   8%|▊         | 537/6434 [1:15:37<13:58:33,  8.53s/it, gpt_loss=0.275, loss_mean=0.291][A[A
+
+Train step of epoch 1:   8%|▊         | 538/6434 [1:15:37<14:41:40,  8.97s/it, gpt_loss=0.275, loss_mean=0.291][A[A
+
+Train step of epoch 1:   8%|▊         | 538/6434 [1:15:46<14:41:40,  8.97s/it, gpt_loss=0.247, loss_mean=0.286][A[A
+
+Train step of epoch 1:   8%|▊         | 539/6434 [1:15:46<14:46:34,  9.02s/it, gpt_loss=0.247, loss_mean=0.286][A[A
+
+Train step of epoch 1:   8%|▊         | 539/6434 [1:15:55<14:46:34,  9.02s/it, gpt_loss=0.307, loss_mean=0.288][A[A
+
+Train step of epoch 1:   8%|▊         | 540/6434 [1:15:55<14:57:11,  9.13s/it, gpt_loss=0.307, loss_mean=0.288][A[A
+
+Train step of epoch 1:   8%|▊         | 540/6434 [1:16:05<14:57:11,  9.13s/it, gpt_loss=0.248, loss_mean=0.284][A[A
+
+Train step of epoch 1:   8%|▊         | 541/6434 [1:16:05<15:06:12,  9.23s/it, gpt_loss=0.248, loss_mean=0.284][A[A
+
+Train step of epoch 1:   8%|▊         | 541/6434 [1:16:13<15:06:12,  9.23s/it, gpt_loss=0.231, loss_mean=0.279][A[A
+
+Train step of epoch 1:   8%|▊         | 542/6434 [1:16:13<14:38:05,  8.94s/it, gpt_loss=0.231, loss_mean=0.279][A[A
+
+Train step of epoch 1:   8%|▊         | 542/6434 [1:16:21<14:38:05,  8.94s/it, gpt_loss=0.314, loss_mean=0.283][A[A
+
+Train step of epoch 1:   8%|▊         | 543/6434 [1:16:21<13:52:20,  8.48s/it, gpt_loss=0.314, loss_mean=0.283][A[A
+
+Train step of epoch 1:   8%|▊         | 543/6434 [1:16:29<13:52:20,  8.48s/it, gpt_loss=0.195, loss_mean=0.274][A[A
+
+Train step of epoch 1:   8%|▊         | 544/6434 [1:16:29<13:40:18,  8.36s/it, gpt_loss=0.195, loss_mean=0.274][A[A
+
+Train step of epoch 1:   8%|▊         | 544/6434 [1:16:36<13:40:18,  8.36s/it, gpt_loss=0.269, loss_mean=0.273][A[A
+
+Train step of epoch 1:   8%|▊         | 545/6434 [1:16:36<13:12:18,  8.07s/it, gpt_loss=0.269, loss_mean=0.273][A[A
+[LID Router Debug] Step: 6980
+Batch Size: 10
+Audio Batch Size: 111
+LID Assignments: [0, 0, 0, 3, 4, 4, 3, 1, 10, 9]
+Active Experts in Batch: {0, 1, 3, 4, 9, 10}
+
+
+Train step of epoch 1:   8%|▊         | 545/6434 [1:16:44<13:12:18,  8.07s/it, gpt_loss=0.297, loss_mean=0.276][A[A
+
+Train step of epoch 1:   8%|▊         | 546/6434 [1:16:44<13:04:51,  8.00s/it, gpt_loss=0.297, loss_mean=0.276][A[A
+
+Train step of epoch 1:   8%|▊         | 546/6434 [1:16:52<13:04:51,  8.00s/it, gpt_loss=0.315, loss_mean=0.28] [A[A
+
+Train step of epoch 1:   9%|▊         | 547/6434 [1:16:52<13:21:10,  8.17s/it, gpt_loss=0.315, loss_mean=0.28][A[A
+
+Train step of epoch 1:   9%|▊         | 547/6434 [1:17:01<13:21:10,  8.17s/it, gpt_loss=0.221, loss_mean=0.274][A[A
+
+Train step of epoch 1:   9%|▊         | 548/6434 [1:17:01<13:24:23,  8.20s/it, gpt_loss=0.221, loss_mean=0.274][A[A
+
+Train step of epoch 1:   9%|▊         | 548/6434 [1:17:08<13:24:23,  8.20s/it, gpt_loss=0.391, loss_mean=0.286][A[A
+
+Train step of epoch 1:   9%|▊         | 549/6434 [1:17:08<13:11:36,  8.07s/it, gpt_loss=0.391, loss_mean=0.286][A[A
+
+Train step of epoch 1:   9%|▊         | 549/6434 [1:17:17<13:11:36,  8.07s/it, gpt_loss=0.339, loss_mean=0.291][A[A
+
+Train step of epoch 1:   9%|▊         | 550/6434 [1:17:17<13:31:50,  8.28s/it, gpt_loss=0.339, loss_mean=0.291][A[A
+
+Train step of epoch 1:   9%|▊         | 550/6434 [1:17:26<13:31:50,  8.28s/it, gpt_loss=0.219, loss_mean=0.284][A[A
+
+Train step of epoch 1:   9%|▊         | 551/6434 [1:17:26<13:36:32,  8.33s/it, gpt_loss=0.219, loss_mean=0.284][A[A
+
+Train step of epoch 1:   9%|▊         | 551/6434 [1:17:34<13:36:32,  8.33s/it, gpt_loss=0.231, loss_mean=0.279][A[A
+
+Train step of epoch 1:   9%|▊         | 552/6434 [1:17:34<13:27:39,  8.24s/it, gpt_loss=0.231, loss_mean=0.279][A[A
+
+Train step of epoch 1:   9%|▊         | 552/6434 [1:17:44<13:27:39,  8.24s/it, gpt_loss=0.315, loss_mean=0.282][A[A
+
+Train step of epoch 1:   9%|▊         | 553/6434 [1:17:44<14:38:56,  8.97s/it, gpt_loss=0.315, loss_mean=0.282][A[A
+
+Train step of epoch 1:   9%|▊         | 553/6434 [1:17:54<14:38:56,  8.97s/it, gpt_loss=0.385, loss_mean=0.292][A[A
+
+Train step of epoch 1:   9%|▊         | 554/6434 [1:17:54<14:54:14,  9.12s/it, gpt_loss=0.385, loss_mean=0.292][A[A
+
+Train step of epoch 1:   9%|▊         | 554/6434 [1:18:02<14:54:14,  9.12s/it, gpt_loss=0.272, loss_mean=0.29] [A[A
+
+Train step of epoch 1:   9%|▊         | 555/6434 [1:18:02<14:28:03,  8.86s/it, gpt_loss=0.272, loss_mean=0.29][A[A
+[LID Router Debug] Step: 6990
+Batch Size: 10
+Audio Batch Size: 83
+LID Assignments: [5, 2, 0, 9, 0, 2, 1, 1, 4, 5]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+
+Train step of epoch 1:   9%|▊         | 555/6434 [1:18:10<14:28:03,  8.86s/it, gpt_loss=0.23, loss_mean=0.284][A[A
+
+Train step of epoch 1:   9%|▊         | 556/6434 [1:18:10<14:08:05,  8.66s/it, gpt_loss=0.23, loss_mean=0.284][A[A
+
+Train step of epoch 1:   9%|▊         | 556/6434 [1:18:19<14:08:05,  8.66s/it, gpt_loss=0.235, loss_mean=0.279][A[A
+
+Train step of epoch 1:   9%|▊         | 557/6434 [1:18:19<14:12:24,  8.70s/it, gpt_loss=0.235, loss_mean=0.279][A[A
+
+Train step of epoch 1:   9%|▊         | 557/6434 [1:18:28<14:12:24,  8.70s/it, gpt_loss=0.266, loss_mean=0.278][A[A
+
+Train step of epoch 1:   9%|▊         | 558/6434 [1:18:28<14:28:21,  8.87s/it, gpt_loss=0.266, loss_mean=0.278][A[A
+
+Train step of epoch 1:   9%|▊         | 558/6434 [1:18:38<14:28:21,  8.87s/it, gpt_loss=0.271, loss_mean=0.277][A[A
+
+Train step of epoch 1:   9%|▊         | 559/6434 [1:18:38<14:38:57,  8.98s/it, gpt_loss=0.271, loss_mean=0.277][A[A
+
+Train step of epoch 1:   9%|▊         | 559/6434 [1:18:46<14:38:57,  8.98s/it, gpt_loss=0.246, loss_mean=0.274][A[A
+
+Train step of epoch 1:   9%|▊         | 560/6434 [1:18:46<14:13:22,  8.72s/it, gpt_loss=0.246, loss_mean=0.274][A[A
+
+Train step of epoch 1:   9%|▊         | 560/6434 [1:18:54<14:13:22,  8.72s/it, gpt_loss=0.291, loss_mean=0.276][A[A
+
+Train step of epoch 1:   9%|▊         | 561/6434 [1:18:54<14:12:26,  8.71s/it, gpt_loss=0.291, loss_mean=0.276][A[A
+
+Train step of epoch 1:   9%|▊         | 561/6434 [1:19:02<14:12:26,  8.71s/it, gpt_loss=0.207, loss_mean=0.269][A[A
+
+Train step of epoch 1:   9%|▊         | 562/6434 [1:19:02<13:35:41,  8.33s/it, gpt_loss=0.207, loss_mean=0.269][A[A
+
+Train step of epoch 1:   9%|▊         | 562/6434 [1:19:09<13:35:41,  8.33s/it, gpt_loss=0.239, loss_mean=0.266][A[A
+
+Train step of epoch 1:   9%|▉         | 563/6434 [1:19:09<13:11:55,  8.09s/it, gpt_loss=0.239, loss_mean=0.266][A[A
+
+Train step of epoch 1:   9%|▉         | 563/6434 [1:19:17<13:11:55,  8.09s/it, gpt_loss=0.366, loss_mean=0.276][A[A
+
+Train step of epoch 1:   9%|▉         | 564/6434 [1:19:17<12:58:38,  7.96s/it, gpt_loss=0.366, loss_mean=0.276][A[A
+
+Train step of epoch 1:   9%|▉         | 564/6434 [1:19:25<12:58:38,  7.96s/it, gpt_loss=0.287, loss_mean=0.277][A[A
+
+Train step of epoch 1:   9%|▉         | 565/6434 [1:19:25<12:52:56,  7.90s/it, gpt_loss=0.287, loss_mean=0.277][A[A
+[LID Router Debug] Step: 7000
+Batch Size: 10
+Audio Batch Size: 104
+LID Assignments: [5, 2, 9, 1, 3, 0, 0, 5, 5, 0]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9}
+[2026-02-07 08:21:20,558] [INFO] [logging.py:96:log_dist] [Rank 0] step=3500, skipped=0, lr=[1.4365904458246237e-05, 1.4365904458246237e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 08:21:20,559] [INFO] [timer.py:260:stop] epoch=0/micro_step=7000/global_step=3500, RunningAvgSamplesPerSec=4.746697179253986, CurrSamplesPerSec=4.872603193731433, MemAllocated=12.64GB, MaxMemAllocated=49.73GB
+
+
+Train step of epoch 1:   9%|▉         | 565/6434 [1:19:33<12:52:56,  7.90s/it, gpt_loss=0.205, loss_mean=0.27] [A[A[2026-02-07 08:21:20,563] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step7000 is about to be saved!
+/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1898: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
+  warnings.warn(
+/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1898: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
+  warnings.warn(
+/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1898: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
+  warnings.warn(
+/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1898: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
+  warnings.warn(
+[2026-02-07 08:21:22,791] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/global_step7000/mp_rank_00_model_states.pt
+[2026-02-07 08:21:22,791] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/global_step7000/mp_rank_00_model_states.pt...
+[2026-02-07 08:21:29,652] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/global_step7000/mp_rank_00_model_states.pt.
+[2026-02-07 08:21:29,656] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/global_step7000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
+[2026-02-07 08:21:29,656] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/global_step7000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt...
+[2026-02-07 08:21:29,656] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/global_step7000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt...
+[2026-02-07 08:21:29,656] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/global_step7000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt...
+[2026-02-07 08:21:30,634] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/global_step7000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt.
+[2026-02-07 08:21:30,634] [INFO] [engine.py:3487:_save_zero_checkpoint] zero checkpoint saved /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/global_step7000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
+[2026-02-07 08:21:30,634] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step7000 is ready now!
+[2026-02-07 08:21:30,641] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/global_step7000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt.
+[2026-02-07 08:21:30,641] [INFO] [engine.py:3487:_save_zero_checkpoint] zero checkpoint saved /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/global_step7000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
+[2026-02-07 08:21:30,641] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step7000 is ready now!
+[2026-02-07 08:21:30,687] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/global_step7000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt.
+[2026-02-07 08:21:30,711] [INFO] [engine.py:3487:_save_zero_checkpoint] zero checkpoint saved /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/global_step7000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
+[2026-02-07 08:21:30,711] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step7000 is ready now!
+[2026-02-07 08:21:30,747] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/global_step7000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
+[2026-02-07 08:21:30,749] [INFO] [engine.py:3487:_save_zero_checkpoint] zero checkpoint saved /fs/nlp/common_intern/meiyuxiang/assets/multilingual/qwen3-1.7b-whisper-0205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/ckpts/global_step7000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
+[2026-02-07 08:21:30,749] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step7000 is ready now!
+
+
+Train step of epoch 1:   9%|▉         | 566/6434 [1:19:44<18:14:34, 11.19s/it, gpt_loss=0.205, loss_mean=0.27][A[A
+
+Train step of epoch 1:   9%|▉         | 566/6434 [1:19:52<18:14:34, 11.19s/it, gpt_loss=0.274, loss_mean=0.27][A[A
+
+Train step of epoch 1:   9%|▉         | 567/6434 [1:19:52<16:38:51, 10.22s/it, gpt_loss=0.274, loss_mean=0.27][A[A
+
+Train step of epoch 1:   9%|▉         | 567/6434 [1:19:59<16:38:51, 10.22s/it, gpt_loss=0.278, loss_mean=0.271][A[A
+
+Train step of epoch 1:   9%|▉         | 568/6434 [1:19:59<15:04:28,  9.25s/it, gpt_loss=0.278, loss_mean=0.271][A[A
+
+Train step of epoch 1:   9%|▉         | 568/6434 [1:20:07<15:04:28,  9.25s/it, gpt_loss=0.283, loss_mean=0.272][A[A
+
+Train step of epoch 1:   9%|▉         | 569/6434 [1:20:07<14:51:54,  9.12s/it, gpt_loss=0.283, loss_mean=0.272][A[A
+
+Train step of epoch 1:   9%|▉         | 569/6434 [1:20:16<14:51:54,  9.12s/it, gpt_loss=0.335, loss_mean=0.279][A[A
+
+Train step of epoch 1:   9%|▉         | 570/6434 [1:20:16<14:26:49,  8.87s/it, gpt_loss=0.335, loss_mean=0.279][A[A
+
+Train step of epoch 1:   9%|▉         | 570/6434 [1:20:26<14:26:49,  8.87s/it, gpt_loss=0.257, loss_mean=0.276][A[A
+
+Train step of epoch 1:   9%|▉         | 571/6434 [1:20:26<14:54:39,  9.16s/it, gpt_loss=0.257, loss_mean=0.276][A[A
+
+Train step of epoch 1:   9%|▉         | 571/6434 [1:20:34<14:54:39,  9.16s/it, gpt_loss=0.313, loss_mean=0.28] [A[A
+
+Train step of epoch 1:   9%|▉         | 572/6434 [1:20:34<14:42:05,  9.03s/it, gpt_loss=0.313, loss_mean=0.28][A[A
+
+Train step of epoch 1:   9%|▉         | 572/6434 [1:20:42<14:42:05,  9.03s/it, gpt_loss=0.286, loss_mean=0.281][A[A
+
+Train step of epoch 1:   9%|▉         | 573/6434 [1:20:42<13:54:26,  8.54s/it, gpt_loss=0.286, loss_mean=0.281][A[A
+
+Train step of epoch 1:   9%|▉         | 573/6434 [1:20:50<13:54:26,  8.54s/it, gpt_loss=0.295, loss_mean=0.282][A[A
+
+Train step of epoch 1:   9%|▉         | 574/6434 [1:20:50<13:43:27,  8.43s/it, gpt_loss=0.295, loss_mean=0.282][A[A
+
+Train step of epoch 1:   9%|▉         | 574/6434 [1:20:59<13:43:27,  8.43s/it, gpt_loss=0.241, loss_mean=0.278][A[A
+
+Train step of epoch 1:   9%|▉         | 575/6434 [1:20:59<14:15:33,  8.76s/it, gpt_loss=0.241, loss_mean=0.278][A[A
+[LID Router Debug] Step: 7010
+Batch Size: 10
+Audio Batch Size: 94
+LID Assignments: [5, 0, 4, 5, 9, 3, 1, 1, 9, 6]
+Active Experts in Batch: {0, 1, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:   9%|▉         | 575/6434 [1:21:08<14:15:33,  8.76s/it, gpt_loss=0.254, loss_mean=0.276][A[A
+
+Train step of epoch 1:   9%|▉         | 576/6434 [1:21:08<14:14:36,  8.75s/it, gpt_loss=0.254, loss_mean=0.276][A[A
+
+Train step of epoch 1:   9%|▉         | 576/6434 [1:21:16<14:14:36,  8.75s/it, gpt_loss=0.25, loss_mean=0.273] [A[A
+
+Train step of epoch 1:   9%|▉         | 577/6434 [1:21:16<13:40:48,  8.41s/it, gpt_loss=0.25, loss_mean=0.273][A[A
+
+Train step of epoch 1:   9%|▉         | 577/6434 [1:21:24<13:40:48,  8.41s/it, gpt_loss=0.294, loss_mean=0.275][A[A
+
+Train step of epoch 1:   9%|▉         | 578/6434 [1:21:24<13:42:39,  8.43s/it, gpt_loss=0.294, loss_mean=0.275][A[A
+
+Train step of epoch 1:   9%|▉         | 578/6434 [1:21:34<13:42:39,  8.43s/it, gpt_loss=0.272, loss_mean=0.275][A[A
+
+Train step of epoch 1:   9%|▉         | 579/6434 [1:21:34<14:08:22,  8.69s/it, gpt_loss=0.272, loss_mean=0.275][A[A
+
+Train step of epoch 1:   9%|▉         | 579/6434 [1:21:43<14:08:22,  8.69s/it, gpt_loss=0.307, loss_mean=0.278][A[A
+
+Train step of epoch 1:   9%|▉         | 580/6434 [1:21:43<14:29:05,  8.91s/it, gpt_loss=0.307, loss_mean=0.278][A[A
+
+Train step of epoch 1:   9%|▉         | 580/6434 [1:21:52<14:29:05,  8.91s/it, gpt_loss=0.274, loss_mean=0.278][A[A
+
+Train step of epoch 1:   9%|▉         | 581/6434 [1:21:52<14:20:16,  8.82s/it, gpt_loss=0.274, loss_mean=0.278][A[A
+
+Train step of epoch 1:   9%|▉         | 581/6434 [1:22:01<14:20:16,  8.82s/it, gpt_loss=0.236, loss_mean=0.273][A[A
+
+Train step of epoch 1:   9%|▉         | 582/6434 [1:22:01<14:31:40,  8.94s/it, gpt_loss=0.236, loss_mean=0.273][A[A
+
+Train step of epoch 1:   9%|▉         | 582/6434 [1:22:10<14:31:40,  8.94s/it, gpt_loss=0.246, loss_mean=0.271][A[A
+
+Train step of epoch 1:   9%|▉         | 583/6434 [1:22:10<14:42:06,  9.05s/it, gpt_loss=0.246, loss_mean=0.271][A[A
+
+Train step of epoch 1:   9%|▉         | 583/6434 [1:22:18<14:42:06,  9.05s/it, gpt_loss=0.217, loss_mean=0.265][A[A
+
+Train step of epoch 1:   9%|▉         | 584/6434 [1:22:18<14:21:35,  8.84s/it, gpt_loss=0.217, loss_mean=0.265][A[A
+
+Train step of epoch 1:   9%|▉         | 584/6434 [1:22:28<14:21:35,  8.84s/it, gpt_loss=0.211, loss_mean=0.26] [A[A
+
+Train step of epoch 1:   9%|▉         | 585/6434 [1:22:28<14:29:33,  8.92s/it, gpt_loss=0.211, loss_mean=0.26][A[A
+[LID Router Debug] Step: 7020
+Batch Size: 10
+Audio Batch Size: 139
+LID Assignments: [9, 5, 4, 4, 3, 3, 1, 3, 2, 3]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:   9%|▉         | 585/6434 [1:22:37<14:29:33,  8.92s/it, gpt_loss=0.322, loss_mean=0.266][A[A
+
+Train step of epoch 1:   9%|▉         | 586/6434 [1:22:37<14:34:15,  8.97s/it, gpt_loss=0.322, loss_mean=0.266][A[A
+
+Train step of epoch 1:   9%|▉         | 586/6434 [1:22:47<14:34:15,  8.97s/it, gpt_loss=0.264, loss_mean=0.266][A[A
+
+Train step of epoch 1:   9%|▉         | 587/6434 [1:22:47<15:21:41,  9.46s/it, gpt_loss=0.264, loss_mean=0.266][A[A
+
+Train step of epoch 1:   9%|▉         | 587/6434 [1:22:55<15:21:41,  9.46s/it, gpt_loss=0.311, loss_mean=0.27] [A[A
+
+Train step of epoch 1:   9%|▉         | 588/6434 [1:22:55<14:25:54,  8.89s/it, gpt_loss=0.311, loss_mean=0.27][A[A
+
+Train step of epoch 1:   9%|▉         | 588/6434 [1:23:03<14:25:54,  8.89s/it, gpt_loss=0.338, loss_mean=0.277][A[A
+
+Train step of epoch 1:   9%|▉         | 589/6434 [1:23:03<13:54:18,  8.56s/it, gpt_loss=0.338, loss_mean=0.277][A[A
+
+Train step of epoch 1:   9%|▉         | 589/6434 [1:23:11<13:54:18,  8.56s/it, gpt_loss=0.28, loss_mean=0.277] [A[A
+
+Train step of epoch 1:   9%|▉         | 590/6434 [1:23:11<14:02:25,  8.65s/it, gpt_loss=0.28, loss_mean=0.277][A[A
+
+Train step of epoch 1:   9%|▉         | 590/6434 [1:23:21<14:02:25,  8.65s/it, gpt_loss=0.381, loss_mean=0.288][A[A
+
+Train step of epoch 1:   9%|▉         | 591/6434 [1:23:21<14:36:10,  9.00s/it, gpt_loss=0.381, loss_mean=0.288][A[A
+
+Train step of epoch 1:   9%|▉         | 591/6434 [1:23:30<14:36:10,  9.00s/it, gpt_loss=0.247, loss_mean=0.284][A[A
+
+Train step of epoch 1:   9%|▉         | 592/6434 [1:23:30<14:20:38,  8.84s/it, gpt_loss=0.247, loss_mean=0.284][A[A
+
+Train step of epoch 1:   9%|▉         | 592/6434 [1:23:37<14:20:38,  8.84s/it, gpt_loss=0.307, loss_mean=0.286][A[A
+
+Train step of epoch 1:   9%|▉         | 593/6434 [1:23:37<13:46:57,  8.49s/it, gpt_loss=0.307, loss_mean=0.286][A[A
+
+Train step of epoch 1:   9%|▉         | 593/6434 [1:23:46<13:46:57,  8.49s/it, gpt_loss=0.274, loss_mean=0.285][A[A
+
+Train step of epoch 1:   9%|▉         | 594/6434 [1:23:46<13:41:34,  8.44s/it, gpt_loss=0.274, loss_mean=0.285][A[A
+
+Train step of epoch 1:   9%|▉         | 594/6434 [1:23:54<13:41:34,  8.44s/it, gpt_loss=0.268, loss_mean=0.283][A[A
+
+Train step of epoch 1:   9%|▉         | 595/6434 [1:23:54<13:50:48,  8.54s/it, gpt_loss=0.268, loss_mean=0.283][A[A
+[LID Router Debug] Step: 7030
+Batch Size: 10
+Audio Batch Size: 109
+LID Assignments: [5, 9, 1, 3, 0, 3, 9, 0, 2, 1]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9}
+
+
+Train step of epoch 1:   9%|▉         | 595/6434 [1:24:02<13:50:48,  8.54s/it, gpt_loss=0.327, loss_mean=0.287][A[A
+
+Train step of epoch 1:   9%|▉         | 596/6434 [1:24:02<13:14:31,  8.17s/it, gpt_loss=0.327, loss_mean=0.287][A[A
+
+Train step of epoch 1:   9%|▉         | 596/6434 [1:24:11<13:14:31,  8.17s/it, gpt_loss=0.344, loss_mean=0.293][A[A
+
+Train step of epoch 1:   9%|▉         | 597/6434 [1:24:11<13:54:02,  8.57s/it, gpt_loss=0.344, loss_mean=0.293][A[A
+
+Train step of epoch 1:   9%|▉         | 597/6434 [1:24:19<13:54:02,  8.57s/it, gpt_loss=0.222, loss_mean=0.286][A[A
+
+Train step of epoch 1:   9%|▉         | 598/6434 [1:24:19<13:17:36,  8.20s/it, gpt_loss=0.222, loss_mean=0.286][A[A
+
+Train step of epoch 1:   9%|▉         | 598/6434 [1:24:28<13:17:36,  8.20s/it, gpt_loss=0.388, loss_mean=0.296][A[A
+
+Train step of epoch 1:   9%|▉         | 599/6434 [1:24:28<13:42:55,  8.46s/it, gpt_loss=0.388, loss_mean=0.296][A[A
+
+Train step of epoch 1:   9%|▉         | 599/6434 [1:24:37<13:42:55,  8.46s/it, gpt_loss=0.264, loss_mean=0.293][A[A
+
+Train step of epoch 1:   9%|▉         | 600/6434 [1:24:37<14:09:01,  8.73s/it, gpt_loss=0.264, loss_mean=0.293][A[A
+
+Train step of epoch 1:   9%|▉         | 600/6434 [1:24:45<14:09:01,  8.73s/it, gpt_loss=0.258, loss_mean=0.29] [A[A
+
+Train step of epoch 1:   9%|▉         | 601/6434 [1:24:45<13:56:34,  8.61s/it, gpt_loss=0.258, loss_mean=0.29][A[A
+
+Train step of epoch 1:   9%|▉         | 601/6434 [1:24:54<13:56:34,  8.61s/it, gpt_loss=0.266, loss_mean=0.287][A[A
+
+Train step of epoch 1:   9%|▉         | 602/6434 [1:24:54<14:06:43,  8.71s/it, gpt_loss=0.266, loss_mean=0.287][A[A
+
+Train step of epoch 1:   9%|▉         | 602/6434 [1:25:02<14:06:43,  8.71s/it, gpt_loss=0.31, loss_mean=0.289] [A[A
+
+Train step of epoch 1:   9%|▉         | 603/6434 [1:25:02<13:43:55,  8.48s/it, gpt_loss=0.31, loss_mean=0.289][A[A
+
+Train step of epoch 1:   9%|▉         | 603/6434 [1:25:11<13:43:55,  8.48s/it, gpt_loss=0.207, loss_mean=0.281][A[A
+
+Train step of epoch 1:   9%|▉         | 604/6434 [1:25:11<13:53:08,  8.57s/it, gpt_loss=0.207, loss_mean=0.281][A[A
+
+Train step of epoch 1:   9%|▉         | 604/6434 [1:25:19<13:53:08,  8.57s/it, gpt_loss=0.251, loss_mean=0.278][A[A
+
+Train step of epoch 1:   9%|▉         | 605/6434 [1:25:19<13:37:22,  8.41s/it, gpt_loss=0.251, loss_mean=0.278][A[A
+[LID Router Debug] Step: 7040
+Batch Size: 10
+Audio Batch Size: 105
+LID Assignments: [9, 3, 0, 2, 1, 1, 2, 5, 3, 2]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9}
+
+
+Train step of epoch 1:   9%|▉         | 605/6434 [1:25:27<13:37:22,  8.41s/it, gpt_loss=0.275, loss_mean=0.278][A[A
+
+Train step of epoch 1:   9%|▉         | 606/6434 [1:25:27<13:13:37,  8.17s/it, gpt_loss=0.275, loss_mean=0.278][A[A
+
+Train step of epoch 1:   9%|▉         | 606/6434 [1:25:35<13:13:37,  8.17s/it, gpt_loss=0.332, loss_mean=0.283][A[A
+
+Train step of epoch 1:   9%|▉         | 607/6434 [1:25:35<13:17:27,  8.21s/it, gpt_loss=0.332, loss_mean=0.283][A[A
+
+Train step of epoch 1:   9%|▉         | 607/6434 [1:25:44<13:17:27,  8.21s/it, gpt_loss=0.198, loss_mean=0.275][A[A
+
+Train step of epoch 1:   9%|▉         | 608/6434 [1:25:44<13:37:01,  8.41s/it, gpt_loss=0.198, loss_mean=0.275][A[A
+
+Train step of epoch 1:   9%|▉         | 608/6434 [1:25:52<13:37:01,  8.41s/it, gpt_loss=0.278, loss_mean=0.275][A[A
+
+Train step of epoch 1:   9%|▉         | 609/6434 [1:25:52<13:14:12,  8.18s/it, gpt_loss=0.278, loss_mean=0.275][A[A
+
+Train step of epoch 1:   9%|▉         | 609/6434 [1:26:00<13:14:12,  8.18s/it, gpt_loss=0.223, loss_mean=0.27] [A[A
+
+Train step of epoch 1:   9%|▉         | 610/6434 [1:26:00<13:34:00,  8.39s/it, gpt_loss=0.223, loss_mean=0.27][A[A
+
+Train step of epoch 1:   9%|▉         | 610/6434 [1:26:09<13:34:00,  8.39s/it, gpt_loss=0.294, loss_mean=0.272][A[A
+
+Train step of epoch 1:   9%|▉         | 611/6434 [1:26:09<13:28:03,  8.33s/it, gpt_loss=0.294, loss_mean=0.272][A[A
+
+Train step of epoch 1:   9%|▉         | 611/6434 [1:26:16<13:28:03,  8.33s/it, gpt_loss=0.263, loss_mean=0.271][A[A
+
+Train step of epoch 1:  10%|▉         | 612/6434 [1:26:16<13:14:53,  8.19s/it, gpt_loss=0.263, loss_mean=0.271][A[A
+
+Train step of epoch 1:  10%|▉         | 612/6434 [1:26:24<13:14:53,  8.19s/it, gpt_loss=0.236, loss_mean=0.268][A[A
+
+Train step of epoch 1:  10%|▉         | 613/6434 [1:26:24<13:08:18,  8.13s/it, gpt_loss=0.236, loss_mean=0.268][A[A
+
+Train step of epoch 1:  10%|▉         | 613/6434 [1:26:34<13:08:18,  8.13s/it, gpt_loss=0.258, loss_mean=0.267][A[A
+
+Train step of epoch 1:  10%|▉         | 614/6434 [1:26:34<13:39:50,  8.45s/it, gpt_loss=0.258, loss_mean=0.267][A[A
+
+Train step of epoch 1:  10%|▉         | 614/6434 [1:26:42<13:39:50,  8.45s/it, gpt_loss=0.244, loss_mean=0.264][A[A
+
+Train step of epoch 1:  10%|▉         | 615/6434 [1:26:42<13:47:57,  8.54s/it, gpt_loss=0.244, loss_mean=0.264][A[A
+[LID Router Debug] Step: 7050
+Batch Size: 10
+Audio Batch Size: 127
+LID Assignments: [0, 5, 9, 5, 3, 9, 2, 3, 3, 9]
+Active Experts in Batch: {0, 2, 3, 5, 9}
+
+
+Train step of epoch 1:  10%|▉         | 615/6434 [1:26:51<13:47:57,  8.54s/it, gpt_loss=0.346, loss_mean=0.273][A[A
+
+Train step of epoch 1:  10%|▉         | 616/6434 [1:26:51<13:43:02,  8.49s/it, gpt_loss=0.346, loss_mean=0.273][A[A
+
+Train step of epoch 1:  10%|▉         | 616/6434 [1:27:00<13:43:02,  8.49s/it, gpt_loss=0.297, loss_mean=0.275][A[A
+
+Train step of epoch 1:  10%|▉         | 617/6434 [1:27:00<14:15:45,  8.83s/it, gpt_loss=0.297, loss_mean=0.275][A[A
+
+Train step of epoch 1:  10%|▉         | 617/6434 [1:27:10<14:15:45,  8.83s/it, gpt_loss=0.312, loss_mean=0.279][A[A
+
+Train step of epoch 1:  10%|▉         | 618/6434 [1:27:10<14:25:29,  8.93s/it, gpt_loss=0.312, loss_mean=0.279][A[A
+
+Train step of epoch 1:  10%|▉         | 618/6434 [1:27:17<14:25:29,  8.93s/it, gpt_loss=0.244, loss_mean=0.275][A[A
+
+Train step of epoch 1:  10%|▉         | 619/6434 [1:27:17<13:34:48,  8.41s/it, gpt_loss=0.244, loss_mean=0.275][A[A
+
+Train step of epoch 1:  10%|▉         | 619/6434 [1:27:25<13:34:48,  8.41s/it, gpt_loss=0.29, loss_mean=0.277] [A[A
+
+Train step of epoch 1:  10%|▉         | 620/6434 [1:27:25<13:22:38,  8.28s/it, gpt_loss=0.29, loss_mean=0.277][A[A
+
+Train step of epoch 1:  10%|▉         | 620/6434 [1:27:33<13:22:38,  8.28s/it, gpt_loss=0.28, loss_mean=0.277][A[A
+
+Train step of epoch 1:  10%|▉         | 621/6434 [1:27:33<13:25:02,  8.31s/it, gpt_loss=0.28, loss_mean=0.277][A[A
+
+Train step of epoch 1:  10%|▉         | 621/6434 [1:27:42<13:25:02,  8.31s/it, gpt_loss=0.317, loss_mean=0.281][A[A
+
+Train step of epoch 1:  10%|▉         | 622/6434 [1:27:42<13:50:51,  8.58s/it, gpt_loss=0.317, loss_mean=0.281][A[A
+
+Train step of epoch 1:  10%|▉         | 622/6434 [1:27:51<13:50:51,  8.58s/it, gpt_loss=0.275, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  10%|▉         | 623/6434 [1:27:51<13:43:57,  8.51s/it, gpt_loss=0.275, loss_mean=0.28][A[A
+
+Train step of epoch 1:  10%|▉         | 623/6434 [1:28:00<13:43:57,  8.51s/it, gpt_loss=0.281, loss_mean=0.281][A[A
+
+Train step of epoch 1:  10%|▉         | 624/6434 [1:28:00<14:00:50,  8.68s/it, gpt_loss=0.281, loss_mean=0.281][A[A
+
+Train step of epoch 1:  10%|▉         | 624/6434 [1:28:07<14:00:50,  8.68s/it, gpt_loss=0.27, loss_mean=0.279] [A[A
+
+Train step of epoch 1:  10%|▉         | 625/6434 [1:28:07<13:30:23,  8.37s/it, gpt_loss=0.27, loss_mean=0.279][A[A
+[LID Router Debug] Step: 7060
+Batch Size: 10
+Audio Batch Size: 120
+LID Assignments: [11, 2, 5, 3, 1, 9, 1, 4, 2, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9, 11}
+
+
+Train step of epoch 1:  10%|▉         | 625/6434 [1:28:17<13:30:23,  8.37s/it, gpt_loss=0.251, loss_mean=0.277][A[A
+
+Train step of epoch 1:  10%|▉         | 626/6434 [1:28:17<14:02:56,  8.71s/it, gpt_loss=0.251, loss_mean=0.277][A[A
+
+Train step of epoch 1:  10%|▉         | 626/6434 [1:28:25<14:02:56,  8.71s/it, gpt_loss=0.365, loss_mean=0.285][A[A
+
+Train step of epoch 1:  10%|▉         | 627/6434 [1:28:25<13:52:25,  8.60s/it, gpt_loss=0.365, loss_mean=0.285][A[A
+
+Train step of epoch 1:  10%|▉         | 627/6434 [1:28:34<13:52:25,  8.60s/it, gpt_loss=0.232, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  10%|▉         | 628/6434 [1:28:34<13:58:46,  8.67s/it, gpt_loss=0.232, loss_mean=0.28][A[A
+
+Train step of epoch 1:  10%|▉         | 628/6434 [1:28:43<13:58:46,  8.67s/it, gpt_loss=0.249, loss_mean=0.277][A[A
+
+Train step of epoch 1:  10%|▉         | 629/6434 [1:28:43<14:04:40,  8.73s/it, gpt_loss=0.249, loss_mean=0.277][A[A
+
+Train step of epoch 1:  10%|▉         | 629/6434 [1:28:52<14:04:40,  8.73s/it, gpt_loss=0.31, loss_mean=0.28]  [A[A
+
+Train step of epoch 1:  10%|▉         | 630/6434 [1:28:52<14:16:47,  8.86s/it, gpt_loss=0.31, loss_mean=0.28][A[A
+
+Train step of epoch 1:  10%|▉         | 630/6434 [1:29:00<14:16:47,  8.86s/it, gpt_loss=0.233, loss_mean=0.276][A[A
+
+Train step of epoch 1:  10%|▉         | 631/6434 [1:29:00<13:44:27,  8.52s/it, gpt_loss=0.233, loss_mean=0.276][A[A
+
+Train step of epoch 1:  10%|▉         | 631/6434 [1:29:09<13:44:27,  8.52s/it, gpt_loss=0.335, loss_mean=0.281][A[A
+
+Train step of epoch 1:  10%|▉         | 632/6434 [1:29:09<13:55:02,  8.64s/it, gpt_loss=0.335, loss_mean=0.281][A[A
+
+Train step of epoch 1:  10%|▉         | 632/6434 [1:29:17<13:55:02,  8.64s/it, gpt_loss=0.217, loss_mean=0.275][A[A
+
+Train step of epoch 1:  10%|▉         | 633/6434 [1:29:17<13:40:59,  8.49s/it, gpt_loss=0.217, loss_mean=0.275][A[A
+
+Train step of epoch 1:  10%|▉         | 633/6434 [1:29:25<13:40:59,  8.49s/it, gpt_loss=0.216, loss_mean=0.269][A[A
+
+Train step of epoch 1:  10%|▉         | 634/6434 [1:29:25<13:26:24,  8.34s/it, gpt_loss=0.216, loss_mean=0.269][A[A
+
+Train step of epoch 1:  10%|▉         | 634/6434 [1:29:34<13:26:24,  8.34s/it, gpt_loss=0.221, loss_mean=0.264][A[A
+
+Train step of epoch 1:  10%|▉         | 635/6434 [1:29:34<13:41:36,  8.50s/it, gpt_loss=0.221, loss_mean=0.264][A[A
+[LID Router Debug] Step: 7070
+Batch Size: 10
+Audio Batch Size: 92
+LID Assignments: [4, 5, 4, 0, 1, 9, 1, 0, 5, 2]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+
+Train step of epoch 1:  10%|▉         | 635/6434 [1:29:42<13:41:36,  8.50s/it, gpt_loss=0.25, loss_mean=0.263] [A[A
+
+Train step of epoch 1:  10%|▉         | 636/6434 [1:29:42<13:24:18,  8.32s/it, gpt_loss=0.25, loss_mean=0.263][A[A
+
+Train step of epoch 1:  10%|▉         | 636/6434 [1:29:51<13:24:18,  8.32s/it, gpt_loss=0.247, loss_mean=0.261][A[A
+
+Train step of epoch 1:  10%|▉         | 637/6434 [1:29:51<14:05:28,  8.75s/it, gpt_loss=0.247, loss_mean=0.261][A[A
+
+Train step of epoch 1:  10%|▉         | 637/6434 [1:30:00<14:05:28,  8.75s/it, gpt_loss=0.285, loss_mean=0.264][A[A
+
+Train step of epoch 1:  10%|▉         | 638/6434 [1:30:00<13:55:42,  8.65s/it, gpt_loss=0.285, loss_mean=0.264][A[A
+
+Train step of epoch 1:  10%|▉         | 638/6434 [1:30:07<13:55:42,  8.65s/it, gpt_loss=0.406, loss_mean=0.278][A[A
+
+Train step of epoch 1:  10%|▉         | 639/6434 [1:30:07<13:23:33,  8.32s/it, gpt_loss=0.406, loss_mean=0.278][A[A
+
+Train step of epoch 1:  10%|▉         | 639/6434 [1:30:15<13:23:33,  8.32s/it, gpt_loss=0.268, loss_mean=0.277][A[A
+
+Train step of epoch 1:  10%|▉         | 640/6434 [1:30:15<12:56:08,  8.04s/it, gpt_loss=0.268, loss_mean=0.277][A[A
+
+Train step of epoch 1:  10%|▉         | 640/6434 [1:30:24<12:56:08,  8.04s/it, gpt_loss=0.312, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  10%|▉         | 641/6434 [1:30:24<13:34:59,  8.44s/it, gpt_loss=0.312, loss_mean=0.28][A[A
+
+Train step of epoch 1:  10%|▉         | 641/6434 [1:30:32<13:34:59,  8.44s/it, gpt_loss=0.256, loss_mean=0.278][A[A
+
+Train step of epoch 1:  10%|▉         | 642/6434 [1:30:32<13:07:28,  8.16s/it, gpt_loss=0.256, loss_mean=0.278][A[A
+
+Train step of epoch 1:  10%|▉         | 642/6434 [1:30:40<13:07:28,  8.16s/it, gpt_loss=0.35, loss_mean=0.285] [A[A
+
+Train step of epoch 1:  10%|▉         | 643/6434 [1:30:40<13:19:36,  8.28s/it, gpt_loss=0.35, loss_mean=0.285][A[A
+
+Train step of epoch 1:  10%|▉         | 643/6434 [1:30:49<13:19:36,  8.28s/it, gpt_loss=0.258, loss_mean=0.282][A[A
+
+Train step of epoch 1:  10%|█         | 644/6434 [1:30:49<13:26:02,  8.35s/it, gpt_loss=0.258, loss_mean=0.282][A[A
+
+Train step of epoch 1:  10%|█         | 644/6434 [1:30:56<13:26:02,  8.35s/it, gpt_loss=0.222, loss_mean=0.276][A[A
+
+Train step of epoch 1:  10%|█         | 645/6434 [1:30:56<13:10:04,  8.19s/it, gpt_loss=0.222, loss_mean=0.276][A[A
+[LID Router Debug] Step: 7080
+Batch Size: 10
+Audio Batch Size: 111
+LID Assignments: [3, 2, 4, 5, 6, 4, 5, 1, 1, 9]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  10%|█         | 645/6434 [1:31:06<13:10:04,  8.19s/it, gpt_loss=0.288, loss_mean=0.278][A[A
+
+Train step of epoch 1:  10%|█         | 646/6434 [1:31:06<14:00:11,  8.71s/it, gpt_loss=0.288, loss_mean=0.278][A[A
+
+Train step of epoch 1:  10%|█         | 646/6434 [1:31:16<14:00:11,  8.71s/it, gpt_loss=0.27, loss_mean=0.277] [A[A
+
+Train step of epoch 1:  10%|█         | 647/6434 [1:31:16<14:19:55,  8.92s/it, gpt_loss=0.27, loss_mean=0.277][A[A
+
+Train step of epoch 1:  10%|█         | 647/6434 [1:31:24<14:19:55,  8.92s/it, gpt_loss=0.249, loss_mean=0.274][A[A
+
+Train step of epoch 1:  10%|█         | 648/6434 [1:31:24<14:05:09,  8.76s/it, gpt_loss=0.249, loss_mean=0.274][A[A
+
+Train step of epoch 1:  10%|█         | 648/6434 [1:31:33<14:05:09,  8.76s/it, gpt_loss=0.319, loss_mean=0.279][A[A
+
+Train step of epoch 1:  10%|█         | 649/6434 [1:31:33<13:56:36,  8.68s/it, gpt_loss=0.319, loss_mean=0.279][A[A
+
+Train step of epoch 1:  10%|█         | 649/6434 [1:31:42<13:56:36,  8.68s/it, gpt_loss=0.337, loss_mean=0.284][A[A
+
+Train step of epoch 1:  10%|█         | 650/6434 [1:31:42<14:11:28,  8.83s/it, gpt_loss=0.337, loss_mean=0.284][A[A
+
+Train step of epoch 1:  10%|█         | 650/6434 [1:31:51<14:11:28,  8.83s/it, gpt_loss=0.276, loss_mean=0.284][A[A
+
+Train step of epoch 1:  10%|█         | 651/6434 [1:31:51<14:20:06,  8.92s/it, gpt_loss=0.276, loss_mean=0.284][A[A
+
+Train step of epoch 1:  10%|█         | 651/6434 [1:32:01<14:20:06,  8.92s/it, gpt_loss=0.226, loss_mean=0.278][A[A
+
+Train step of epoch 1:  10%|█         | 652/6434 [1:32:01<14:43:35,  9.17s/it, gpt_loss=0.226, loss_mean=0.278][A[A
+
+Train step of epoch 1:  10%|█         | 652/6434 [1:32:09<14:43:35,  9.17s/it, gpt_loss=0.225, loss_mean=0.273][A[A
+
+Train step of epoch 1:  10%|█         | 653/6434 [1:32:09<14:14:52,  8.87s/it, gpt_loss=0.225, loss_mean=0.273][A[A
+
+Train step of epoch 1:  10%|█         | 653/6434 [1:32:17<14:14:52,  8.87s/it, gpt_loss=0.21, loss_mean=0.266] [A[A
+
+Train step of epoch 1:  10%|█         | 654/6434 [1:32:17<13:56:55,  8.69s/it, gpt_loss=0.21, loss_mean=0.266][A[A
+
+Train step of epoch 1:  10%|█         | 654/6434 [1:32:26<13:56:55,  8.69s/it, gpt_loss=0.257, loss_mean=0.265][A[A
+
+Train step of epoch 1:  10%|█         | 655/6434 [1:32:26<14:08:52,  8.81s/it, gpt_loss=0.257, loss_mean=0.265][A[A
+[LID Router Debug] Step: 7090
+Batch Size: 10
+Audio Batch Size: 110
+LID Assignments: [4, 3, 2, 0, 2, 5, 5, 3, 1, 9]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  10%|█         | 655/6434 [1:32:34<14:08:52,  8.81s/it, gpt_loss=0.266, loss_mean=0.265][A[A
+
+Train step of epoch 1:  10%|█         | 656/6434 [1:32:34<13:31:29,  8.43s/it, gpt_loss=0.266, loss_mean=0.265][A[A
+
+Train step of epoch 1:  10%|█         | 656/6434 [1:32:41<13:31:29,  8.43s/it, gpt_loss=0.237, loss_mean=0.263][A[A
+
+Train step of epoch 1:  10%|█         | 657/6434 [1:32:41<13:01:10,  8.11s/it, gpt_loss=0.237, loss_mean=0.263][A[A
+
+Train step of epoch 1:  10%|█         | 657/6434 [1:32:50<13:01:10,  8.11s/it, gpt_loss=0.224, loss_mean=0.259][A[A
+
+Train step of epoch 1:  10%|█         | 658/6434 [1:32:50<13:07:14,  8.18s/it, gpt_loss=0.224, loss_mean=0.259][A[A
+
+Train step of epoch 1:  10%|█         | 658/6434 [1:32:56<13:07:14,  8.18s/it, gpt_loss=0.364, loss_mean=0.269][A[A
+
+Train step of epoch 1:  10%|█         | 659/6434 [1:32:56<12:22:52,  7.72s/it, gpt_loss=0.364, loss_mean=0.269][A[A
+
+Train step of epoch 1:  10%|█         | 659/6434 [1:33:06<12:22:52,  7.72s/it, gpt_loss=0.284, loss_mean=0.271][A[A
+
+Train step of epoch 1:  10%|█         | 660/6434 [1:33:06<13:12:53,  8.24s/it, gpt_loss=0.284, loss_mean=0.271][A[A
+
+Train step of epoch 1:  10%|█         | 660/6434 [1:33:15<13:12:53,  8.24s/it, gpt_loss=0.264, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  10%|█         | 661/6434 [1:33:15<13:33:59,  8.46s/it, gpt_loss=0.264, loss_mean=0.27][A[A
+
+Train step of epoch 1:  10%|█         | 661/6434 [1:33:23<13:33:59,  8.46s/it, gpt_loss=0.268, loss_mean=0.27][A[A
+
+Train step of epoch 1:  10%|█         | 662/6434 [1:33:23<13:22:22,  8.34s/it, gpt_loss=0.268, loss_mean=0.27][A[A
+
+Train step of epoch 1:  10%|█         | 662/6434 [1:33:30<13:22:22,  8.34s/it, gpt_loss=0.249, loss_mean=0.268][A[A
+
+Train step of epoch 1:  10%|█         | 663/6434 [1:33:30<12:59:35,  8.11s/it, gpt_loss=0.249, loss_mean=0.268][A[A
+
+Train step of epoch 1:  10%|█         | 663/6434 [1:33:38<12:59:35,  8.11s/it, gpt_loss=0.251, loss_mean=0.266][A[A
+
+Train step of epoch 1:  10%|█         | 664/6434 [1:33:38<12:42:33,  7.93s/it, gpt_loss=0.251, loss_mean=0.266][A[A
+
+Train step of epoch 1:  10%|█         | 664/6434 [1:33:47<12:42:33,  7.93s/it, gpt_loss=0.322, loss_mean=0.272][A[A
+
+Train step of epoch 1:  10%|█         | 665/6434 [1:33:47<13:28:49,  8.41s/it, gpt_loss=0.322, loss_mean=0.272][A[A
+[LID Router Debug] Step: 7100
+Batch Size: 10
+Audio Batch Size: 122
+LID Assignments: [2, 7, 2, 0, 9, 1, 1, 2, 4, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 7, 9}
+
+
+Train step of epoch 1:  10%|█         | 665/6434 [1:33:56<13:28:49,  8.41s/it, gpt_loss=0.256, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  10%|█         | 666/6434 [1:33:56<13:24:44,  8.37s/it, gpt_loss=0.256, loss_mean=0.27][A[A
+
+Train step of epoch 1:  10%|█         | 666/6434 [1:34:04<13:24:44,  8.37s/it, gpt_loss=0.302, loss_mean=0.273][A[A
+
+Train step of epoch 1:  10%|█         | 667/6434 [1:34:04<13:27:26,  8.40s/it, gpt_loss=0.302, loss_mean=0.273][A[A
+
+Train step of epoch 1:  10%|█         | 667/6434 [1:34:12<13:27:26,  8.40s/it, gpt_loss=0.424, loss_mean=0.288][A[A
+
+Train step of epoch 1:  10%|█         | 668/6434 [1:34:12<13:25:53,  8.39s/it, gpt_loss=0.424, loss_mean=0.288][A[A
+
+Train step of epoch 1:  10%|█         | 668/6434 [1:34:21<13:25:53,  8.39s/it, gpt_loss=0.275, loss_mean=0.287][A[A
+
+Train step of epoch 1:  10%|█         | 669/6434 [1:34:21<13:30:17,  8.43s/it, gpt_loss=0.275, loss_mean=0.287][A[A
+
+Train step of epoch 1:  10%|█         | 669/6434 [1:34:29<13:30:17,  8.43s/it, gpt_loss=0.227, loss_mean=0.281][A[A
+
+Train step of epoch 1:  10%|█         | 670/6434 [1:34:29<13:08:26,  8.21s/it, gpt_loss=0.227, loss_mean=0.281][A[A
+
+Train step of epoch 1:  10%|█         | 670/6434 [1:34:37<13:08:26,  8.21s/it, gpt_loss=0.335, loss_mean=0.286][A[A
+
+Train step of epoch 1:  10%|█         | 671/6434 [1:34:37<13:08:38,  8.21s/it, gpt_loss=0.335, loss_mean=0.286][A[A
+
+Train step of epoch 1:  10%|█         | 671/6434 [1:34:45<13:08:38,  8.21s/it, gpt_loss=0.226, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  10%|█         | 672/6434 [1:34:45<12:59:51,  8.12s/it, gpt_loss=0.226, loss_mean=0.28][A[A
+
+Train step of epoch 1:  10%|█         | 672/6434 [1:34:54<12:59:51,  8.12s/it, gpt_loss=0.297, loss_mean=0.282][A[A
+
+Train step of epoch 1:  10%|█         | 673/6434 [1:34:54<13:25:39,  8.39s/it, gpt_loss=0.297, loss_mean=0.282][A[A
+
+Train step of epoch 1:  10%|█         | 673/6434 [1:35:03<13:25:39,  8.39s/it, gpt_loss=0.238, loss_mean=0.278][A[A
+
+Train step of epoch 1:  10%|█         | 674/6434 [1:35:03<13:51:23,  8.66s/it, gpt_loss=0.238, loss_mean=0.278][A[A
+
+Train step of epoch 1:  10%|█         | 674/6434 [1:35:11<13:51:23,  8.66s/it, gpt_loss=0.313, loss_mean=0.281][A[A
+
+Train step of epoch 1:  10%|█         | 675/6434 [1:35:11<13:22:09,  8.36s/it, gpt_loss=0.313, loss_mean=0.281][A[A
+[LID Router Debug] Step: 7110
+Batch Size: 10
+Audio Batch Size: 127
+LID Assignments: [2, 3, 4, 0, 1, 1, 3, 6, 0, 9]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6, 9}
+
+
+Train step of epoch 1:  10%|█         | 675/6434 [1:35:19<13:22:09,  8.36s/it, gpt_loss=0.251, loss_mean=0.278][A[A
+
+Train step of epoch 1:  11%|█         | 676/6434 [1:35:19<13:17:45,  8.31s/it, gpt_loss=0.251, loss_mean=0.278][A[A
+
+Train step of epoch 1:  11%|█         | 676/6434 [1:35:27<13:17:45,  8.31s/it, gpt_loss=0.325, loss_mean=0.283][A[A
+
+Train step of epoch 1:  11%|█         | 677/6434 [1:35:27<13:15:49,  8.29s/it, gpt_loss=0.325, loss_mean=0.283][A[A
+
+Train step of epoch 1:  11%|█         | 677/6434 [1:35:36<13:15:49,  8.29s/it, gpt_loss=0.338, loss_mean=0.288][A[A
+
+Train step of epoch 1:  11%|█         | 678/6434 [1:35:36<13:29:42,  8.44s/it, gpt_loss=0.338, loss_mean=0.288][A[A
+
+Train step of epoch 1:  11%|█         | 678/6434 [1:35:43<13:29:42,  8.44s/it, gpt_loss=0.286, loss_mean=0.288][A[A
+
+Train step of epoch 1:  11%|█         | 679/6434 [1:35:43<12:51:14,  8.04s/it, gpt_loss=0.286, loss_mean=0.288][A[A
+
+Train step of epoch 1:  11%|█         | 679/6434 [1:35:52<12:51:14,  8.04s/it, gpt_loss=0.281, loss_mean=0.287][A[A
+
+Train step of epoch 1:  11%|█         | 680/6434 [1:35:52<13:20:40,  8.35s/it, gpt_loss=0.281, loss_mean=0.287][A[A
+
+Train step of epoch 1:  11%|█         | 680/6434 [1:36:01<13:20:40,  8.35s/it, gpt_loss=0.226, loss_mean=0.281][A[A
+
+Train step of epoch 1:  11%|█         | 681/6434 [1:36:01<13:45:18,  8.61s/it, gpt_loss=0.226, loss_mean=0.281][A[A
+
+Train step of epoch 1:  11%|█         | 681/6434 [1:36:10<13:45:18,  8.61s/it, gpt_loss=0.266, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  11%|█         | 682/6434 [1:36:10<13:36:36,  8.52s/it, gpt_loss=0.266, loss_mean=0.28][A[A
+
+Train step of epoch 1:  11%|█         | 682/6434 [1:36:18<13:36:36,  8.52s/it, gpt_loss=0.183, loss_mean=0.27][A[A
+
+Train step of epoch 1:  11%|█         | 683/6434 [1:36:18<13:26:17,  8.41s/it, gpt_loss=0.183, loss_mean=0.27][A[A
+
+Train step of epoch 1:  11%|█         | 683/6434 [1:36:26<13:26:17,  8.41s/it, gpt_loss=0.242, loss_mean=0.267][A[A
+
+Train step of epoch 1:  11%|█         | 684/6434 [1:36:26<13:17:49,  8.33s/it, gpt_loss=0.242, loss_mean=0.267][A[A
+
+Train step of epoch 1:  11%|█         | 684/6434 [1:36:35<13:17:49,  8.33s/it, gpt_loss=0.258, loss_mean=0.266][A[A
+
+Train step of epoch 1:  11%|█         | 685/6434 [1:36:35<13:33:59,  8.50s/it, gpt_loss=0.258, loss_mean=0.266][A[A
+[LID Router Debug] Step: 7120
+Batch Size: 10
+Audio Batch Size: 94
+LID Assignments: [4, 4, 2, 3, 1, 3, 5, 1, 4, 1]
+Active Experts in Batch: {1, 2, 3, 4, 5}
+
+
+Train step of epoch 1:  11%|█         | 685/6434 [1:36:45<13:33:59,  8.50s/it, gpt_loss=0.245, loss_mean=0.264][A[A
+
+Train step of epoch 1:  11%|█         | 686/6434 [1:36:45<14:08:10,  8.85s/it, gpt_loss=0.245, loss_mean=0.264][A[A
+
+Train step of epoch 1:  11%|█         | 686/6434 [1:36:52<14:08:10,  8.85s/it, gpt_loss=0.289, loss_mean=0.267][A[A
+
+Train step of epoch 1:  11%|█         | 687/6434 [1:36:52<13:28:33,  8.44s/it, gpt_loss=0.289, loss_mean=0.267][A[A
+
+Train step of epoch 1:  11%|█         | 687/6434 [1:37:01<13:28:33,  8.44s/it, gpt_loss=0.284, loss_mean=0.268][A[A
+
+Train step of epoch 1:  11%|█         | 688/6434 [1:37:01<13:41:30,  8.58s/it, gpt_loss=0.284, loss_mean=0.268][A[A
+
+Train step of epoch 1:  11%|█         | 688/6434 [1:37:09<13:41:30,  8.58s/it, gpt_loss=0.252, loss_mean=0.267][A[A
+
+Train step of epoch 1:  11%|█         | 689/6434 [1:37:09<13:23:23,  8.39s/it, gpt_loss=0.252, loss_mean=0.267][A[A
+
+Train step of epoch 1:  11%|█         | 689/6434 [1:37:16<13:23:23,  8.39s/it, gpt_loss=0.312, loss_mean=0.271][A[A
+
+Train step of epoch 1:  11%|█         | 690/6434 [1:37:16<12:53:57,  8.08s/it, gpt_loss=0.312, loss_mean=0.271][A[A
+
+Train step of epoch 1:  11%|█         | 690/6434 [1:37:24<12:53:57,  8.08s/it, gpt_loss=0.297, loss_mean=0.274][A[A
+
+Train step of epoch 1:  11%|█         | 691/6434 [1:37:24<12:34:53,  7.89s/it, gpt_loss=0.297, loss_mean=0.274][A[A
+
+Train step of epoch 1:  11%|█         | 691/6434 [1:37:32<12:34:53,  7.89s/it, gpt_loss=0.338, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  11%|█         | 692/6434 [1:37:32<12:53:55,  8.09s/it, gpt_loss=0.338, loss_mean=0.28][A[A
+
+Train step of epoch 1:  11%|█         | 692/6434 [1:37:40<12:53:55,  8.09s/it, gpt_loss=0.363, loss_mean=0.288][A[A
+
+Train step of epoch 1:  11%|█         | 693/6434 [1:37:40<12:44:06,  7.99s/it, gpt_loss=0.363, loss_mean=0.288][A[A
+
+Train step of epoch 1:  11%|█         | 693/6434 [1:37:49<12:44:06,  7.99s/it, gpt_loss=0.194, loss_mean=0.279][A[A
+
+Train step of epoch 1:  11%|█         | 694/6434 [1:37:49<13:16:04,  8.32s/it, gpt_loss=0.194, loss_mean=0.279][A[A
+
+Train step of epoch 1:  11%|█         | 694/6434 [1:37:58<13:16:04,  8.32s/it, gpt_loss=0.252, loss_mean=0.276][A[A
+
+Train step of epoch 1:  11%|█         | 695/6434 [1:37:58<13:31:18,  8.48s/it, gpt_loss=0.252, loss_mean=0.276][A[A
+[LID Router Debug] Step: 7130
+Batch Size: 10
+Audio Batch Size: 92
+LID Assignments: [4, 5, 3, 1, 1, 2, 0, 6, 0, 2]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+
+Train step of epoch 1:  11%|█         | 695/6434 [1:38:06<13:31:18,  8.48s/it, gpt_loss=0.337, loss_mean=0.282][A[A
+
+Train step of epoch 1:  11%|█         | 696/6434 [1:38:06<13:30:57,  8.48s/it, gpt_loss=0.337, loss_mean=0.282][A[A
+
+Train step of epoch 1:  11%|█         | 696/6434 [1:38:15<13:30:57,  8.48s/it, gpt_loss=0.281, loss_mean=0.282][A[A
+
+Train step of epoch 1:  11%|█         | 697/6434 [1:38:15<13:22:18,  8.39s/it, gpt_loss=0.281, loss_mean=0.282][A[A
+
+Train step of epoch 1:  11%|█         | 697/6434 [1:38:23<13:22:18,  8.39s/it, gpt_loss=0.254, loss_mean=0.279][A[A
+
+Train step of epoch 1:  11%|█         | 698/6434 [1:38:23<13:31:54,  8.49s/it, gpt_loss=0.254, loss_mean=0.279][A[A
+
+Train step of epoch 1:  11%|█         | 698/6434 [1:38:31<13:31:54,  8.49s/it, gpt_loss=0.208, loss_mean=0.272][A[A
+
+Train step of epoch 1:  11%|█         | 699/6434 [1:38:31<13:15:17,  8.32s/it, gpt_loss=0.208, loss_mean=0.272][A[A
+
+Train step of epoch 1:  11%|█         | 699/6434 [1:38:39<13:15:17,  8.32s/it, gpt_loss=0.242, loss_mean=0.269][A[A
+
+Train step of epoch 1:  11%|█         | 700/6434 [1:38:39<12:47:25,  8.03s/it, gpt_loss=0.242, loss_mean=0.269][A[A
+
+Train step of epoch 1:  11%|█         | 700/6434 [1:38:48<12:47:25,  8.03s/it, gpt_loss=0.269, loss_mean=0.269][A[A
+
+Train step of epoch 1:  11%|█         | 701/6434 [1:38:48<13:16:11,  8.33s/it, gpt_loss=0.269, loss_mean=0.269][A[A
+
+Train step of epoch 1:  11%|█         | 701/6434 [1:38:55<13:16:11,  8.33s/it, gpt_loss=0.298, loss_mean=0.272][A[A
+
+Train step of epoch 1:  11%|█         | 702/6434 [1:38:55<12:45:09,  8.01s/it, gpt_loss=0.298, loss_mean=0.272][A[A
+
+Train step of epoch 1:  11%|█         | 702/6434 [1:39:02<12:45:09,  8.01s/it, gpt_loss=0.272, loss_mean=0.272][A[A
+
+Train step of epoch 1:  11%|█         | 703/6434 [1:39:02<12:29:35,  7.85s/it, gpt_loss=0.272, loss_mean=0.272][A[A
+
+Train step of epoch 1:  11%|█         | 703/6434 [1:39:10<12:29:35,  7.85s/it, gpt_loss=0.247, loss_mean=0.269][A[A
+
+Train step of epoch 1:  11%|█         | 704/6434 [1:39:10<12:29:18,  7.85s/it, gpt_loss=0.247, loss_mean=0.269][A[A
+
+Train step of epoch 1:  11%|█         | 704/6434 [1:39:18<12:29:18,  7.85s/it, gpt_loss=0.206, loss_mean=0.263][A[A
+
+Train step of epoch 1:  11%|█         | 705/6434 [1:39:18<12:27:54,  7.83s/it, gpt_loss=0.206, loss_mean=0.263][A[A
+[LID Router Debug] Step: 7140
+Batch Size: 10
+Audio Batch Size: 105
+LID Assignments: [4, 6, 5, 1, 9, 5, 3, 9, 2, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  11%|█         | 705/6434 [1:39:27<12:27:54,  7.83s/it, gpt_loss=0.218, loss_mean=0.259][A[A
+
+Train step of epoch 1:  11%|█         | 706/6434 [1:39:27<13:12:58,  8.31s/it, gpt_loss=0.218, loss_mean=0.259][A[A
+
+Train step of epoch 1:  11%|█         | 706/6434 [1:39:35<13:12:58,  8.31s/it, gpt_loss=0.253, loss_mean=0.258][A[A
+
+Train step of epoch 1:  11%|█         | 707/6434 [1:39:35<13:04:08,  8.22s/it, gpt_loss=0.253, loss_mean=0.258][A[A
+
+Train step of epoch 1:  11%|█         | 707/6434 [1:39:44<13:04:08,  8.22s/it, gpt_loss=0.241, loss_mean=0.256][A[A
+
+Train step of epoch 1:  11%|█         | 708/6434 [1:39:44<13:21:10,  8.40s/it, gpt_loss=0.241, loss_mean=0.256][A[A
+
+Train step of epoch 1:  11%|█         | 708/6434 [1:39:53<13:21:10,  8.40s/it, gpt_loss=0.299, loss_mean=0.261][A[A
+
+Train step of epoch 1:  11%|█         | 709/6434 [1:39:53<13:28:10,  8.47s/it, gpt_loss=0.299, loss_mean=0.261][A[A
+
+Train step of epoch 1:  11%|█         | 709/6434 [1:40:03<13:28:10,  8.47s/it, gpt_loss=0.271, loss_mean=0.262][A[A
+
+Train step of epoch 1:  11%|█         | 710/6434 [1:40:03<14:21:05,  9.03s/it, gpt_loss=0.271, loss_mean=0.262][A[A
+
+Train step of epoch 1:  11%|█         | 710/6434 [1:40:12<14:21:05,  9.03s/it, gpt_loss=0.311, loss_mean=0.267][A[A
+
+Train step of epoch 1:  11%|█         | 711/6434 [1:40:12<14:05:29,  8.86s/it, gpt_loss=0.311, loss_mean=0.267][A[A
+
+Train step of epoch 1:  11%|█         | 711/6434 [1:40:19<14:05:29,  8.86s/it, gpt_loss=0.203, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  11%|█         | 712/6434 [1:40:19<13:30:18,  8.50s/it, gpt_loss=0.203, loss_mean=0.26][A[A
+
+Train step of epoch 1:  11%|█         | 712/6434 [1:40:28<13:30:18,  8.50s/it, gpt_loss=0.242, loss_mean=0.258][A[A
+
+Train step of epoch 1:  11%|█         | 713/6434 [1:40:28<13:29:29,  8.49s/it, gpt_loss=0.242, loss_mean=0.258][A[A
+
+Train step of epoch 1:  11%|█         | 713/6434 [1:40:37<13:29:29,  8.49s/it, gpt_loss=0.203, loss_mean=0.253][A[A
+
+Train step of epoch 1:  11%|█         | 714/6434 [1:40:37<13:56:42,  8.78s/it, gpt_loss=0.203, loss_mean=0.253][A[A
+
+Train step of epoch 1:  11%|█         | 714/6434 [1:40:45<13:56:42,  8.78s/it, gpt_loss=0.278, loss_mean=0.255][A[A
+
+Train step of epoch 1:  11%|█         | 715/6434 [1:40:45<13:25:18,  8.45s/it, gpt_loss=0.278, loss_mean=0.255][A[A
+[LID Router Debug] Step: 7150
+Batch Size: 10
+Audio Batch Size: 126
+LID Assignments: [9, 5, 1, 5, 3, 9, 4, 5, 2, 3]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  11%|█         | 715/6434 [1:40:53<13:25:18,  8.45s/it, gpt_loss=0.288, loss_mean=0.259][A[A
+
+Train step of epoch 1:  11%|█         | 716/6434 [1:40:53<13:28:06,  8.48s/it, gpt_loss=0.288, loss_mean=0.259][A[A
+
+Train step of epoch 1:  11%|█         | 716/6434 [1:41:01<13:28:06,  8.48s/it, gpt_loss=0.293, loss_mean=0.262][A[A
+
+Train step of epoch 1:  11%|█         | 717/6434 [1:41:01<12:52:47,  8.11s/it, gpt_loss=0.293, loss_mean=0.262][A[A
+
+Train step of epoch 1:  11%|█         | 717/6434 [1:41:09<12:52:47,  8.11s/it, gpt_loss=0.261, loss_mean=0.262][A[A
+
+Train step of epoch 1:  11%|█         | 718/6434 [1:41:09<13:07:54,  8.27s/it, gpt_loss=0.261, loss_mean=0.262][A[A
+
+Train step of epoch 1:  11%|█         | 718/6434 [1:41:19<13:07:54,  8.27s/it, gpt_loss=0.275, loss_mean=0.263][A[A
+
+Train step of epoch 1:  11%|█         | 719/6434 [1:41:19<13:33:37,  8.54s/it, gpt_loss=0.275, loss_mean=0.263][A[A
+
+Train step of epoch 1:  11%|█         | 719/6434 [1:41:27<13:33:37,  8.54s/it, gpt_loss=0.219, loss_mean=0.259][A[A
+
+Train step of epoch 1:  11%|█         | 720/6434 [1:41:27<13:37:44,  8.59s/it, gpt_loss=0.219, loss_mean=0.259][A[A
+
+Train step of epoch 1:  11%|█         | 720/6434 [1:41:36<13:37:44,  8.59s/it, gpt_loss=0.228, loss_mean=0.256][A[A
+
+Train step of epoch 1:  11%|█         | 721/6434 [1:41:36<13:46:03,  8.68s/it, gpt_loss=0.228, loss_mean=0.256][A[A
+
+Train step of epoch 1:  11%|█         | 721/6434 [1:41:44<13:46:03,  8.68s/it, gpt_loss=0.249, loss_mean=0.255][A[A
+
+Train step of epoch 1:  11%|█         | 722/6434 [1:41:44<13:28:21,  8.49s/it, gpt_loss=0.249, loss_mean=0.255][A[A
+
+Train step of epoch 1:  11%|█         | 722/6434 [1:41:53<13:28:21,  8.49s/it, gpt_loss=0.261, loss_mean=0.256][A[A
+
+Train step of epoch 1:  11%|█         | 723/6434 [1:41:53<13:39:20,  8.61s/it, gpt_loss=0.261, loss_mean=0.256][A[A
+
+Train step of epoch 1:  11%|█         | 723/6434 [1:42:03<13:39:20,  8.61s/it, gpt_loss=0.268, loss_mean=0.257][A[A
+
+Train step of epoch 1:  11%|█▏        | 724/6434 [1:42:03<14:26:14,  9.10s/it, gpt_loss=0.268, loss_mean=0.257][A[A
+
+Train step of epoch 1:  11%|█▏        | 724/6434 [1:42:11<14:26:14,  9.10s/it, gpt_loss=0.261, loss_mean=0.257][A[A
+
+Train step of epoch 1:  11%|█▏        | 725/6434 [1:42:11<13:53:50,  8.76s/it, gpt_loss=0.261, loss_mean=0.257][A[A
+[LID Router Debug] Step: 7160
+Batch Size: 10
+Audio Batch Size: 90
+LID Assignments: [1, 6, 4, 0, 0, 9, 1, 1, 0, 1]
+Active Experts in Batch: {0, 1, 4, 6, 9}
+
+
+Train step of epoch 1:  11%|█▏        | 725/6434 [1:42:19<13:53:50,  8.76s/it, gpt_loss=0.195, loss_mean=0.251][A[A
+
+Train step of epoch 1:  11%|█▏        | 726/6434 [1:42:19<13:18:32,  8.39s/it, gpt_loss=0.195, loss_mean=0.251][A[A
+
+Train step of epoch 1:  11%|█▏        | 726/6434 [1:42:29<13:18:32,  8.39s/it, gpt_loss=0.252, loss_mean=0.251][A[A
+
+Train step of epoch 1:  11%|█▏        | 727/6434 [1:42:29<14:05:01,  8.88s/it, gpt_loss=0.252, loss_mean=0.251][A[A
+
+Train step of epoch 1:  11%|█▏        | 727/6434 [1:42:39<14:05:01,  8.88s/it, gpt_loss=0.314, loss_mean=0.257][A[A
+
+Train step of epoch 1:  11%|█▏        | 728/6434 [1:42:39<14:30:51,  9.16s/it, gpt_loss=0.314, loss_mean=0.257][A[A
+
+Train step of epoch 1:  11%|█▏        | 728/6434 [1:42:47<14:30:51,  9.16s/it, gpt_loss=0.253, loss_mean=0.257][A[A
+
+Train step of epoch 1:  11%|█▏        | 729/6434 [1:42:47<14:09:46,  8.94s/it, gpt_loss=0.253, loss_mean=0.257][A[A
+
+Train step of epoch 1:  11%|█▏        | 729/6434 [1:42:56<14:09:46,  8.94s/it, gpt_loss=0.262, loss_mean=0.257][A[A
+
+Train step of epoch 1:  11%|█▏        | 730/6434 [1:42:56<14:02:24,  8.86s/it, gpt_loss=0.262, loss_mean=0.257][A[A
+
+Train step of epoch 1:  11%|█▏        | 730/6434 [1:43:04<14:02:24,  8.86s/it, gpt_loss=0.261, loss_mean=0.258][A[A
+
+Train step of epoch 1:  11%|█▏        | 731/6434 [1:43:04<13:53:50,  8.77s/it, gpt_loss=0.261, loss_mean=0.258][A[A
+
+Train step of epoch 1:  11%|█▏        | 731/6434 [1:43:12<13:53:50,  8.77s/it, gpt_loss=0.286, loss_mean=0.261][A[A
+
+Train step of epoch 1:  11%|█▏        | 732/6434 [1:43:12<13:22:49,  8.45s/it, gpt_loss=0.286, loss_mean=0.261][A[A
+
+Train step of epoch 1:  11%|█▏        | 732/6434 [1:43:20<13:22:49,  8.45s/it, gpt_loss=0.262, loss_mean=0.261][A[A
+
+Train step of epoch 1:  11%|█▏        | 733/6434 [1:43:20<13:21:19,  8.43s/it, gpt_loss=0.262, loss_mean=0.261][A[A
+
+Train step of epoch 1:  11%|█▏        | 733/6434 [1:43:29<13:21:19,  8.43s/it, gpt_loss=0.312, loss_mean=0.266][A[A
+
+Train step of epoch 1:  11%|█▏        | 734/6434 [1:43:29<13:34:45,  8.58s/it, gpt_loss=0.312, loss_mean=0.266][A[A
+
+Train step of epoch 1:  11%|█▏        | 734/6434 [1:43:37<13:34:45,  8.58s/it, gpt_loss=0.361, loss_mean=0.275][A[A
+
+Train step of epoch 1:  11%|█▏        | 735/6434 [1:43:37<13:04:39,  8.26s/it, gpt_loss=0.361, loss_mean=0.275][A[A
+[LID Router Debug] Step: 7170
+Batch Size: 10
+Audio Batch Size: 108
+LID Assignments: [1, 4, 0, 1, 3, 1, 5, 4, 9, 3]
+Active Experts in Batch: {0, 1, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  11%|█▏        | 735/6434 [1:43:45<13:04:39,  8.26s/it, gpt_loss=0.248, loss_mean=0.273][A[A
+
+Train step of epoch 1:  11%|█▏        | 736/6434 [1:43:45<12:51:39,  8.13s/it, gpt_loss=0.248, loss_mean=0.273][A[A
+
+Train step of epoch 1:  11%|█▏        | 736/6434 [1:43:54<12:51:39,  8.13s/it, gpt_loss=0.252, loss_mean=0.271][A[A
+
+Train step of epoch 1:  11%|█▏        | 737/6434 [1:43:54<13:23:14,  8.46s/it, gpt_loss=0.252, loss_mean=0.271][A[A
+
+Train step of epoch 1:  11%|█▏        | 737/6434 [1:44:04<13:23:14,  8.46s/it, gpt_loss=0.341, loss_mean=0.278][A[A
+
+Train step of epoch 1:  11%|█▏        | 738/6434 [1:44:04<14:03:27,  8.88s/it, gpt_loss=0.341, loss_mean=0.278][A[A
+
+Train step of epoch 1:  11%|█▏        | 738/6434 [1:44:12<14:03:27,  8.88s/it, gpt_loss=0.314, loss_mean=0.281][A[A
+
+Train step of epoch 1:  11%|█▏        | 739/6434 [1:44:12<13:55:36,  8.80s/it, gpt_loss=0.314, loss_mean=0.281][A[A
+
+Train step of epoch 1:  11%|█▏        | 739/6434 [1:44:21<13:55:36,  8.80s/it, gpt_loss=0.312, loss_mean=0.284][A[A
+
+Train step of epoch 1:  12%|█▏        | 740/6434 [1:44:21<13:44:43,  8.69s/it, gpt_loss=0.312, loss_mean=0.284][A[A
+
+Train step of epoch 1:  12%|█▏        | 740/6434 [1:44:29<13:44:43,  8.69s/it, gpt_loss=0.265, loss_mean=0.282][A[A
+
+Train step of epoch 1:  12%|█▏        | 741/6434 [1:44:29<13:29:02,  8.53s/it, gpt_loss=0.265, loss_mean=0.282][A[A
+
+Train step of epoch 1:  12%|█▏        | 741/6434 [1:44:36<13:29:02,  8.53s/it, gpt_loss=0.265, loss_mean=0.281][A[A
+
+Train step of epoch 1:  12%|█▏        | 742/6434 [1:44:36<12:59:25,  8.22s/it, gpt_loss=0.265, loss_mean=0.281][A[A
+
+Train step of epoch 1:  12%|█▏        | 742/6434 [1:44:46<12:59:25,  8.22s/it, gpt_loss=0.328, loss_mean=0.285][A[A
+
+Train step of epoch 1:  12%|█▏        | 743/6434 [1:44:46<13:45:52,  8.71s/it, gpt_loss=0.328, loss_mean=0.285][A[A
+
+Train step of epoch 1:  12%|█▏        | 743/6434 [1:44:54<13:45:52,  8.71s/it, gpt_loss=0.247, loss_mean=0.282][A[A
+
+Train step of epoch 1:  12%|█▏        | 744/6434 [1:44:54<13:17:03,  8.40s/it, gpt_loss=0.247, loss_mean=0.282][A[A
+
+Train step of epoch 1:  12%|█▏        | 744/6434 [1:45:02<13:17:03,  8.40s/it, gpt_loss=0.308, loss_mean=0.284][A[A
+
+Train step of epoch 1:  12%|█▏        | 745/6434 [1:45:02<12:56:42,  8.19s/it, gpt_loss=0.308, loss_mean=0.284][A[A
+[LID Router Debug] Step: 7180
+Batch Size: 10
+Audio Batch Size: 92
+LID Assignments: [4, 0, 5, 1, 2, 5, 3, 0, 4, 6]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+
+Train step of epoch 1:  12%|█▏        | 745/6434 [1:45:09<12:56:42,  8.19s/it, gpt_loss=0.243, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  12%|█▏        | 746/6434 [1:45:09<12:27:07,  7.88s/it, gpt_loss=0.243, loss_mean=0.28][A[A
+
+Train step of epoch 1:  12%|█▏        | 746/6434 [1:45:17<12:27:07,  7.88s/it, gpt_loss=0.316, loss_mean=0.284][A[A
+
+Train step of epoch 1:  12%|█▏        | 747/6434 [1:45:17<12:23:00,  7.84s/it, gpt_loss=0.316, loss_mean=0.284][A[A
+
+Train step of epoch 1:  12%|█▏        | 747/6434 [1:45:25<12:23:00,  7.84s/it, gpt_loss=0.385, loss_mean=0.294][A[A
+
+Train step of epoch 1:  12%|█▏        | 748/6434 [1:45:25<12:45:18,  8.08s/it, gpt_loss=0.385, loss_mean=0.294][A[A
+
+Train step of epoch 1:  12%|█▏        | 748/6434 [1:45:33<12:45:18,  8.08s/it, gpt_loss=0.199, loss_mean=0.284][A[A
+
+Train step of epoch 1:  12%|█▏        | 749/6434 [1:45:33<12:34:45,  7.97s/it, gpt_loss=0.199, loss_mean=0.284][A[A
+
+Train step of epoch 1:  12%|█▏        | 749/6434 [1:45:41<12:34:45,  7.97s/it, gpt_loss=0.242, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  12%|█▏        | 750/6434 [1:45:41<12:43:55,  8.06s/it, gpt_loss=0.242, loss_mean=0.28][A[A
+
+Train step of epoch 1:  12%|█▏        | 750/6434 [1:45:50<12:43:55,  8.06s/it, gpt_loss=0.202, loss_mean=0.272][A[A
+
+Train step of epoch 1:  12%|█▏        | 751/6434 [1:45:50<13:00:03,  8.24s/it, gpt_loss=0.202, loss_mean=0.272][A[A
+
+Train step of epoch 1:  12%|█▏        | 751/6434 [1:45:59<13:00:03,  8.24s/it, gpt_loss=0.245, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  12%|█▏        | 752/6434 [1:45:59<13:38:28,  8.64s/it, gpt_loss=0.245, loss_mean=0.27][A[A
+
+Train step of epoch 1:  12%|█▏        | 752/6434 [1:46:07<13:38:28,  8.64s/it, gpt_loss=0.252, loss_mean=0.268][A[A
+
+Train step of epoch 1:  12%|█▏        | 753/6434 [1:46:07<13:17:04,  8.42s/it, gpt_loss=0.252, loss_mean=0.268][A[A
+
+Train step of epoch 1:  12%|█▏        | 753/6434 [1:46:15<13:17:04,  8.42s/it, gpt_loss=0.338, loss_mean=0.275][A[A
+
+Train step of epoch 1:  12%|█▏        | 754/6434 [1:46:15<12:55:38,  8.19s/it, gpt_loss=0.338, loss_mean=0.275][A[A
+
+Train step of epoch 1:  12%|█▏        | 754/6434 [1:46:24<12:55:38,  8.19s/it, gpt_loss=0.222, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  12%|█▏        | 755/6434 [1:46:24<13:04:31,  8.29s/it, gpt_loss=0.222, loss_mean=0.27][A[A
+[LID Router Debug] Step: 7190
+Batch Size: 10
+Audio Batch Size: 107
+LID Assignments: [9, 0, 3, 1, 9, 2, 0, 5, 1, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  12%|█▏        | 755/6434 [1:46:31<13:04:31,  8.29s/it, gpt_loss=0.28, loss_mean=0.271][A[A
+
+Train step of epoch 1:  12%|█▏        | 756/6434 [1:46:31<12:52:15,  8.16s/it, gpt_loss=0.28, loss_mean=0.271][A[A
+
+Train step of epoch 1:  12%|█▏        | 756/6434 [1:46:39<12:52:15,  8.16s/it, gpt_loss=0.354, loss_mean=0.279][A[A
+
+Train step of epoch 1:  12%|█▏        | 757/6434 [1:46:39<12:39:04,  8.02s/it, gpt_loss=0.354, loss_mean=0.279][A[A
+
+Train step of epoch 1:  12%|█▏        | 757/6434 [1:46:48<12:39:04,  8.02s/it, gpt_loss=0.255, loss_mean=0.277][A[A
+
+Train step of epoch 1:  12%|█▏        | 758/6434 [1:46:48<13:04:25,  8.29s/it, gpt_loss=0.255, loss_mean=0.277][A[A
+
+Train step of epoch 1:  12%|█▏        | 758/6434 [1:46:58<13:04:25,  8.29s/it, gpt_loss=0.343, loss_mean=0.283][A[A
+
+Train step of epoch 1:  12%|█▏        | 759/6434 [1:46:58<13:39:09,  8.66s/it, gpt_loss=0.343, loss_mean=0.283][A[A
+
+Train step of epoch 1:  12%|█▏        | 759/6434 [1:47:06<13:39:09,  8.66s/it, gpt_loss=0.336, loss_mean=0.288][A[A
+
+Train step of epoch 1:  12%|█▏        | 760/6434 [1:47:06<13:40:34,  8.68s/it, gpt_loss=0.336, loss_mean=0.288][A[A
+
+Train step of epoch 1:  12%|█▏        | 760/6434 [1:47:15<13:40:34,  8.68s/it, gpt_loss=0.289, loss_mean=0.289][A[A
+
+Train step of epoch 1:  12%|█▏        | 761/6434 [1:47:15<13:41:41,  8.69s/it, gpt_loss=0.289, loss_mean=0.289][A[A
+
+Train step of epoch 1:  12%|█▏        | 761/6434 [1:47:23<13:41:41,  8.69s/it, gpt_loss=0.295, loss_mean=0.289][A[A
+
+Train step of epoch 1:  12%|█▏        | 762/6434 [1:47:23<13:13:48,  8.40s/it, gpt_loss=0.295, loss_mean=0.289][A[A
+
+Train step of epoch 1:  12%|█▏        | 762/6434 [1:47:31<13:13:48,  8.40s/it, gpt_loss=0.282, loss_mean=0.289][A[A
+
+Train step of epoch 1:  12%|█▏        | 763/6434 [1:47:31<13:03:57,  8.29s/it, gpt_loss=0.282, loss_mean=0.289][A[A
+
+Train step of epoch 1:  12%|█▏        | 763/6434 [1:47:40<13:03:57,  8.29s/it, gpt_loss=0.24, loss_mean=0.284] [A[A
+
+Train step of epoch 1:  12%|█▏        | 764/6434 [1:47:40<13:22:40,  8.49s/it, gpt_loss=0.24, loss_mean=0.284][A[A
+
+Train step of epoch 1:  12%|█▏        | 764/6434 [1:47:47<13:22:40,  8.49s/it, gpt_loss=0.304, loss_mean=0.286][A[A
+
+Train step of epoch 1:  12%|█▏        | 765/6434 [1:47:47<13:01:19,  8.27s/it, gpt_loss=0.304, loss_mean=0.286][A[A
+[LID Router Debug] Step: 7200
+Batch Size: 10
+Audio Batch Size: 129
+LID Assignments: [2, 2, 4, 4, 9, 2, 2, 4, 10, 2]
+Active Experts in Batch: {9, 2, 10, 4}
+[2026-02-07 08:49:42,932] [INFO] [logging.py:96:log_dist] [Rank 0] step=3600, skipped=0, lr=[1.406776695545077e-05, 1.406776695545077e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 08:49:42,932] [INFO] [timer.py:260:stop] epoch=0/micro_step=7200/global_step=3600, RunningAvgSamplesPerSec=4.746426655169056, CurrSamplesPerSec=4.951074207256965, MemAllocated=12.91GB, MaxMemAllocated=49.73GB
+
+
+Train step of epoch 1:  12%|█▏        | 765/6434 [1:47:56<13:01:19,  8.27s/it, gpt_loss=0.229, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  12%|█▏        | 766/6434 [1:47:56<13:06:06,  8.32s/it, gpt_loss=0.229, loss_mean=0.28][A[A
+
+Train step of epoch 1:  12%|█▏        | 766/6434 [1:48:04<13:06:06,  8.32s/it, gpt_loss=0.277, loss_mean=0.28][A[A
+
+Train step of epoch 1:  12%|█▏        | 767/6434 [1:48:04<13:03:41,  8.30s/it, gpt_loss=0.277, loss_mean=0.28][A[A
+
+Train step of epoch 1:  12%|█▏        | 767/6434 [1:48:14<13:03:41,  8.30s/it, gpt_loss=0.39, loss_mean=0.291][A[A
+
+Train step of epoch 1:  12%|█▏        | 768/6434 [1:48:14<13:57:39,  8.87s/it, gpt_loss=0.39, loss_mean=0.291][A[A
+
+Train step of epoch 1:  12%|█▏        | 768/6434 [1:48:22<13:57:39,  8.87s/it, gpt_loss=0.299, loss_mean=0.292][A[A
+
+Train step of epoch 1:  12%|█▏        | 769/6434 [1:48:22<13:10:59,  8.38s/it, gpt_loss=0.299, loss_mean=0.292][A[A
+
+Train step of epoch 1:  12%|█▏        | 769/6434 [1:48:30<13:10:59,  8.38s/it, gpt_loss=0.228, loss_mean=0.285][A[A
+
+Train step of epoch 1:  12%|█▏        | 770/6434 [1:48:30<13:12:21,  8.39s/it, gpt_loss=0.228, loss_mean=0.285][A[A
+
+Train step of epoch 1:  12%|█▏        | 770/6434 [1:48:39<13:12:21,  8.39s/it, gpt_loss=0.219, loss_mean=0.279][A[A
+
+Train step of epoch 1:  12%|█▏        | 771/6434 [1:48:39<13:24:15,  8.52s/it, gpt_loss=0.219, loss_mean=0.279][A[A
+
+Train step of epoch 1:  12%|█▏        | 771/6434 [1:48:47<13:24:15,  8.52s/it, gpt_loss=0.216, loss_mean=0.272][A[A
+
+Train step of epoch 1:  12%|█▏        | 772/6434 [1:48:47<13:18:45,  8.46s/it, gpt_loss=0.216, loss_mean=0.272][A[A
+
+Train step of epoch 1:  12%|█▏        | 772/6434 [1:48:56<13:18:45,  8.46s/it, gpt_loss=0.225, loss_mean=0.268][A[A
+
+Train step of epoch 1:  12%|█▏        | 773/6434 [1:48:56<13:27:26,  8.56s/it, gpt_loss=0.225, loss_mean=0.268][A[A
+
+Train step of epoch 1:  12%|█▏        | 773/6434 [1:49:05<13:27:26,  8.56s/it, gpt_loss=0.198, loss_mean=0.261][A[A
+
+Train step of epoch 1:  12%|█▏        | 774/6434 [1:49:05<13:31:17,  8.60s/it, gpt_loss=0.198, loss_mean=0.261][A[A
+
+Train step of epoch 1:  12%|█▏        | 774/6434 [1:49:13<13:31:17,  8.60s/it, gpt_loss=0.266, loss_mean=0.261][A[A
+
+Train step of epoch 1:  12%|█▏        | 775/6434 [1:49:13<13:14:46,  8.43s/it, gpt_loss=0.266, loss_mean=0.261][A[A
+[LID Router Debug] Step: 7210
+Batch Size: 10
+Audio Batch Size: 124
+LID Assignments: [3, 9, 0, 9, 1, 5, 3, 2, 2, 5]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9}
+
+
+Train step of epoch 1:  12%|█▏        | 775/6434 [1:49:21<13:14:46,  8.43s/it, gpt_loss=0.301, loss_mean=0.265][A[A
+
+Train step of epoch 1:  12%|█▏        | 776/6434 [1:49:21<13:20:57,  8.49s/it, gpt_loss=0.301, loss_mean=0.265][A[A
+
+Train step of epoch 1:  12%|█▏        | 776/6434 [1:49:30<13:20:57,  8.49s/it, gpt_loss=0.254, loss_mean=0.264][A[A
+
+Train step of epoch 1:  12%|█▏        | 777/6434 [1:49:30<13:15:15,  8.43s/it, gpt_loss=0.254, loss_mean=0.264][A[A
+
+Train step of epoch 1:  12%|█▏        | 777/6434 [1:49:37<13:15:15,  8.43s/it, gpt_loss=0.256, loss_mean=0.263][A[A
+
+Train step of epoch 1:  12%|█▏        | 778/6434 [1:49:37<12:39:37,  8.06s/it, gpt_loss=0.256, loss_mean=0.263][A[A
+
+Train step of epoch 1:  12%|█▏        | 778/6434 [1:49:44<12:39:37,  8.06s/it, gpt_loss=0.282, loss_mean=0.265][A[A
+
+Train step of epoch 1:  12%|█▏        | 779/6434 [1:49:44<12:26:39,  7.92s/it, gpt_loss=0.282, loss_mean=0.265][A[A
+
+Train step of epoch 1:  12%|█▏        | 779/6434 [1:49:54<12:26:39,  7.92s/it, gpt_loss=0.251, loss_mean=0.264][A[A
+
+Train step of epoch 1:  12%|█▏        | 780/6434 [1:49:54<13:11:13,  8.40s/it, gpt_loss=0.251, loss_mean=0.264][A[A
+
+Train step of epoch 1:  12%|█▏        | 780/6434 [1:50:01<13:11:13,  8.40s/it, gpt_loss=0.299, loss_mean=0.267][A[A
+
+Train step of epoch 1:  12%|█▏        | 781/6434 [1:50:01<12:36:11,  8.03s/it, gpt_loss=0.299, loss_mean=0.267][A[A
+
+Train step of epoch 1:  12%|█▏        | 781/6434 [1:50:10<12:36:11,  8.03s/it, gpt_loss=0.219, loss_mean=0.263][A[A
+
+Train step of epoch 1:  12%|█▏        | 782/6434 [1:50:10<12:56:08,  8.24s/it, gpt_loss=0.219, loss_mean=0.263][A[A
+
+Train step of epoch 1:  12%|█▏        | 782/6434 [1:50:17<12:56:08,  8.24s/it, gpt_loss=0.349, loss_mean=0.271][A[A
+
+Train step of epoch 1:  12%|█▏        | 783/6434 [1:50:17<12:26:03,  7.92s/it, gpt_loss=0.349, loss_mean=0.271][A[A
+
+Train step of epoch 1:  12%|█▏        | 783/6434 [1:50:25<12:26:03,  7.92s/it, gpt_loss=0.302, loss_mean=0.274][A[A
+
+Train step of epoch 1:  12%|█▏        | 784/6434 [1:50:25<12:33:26,  8.00s/it, gpt_loss=0.302, loss_mean=0.274][A[A
+
+Train step of epoch 1:  12%|█▏        | 784/6434 [1:50:34<12:33:26,  8.00s/it, gpt_loss=0.262, loss_mean=0.273][A[A
+
+Train step of epoch 1:  12%|█▏        | 785/6434 [1:50:34<13:02:10,  8.31s/it, gpt_loss=0.262, loss_mean=0.273][A[A
+[LID Router Debug] Step: 7220
+Batch Size: 10
+Audio Batch Size: 114
+LID Assignments: [9, 9, 0, 2, 5, 0, 4, 2, 3, 9]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  12%|█▏        | 785/6434 [1:50:43<13:02:10,  8.31s/it, gpt_loss=0.373, loss_mean=0.283][A[A
+
+Train step of epoch 1:  12%|█▏        | 786/6434 [1:50:43<13:31:20,  8.62s/it, gpt_loss=0.373, loss_mean=0.283][A[A
+
+Train step of epoch 1:  12%|█▏        | 786/6434 [1:50:53<13:31:20,  8.62s/it, gpt_loss=0.183, loss_mean=0.273][A[A
+
+Train step of epoch 1:  12%|█▏        | 787/6434 [1:50:53<13:43:51,  8.75s/it, gpt_loss=0.183, loss_mean=0.273][A[A
+
+Train step of epoch 1:  12%|█▏        | 787/6434 [1:51:02<13:43:51,  8.75s/it, gpt_loss=0.274, loss_mean=0.273][A[A
+
+Train step of epoch 1:  12%|█▏        | 788/6434 [1:51:02<13:53:21,  8.86s/it, gpt_loss=0.274, loss_mean=0.273][A[A
+
+Train step of epoch 1:  12%|█▏        | 788/6434 [1:51:11<13:53:21,  8.86s/it, gpt_loss=0.242, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  12%|█▏        | 789/6434 [1:51:11<14:01:27,  8.94s/it, gpt_loss=0.242, loss_mean=0.27][A[A
+
+Train step of epoch 1:  12%|█▏        | 789/6434 [1:51:18<14:01:27,  8.94s/it, gpt_loss=0.271, loss_mean=0.27][A[A
+
+Train step of epoch 1:  12%|█▏        | 790/6434 [1:51:18<13:16:24,  8.47s/it, gpt_loss=0.271, loss_mean=0.27][A[A
+
+Train step of epoch 1:  12%|█▏        | 790/6434 [1:51:27<13:16:24,  8.47s/it, gpt_loss=0.292, loss_mean=0.272][A[A
+
+Train step of epoch 1:  12%|█▏        | 791/6434 [1:51:27<13:21:31,  8.52s/it, gpt_loss=0.292, loss_mean=0.272][A[A
+
+Train step of epoch 1:  12%|█▏        | 791/6434 [1:51:35<13:21:31,  8.52s/it, gpt_loss=0.258, loss_mean=0.271][A[A
+
+Train step of epoch 1:  12%|█▏        | 792/6434 [1:51:35<13:11:55,  8.42s/it, gpt_loss=0.258, loss_mean=0.271][A[A
+
+Train step of epoch 1:  12%|█▏        | 792/6434 [1:51:43<13:11:55,  8.42s/it, gpt_loss=0.288, loss_mean=0.273][A[A
+
+Train step of epoch 1:  12%|█▏        | 793/6434 [1:51:43<12:53:24,  8.23s/it, gpt_loss=0.288, loss_mean=0.273][A[A
+
+Train step of epoch 1:  12%|█▏        | 793/6434 [1:51:51<12:53:24,  8.23s/it, gpt_loss=0.235, loss_mean=0.269][A[A
+
+Train step of epoch 1:  12%|█▏        | 794/6434 [1:51:51<12:41:36,  8.10s/it, gpt_loss=0.235, loss_mean=0.269][A[A
+
+Train step of epoch 1:  12%|█▏        | 794/6434 [1:51:58<12:41:36,  8.10s/it, gpt_loss=0.223, loss_mean=0.264][A[A
+
+Train step of epoch 1:  12%|█▏        | 795/6434 [1:51:58<12:34:10,  8.02s/it, gpt_loss=0.223, loss_mean=0.264][A[A
+[LID Router Debug] Step: 7230
+Batch Size: 10
+Audio Batch Size: 130
+LID Assignments: [9, 4, 0, 2, 9, 4, 3, 2, 6, 3]
+Active Experts in Batch: {0, 2, 3, 4, 6, 9}
+
+
+Train step of epoch 1:  12%|█▏        | 795/6434 [1:52:07<12:34:10,  8.02s/it, gpt_loss=0.269, loss_mean=0.265][A[A
+
+Train step of epoch 1:  12%|█▏        | 796/6434 [1:52:07<12:41:28,  8.10s/it, gpt_loss=0.269, loss_mean=0.265][A[A
+
+Train step of epoch 1:  12%|█▏        | 796/6434 [1:52:15<12:41:28,  8.10s/it, gpt_loss=0.291, loss_mean=0.267][A[A
+
+Train step of epoch 1:  12%|█▏        | 797/6434 [1:52:15<13:00:45,  8.31s/it, gpt_loss=0.291, loss_mean=0.267][A[A
+
+Train step of epoch 1:  12%|█▏        | 797/6434 [1:52:24<13:00:45,  8.31s/it, gpt_loss=0.274, loss_mean=0.268][A[A
+
+Train step of epoch 1:  12%|█▏        | 798/6434 [1:52:24<13:08:10,  8.39s/it, gpt_loss=0.274, loss_mean=0.268][A[A
+
+Train step of epoch 1:  12%|█▏        | 798/6434 [1:52:32<13:08:10,  8.39s/it, gpt_loss=0.263, loss_mean=0.267][A[A
+
+Train step of epoch 1:  12%|█▏        | 799/6434 [1:52:32<12:51:58,  8.22s/it, gpt_loss=0.263, loss_mean=0.267][A[A
+
+Train step of epoch 1:  12%|█▏        | 799/6434 [1:52:41<12:51:58,  8.22s/it, gpt_loss=0.247, loss_mean=0.265][A[A
+
+Train step of epoch 1:  12%|█▏        | 800/6434 [1:52:41<13:09:17,  8.41s/it, gpt_loss=0.247, loss_mean=0.265][A[A
+
+Train step of epoch 1:  12%|█▏        | 800/6434 [1:52:49<13:09:17,  8.41s/it, gpt_loss=0.278, loss_mean=0.267][A[A
+
+Train step of epoch 1:  12%|█▏        | 801/6434 [1:52:49<13:02:34,  8.34s/it, gpt_loss=0.278, loss_mean=0.267][A[A
+
+Train step of epoch 1:  12%|█▏        | 801/6434 [1:52:58<13:02:34,  8.34s/it, gpt_loss=0.266, loss_mean=0.267][A[A
+
+Train step of epoch 1:  12%|█▏        | 802/6434 [1:52:58<13:13:51,  8.46s/it, gpt_loss=0.266, loss_mean=0.267][A[A
+
+Train step of epoch 1:  12%|█▏        | 802/6434 [1:53:07<13:13:51,  8.46s/it, gpt_loss=0.324, loss_mean=0.272][A[A
+
+Train step of epoch 1:  12%|█▏        | 803/6434 [1:53:07<13:29:38,  8.63s/it, gpt_loss=0.324, loss_mean=0.272][A[A
+
+Train step of epoch 1:  12%|█▏        | 803/6434 [1:53:14<13:29:38,  8.63s/it, gpt_loss=0.252, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  12%|█▏        | 804/6434 [1:53:14<12:51:51,  8.23s/it, gpt_loss=0.252, loss_mean=0.27][A[A
+
+Train step of epoch 1:  12%|█▏        | 804/6434 [1:53:22<12:51:51,  8.23s/it, gpt_loss=0.297, loss_mean=0.273][A[A
+
+Train step of epoch 1:  13%|█▎        | 805/6434 [1:53:22<12:43:06,  8.13s/it, gpt_loss=0.297, loss_mean=0.273][A[A
+[LID Router Debug] Step: 7240
+Batch Size: 10
+Audio Batch Size: 107
+LID Assignments: [1, 1, 1, 7, 1, 9, 3, 3, 5, 6]
+Active Experts in Batch: {1, 3, 5, 6, 7, 9}
+
+
+Train step of epoch 1:  13%|█▎        | 805/6434 [1:53:30<12:43:06,  8.13s/it, gpt_loss=0.245, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  13%|█▎        | 806/6434 [1:53:30<12:33:29,  8.03s/it, gpt_loss=0.245, loss_mean=0.27][A[A
+
+Train step of epoch 1:  13%|█▎        | 806/6434 [1:53:39<12:33:29,  8.03s/it, gpt_loss=0.259, loss_mean=0.269][A[A
+
+Train step of epoch 1:  13%|█▎        | 807/6434 [1:53:39<12:56:21,  8.28s/it, gpt_loss=0.259, loss_mean=0.269][A[A
+
+Train step of epoch 1:  13%|█▎        | 807/6434 [1:53:47<12:56:21,  8.28s/it, gpt_loss=0.365, loss_mean=0.279][A[A
+
+Train step of epoch 1:  13%|█▎        | 808/6434 [1:53:47<13:05:58,  8.38s/it, gpt_loss=0.365, loss_mean=0.279][A[A
+
+Train step of epoch 1:  13%|█▎        | 808/6434 [1:53:55<13:05:58,  8.38s/it, gpt_loss=0.271, loss_mean=0.278][A[A
+
+Train step of epoch 1:  13%|█▎        | 809/6434 [1:53:55<12:54:27,  8.26s/it, gpt_loss=0.271, loss_mean=0.278][A[A
+
+Train step of epoch 1:  13%|█▎        | 809/6434 [1:54:02<12:54:27,  8.26s/it, gpt_loss=0.34, loss_mean=0.284] [A[A
+
+Train step of epoch 1:  13%|█▎        | 810/6434 [1:54:02<12:05:49,  7.74s/it, gpt_loss=0.34, loss_mean=0.284][A[A
+
+Train step of epoch 1:  13%|█▎        | 810/6434 [1:54:11<12:05:49,  7.74s/it, gpt_loss=0.303, loss_mean=0.286][A[A
+
+Train step of epoch 1:  13%|█▎        | 811/6434 [1:54:11<12:40:12,  8.11s/it, gpt_loss=0.303, loss_mean=0.286][A[A
+
+Train step of epoch 1:  13%|█▎        | 811/6434 [1:54:21<12:40:12,  8.11s/it, gpt_loss=0.194, loss_mean=0.277][A[A
+
+Train step of epoch 1:  13%|█▎        | 812/6434 [1:54:21<13:48:43,  8.84s/it, gpt_loss=0.194, loss_mean=0.277][A[A
+
+Train step of epoch 1:  13%|█▎        | 812/6434 [1:54:31<13:48:43,  8.84s/it, gpt_loss=0.312, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  13%|█▎        | 813/6434 [1:54:31<14:24:21,  9.23s/it, gpt_loss=0.312, loss_mean=0.28][A[A
+
+Train step of epoch 1:  13%|█▎        | 813/6434 [1:54:40<14:24:21,  9.23s/it, gpt_loss=0.234, loss_mean=0.276][A[A
+
+Train step of epoch 1:  13%|█▎        | 814/6434 [1:54:40<14:22:50,  9.21s/it, gpt_loss=0.234, loss_mean=0.276][A[A
+
+Train step of epoch 1:  13%|█▎        | 814/6434 [1:54:49<14:22:50,  9.21s/it, gpt_loss=0.325, loss_mean=0.281][A[A
+
+Train step of epoch 1:  13%|█▎        | 815/6434 [1:54:49<14:02:54,  9.00s/it, gpt_loss=0.325, loss_mean=0.281][A[A
+[LID Router Debug] Step: 7250
+Batch Size: 10
+Audio Batch Size: 134
+LID Assignments: [5, 5, 3, 9, 3, 3, 5, 9, 9, 0]
+Active Experts in Batch: {0, 9, 3, 5}
+
+
+Train step of epoch 1:  13%|█▎        | 815/6434 [1:54:58<14:02:54,  9.00s/it, gpt_loss=0.327, loss_mean=0.285][A[A
+
+Train step of epoch 1:  13%|█▎        | 816/6434 [1:54:58<14:01:04,  8.98s/it, gpt_loss=0.327, loss_mean=0.285][A[A
+
+Train step of epoch 1:  13%|█▎        | 816/6434 [1:55:08<14:01:04,  8.98s/it, gpt_loss=0.265, loss_mean=0.283][A[A
+
+Train step of epoch 1:  13%|█▎        | 817/6434 [1:55:08<14:33:40,  9.33s/it, gpt_loss=0.265, loss_mean=0.283][A[A
+
+Train step of epoch 1:  13%|█▎        | 817/6434 [1:55:17<14:33:40,  9.33s/it, gpt_loss=0.229, loss_mean=0.278][A[A
+
+Train step of epoch 1:  13%|█▎        | 818/6434 [1:55:17<14:34:30,  9.34s/it, gpt_loss=0.229, loss_mean=0.278][A[A
+
+Train step of epoch 1:  13%|█▎        | 818/6434 [1:55:25<14:34:30,  9.34s/it, gpt_loss=0.265, loss_mean=0.276][A[A
+
+Train step of epoch 1:  13%|█▎        | 819/6434 [1:55:25<13:33:40,  8.69s/it, gpt_loss=0.265, loss_mean=0.276][A[A
+
+Train step of epoch 1:  13%|█▎        | 819/6434 [1:55:33<13:33:40,  8.69s/it, gpt_loss=0.294, loss_mean=0.278][A[A
+
+Train step of epoch 1:  13%|█▎        | 820/6434 [1:55:33<13:18:49,  8.54s/it, gpt_loss=0.294, loss_mean=0.278][A[A
+
+Train step of epoch 1:  13%|█▎        | 820/6434 [1:55:41<13:18:49,  8.54s/it, gpt_loss=0.356, loss_mean=0.286][A[A
+
+Train step of epoch 1:  13%|█▎        | 821/6434 [1:55:41<13:05:33,  8.40s/it, gpt_loss=0.356, loss_mean=0.286][A[A
+
+Train step of epoch 1:  13%|█▎        | 821/6434 [1:55:50<13:05:33,  8.40s/it, gpt_loss=0.346, loss_mean=0.292][A[A
+
+Train step of epoch 1:  13%|█▎        | 822/6434 [1:55:50<13:13:46,  8.49s/it, gpt_loss=0.346, loss_mean=0.292][A[A
+
+Train step of epoch 1:  13%|█▎        | 822/6434 [1:55:58<13:13:46,  8.49s/it, gpt_loss=0.311, loss_mean=0.294][A[A
+
+Train step of epoch 1:  13%|█▎        | 823/6434 [1:55:58<13:26:07,  8.62s/it, gpt_loss=0.311, loss_mean=0.294][A[A
+
+Train step of epoch 1:  13%|█▎        | 823/6434 [1:56:07<13:26:07,  8.62s/it, gpt_loss=0.228, loss_mean=0.287][A[A
+
+Train step of epoch 1:  13%|█▎        | 824/6434 [1:56:07<13:10:25,  8.45s/it, gpt_loss=0.228, loss_mean=0.287][A[A
+
+Train step of epoch 1:  13%|█▎        | 824/6434 [1:56:15<13:10:25,  8.45s/it, gpt_loss=0.209, loss_mean=0.279][A[A
+
+Train step of epoch 1:  13%|█▎        | 825/6434 [1:56:15<13:06:49,  8.42s/it, gpt_loss=0.209, loss_mean=0.279][A[A
+[LID Router Debug] Step: 7260
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [2, 4, 5, 9, 3, 1, 4, 5, 5, 9]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  13%|█▎        | 825/6434 [1:56:24<13:06:49,  8.42s/it, gpt_loss=0.274, loss_mean=0.279][A[A
+
+Train step of epoch 1:  13%|█▎        | 826/6434 [1:56:24<13:17:19,  8.53s/it, gpt_loss=0.274, loss_mean=0.279][A[A
+
+Train step of epoch 1:  13%|█▎        | 826/6434 [1:56:32<13:17:19,  8.53s/it, gpt_loss=0.301, loss_mean=0.281][A[A
+
+Train step of epoch 1:  13%|█▎        | 827/6434 [1:56:32<13:08:54,  8.44s/it, gpt_loss=0.301, loss_mean=0.281][A[A
+
+Train step of epoch 1:  13%|█▎        | 827/6434 [1:56:41<13:08:54,  8.44s/it, gpt_loss=0.267, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  13%|█▎        | 828/6434 [1:56:41<13:14:30,  8.50s/it, gpt_loss=0.267, loss_mean=0.28][A[A
+
+Train step of epoch 1:  13%|█▎        | 828/6434 [1:56:49<13:14:30,  8.50s/it, gpt_loss=0.268, loss_mean=0.279][A[A
+
+Train step of epoch 1:  13%|█▎        | 829/6434 [1:56:49<13:09:54,  8.46s/it, gpt_loss=0.268, loss_mean=0.279][A[A
+
+Train step of epoch 1:  13%|█▎        | 829/6434 [1:56:57<13:09:54,  8.46s/it, gpt_loss=0.256, loss_mean=0.276][A[A
+
+Train step of epoch 1:  13%|█▎        | 830/6434 [1:56:57<12:52:52,  8.27s/it, gpt_loss=0.256, loss_mean=0.276][A[A
+
+Train step of epoch 1:  13%|█▎        | 830/6434 [1:57:04<12:52:52,  8.27s/it, gpt_loss=0.284, loss_mean=0.277][A[A
+
+Train step of epoch 1:  13%|█▎        | 831/6434 [1:57:04<12:36:50,  8.10s/it, gpt_loss=0.284, loss_mean=0.277][A[A
+
+Train step of epoch 1:  13%|█▎        | 831/6434 [1:57:13<12:36:50,  8.10s/it, gpt_loss=0.219, loss_mean=0.271][A[A
+
+Train step of epoch 1:  13%|█▎        | 832/6434 [1:57:13<12:42:35,  8.17s/it, gpt_loss=0.219, loss_mean=0.271][A[A
+
+Train step of epoch 1:  13%|█▎        | 832/6434 [1:57:21<12:42:35,  8.17s/it, gpt_loss=0.213, loss_mean=0.265][A[A
+
+Train step of epoch 1:  13%|█▎        | 833/6434 [1:57:21<12:54:21,  8.30s/it, gpt_loss=0.213, loss_mean=0.265][A[A
+
+Train step of epoch 1:  13%|█▎        | 833/6434 [1:57:30<12:54:21,  8.30s/it, gpt_loss=0.336, loss_mean=0.272][A[A
+
+Train step of epoch 1:  13%|█▎        | 834/6434 [1:57:30<12:54:09,  8.29s/it, gpt_loss=0.336, loss_mean=0.272][A[A
+
+Train step of epoch 1:  13%|█▎        | 834/6434 [1:57:38<12:54:09,  8.29s/it, gpt_loss=0.278, loss_mean=0.273][A[A
+
+Train step of epoch 1:  13%|█▎        | 835/6434 [1:57:38<12:45:57,  8.21s/it, gpt_loss=0.278, loss_mean=0.273][A[A
+[LID Router Debug] Step: 7270
+Batch Size: 10
+Audio Batch Size: 109
+LID Assignments: [2, 3, 2, 2, 0, 3, 0, 4, 2, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4}
+
+
+Train step of epoch 1:  13%|█▎        | 835/6434 [1:57:45<12:45:57,  8.21s/it, gpt_loss=0.32, loss_mean=0.278] [A[A
+
+Train step of epoch 1:  13%|█▎        | 836/6434 [1:57:45<12:32:46,  8.07s/it, gpt_loss=0.32, loss_mean=0.278][A[A
+
+Train step of epoch 1:  13%|█▎        | 836/6434 [1:57:54<12:32:46,  8.07s/it, gpt_loss=0.258, loss_mean=0.276][A[A
+
+Train step of epoch 1:  13%|█▎        | 837/6434 [1:57:54<12:54:07,  8.30s/it, gpt_loss=0.258, loss_mean=0.276][A[A
+
+Train step of epoch 1:  13%|█▎        | 837/6434 [1:58:03<12:54:07,  8.30s/it, gpt_loss=0.34, loss_mean=0.282] [A[A
+
+Train step of epoch 1:  13%|█▎        | 838/6434 [1:58:03<13:19:08,  8.57s/it, gpt_loss=0.34, loss_mean=0.282][A[A
+
+Train step of epoch 1:  13%|█▎        | 838/6434 [1:58:13<13:19:08,  8.57s/it, gpt_loss=0.256, loss_mean=0.279][A[A
+
+Train step of epoch 1:  13%|█▎        | 839/6434 [1:58:13<13:34:57,  8.74s/it, gpt_loss=0.256, loss_mean=0.279][A[A
+
+Train step of epoch 1:  13%|█▎        | 839/6434 [1:58:20<13:34:57,  8.74s/it, gpt_loss=0.198, loss_mean=0.271][A[A
+
+Train step of epoch 1:  13%|█▎        | 840/6434 [1:58:20<13:10:48,  8.48s/it, gpt_loss=0.198, loss_mean=0.271][A[A
+
+Train step of epoch 1:  13%|█▎        | 840/6434 [1:58:28<13:10:48,  8.48s/it, gpt_loss=0.211, loss_mean=0.265][A[A
+
+Train step of epoch 1:  13%|█▎        | 841/6434 [1:58:28<12:37:22,  8.12s/it, gpt_loss=0.211, loss_mean=0.265][A[A
+
+Train step of epoch 1:  13%|█▎        | 841/6434 [1:58:37<12:37:22,  8.12s/it, gpt_loss=0.215, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  13%|█▎        | 842/6434 [1:58:37<13:15:07,  8.53s/it, gpt_loss=0.215, loss_mean=0.26][A[A
+
+Train step of epoch 1:  13%|█▎        | 842/6434 [1:58:46<13:15:07,  8.53s/it, gpt_loss=0.24, loss_mean=0.258][A[A
+
+Train step of epoch 1:  13%|█▎        | 843/6434 [1:58:46<13:20:05,  8.59s/it, gpt_loss=0.24, loss_mean=0.258][A[A
+
+Train step of epoch 1:  13%|█▎        | 843/6434 [1:58:54<13:20:05,  8.59s/it, gpt_loss=0.235, loss_mean=0.256][A[A
+
+Train step of epoch 1:  13%|█▎        | 844/6434 [1:58:54<13:00:33,  8.38s/it, gpt_loss=0.235, loss_mean=0.256][A[A
+
+Train step of epoch 1:  13%|█▎        | 844/6434 [1:59:02<13:00:33,  8.38s/it, gpt_loss=0.316, loss_mean=0.262][A[A
+
+Train step of epoch 1:  13%|█▎        | 845/6434 [1:59:02<12:56:23,  8.33s/it, gpt_loss=0.316, loss_mean=0.262][A[A
+[LID Router Debug] Step: 7280
+Batch Size: 10
+Audio Batch Size: 83
+LID Assignments: [0, 0, 6, 0, 1, 9, 5, 4, 4, 2]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  13%|█▎        | 845/6434 [1:59:10<12:56:23,  8.33s/it, gpt_loss=0.242, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  13%|█▎        | 846/6434 [1:59:10<12:51:30,  8.28s/it, gpt_loss=0.242, loss_mean=0.26][A[A
+
+Train step of epoch 1:  13%|█▎        | 846/6434 [1:59:19<12:51:30,  8.28s/it, gpt_loss=0.271, loss_mean=0.261][A[A
+
+Train step of epoch 1:  13%|█▎        | 847/6434 [1:59:19<13:12:57,  8.52s/it, gpt_loss=0.271, loss_mean=0.261][A[A
+
+Train step of epoch 1:  13%|█▎        | 847/6434 [1:59:28<13:12:57,  8.52s/it, gpt_loss=0.24, loss_mean=0.259] [A[A
+
+Train step of epoch 1:  13%|█▎        | 848/6434 [1:59:28<13:04:18,  8.42s/it, gpt_loss=0.24, loss_mean=0.259][A[A
+
+Train step of epoch 1:  13%|█▎        | 848/6434 [1:59:37<13:04:18,  8.42s/it, gpt_loss=0.223, loss_mean=0.255][A[A
+
+Train step of epoch 1:  13%|█▎        | 849/6434 [1:59:37<13:25:51,  8.66s/it, gpt_loss=0.223, loss_mean=0.255][A[A
+
+Train step of epoch 1:  13%|█▎        | 849/6434 [1:59:45<13:25:51,  8.66s/it, gpt_loss=0.357, loss_mean=0.266][A[A
+
+Train step of epoch 1:  13%|█▎        | 850/6434 [1:59:45<13:19:52,  8.59s/it, gpt_loss=0.357, loss_mean=0.266][A[A
+
+Train step of epoch 1:  13%|█▎        | 850/6434 [1:59:52<13:19:52,  8.59s/it, gpt_loss=0.228, loss_mean=0.262][A[A
+
+Train step of epoch 1:  13%|█▎        | 851/6434 [1:59:52<12:39:23,  8.16s/it, gpt_loss=0.228, loss_mean=0.262][A[A
+
+Train step of epoch 1:  13%|█▎        | 851/6434 [2:00:00<12:39:23,  8.16s/it, gpt_loss=0.435, loss_mean=0.279][A[A
+
+Train step of epoch 1:  13%|█▎        | 852/6434 [2:00:00<12:29:58,  8.06s/it, gpt_loss=0.435, loss_mean=0.279][A[A
+
+Train step of epoch 1:  13%|█▎        | 852/6434 [2:00:09<12:29:58,  8.06s/it, gpt_loss=0.234, loss_mean=0.275][A[A
+
+Train step of epoch 1:  13%|█▎        | 853/6434 [2:00:09<12:55:11,  8.33s/it, gpt_loss=0.234, loss_mean=0.275][A[A
+
+Train step of epoch 1:  13%|█▎        | 853/6434 [2:00:17<12:55:11,  8.33s/it, gpt_loss=0.296, loss_mean=0.277][A[A
+
+Train step of epoch 1:  13%|█▎        | 854/6434 [2:00:17<12:43:06,  8.21s/it, gpt_loss=0.296, loss_mean=0.277][A[A
+
+Train step of epoch 1:  13%|█▎        | 854/6434 [2:00:25<12:43:06,  8.21s/it, gpt_loss=0.318, loss_mean=0.281][A[A
+
+Train step of epoch 1:  13%|█▎        | 855/6434 [2:00:25<12:36:32,  8.14s/it, gpt_loss=0.318, loss_mean=0.281][A[A
+[LID Router Debug] Step: 7290
+Batch Size: 10
+Audio Batch Size: 105
+LID Assignments: [9, 4, 3, 3, 5, 0, 2, 0, 5, 4]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  13%|█▎        | 855/6434 [2:00:33<12:36:32,  8.14s/it, gpt_loss=0.245, loss_mean=0.277][A[A
+
+Train step of epoch 1:  13%|█▎        | 856/6434 [2:00:33<12:22:31,  7.99s/it, gpt_loss=0.245, loss_mean=0.277][A[A
+
+Train step of epoch 1:  13%|█▎        | 856/6434 [2:00:40<12:22:31,  7.99s/it, gpt_loss=0.28, loss_mean=0.278] [A[A
+
+Train step of epoch 1:  13%|█▎        | 857/6434 [2:00:40<12:09:10,  7.84s/it, gpt_loss=0.28, loss_mean=0.278][A[A
+
+Train step of epoch 1:  13%|█▎        | 857/6434 [2:00:49<12:09:10,  7.84s/it, gpt_loss=0.333, loss_mean=0.283][A[A
+
+Train step of epoch 1:  13%|█▎        | 858/6434 [2:00:49<12:39:40,  8.17s/it, gpt_loss=0.333, loss_mean=0.283][A[A
+
+Train step of epoch 1:  13%|█▎        | 858/6434 [2:00:57<12:39:40,  8.17s/it, gpt_loss=0.241, loss_mean=0.279][A[A
+
+Train step of epoch 1:  13%|█▎        | 859/6434 [2:00:57<12:30:24,  8.08s/it, gpt_loss=0.241, loss_mean=0.279][A[A
+
+Train step of epoch 1:  13%|█▎        | 859/6434 [2:01:06<12:30:24,  8.08s/it, gpt_loss=0.33, loss_mean=0.284] [A[A
+
+Train step of epoch 1:  13%|█▎        | 860/6434 [2:01:06<12:58:16,  8.38s/it, gpt_loss=0.33, loss_mean=0.284][A[A
+
+Train step of epoch 1:  13%|█▎        | 860/6434 [2:01:15<12:58:16,  8.38s/it, gpt_loss=0.268, loss_mean=0.282][A[A
+
+Train step of epoch 1:  13%|█▎        | 861/6434 [2:01:15<13:01:11,  8.41s/it, gpt_loss=0.268, loss_mean=0.282][A[A
+
+Train step of epoch 1:  13%|█▎        | 861/6434 [2:01:23<13:01:11,  8.41s/it, gpt_loss=0.274, loss_mean=0.282][A[A
+
+Train step of epoch 1:  13%|█▎        | 862/6434 [2:01:23<12:52:37,  8.32s/it, gpt_loss=0.274, loss_mean=0.282][A[A
+
+Train step of epoch 1:  13%|█▎        | 862/6434 [2:01:31<12:52:37,  8.32s/it, gpt_loss=0.225, loss_mean=0.276][A[A
+
+Train step of epoch 1:  13%|█▎        | 863/6434 [2:01:31<12:49:06,  8.28s/it, gpt_loss=0.225, loss_mean=0.276][A[A
+
+Train step of epoch 1:  13%|█▎        | 863/6434 [2:01:40<12:49:06,  8.28s/it, gpt_loss=0.268, loss_mean=0.275][A[A
+
+Train step of epoch 1:  13%|█▎        | 864/6434 [2:01:40<13:22:21,  8.64s/it, gpt_loss=0.268, loss_mean=0.275][A[A
+
+Train step of epoch 1:  13%|█▎        | 864/6434 [2:01:49<13:22:21,  8.64s/it, gpt_loss=0.262, loss_mean=0.274][A[A
+
+Train step of epoch 1:  13%|█▎        | 865/6434 [2:01:49<13:18:57,  8.61s/it, gpt_loss=0.262, loss_mean=0.274][A[A
+[LID Router Debug] Step: 7300
+Batch Size: 10
+Audio Batch Size: 91
+LID Assignments: [2, 2, 4, 4, 6, 0, 1, 5, 5, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+
+Train step of epoch 1:  13%|█▎        | 865/6434 [2:01:58<13:18:57,  8.61s/it, gpt_loss=0.298, loss_mean=0.276][A[A
+
+Train step of epoch 1:  13%|█▎        | 866/6434 [2:01:58<13:43:36,  8.88s/it, gpt_loss=0.298, loss_mean=0.276][A[A
+
+Train step of epoch 1:  13%|█▎        | 866/6434 [2:02:07<13:43:36,  8.88s/it, gpt_loss=0.272, loss_mean=0.276][A[A
+
+Train step of epoch 1:  13%|█▎        | 867/6434 [2:02:07<13:33:23,  8.77s/it, gpt_loss=0.272, loss_mean=0.276][A[A
+
+Train step of epoch 1:  13%|█▎        | 867/6434 [2:02:14<13:33:23,  8.77s/it, gpt_loss=0.268, loss_mean=0.275][A[A
+
+Train step of epoch 1:  13%|█▎        | 868/6434 [2:02:14<12:59:38,  8.40s/it, gpt_loss=0.268, loss_mean=0.275][A[A
+
+Train step of epoch 1:  13%|█▎        | 868/6434 [2:02:23<12:59:38,  8.40s/it, gpt_loss=0.249, loss_mean=0.272][A[A
+
+Train step of epoch 1:  14%|█▎        | 869/6434 [2:02:23<13:04:09,  8.45s/it, gpt_loss=0.249, loss_mean=0.272][A[A
+
+Train step of epoch 1:  14%|█▎        | 869/6434 [2:02:31<13:04:09,  8.45s/it, gpt_loss=0.254, loss_mean=0.271][A[A
+
+Train step of epoch 1:  14%|█▎        | 870/6434 [2:02:31<12:49:28,  8.30s/it, gpt_loss=0.254, loss_mean=0.271][A[A
+
+Train step of epoch 1:  14%|█▎        | 870/6434 [2:02:39<12:49:28,  8.30s/it, gpt_loss=0.24, loss_mean=0.268] [A[A
+
+Train step of epoch 1:  14%|█▎        | 871/6434 [2:02:39<12:42:02,  8.22s/it, gpt_loss=0.24, loss_mean=0.268][A[A
+
+Train step of epoch 1:  14%|█▎        | 871/6434 [2:02:49<12:42:02,  8.22s/it, gpt_loss=0.198, loss_mean=0.261][A[A
+
+Train step of epoch 1:  14%|█▎        | 872/6434 [2:02:49<13:24:21,  8.68s/it, gpt_loss=0.198, loss_mean=0.261][A[A
+
+Train step of epoch 1:  14%|█▎        | 872/6434 [2:02:56<13:24:21,  8.68s/it, gpt_loss=0.301, loss_mean=0.265][A[A
+
+Train step of epoch 1:  14%|█▎        | 873/6434 [2:02:56<12:58:53,  8.40s/it, gpt_loss=0.301, loss_mean=0.265][A[A
+
+Train step of epoch 1:  14%|█▎        | 873/6434 [2:03:05<12:58:53,  8.40s/it, gpt_loss=0.294, loss_mean=0.268][A[A
+
+Train step of epoch 1:  14%|█▎        | 874/6434 [2:03:05<13:07:18,  8.50s/it, gpt_loss=0.294, loss_mean=0.268][A[A
+
+Train step of epoch 1:  14%|█▎        | 874/6434 [2:03:13<13:07:18,  8.50s/it, gpt_loss=0.277, loss_mean=0.269][A[A
+
+Train step of epoch 1:  14%|█▎        | 875/6434 [2:03:13<12:55:22,  8.37s/it, gpt_loss=0.277, loss_mean=0.269][A[A
+[LID Router Debug] Step: 7310
+Batch Size: 10
+Audio Batch Size: 91
+LID Assignments: [6, 1, 6, 9, 2, 9, 2, 2, 9, 5]
+Active Experts in Batch: {1, 2, 5, 6, 9}
+
+
+Train step of epoch 1:  14%|█▎        | 875/6434 [2:03:21<12:55:22,  8.37s/it, gpt_loss=0.411, loss_mean=0.283][A[A
+
+Train step of epoch 1:  14%|█▎        | 876/6434 [2:03:21<12:50:26,  8.32s/it, gpt_loss=0.411, loss_mean=0.283][A[A
+
+Train step of epoch 1:  14%|█▎        | 876/6434 [2:03:29<12:50:26,  8.32s/it, gpt_loss=0.233, loss_mean=0.278][A[A
+
+Train step of epoch 1:  14%|█▎        | 877/6434 [2:03:29<12:33:32,  8.14s/it, gpt_loss=0.233, loss_mean=0.278][A[A
+
+Train step of epoch 1:  14%|█▎        | 877/6434 [2:03:37<12:33:32,  8.14s/it, gpt_loss=0.202, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  14%|█▎        | 878/6434 [2:03:37<12:34:34,  8.15s/it, gpt_loss=0.202, loss_mean=0.27][A[A
+
+Train step of epoch 1:  14%|█▎        | 878/6434 [2:03:46<12:34:34,  8.15s/it, gpt_loss=0.361, loss_mean=0.279][A[A
+
+Train step of epoch 1:  14%|█▎        | 879/6434 [2:03:46<12:38:08,  8.19s/it, gpt_loss=0.361, loss_mean=0.279][A[A
+
+Train step of epoch 1:  14%|█▎        | 879/6434 [2:03:54<12:38:08,  8.19s/it, gpt_loss=0.294, loss_mean=0.281][A[A
+
+Train step of epoch 1:  14%|█▎        | 880/6434 [2:03:54<12:56:39,  8.39s/it, gpt_loss=0.294, loss_mean=0.281][A[A
+
+Train step of epoch 1:  14%|█▎        | 880/6434 [2:04:02<12:56:39,  8.39s/it, gpt_loss=0.256, loss_mean=0.278][A[A
+
+Train step of epoch 1:  14%|█▎        | 881/6434 [2:04:02<12:44:21,  8.26s/it, gpt_loss=0.256, loss_mean=0.278][A[A
+
+Train step of epoch 1:  14%|█▎        | 881/6434 [2:04:10<12:44:21,  8.26s/it, gpt_loss=0.266, loss_mean=0.277][A[A
+
+Train step of epoch 1:  14%|█▎        | 882/6434 [2:04:10<12:32:33,  8.13s/it, gpt_loss=0.266, loss_mean=0.277][A[A
+
+Train step of epoch 1:  14%|█▎        | 882/6434 [2:04:18<12:32:33,  8.13s/it, gpt_loss=0.287, loss_mean=0.278][A[A
+
+Train step of epoch 1:  14%|█▎        | 883/6434 [2:04:18<12:29:15,  8.10s/it, gpt_loss=0.287, loss_mean=0.278][A[A
+
+Train step of epoch 1:  14%|█▎        | 883/6434 [2:04:27<12:29:15,  8.10s/it, gpt_loss=0.343, loss_mean=0.284][A[A
+
+Train step of epoch 1:  14%|█▎        | 884/6434 [2:04:27<12:52:03,  8.35s/it, gpt_loss=0.343, loss_mean=0.284][A[A
+
+Train step of epoch 1:  14%|█▎        | 884/6434 [2:04:37<12:52:03,  8.35s/it, gpt_loss=0.285, loss_mean=0.285][A[A
+
+Train step of epoch 1:  14%|█▍        | 885/6434 [2:04:37<13:21:06,  8.66s/it, gpt_loss=0.285, loss_mean=0.285][A[A
+[LID Router Debug] Step: 7320
+Batch Size: 10
+Audio Batch Size: 100
+LID Assignments: [3, 9, 1, 5, 6, 9, 4, 5, 5, 3]
+Active Experts in Batch: {1, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  14%|█▍        | 885/6434 [2:04:44<13:21:06,  8.66s/it, gpt_loss=0.32, loss_mean=0.288] [A[A
+
+Train step of epoch 1:  14%|█▍        | 886/6434 [2:04:44<12:47:51,  8.30s/it, gpt_loss=0.32, loss_mean=0.288][A[A
+
+Train step of epoch 1:  14%|█▍        | 886/6434 [2:04:53<12:47:51,  8.30s/it, gpt_loss=0.284, loss_mean=0.288][A[A
+
+Train step of epoch 1:  14%|█▍        | 887/6434 [2:04:53<13:01:55,  8.46s/it, gpt_loss=0.284, loss_mean=0.288][A[A
+
+Train step of epoch 1:  14%|█▍        | 887/6434 [2:05:01<13:01:55,  8.46s/it, gpt_loss=0.232, loss_mean=0.282][A[A
+
+Train step of epoch 1:  14%|█▍        | 888/6434 [2:05:01<12:59:52,  8.44s/it, gpt_loss=0.232, loss_mean=0.282][A[A
+
+Train step of epoch 1:  14%|█▍        | 888/6434 [2:05:10<12:59:52,  8.44s/it, gpt_loss=0.222, loss_mean=0.276][A[A
+
+Train step of epoch 1:  14%|█▍        | 889/6434 [2:05:10<13:15:09,  8.60s/it, gpt_loss=0.222, loss_mean=0.276][A[A
+
+Train step of epoch 1:  14%|█▍        | 889/6434 [2:05:19<13:15:09,  8.60s/it, gpt_loss=0.342, loss_mean=0.283][A[A
+
+Train step of epoch 1:  14%|█▍        | 890/6434 [2:05:19<13:13:21,  8.59s/it, gpt_loss=0.342, loss_mean=0.283][A[A
+
+Train step of epoch 1:  14%|█▍        | 890/6434 [2:05:27<13:13:21,  8.59s/it, gpt_loss=0.361, loss_mean=0.291][A[A
+
+Train step of epoch 1:  14%|█▍        | 891/6434 [2:05:27<13:00:47,  8.45s/it, gpt_loss=0.361, loss_mean=0.291][A[A
+
+Train step of epoch 1:  14%|█▍        | 891/6434 [2:05:35<13:00:47,  8.45s/it, gpt_loss=0.22, loss_mean=0.284] [A[A
+
+Train step of epoch 1:  14%|█▍        | 892/6434 [2:05:35<12:57:03,  8.41s/it, gpt_loss=0.22, loss_mean=0.284][A[A
+
+Train step of epoch 1:  14%|█▍        | 892/6434 [2:05:44<12:57:03,  8.41s/it, gpt_loss=0.249, loss_mean=0.28][A[A
+
+Train step of epoch 1:  14%|█▍        | 893/6434 [2:05:44<12:59:54,  8.45s/it, gpt_loss=0.249, loss_mean=0.28][A[A
+
+Train step of epoch 1:  14%|█▍        | 893/6434 [2:05:52<12:59:54,  8.45s/it, gpt_loss=0.351, loss_mean=0.287][A[A
+
+Train step of epoch 1:  14%|█▍        | 894/6434 [2:05:52<12:45:19,  8.29s/it, gpt_loss=0.351, loss_mean=0.287][A[A
+
+Train step of epoch 1:  14%|█▍        | 894/6434 [2:06:00<12:45:19,  8.29s/it, gpt_loss=0.227, loss_mean=0.281][A[A
+
+Train step of epoch 1:  14%|█▍        | 895/6434 [2:06:00<12:34:29,  8.17s/it, gpt_loss=0.227, loss_mean=0.281][A[A
+[LID Router Debug] Step: 7330
+Batch Size: 10
+Audio Batch Size: 134
+LID Assignments: [4, 2, 3, 2, 4, 3, 3, 5, 2, 9]
+Active Experts in Batch: {2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  14%|█▍        | 895/6434 [2:06:08<12:34:29,  8.17s/it, gpt_loss=0.249, loss_mean=0.278][A[A
+
+Train step of epoch 1:  14%|█▍        | 896/6434 [2:06:08<12:52:49,  8.37s/it, gpt_loss=0.249, loss_mean=0.278][A[A
+
+Train step of epoch 1:  14%|█▍        | 896/6434 [2:06:18<12:52:49,  8.37s/it, gpt_loss=0.251, loss_mean=0.275][A[A
+
+Train step of epoch 1:  14%|█▍        | 897/6434 [2:06:18<13:24:45,  8.72s/it, gpt_loss=0.251, loss_mean=0.275][A[A
+
+Train step of epoch 1:  14%|█▍        | 897/6434 [2:06:27<13:24:45,  8.72s/it, gpt_loss=0.235, loss_mean=0.271][A[A
+
+Train step of epoch 1:  14%|█▍        | 898/6434 [2:06:27<13:39:43,  8.88s/it, gpt_loss=0.235, loss_mean=0.271][A[A
+
+Train step of epoch 1:  14%|█▍        | 898/6434 [2:06:35<13:39:43,  8.88s/it, gpt_loss=0.27, loss_mean=0.271] [A[A
+
+Train step of epoch 1:  14%|█▍        | 899/6434 [2:06:35<13:04:20,  8.50s/it, gpt_loss=0.27, loss_mean=0.271][A[A
+
+Train step of epoch 1:  14%|█▍        | 899/6434 [2:06:43<13:04:20,  8.50s/it, gpt_loss=0.234, loss_mean=0.267][A[A
+
+Train step of epoch 1:  14%|█▍        | 900/6434 [2:06:43<12:53:23,  8.39s/it, gpt_loss=0.234, loss_mean=0.267][A[A
+
+Train step of epoch 1:  14%|█▍        | 900/6434 [2:06:52<12:53:23,  8.39s/it, gpt_loss=0.234, loss_mean=0.264][A[A
+
+Train step of epoch 1:  14%|█▍        | 901/6434 [2:06:52<13:15:56,  8.63s/it, gpt_loss=0.234, loss_mean=0.264][A[A
+
+Train step of epoch 1:  14%|█▍        | 901/6434 [2:07:01<13:15:56,  8.63s/it, gpt_loss=0.187, loss_mean=0.256][A[A
+
+Train step of epoch 1:  14%|█▍        | 902/6434 [2:07:01<13:26:38,  8.75s/it, gpt_loss=0.187, loss_mean=0.256][A[A
+
+Train step of epoch 1:  14%|█▍        | 902/6434 [2:07:09<13:26:38,  8.75s/it, gpt_loss=0.248, loss_mean=0.256][A[A
+
+Train step of epoch 1:  14%|█▍        | 903/6434 [2:07:09<12:47:43,  8.33s/it, gpt_loss=0.248, loss_mean=0.256][A[A
+
+Train step of epoch 1:  14%|█▍        | 903/6434 [2:07:17<12:47:43,  8.33s/it, gpt_loss=0.292, loss_mean=0.259][A[A
+
+Train step of epoch 1:  14%|█▍        | 904/6434 [2:07:17<12:40:39,  8.25s/it, gpt_loss=0.292, loss_mean=0.259][A[A
+
+Train step of epoch 1:  14%|█▍        | 904/6434 [2:07:27<12:40:39,  8.25s/it, gpt_loss=0.289, loss_mean=0.262][A[A
+
+Train step of epoch 1:  14%|█▍        | 905/6434 [2:07:27<13:29:54,  8.79s/it, gpt_loss=0.289, loss_mean=0.262][A[A
+[LID Router Debug] Step: 7340
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [4, 9, 4, 2, 0, 1, 5, 2, 1, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  14%|█▍        | 905/6434 [2:07:35<13:29:54,  8.79s/it, gpt_loss=0.288, loss_mean=0.265][A[A
+
+Train step of epoch 1:  14%|█▍        | 906/6434 [2:07:35<13:08:50,  8.56s/it, gpt_loss=0.288, loss_mean=0.265][A[A
+
+Train step of epoch 1:  14%|█▍        | 906/6434 [2:07:43<13:08:50,  8.56s/it, gpt_loss=0.292, loss_mean=0.268][A[A
+
+Train step of epoch 1:  14%|█▍        | 907/6434 [2:07:43<12:52:42,  8.39s/it, gpt_loss=0.292, loss_mean=0.268][A[A
+
+Train step of epoch 1:  14%|█▍        | 907/6434 [2:07:52<12:52:42,  8.39s/it, gpt_loss=0.352, loss_mean=0.276][A[A
+
+Train step of epoch 1:  14%|█▍        | 908/6434 [2:07:52<13:12:03,  8.60s/it, gpt_loss=0.352, loss_mean=0.276][A[A
+
+Train step of epoch 1:  14%|█▍        | 908/6434 [2:07:59<13:12:03,  8.60s/it, gpt_loss=0.302, loss_mean=0.279][A[A
+
+Train step of epoch 1:  14%|█▍        | 909/6434 [2:07:59<12:40:43,  8.26s/it, gpt_loss=0.302, loss_mean=0.279][A[A
+
+Train step of epoch 1:  14%|█▍        | 909/6434 [2:08:08<12:40:43,  8.26s/it, gpt_loss=0.308, loss_mean=0.282][A[A
+
+Train step of epoch 1:  14%|█▍        | 910/6434 [2:08:08<12:48:56,  8.35s/it, gpt_loss=0.308, loss_mean=0.282][A[A
+
+Train step of epoch 1:  14%|█▍        | 910/6434 [2:08:15<12:48:56,  8.35s/it, gpt_loss=0.221, loss_mean=0.276][A[A
+
+Train step of epoch 1:  14%|█▍        | 911/6434 [2:08:15<12:20:21,  8.04s/it, gpt_loss=0.221, loss_mean=0.276][A[A
+
+Train step of epoch 1:  14%|█▍        | 911/6434 [2:08:23<12:20:21,  8.04s/it, gpt_loss=0.348, loss_mean=0.283][A[A
+
+Train step of epoch 1:  14%|█▍        | 912/6434 [2:08:23<12:27:21,  8.12s/it, gpt_loss=0.348, loss_mean=0.283][A[A
+
+Train step of epoch 1:  14%|█▍        | 912/6434 [2:08:32<12:27:21,  8.12s/it, gpt_loss=0.323, loss_mean=0.287][A[A
+
+Train step of epoch 1:  14%|█▍        | 913/6434 [2:08:32<12:39:34,  8.25s/it, gpt_loss=0.323, loss_mean=0.287][A[A
+
+Train step of epoch 1:  14%|█▍        | 913/6434 [2:08:41<12:39:34,  8.25s/it, gpt_loss=0.219, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  14%|█▍        | 914/6434 [2:08:41<12:46:26,  8.33s/it, gpt_loss=0.219, loss_mean=0.28][A[A
+
+Train step of epoch 1:  14%|█▍        | 914/6434 [2:08:48<12:46:26,  8.33s/it, gpt_loss=0.289, loss_mean=0.281][A[A
+
+Train step of epoch 1:  14%|█▍        | 915/6434 [2:08:48<12:16:19,  8.00s/it, gpt_loss=0.289, loss_mean=0.281][A[A
+[LID Router Debug] Step: 7350
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [0, 2, 5, 9, 2, 2, 0, 9, 2, 1]
+Active Experts in Batch: {0, 1, 2, 5, 9}
+
+
+Train step of epoch 1:  14%|█▍        | 915/6434 [2:08:56<12:16:19,  8.00s/it, gpt_loss=0.301, loss_mean=0.283][A[A
+
+Train step of epoch 1:  14%|█▍        | 916/6434 [2:08:56<12:12:25,  7.96s/it, gpt_loss=0.301, loss_mean=0.283][A[A
+
+Train step of epoch 1:  14%|█▍        | 916/6434 [2:09:04<12:12:25,  7.96s/it, gpt_loss=0.302, loss_mean=0.285][A[A
+
+Train step of epoch 1:  14%|█▍        | 917/6434 [2:09:04<12:20:49,  8.06s/it, gpt_loss=0.302, loss_mean=0.285][A[A
+
+Train step of epoch 1:  14%|█▍        | 917/6434 [2:09:12<12:20:49,  8.06s/it, gpt_loss=0.273, loss_mean=0.284][A[A
+
+Train step of epoch 1:  14%|█▍        | 918/6434 [2:09:12<12:32:41,  8.19s/it, gpt_loss=0.273, loss_mean=0.284][A[A
+
+Train step of epoch 1:  14%|█▍        | 918/6434 [2:09:21<12:32:41,  8.19s/it, gpt_loss=0.308, loss_mean=0.286][A[A
+
+Train step of epoch 1:  14%|█▍        | 919/6434 [2:09:21<12:43:20,  8.30s/it, gpt_loss=0.308, loss_mean=0.286][A[A
+
+Train step of epoch 1:  14%|█▍        | 919/6434 [2:09:29<12:43:20,  8.30s/it, gpt_loss=0.257, loss_mean=0.283][A[A
+
+Train step of epoch 1:  14%|█▍        | 920/6434 [2:09:29<12:25:41,  8.11s/it, gpt_loss=0.257, loss_mean=0.283][A[A
+
+Train step of epoch 1:  14%|█▍        | 920/6434 [2:09:37<12:25:41,  8.11s/it, gpt_loss=0.295, loss_mean=0.284][A[A
+
+Train step of epoch 1:  14%|█▍        | 921/6434 [2:09:37<12:40:37,  8.28s/it, gpt_loss=0.295, loss_mean=0.284][A[A
+
+Train step of epoch 1:  14%|█▍        | 921/6434 [2:09:45<12:40:37,  8.28s/it, gpt_loss=0.324, loss_mean=0.288][A[A
+
+Train step of epoch 1:  14%|█▍        | 922/6434 [2:09:45<12:30:10,  8.17s/it, gpt_loss=0.324, loss_mean=0.288][A[A
+
+Train step of epoch 1:  14%|█▍        | 922/6434 [2:09:54<12:30:10,  8.17s/it, gpt_loss=0.293, loss_mean=0.289][A[A
+
+Train step of epoch 1:  14%|█▍        | 923/6434 [2:09:54<12:36:14,  8.23s/it, gpt_loss=0.293, loss_mean=0.289][A[A
+
+Train step of epoch 1:  14%|█▍        | 923/6434 [2:10:02<12:36:14,  8.23s/it, gpt_loss=0.455, loss_mean=0.305][A[A
+
+Train step of epoch 1:  14%|█▍        | 924/6434 [2:10:02<12:45:21,  8.33s/it, gpt_loss=0.455, loss_mean=0.305][A[A
+
+Train step of epoch 1:  14%|█▍        | 924/6434 [2:10:11<12:45:21,  8.33s/it, gpt_loss=0.21, loss_mean=0.296] [A[A
+
+Train step of epoch 1:  14%|█▍        | 925/6434 [2:10:11<13:11:06,  8.62s/it, gpt_loss=0.21, loss_mean=0.296][A[A
+[LID Router Debug] Step: 7360
+Batch Size: 10
+Audio Batch Size: 84
+LID Assignments: [2, 2, 1, 0, 5, 0, 4, 4, 1, 1]
+Active Experts in Batch: {0, 1, 2, 4, 5}
+
+
+Train step of epoch 1:  14%|█▍        | 925/6434 [2:10:21<13:11:06,  8.62s/it, gpt_loss=0.237, loss_mean=0.29][A[A
+
+Train step of epoch 1:  14%|█▍        | 926/6434 [2:10:21<13:45:29,  8.99s/it, gpt_loss=0.237, loss_mean=0.29][A[A
+
+Train step of epoch 1:  14%|█▍        | 926/6434 [2:10:32<13:45:29,  8.99s/it, gpt_loss=0.268, loss_mean=0.288][A[A
+
+Train step of epoch 1:  14%|█▍        | 927/6434 [2:10:32<14:20:45,  9.38s/it, gpt_loss=0.268, loss_mean=0.288][A[A
+
+Train step of epoch 1:  14%|█▍        | 927/6434 [2:10:40<14:20:45,  9.38s/it, gpt_loss=0.295, loss_mean=0.289][A[A
+
+Train step of epoch 1:  14%|█▍        | 928/6434 [2:10:40<13:48:10,  9.02s/it, gpt_loss=0.295, loss_mean=0.289][A[A
+
+Train step of epoch 1:  14%|█▍        | 928/6434 [2:10:48<13:48:10,  9.02s/it, gpt_loss=0.315, loss_mean=0.291][A[A
+
+Train step of epoch 1:  14%|█▍        | 929/6434 [2:10:48<13:16:04,  8.68s/it, gpt_loss=0.315, loss_mean=0.291][A[A
+
+Train step of epoch 1:  14%|█▍        | 929/6434 [2:10:55<13:16:04,  8.68s/it, gpt_loss=0.299, loss_mean=0.292][A[A
+
+Train step of epoch 1:  14%|█▍        | 930/6434 [2:10:55<12:46:27,  8.36s/it, gpt_loss=0.299, loss_mean=0.292][A[A
+
+Train step of epoch 1:  14%|█▍        | 930/6434 [2:11:03<12:46:27,  8.36s/it, gpt_loss=0.319, loss_mean=0.295][A[A
+
+Train step of epoch 1:  14%|█▍        | 931/6434 [2:11:03<12:17:08,  8.04s/it, gpt_loss=0.319, loss_mean=0.295][A[A
+
+Train step of epoch 1:  14%|█▍        | 931/6434 [2:11:11<12:17:08,  8.04s/it, gpt_loss=0.241, loss_mean=0.289][A[A
+
+Train step of epoch 1:  14%|█▍        | 932/6434 [2:11:11<12:36:26,  8.25s/it, gpt_loss=0.241, loss_mean=0.289][A[A
+
+Train step of epoch 1:  14%|█▍        | 932/6434 [2:11:20<12:36:26,  8.25s/it, gpt_loss=0.283, loss_mean=0.289][A[A
+
+Train step of epoch 1:  15%|█▍        | 933/6434 [2:11:20<12:46:54,  8.36s/it, gpt_loss=0.283, loss_mean=0.289][A[A
+
+Train step of epoch 1:  15%|█▍        | 933/6434 [2:11:28<12:46:54,  8.36s/it, gpt_loss=0.23, loss_mean=0.283] [A[A
+
+Train step of epoch 1:  15%|█▍        | 934/6434 [2:11:28<12:45:22,  8.35s/it, gpt_loss=0.23, loss_mean=0.283][A[A
+
+Train step of epoch 1:  15%|█▍        | 934/6434 [2:11:36<12:45:22,  8.35s/it, gpt_loss=0.22, loss_mean=0.277][A[A
+
+Train step of epoch 1:  15%|█▍        | 935/6434 [2:11:36<12:38:43,  8.28s/it, gpt_loss=0.22, loss_mean=0.277][A[A
+[LID Router Debug] Step: 7370
+Batch Size: 10
+Audio Batch Size: 96
+LID Assignments: [9, 4, 5, 9, 4, 2, 2, 2, 0, 1]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+
+Train step of epoch 1:  15%|█▍        | 935/6434 [2:11:45<12:38:43,  8.28s/it, gpt_loss=0.279, loss_mean=0.277][A[A
+
+Train step of epoch 1:  15%|█▍        | 936/6434 [2:11:45<12:39:43,  8.29s/it, gpt_loss=0.279, loss_mean=0.277][A[A
+
+Train step of epoch 1:  15%|█▍        | 936/6434 [2:11:53<12:39:43,  8.29s/it, gpt_loss=0.27, loss_mean=0.276] [A[A
+
+Train step of epoch 1:  15%|█▍        | 937/6434 [2:11:53<12:44:56,  8.35s/it, gpt_loss=0.27, loss_mean=0.276][A[A
+
+Train step of epoch 1:  15%|█▍        | 937/6434 [2:12:00<12:44:56,  8.35s/it, gpt_loss=0.214, loss_mean=0.27][A[A
+
+Train step of epoch 1:  15%|█▍        | 938/6434 [2:12:00<12:14:28,  8.02s/it, gpt_loss=0.214, loss_mean=0.27][A[A
+
+Train step of epoch 1:  15%|█▍        | 938/6434 [2:12:09<12:14:28,  8.02s/it, gpt_loss=0.281, loss_mean=0.271][A[A
+
+Train step of epoch 1:  15%|█▍        | 939/6434 [2:12:09<12:41:37,  8.32s/it, gpt_loss=0.281, loss_mean=0.271][A[A
+
+Train step of epoch 1:  15%|█▍        | 939/6434 [2:12:18<12:41:37,  8.32s/it, gpt_loss=0.303, loss_mean=0.274][A[A
+
+Train step of epoch 1:  15%|█▍        | 940/6434 [2:12:18<12:47:22,  8.38s/it, gpt_loss=0.303, loss_mean=0.274][A[A
+
+Train step of epoch 1:  15%|█▍        | 940/6434 [2:12:27<12:47:22,  8.38s/it, gpt_loss=0.294, loss_mean=0.276][A[A
+
+Train step of epoch 1:  15%|█▍        | 941/6434 [2:12:27<12:59:23,  8.51s/it, gpt_loss=0.294, loss_mean=0.276][A[A
+
+Train step of epoch 1:  15%|█▍        | 941/6434 [2:12:34<12:59:23,  8.51s/it, gpt_loss=0.322, loss_mean=0.281][A[A
+
+Train step of epoch 1:  15%|█▍        | 942/6434 [2:12:34<12:29:12,  8.19s/it, gpt_loss=0.322, loss_mean=0.281][A[A
+
+Train step of epoch 1:  15%|█▍        | 942/6434 [2:12:43<12:29:12,  8.19s/it, gpt_loss=0.302, loss_mean=0.283][A[A
+
+Train step of epoch 1:  15%|█▍        | 943/6434 [2:12:43<12:49:52,  8.41s/it, gpt_loss=0.302, loss_mean=0.283][A[A
+
+Train step of epoch 1:  15%|█▍        | 943/6434 [2:12:52<12:49:52,  8.41s/it, gpt_loss=0.254, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  15%|█▍        | 944/6434 [2:12:52<12:58:09,  8.50s/it, gpt_loss=0.254, loss_mean=0.28][A[A
+
+Train step of epoch 1:  15%|█▍        | 944/6434 [2:13:01<12:58:09,  8.50s/it, gpt_loss=0.234, loss_mean=0.275][A[A
+
+Train step of epoch 1:  15%|█▍        | 945/6434 [2:13:01<13:17:08,  8.71s/it, gpt_loss=0.234, loss_mean=0.275][A[A
+[LID Router Debug] Step: 7380
+Batch Size: 10
+Audio Batch Size: 105
+LID Assignments: [9, 3, 9, 1, 0, 4, 5, 9, 4, 9]
+Active Experts in Batch: {0, 1, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  15%|█▍        | 945/6434 [2:13:10<13:17:08,  8.71s/it, gpt_loss=0.2, loss_mean=0.268]  [A[A
+
+Train step of epoch 1:  15%|█▍        | 946/6434 [2:13:10<13:14:14,  8.68s/it, gpt_loss=0.2, loss_mean=0.268][A[A
+
+Train step of epoch 1:  15%|█▍        | 946/6434 [2:13:19<13:14:14,  8.68s/it, gpt_loss=0.24, loss_mean=0.265][A[A
+
+Train step of epoch 1:  15%|█▍        | 947/6434 [2:13:19<13:34:58,  8.91s/it, gpt_loss=0.24, loss_mean=0.265][A[A
+
+Train step of epoch 1:  15%|█▍        | 947/6434 [2:13:28<13:34:58,  8.91s/it, gpt_loss=0.235, loss_mean=0.262][A[A
+
+Train step of epoch 1:  15%|█▍        | 948/6434 [2:13:28<13:37:00,  8.94s/it, gpt_loss=0.235, loss_mean=0.262][A[A
+
+Train step of epoch 1:  15%|█▍        | 948/6434 [2:13:37<13:37:00,  8.94s/it, gpt_loss=0.26, loss_mean=0.262] [A[A
+
+Train step of epoch 1:  15%|█▍        | 949/6434 [2:13:37<13:23:40,  8.79s/it, gpt_loss=0.26, loss_mean=0.262][A[A
+
+Train step of epoch 1:  15%|█▍        | 949/6434 [2:13:45<13:23:40,  8.79s/it, gpt_loss=0.247, loss_mean=0.26][A[A
+
+Train step of epoch 1:  15%|█▍        | 950/6434 [2:13:45<13:27:00,  8.83s/it, gpt_loss=0.247, loss_mean=0.26][A[A
+
+Train step of epoch 1:  15%|█▍        | 950/6434 [2:13:54<13:27:00,  8.83s/it, gpt_loss=0.216, loss_mean=0.256][A[A
+
+Train step of epoch 1:  15%|█▍        | 951/6434 [2:13:54<13:28:10,  8.84s/it, gpt_loss=0.216, loss_mean=0.256][A[A
+
+Train step of epoch 1:  15%|█▍        | 951/6434 [2:14:03<13:28:10,  8.84s/it, gpt_loss=0.286, loss_mean=0.259][A[A
+
+Train step of epoch 1:  15%|█▍        | 952/6434 [2:14:03<13:09:07,  8.64s/it, gpt_loss=0.286, loss_mean=0.259][A[A
+
+Train step of epoch 1:  15%|█▍        | 952/6434 [2:14:11<13:09:07,  8.64s/it, gpt_loss=0.249, loss_mean=0.258][A[A
+
+Train step of epoch 1:  15%|█▍        | 953/6434 [2:14:11<13:10:11,  8.65s/it, gpt_loss=0.249, loss_mean=0.258][A[A
+
+Train step of epoch 1:  15%|█▍        | 953/6434 [2:14:19<13:10:11,  8.65s/it, gpt_loss=0.24, loss_mean=0.256] [A[A
+
+Train step of epoch 1:  15%|█▍        | 954/6434 [2:14:19<12:58:48,  8.53s/it, gpt_loss=0.24, loss_mean=0.256][A[A
+
+Train step of epoch 1:  15%|█▍        | 954/6434 [2:14:28<12:58:48,  8.53s/it, gpt_loss=0.214, loss_mean=0.252][A[A
+
+Train step of epoch 1:  15%|█▍        | 955/6434 [2:14:28<12:50:59,  8.44s/it, gpt_loss=0.214, loss_mean=0.252][A[A
+[LID Router Debug] Step: 7390
+Batch Size: 10
+Audio Batch Size: 78
+LID Assignments: [6, 5, 9, 0, 4, 0, 4, 6, 4, 0]
+Active Experts in Batch: {0, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  15%|█▍        | 955/6434 [2:14:36<12:50:59,  8.44s/it, gpt_loss=0.258, loss_mean=0.253][A[A
+
+Train step of epoch 1:  15%|█▍        | 956/6434 [2:14:36<12:42:23,  8.35s/it, gpt_loss=0.258, loss_mean=0.253][A[A
+
+Train step of epoch 1:  15%|█▍        | 956/6434 [2:14:44<12:42:23,  8.35s/it, gpt_loss=0.255, loss_mean=0.253][A[A
+
+Train step of epoch 1:  15%|█▍        | 957/6434 [2:14:44<12:43:46,  8.37s/it, gpt_loss=0.255, loss_mean=0.253][A[A
+
+Train step of epoch 1:  15%|█▍        | 957/6434 [2:14:53<12:43:46,  8.37s/it, gpt_loss=0.268, loss_mean=0.254][A[A
+
+Train step of epoch 1:  15%|█▍        | 958/6434 [2:14:53<13:08:22,  8.64s/it, gpt_loss=0.268, loss_mean=0.254][A[A
+
+Train step of epoch 1:  15%|█▍        | 958/6434 [2:15:02<13:08:22,  8.64s/it, gpt_loss=0.194, loss_mean=0.248][A[A
+
+Train step of epoch 1:  15%|█▍        | 959/6434 [2:15:02<13:13:28,  8.70s/it, gpt_loss=0.194, loss_mean=0.248][A[A
+
+Train step of epoch 1:  15%|█▍        | 959/6434 [2:15:11<13:13:28,  8.70s/it, gpt_loss=0.243, loss_mean=0.248][A[A
+
+Train step of epoch 1:  15%|█▍        | 960/6434 [2:15:11<13:26:30,  8.84s/it, gpt_loss=0.243, loss_mean=0.248][A[A
+
+Train step of epoch 1:  15%|█▍        | 960/6434 [2:15:21<13:26:30,  8.84s/it, gpt_loss=0.308, loss_mean=0.254][A[A
+
+Train step of epoch 1:  15%|█▍        | 961/6434 [2:15:21<13:43:31,  9.03s/it, gpt_loss=0.308, loss_mean=0.254][A[A
+
+Train step of epoch 1:  15%|█▍        | 961/6434 [2:15:29<13:43:31,  9.03s/it, gpt_loss=0.251, loss_mean=0.254][A[A
+
+Train step of epoch 1:  15%|█▍        | 962/6434 [2:15:29<13:11:48,  8.68s/it, gpt_loss=0.251, loss_mean=0.254][A[A
+
+Train step of epoch 1:  15%|█▍        | 962/6434 [2:15:38<13:11:48,  8.68s/it, gpt_loss=0.227, loss_mean=0.251][A[A
+
+Train step of epoch 1:  15%|█▍        | 963/6434 [2:15:38<13:16:20,  8.73s/it, gpt_loss=0.227, loss_mean=0.251][A[A
+
+Train step of epoch 1:  15%|█▍        | 963/6434 [2:15:46<13:16:20,  8.73s/it, gpt_loss=0.255, loss_mean=0.251][A[A
+
+Train step of epoch 1:  15%|█▍        | 964/6434 [2:15:46<12:53:16,  8.48s/it, gpt_loss=0.255, loss_mean=0.251][A[A
+
+Train step of epoch 1:  15%|█▍        | 964/6434 [2:15:55<12:53:16,  8.48s/it, gpt_loss=0.28, loss_mean=0.254] [A[A
+
+Train step of epoch 1:  15%|█▍        | 965/6434 [2:15:55<13:13:20,  8.70s/it, gpt_loss=0.28, loss_mean=0.254][A[A
+[LID Router Debug] Step: 7400
+Batch Size: 10
+Audio Batch Size: 124
+LID Assignments: [5, 9, 5, 2, 9, 3, 3, 5, 2, 5]
+Active Experts in Batch: {9, 2, 3, 5}
+[2026-02-07 09:17:50,557] [INFO] [logging.py:96:log_dist] [Rank 0] step=3700, skipped=0, lr=[1.3765231544017158e-05, 1.3765231544017158e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 09:17:50,559] [INFO] [timer.py:260:stop] epoch=0/micro_step=7400/global_step=3700, RunningAvgSamplesPerSec=4.7464975407334915, CurrSamplesPerSec=4.478527476347956, MemAllocated=12.68GB, MaxMemAllocated=49.73GB
+
+
+Train step of epoch 1:  15%|█▍        | 965/6434 [2:16:03<13:13:20,  8.70s/it, gpt_loss=0.346, loss_mean=0.263][A[A
+
+Train step of epoch 1:  15%|█▌        | 966/6434 [2:16:03<13:12:26,  8.70s/it, gpt_loss=0.346, loss_mean=0.263][A[A
+
+Train step of epoch 1:  15%|█▌        | 966/6434 [2:16:13<13:12:26,  8.70s/it, gpt_loss=0.176, loss_mean=0.255][A[A
+
+Train step of epoch 1:  15%|█▌        | 967/6434 [2:16:13<13:37:48,  8.98s/it, gpt_loss=0.176, loss_mean=0.255][A[A
+
+Train step of epoch 1:  15%|█▌        | 967/6434 [2:16:22<13:37:48,  8.98s/it, gpt_loss=0.272, loss_mean=0.256][A[A
+
+Train step of epoch 1:  15%|█▌        | 968/6434 [2:16:22<13:45:56,  9.07s/it, gpt_loss=0.272, loss_mean=0.256][A[A
+
+Train step of epoch 1:  15%|█▌        | 968/6434 [2:16:31<13:45:56,  9.07s/it, gpt_loss=0.261, loss_mean=0.257][A[A
+
+Train step of epoch 1:  15%|█▌        | 969/6434 [2:16:31<13:25:18,  8.84s/it, gpt_loss=0.261, loss_mean=0.257][A[A
+
+Train step of epoch 1:  15%|█▌        | 969/6434 [2:16:39<13:25:18,  8.84s/it, gpt_loss=0.252, loss_mean=0.256][A[A
+
+Train step of epoch 1:  15%|█▌        | 970/6434 [2:16:39<13:22:20,  8.81s/it, gpt_loss=0.252, loss_mean=0.256][A[A
+
+Train step of epoch 1:  15%|█▌        | 970/6434 [2:16:47<13:22:20,  8.81s/it, gpt_loss=0.306, loss_mean=0.261][A[A
+
+Train step of epoch 1:  15%|█▌        | 971/6434 [2:16:47<12:43:17,  8.38s/it, gpt_loss=0.306, loss_mean=0.261][A[A
+
+Train step of epoch 1:  15%|█▌        | 971/6434 [2:16:54<12:43:17,  8.38s/it, gpt_loss=0.253, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  15%|█▌        | 972/6434 [2:16:54<12:21:14,  8.14s/it, gpt_loss=0.253, loss_mean=0.26][A[A
+
+Train step of epoch 1:  15%|█▌        | 972/6434 [2:17:03<12:21:14,  8.14s/it, gpt_loss=0.254, loss_mean=0.26][A[A
+
+Train step of epoch 1:  15%|█▌        | 973/6434 [2:17:03<12:45:02,  8.41s/it, gpt_loss=0.254, loss_mean=0.26][A[A
+
+Train step of epoch 1:  15%|█▌        | 973/6434 [2:17:12<12:45:02,  8.41s/it, gpt_loss=0.3, loss_mean=0.264] [A[A
+
+Train step of epoch 1:  15%|█▌        | 974/6434 [2:17:12<12:41:46,  8.37s/it, gpt_loss=0.3, loss_mean=0.264][A[A
+
+Train step of epoch 1:  15%|█▌        | 974/6434 [2:17:20<12:41:46,  8.37s/it, gpt_loss=0.226, loss_mean=0.26][A[A
+
+Train step of epoch 1:  15%|█▌        | 975/6434 [2:17:20<12:46:52,  8.43s/it, gpt_loss=0.226, loss_mean=0.26][A[A
+[LID Router Debug] Step: 7410
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [1, 2, 3, 0, 6, 1, 4, 1, 1, 2]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6}
+
+
+Train step of epoch 1:  15%|█▌        | 975/6434 [2:17:29<12:46:52,  8.43s/it, gpt_loss=0.286, loss_mean=0.263][A[A
+
+Train step of epoch 1:  15%|█▌        | 976/6434 [2:17:29<12:54:39,  8.52s/it, gpt_loss=0.286, loss_mean=0.263][A[A
+
+Train step of epoch 1:  15%|█▌        | 976/6434 [2:17:38<12:54:39,  8.52s/it, gpt_loss=0.225, loss_mean=0.259][A[A
+
+Train step of epoch 1:  15%|█▌        | 977/6434 [2:17:38<13:10:32,  8.69s/it, gpt_loss=0.225, loss_mean=0.259][A[A
+
+Train step of epoch 1:  15%|█▌        | 977/6434 [2:17:47<13:10:32,  8.69s/it, gpt_loss=0.273, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  15%|█▌        | 978/6434 [2:17:47<13:23:47,  8.84s/it, gpt_loss=0.273, loss_mean=0.26][A[A
+
+Train step of epoch 1:  15%|█▌        | 978/6434 [2:17:56<13:23:47,  8.84s/it, gpt_loss=0.279, loss_mean=0.262][A[A
+
+Train step of epoch 1:  15%|█▌        | 979/6434 [2:17:56<13:19:52,  8.80s/it, gpt_loss=0.279, loss_mean=0.262][A[A
+
+Train step of epoch 1:  15%|█▌        | 979/6434 [2:18:04<13:19:52,  8.80s/it, gpt_loss=0.276, loss_mean=0.264][A[A
+
+Train step of epoch 1:  15%|█▌        | 980/6434 [2:18:04<13:09:10,  8.68s/it, gpt_loss=0.276, loss_mean=0.264][A[A
+
+Train step of epoch 1:  15%|█▌        | 980/6434 [2:18:12<13:09:10,  8.68s/it, gpt_loss=0.292, loss_mean=0.266][A[A
+
+Train step of epoch 1:  15%|█▌        | 981/6434 [2:18:12<12:39:11,  8.35s/it, gpt_loss=0.292, loss_mean=0.266][A[A
+
+Train step of epoch 1:  15%|█▌        | 981/6434 [2:18:21<12:39:11,  8.35s/it, gpt_loss=0.244, loss_mean=0.264][A[A
+
+Train step of epoch 1:  15%|█▌        | 982/6434 [2:18:21<12:47:02,  8.44s/it, gpt_loss=0.244, loss_mean=0.264][A[A
+
+Train step of epoch 1:  15%|█▌        | 982/6434 [2:18:29<12:47:02,  8.44s/it, gpt_loss=0.216, loss_mean=0.259][A[A
+
+Train step of epoch 1:  15%|█▌        | 983/6434 [2:18:29<12:50:49,  8.48s/it, gpt_loss=0.216, loss_mean=0.259][A[A
+
+Train step of epoch 1:  15%|█▌        | 983/6434 [2:18:38<12:50:49,  8.48s/it, gpt_loss=0.34, loss_mean=0.267] [A[A
+
+Train step of epoch 1:  15%|█▌        | 984/6434 [2:18:38<12:57:31,  8.56s/it, gpt_loss=0.34, loss_mean=0.267][A[A
+
+Train step of epoch 1:  15%|█▌        | 984/6434 [2:18:46<12:57:31,  8.56s/it, gpt_loss=0.315, loss_mean=0.272][A[A
+
+Train step of epoch 1:  15%|█▌        | 985/6434 [2:18:46<12:52:55,  8.51s/it, gpt_loss=0.315, loss_mean=0.272][A[A
+[LID Router Debug] Step: 7420
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [3, 6, 6, 4, 0, 2, 1, 2, 1, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6}
+
+
+Train step of epoch 1:  15%|█▌        | 985/6434 [2:18:54<12:52:55,  8.51s/it, gpt_loss=0.266, loss_mean=0.272][A[A
+
+Train step of epoch 1:  15%|█▌        | 986/6434 [2:18:54<12:30:23,  8.26s/it, gpt_loss=0.266, loss_mean=0.272][A[A
+
+Train step of epoch 1:  15%|█▌        | 986/6434 [2:19:04<12:30:23,  8.26s/it, gpt_loss=0.205, loss_mean=0.265][A[A
+
+Train step of epoch 1:  15%|█▌        | 987/6434 [2:19:04<13:09:18,  8.69s/it, gpt_loss=0.205, loss_mean=0.265][A[A
+
+Train step of epoch 1:  15%|█▌        | 987/6434 [2:19:12<13:09:18,  8.69s/it, gpt_loss=0.268, loss_mean=0.265][A[A
+
+Train step of epoch 1:  15%|█▌        | 988/6434 [2:19:12<12:47:49,  8.46s/it, gpt_loss=0.268, loss_mean=0.265][A[A
+
+Train step of epoch 1:  15%|█▌        | 988/6434 [2:19:21<12:47:49,  8.46s/it, gpt_loss=0.255, loss_mean=0.264][A[A
+
+Train step of epoch 1:  15%|█▌        | 989/6434 [2:19:21<13:03:45,  8.64s/it, gpt_loss=0.255, loss_mean=0.264][A[A
+
+Train step of epoch 1:  15%|█▌        | 989/6434 [2:19:30<13:03:45,  8.64s/it, gpt_loss=0.236, loss_mean=0.261][A[A
+
+Train step of epoch 1:  15%|█▌        | 990/6434 [2:19:30<13:08:08,  8.69s/it, gpt_loss=0.236, loss_mean=0.261][A[A
+
+Train step of epoch 1:  15%|█▌        | 990/6434 [2:19:39<13:08:08,  8.69s/it, gpt_loss=0.322, loss_mean=0.267][A[A
+
+Train step of epoch 1:  15%|█▌        | 991/6434 [2:19:39<13:16:43,  8.78s/it, gpt_loss=0.322, loss_mean=0.267][A[A
+
+Train step of epoch 1:  15%|█▌        | 991/6434 [2:19:46<13:16:43,  8.78s/it, gpt_loss=0.269, loss_mean=0.268][A[A
+
+Train step of epoch 1:  15%|█▌        | 992/6434 [2:19:46<12:54:36,  8.54s/it, gpt_loss=0.269, loss_mean=0.268][A[A
+
+Train step of epoch 1:  15%|█▌        | 992/6434 [2:19:58<12:54:36,  8.54s/it, gpt_loss=0.337, loss_mean=0.274][A[A
+
+Train step of epoch 1:  15%|█▌        | 993/6434 [2:19:58<14:19:55,  9.48s/it, gpt_loss=0.337, loss_mean=0.274][A[A
+
+Train step of epoch 1:  15%|█▌        | 993/6434 [2:20:06<14:19:55,  9.48s/it, gpt_loss=0.29, loss_mean=0.276] [A[A
+
+Train step of epoch 1:  15%|█▌        | 994/6434 [2:20:06<13:36:34,  9.01s/it, gpt_loss=0.29, loss_mean=0.276][A[A
+
+Train step of epoch 1:  15%|█▌        | 994/6434 [2:20:15<13:36:34,  9.01s/it, gpt_loss=0.379, loss_mean=0.286][A[A
+
+Train step of epoch 1:  15%|█▌        | 995/6434 [2:20:15<13:27:57,  8.91s/it, gpt_loss=0.379, loss_mean=0.286][A[A
+[LID Router Debug] Step: 7430
+Batch Size: 10
+Audio Batch Size: 120
+LID Assignments: [9, 9, 3, 4, 0, 6, 2, 2, 5, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  15%|█▌        | 995/6434 [2:20:24<13:27:57,  8.91s/it, gpt_loss=0.215, loss_mean=0.279][A[A
+
+Train step of epoch 1:  15%|█▌        | 996/6434 [2:20:24<13:32:33,  8.97s/it, gpt_loss=0.215, loss_mean=0.279][A[A
+
+Train step of epoch 1:  15%|█▌        | 996/6434 [2:20:32<13:32:33,  8.97s/it, gpt_loss=0.34, loss_mean=0.285] [A[A
+
+Train step of epoch 1:  15%|█▌        | 997/6434 [2:20:32<13:09:32,  8.71s/it, gpt_loss=0.34, loss_mean=0.285][A[A
+
+Train step of epoch 1:  15%|█▌        | 997/6434 [2:20:41<13:09:32,  8.71s/it, gpt_loss=0.296, loss_mean=0.286][A[A
+
+Train step of epoch 1:  16%|█▌        | 998/6434 [2:20:41<13:09:01,  8.71s/it, gpt_loss=0.296, loss_mean=0.286][A[A
+
+Train step of epoch 1:  16%|█▌        | 998/6434 [2:20:50<13:09:01,  8.71s/it, gpt_loss=0.235, loss_mean=0.281][A[A
+
+Train step of epoch 1:  16%|█▌        | 999/6434 [2:20:50<13:13:37,  8.76s/it, gpt_loss=0.235, loss_mean=0.281][A[A
+
+Train step of epoch 1:  16%|█▌        | 999/6434 [2:20:57<13:13:37,  8.76s/it, gpt_loss=0.298, loss_mean=0.283][A[A
+
+Train step of epoch 1:  16%|█▌        | 1000/6434 [2:20:57<12:47:18,  8.47s/it, gpt_loss=0.298, loss_mean=0.283][A[A
+
+Train step of epoch 1:  16%|█▌        | 1000/6434 [2:21:06<12:47:18,  8.47s/it, gpt_loss=0.336, loss_mean=0.288][A[A
+
+Train step of epoch 1:  16%|█▌        | 1001/6434 [2:21:06<13:04:37,  8.67s/it, gpt_loss=0.336, loss_mean=0.288][A[A
+
+Train step of epoch 1:  16%|█▌        | 1001/6434 [2:21:15<13:04:37,  8.67s/it, gpt_loss=0.267, loss_mean=0.286][A[A
+
+Train step of epoch 1:  16%|█▌        | 1002/6434 [2:21:15<13:00:08,  8.62s/it, gpt_loss=0.267, loss_mean=0.286][A[A
+
+Train step of epoch 1:  16%|█▌        | 1002/6434 [2:21:25<13:00:08,  8.62s/it, gpt_loss=0.205, loss_mean=0.278][A[A
+
+Train step of epoch 1:  16%|█▌        | 1003/6434 [2:21:25<13:33:25,  8.99s/it, gpt_loss=0.205, loss_mean=0.278][A[A
+
+Train step of epoch 1:  16%|█▌        | 1003/6434 [2:21:34<13:33:25,  8.99s/it, gpt_loss=0.243, loss_mean=0.275][A[A
+
+Train step of epoch 1:  16%|█▌        | 1004/6434 [2:21:34<13:27:44,  8.93s/it, gpt_loss=0.243, loss_mean=0.275][A[A
+
+Train step of epoch 1:  16%|█▌        | 1004/6434 [2:21:42<13:27:44,  8.93s/it, gpt_loss=0.272, loss_mean=0.274][A[A
+
+Train step of epoch 1:  16%|█▌        | 1005/6434 [2:21:42<13:10:46,  8.74s/it, gpt_loss=0.272, loss_mean=0.274][A[A
+[LID Router Debug] Step: 7440
+Batch Size: 10
+Audio Batch Size: 123
+LID Assignments: [3, 3, 9, 1, 1, 2, 5, 1, 0, 3]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9}
+
+
+Train step of epoch 1:  16%|█▌        | 1005/6434 [2:21:51<13:10:46,  8.74s/it, gpt_loss=0.275, loss_mean=0.274][A[A
+
+Train step of epoch 1:  16%|█▌        | 1006/6434 [2:21:51<13:18:58,  8.83s/it, gpt_loss=0.275, loss_mean=0.274][A[A
+
+Train step of epoch 1:  16%|█▌        | 1006/6434 [2:21:59<13:18:58,  8.83s/it, gpt_loss=0.239, loss_mean=0.271][A[A
+
+Train step of epoch 1:  16%|█▌        | 1007/6434 [2:21:59<12:54:11,  8.56s/it, gpt_loss=0.239, loss_mean=0.271][A[A
+
+Train step of epoch 1:  16%|█▌        | 1007/6434 [2:22:08<12:54:11,  8.56s/it, gpt_loss=0.223, loss_mean=0.266][A[A
+
+Train step of epoch 1:  16%|█▌        | 1008/6434 [2:22:08<13:02:38,  8.65s/it, gpt_loss=0.223, loss_mean=0.266][A[A
+
+Train step of epoch 1:  16%|█▌        | 1008/6434 [2:22:15<13:02:38,  8.65s/it, gpt_loss=0.251, loss_mean=0.265][A[A
+
+Train step of epoch 1:  16%|█▌        | 1009/6434 [2:22:15<12:35:00,  8.35s/it, gpt_loss=0.251, loss_mean=0.265][A[A
+
+Train step of epoch 1:  16%|█▌        | 1009/6434 [2:22:23<12:35:00,  8.35s/it, gpt_loss=0.228, loss_mean=0.261][A[A
+
+Train step of epoch 1:  16%|█▌        | 1010/6434 [2:22:23<12:24:20,  8.23s/it, gpt_loss=0.228, loss_mean=0.261][A[A
+
+Train step of epoch 1:  16%|█▌        | 1010/6434 [2:22:35<12:24:20,  8.23s/it, gpt_loss=0.257, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  16%|█▌        | 1011/6434 [2:22:35<13:46:47,  9.15s/it, gpt_loss=0.257, loss_mean=0.26][A[A
+
+Train step of epoch 1:  16%|█▌        | 1011/6434 [2:22:42<13:46:47,  9.15s/it, gpt_loss=0.291, loss_mean=0.264][A[A
+
+Train step of epoch 1:  16%|█▌        | 1012/6434 [2:22:42<13:04:07,  8.68s/it, gpt_loss=0.291, loss_mean=0.264][A[A
+
+Train step of epoch 1:  16%|█▌        | 1012/6434 [2:22:50<13:04:07,  8.68s/it, gpt_loss=0.411, loss_mean=0.278][A[A
+
+Train step of epoch 1:  16%|█▌        | 1013/6434 [2:22:50<12:47:54,  8.50s/it, gpt_loss=0.411, loss_mean=0.278][A[A
+
+Train step of epoch 1:  16%|█▌        | 1013/6434 [2:22:58<12:47:54,  8.50s/it, gpt_loss=0.249, loss_mean=0.275][A[A
+
+Train step of epoch 1:  16%|█▌        | 1014/6434 [2:22:58<12:33:41,  8.34s/it, gpt_loss=0.249, loss_mean=0.275][A[A
+
+Train step of epoch 1:  16%|█▌        | 1014/6434 [2:23:07<12:33:41,  8.34s/it, gpt_loss=0.282, loss_mean=0.276][A[A
+
+Train step of epoch 1:  16%|█▌        | 1015/6434 [2:23:07<12:43:44,  8.46s/it, gpt_loss=0.282, loss_mean=0.276][A[A
+[LID Router Debug] Step: 7450
+Batch Size: 10
+Audio Batch Size: 108
+LID Assignments: [4, 5, 5, 3, 0, 4, 9, 9, 9, 4]
+Active Experts in Batch: {0, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  16%|█▌        | 1015/6434 [2:23:16<12:43:44,  8.46s/it, gpt_loss=0.271, loss_mean=0.275][A[A
+
+Train step of epoch 1:  16%|█▌        | 1016/6434 [2:23:16<12:53:32,  8.57s/it, gpt_loss=0.271, loss_mean=0.275][A[A
+
+Train step of epoch 1:  16%|█▌        | 1016/6434 [2:23:24<12:53:32,  8.57s/it, gpt_loss=0.274, loss_mean=0.275][A[A
+
+Train step of epoch 1:  16%|█▌        | 1017/6434 [2:23:24<12:37:33,  8.39s/it, gpt_loss=0.274, loss_mean=0.275][A[A
+
+Train step of epoch 1:  16%|█▌        | 1017/6434 [2:23:33<12:37:33,  8.39s/it, gpt_loss=0.201, loss_mean=0.268][A[A
+
+Train step of epoch 1:  16%|█▌        | 1018/6434 [2:23:33<13:00:28,  8.65s/it, gpt_loss=0.201, loss_mean=0.268][A[A
+
+Train step of epoch 1:  16%|█▌        | 1018/6434 [2:23:41<13:00:28,  8.65s/it, gpt_loss=0.219, loss_mean=0.263][A[A
+
+Train step of epoch 1:  16%|█▌        | 1019/6434 [2:23:41<12:50:37,  8.54s/it, gpt_loss=0.219, loss_mean=0.263][A[A
+
+Train step of epoch 1:  16%|█▌        | 1019/6434 [2:23:50<12:50:37,  8.54s/it, gpt_loss=0.237, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  16%|█▌        | 1020/6434 [2:23:50<12:55:47,  8.60s/it, gpt_loss=0.237, loss_mean=0.26][A[A
+
+Train step of epoch 1:  16%|█▌        | 1020/6434 [2:23:59<12:55:47,  8.60s/it, gpt_loss=0.302, loss_mean=0.265][A[A
+
+Train step of epoch 1:  16%|█▌        | 1021/6434 [2:23:59<12:54:58,  8.59s/it, gpt_loss=0.302, loss_mean=0.265][A[A
+
+Train step of epoch 1:  16%|█▌        | 1021/6434 [2:24:07<12:54:58,  8.59s/it, gpt_loss=0.279, loss_mean=0.266][A[A
+
+Train step of epoch 1:  16%|█▌        | 1022/6434 [2:24:07<12:55:41,  8.60s/it, gpt_loss=0.279, loss_mean=0.266][A[A
+
+Train step of epoch 1:  16%|█▌        | 1022/6434 [2:24:16<12:55:41,  8.60s/it, gpt_loss=0.334, loss_mean=0.273][A[A
+
+Train step of epoch 1:  16%|█▌        | 1023/6434 [2:24:16<12:55:16,  8.60s/it, gpt_loss=0.334, loss_mean=0.273][A[A
+
+Train step of epoch 1:  16%|█▌        | 1023/6434 [2:24:25<12:55:16,  8.60s/it, gpt_loss=0.2, loss_mean=0.266]  [A[A
+
+Train step of epoch 1:  16%|█▌        | 1024/6434 [2:24:25<12:56:49,  8.62s/it, gpt_loss=0.2, loss_mean=0.266][A[A
+
+Train step of epoch 1:  16%|█▌        | 1024/6434 [2:24:32<12:56:49,  8.62s/it, gpt_loss=0.243, loss_mean=0.263][A[A
+
+Train step of epoch 1:  16%|█▌        | 1025/6434 [2:24:32<12:39:16,  8.42s/it, gpt_loss=0.243, loss_mean=0.263][A[A
+[LID Router Debug] Step: 7460
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [0, 0, 4, 2, 1, 2, 6, 1, 9, 9]
+Active Experts in Batch: {0, 1, 2, 4, 6, 9}
+
+
+Train step of epoch 1:  16%|█▌        | 1025/6434 [2:24:41<12:39:16,  8.42s/it, gpt_loss=0.27, loss_mean=0.264] [A[A
+
+Train step of epoch 1:  16%|█▌        | 1026/6434 [2:24:41<12:41:41,  8.45s/it, gpt_loss=0.27, loss_mean=0.264][A[A
+
+Train step of epoch 1:  16%|█▌        | 1026/6434 [2:24:51<12:41:41,  8.45s/it, gpt_loss=0.339, loss_mean=0.271][A[A
+
+Train step of epoch 1:  16%|█▌        | 1027/6434 [2:24:51<13:14:19,  8.81s/it, gpt_loss=0.339, loss_mean=0.271][A[A
+
+Train step of epoch 1:  16%|█▌        | 1027/6434 [2:25:00<13:14:19,  8.81s/it, gpt_loss=0.306, loss_mean=0.275][A[A
+
+Train step of epoch 1:  16%|█▌        | 1028/6434 [2:25:00<13:18:17,  8.86s/it, gpt_loss=0.306, loss_mean=0.275][A[A
+
+Train step of epoch 1:  16%|█▌        | 1028/6434 [2:25:08<13:18:17,  8.86s/it, gpt_loss=0.274, loss_mean=0.275][A[A
+
+Train step of epoch 1:  16%|█▌        | 1029/6434 [2:25:08<13:11:49,  8.79s/it, gpt_loss=0.274, loss_mean=0.275][A[A
+
+Train step of epoch 1:  16%|█▌        | 1029/6434 [2:25:16<13:11:49,  8.79s/it, gpt_loss=0.275, loss_mean=0.275][A[A
+
+Train step of epoch 1:  16%|█▌        | 1030/6434 [2:25:16<12:34:39,  8.38s/it, gpt_loss=0.275, loss_mean=0.275][A[A
+
+Train step of epoch 1:  16%|█▌        | 1030/6434 [2:25:25<12:34:39,  8.38s/it, gpt_loss=0.259, loss_mean=0.273][A[A
+
+Train step of epoch 1:  16%|█▌        | 1031/6434 [2:25:25<12:52:56,  8.58s/it, gpt_loss=0.259, loss_mean=0.273][A[A
+
+Train step of epoch 1:  16%|█▌        | 1031/6434 [2:25:33<12:52:56,  8.58s/it, gpt_loss=0.31, loss_mean=0.277] [A[A
+
+Train step of epoch 1:  16%|█▌        | 1032/6434 [2:25:33<12:43:53,  8.48s/it, gpt_loss=0.31, loss_mean=0.277][A[A
+
+Train step of epoch 1:  16%|█▌        | 1032/6434 [2:25:43<12:43:53,  8.48s/it, gpt_loss=0.211, loss_mean=0.27][A[A
+
+Train step of epoch 1:  16%|█▌        | 1033/6434 [2:25:43<13:23:58,  8.93s/it, gpt_loss=0.211, loss_mean=0.27][A[A
+
+Train step of epoch 1:  16%|█▌        | 1033/6434 [2:25:53<13:23:58,  8.93s/it, gpt_loss=0.249, loss_mean=0.268][A[A
+
+Train step of epoch 1:  16%|█▌        | 1034/6434 [2:25:53<13:51:56,  9.24s/it, gpt_loss=0.249, loss_mean=0.268][A[A
+
+Train step of epoch 1:  16%|█▌        | 1034/6434 [2:26:02<13:51:56,  9.24s/it, gpt_loss=0.262, loss_mean=0.268][A[A
+
+Train step of epoch 1:  16%|█▌        | 1035/6434 [2:26:02<13:37:01,  9.08s/it, gpt_loss=0.262, loss_mean=0.268][A[A
+[LID Router Debug] Step: 7470
+Batch Size: 10
+Audio Batch Size: 118
+LID Assignments: [4, 1, 9, 5, 2, 3, 9, 3, 9, 4]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  16%|█▌        | 1035/6434 [2:26:10<13:37:01,  9.08s/it, gpt_loss=0.236, loss_mean=0.264][A[A
+
+Train step of epoch 1:  16%|█▌        | 1036/6434 [2:26:10<13:26:19,  8.96s/it, gpt_loss=0.236, loss_mean=0.264][A[A
+
+Train step of epoch 1:  16%|█▌        | 1036/6434 [2:26:19<13:26:19,  8.96s/it, gpt_loss=0.251, loss_mean=0.263][A[A
+
+Train step of epoch 1:  16%|█▌        | 1037/6434 [2:26:19<13:07:22,  8.75s/it, gpt_loss=0.251, loss_mean=0.263][A[A
+
+Train step of epoch 1:  16%|█▌        | 1037/6434 [2:26:28<13:07:22,  8.75s/it, gpt_loss=0.249, loss_mean=0.262][A[A
+
+Train step of epoch 1:  16%|█▌        | 1038/6434 [2:26:28<13:29:56,  9.01s/it, gpt_loss=0.249, loss_mean=0.262][A[A
+
+Train step of epoch 1:  16%|█▌        | 1038/6434 [2:26:37<13:29:56,  9.01s/it, gpt_loss=0.329, loss_mean=0.268][A[A
+
+Train step of epoch 1:  16%|█▌        | 1039/6434 [2:26:37<13:17:47,  8.87s/it, gpt_loss=0.329, loss_mean=0.268][A[A
+
+Train step of epoch 1:  16%|█▌        | 1039/6434 [2:26:45<13:17:47,  8.87s/it, gpt_loss=0.329, loss_mean=0.274][A[A
+
+Train step of epoch 1:  16%|█▌        | 1040/6434 [2:26:45<13:13:44,  8.83s/it, gpt_loss=0.329, loss_mean=0.274][A[A
+
+Train step of epoch 1:  16%|█▌        | 1040/6434 [2:26:55<13:13:44,  8.83s/it, gpt_loss=0.248, loss_mean=0.272][A[A
+
+Train step of epoch 1:  16%|█▌        | 1041/6434 [2:26:55<13:20:19,  8.90s/it, gpt_loss=0.248, loss_mean=0.272][A[A
+
+Train step of epoch 1:  16%|█▌        | 1041/6434 [2:27:03<13:20:19,  8.90s/it, gpt_loss=0.347, loss_mean=0.279][A[A
+
+Train step of epoch 1:  16%|█▌        | 1042/6434 [2:27:03<13:08:34,  8.77s/it, gpt_loss=0.347, loss_mean=0.279][A[A
+
+Train step of epoch 1:  16%|█▌        | 1042/6434 [2:27:11<13:08:34,  8.77s/it, gpt_loss=0.3, loss_mean=0.281]  [A[A
+
+Train step of epoch 1:  16%|█▌        | 1043/6434 [2:27:11<12:49:05,  8.56s/it, gpt_loss=0.3, loss_mean=0.281][A[A
+
+Train step of epoch 1:  16%|█▌        | 1043/6434 [2:27:21<12:49:05,  8.56s/it, gpt_loss=0.253, loss_mean=0.279][A[A
+
+Train step of epoch 1:  16%|█▌        | 1044/6434 [2:27:21<13:12:23,  8.82s/it, gpt_loss=0.253, loss_mean=0.279][A[A
+
+Train step of epoch 1:  16%|█▌        | 1044/6434 [2:27:29<13:12:23,  8.82s/it, gpt_loss=0.288, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  16%|█▌        | 1045/6434 [2:27:29<12:57:12,  8.65s/it, gpt_loss=0.288, loss_mean=0.28][A[A
+[LID Router Debug] Step: 7480
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [1, 4, 5, 2, 4, 5, 5, 0, 3, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5}
+
+
+Train step of epoch 1:  16%|█▌        | 1045/6434 [2:27:37<12:57:12,  8.65s/it, gpt_loss=0.23, loss_mean=0.275][A[A
+
+Train step of epoch 1:  16%|█▋        | 1046/6434 [2:27:37<12:44:25,  8.51s/it, gpt_loss=0.23, loss_mean=0.275][A[A
+
+Train step of epoch 1:  16%|█▋        | 1046/6434 [2:27:46<12:44:25,  8.51s/it, gpt_loss=0.254, loss_mean=0.273][A[A
+
+Train step of epoch 1:  16%|█▋        | 1047/6434 [2:27:46<13:11:41,  8.82s/it, gpt_loss=0.254, loss_mean=0.273][A[A
+
+Train step of epoch 1:  16%|█▋        | 1047/6434 [2:27:54<13:11:41,  8.82s/it, gpt_loss=0.364, loss_mean=0.282][A[A
+
+Train step of epoch 1:  16%|█▋        | 1048/6434 [2:27:54<12:44:55,  8.52s/it, gpt_loss=0.364, loss_mean=0.282][A[A
+
+Train step of epoch 1:  16%|█▋        | 1048/6434 [2:28:03<12:44:55,  8.52s/it, gpt_loss=0.27, loss_mean=0.281] [A[A
+
+Train step of epoch 1:  16%|█▋        | 1049/6434 [2:28:03<12:46:49,  8.54s/it, gpt_loss=0.27, loss_mean=0.281][A[A
+
+Train step of epoch 1:  16%|█▋        | 1049/6434 [2:28:11<12:46:49,  8.54s/it, gpt_loss=0.219, loss_mean=0.274][A[A
+
+Train step of epoch 1:  16%|█▋        | 1050/6434 [2:28:11<12:44:15,  8.52s/it, gpt_loss=0.219, loss_mean=0.274][A[A
+
+Train step of epoch 1:  16%|█▋        | 1050/6434 [2:28:19<12:44:15,  8.52s/it, gpt_loss=0.241, loss_mean=0.271][A[A
+
+Train step of epoch 1:  16%|█▋        | 1051/6434 [2:28:19<12:32:46,  8.39s/it, gpt_loss=0.241, loss_mean=0.271][A[A
+
+Train step of epoch 1:  16%|█▋        | 1051/6434 [2:28:28<12:32:46,  8.39s/it, gpt_loss=0.288, loss_mean=0.273][A[A
+
+Train step of epoch 1:  16%|█▋        | 1052/6434 [2:28:28<12:41:51,  8.49s/it, gpt_loss=0.288, loss_mean=0.273][A[A
+
+Train step of epoch 1:  16%|█▋        | 1052/6434 [2:28:38<12:41:51,  8.49s/it, gpt_loss=0.291, loss_mean=0.275][A[A
+
+Train step of epoch 1:  16%|█▋        | 1053/6434 [2:28:38<13:07:22,  8.78s/it, gpt_loss=0.291, loss_mean=0.275][A[A
+
+Train step of epoch 1:  16%|█▋        | 1053/6434 [2:28:45<13:07:22,  8.78s/it, gpt_loss=0.242, loss_mean=0.271][A[A
+
+Train step of epoch 1:  16%|█▋        | 1054/6434 [2:28:45<12:36:03,  8.43s/it, gpt_loss=0.242, loss_mean=0.271][A[A
+
+Train step of epoch 1:  16%|█▋        | 1054/6434 [2:28:55<12:36:03,  8.43s/it, gpt_loss=0.272, loss_mean=0.271][A[A
+
+Train step of epoch 1:  16%|█▋        | 1055/6434 [2:28:55<13:01:08,  8.71s/it, gpt_loss=0.272, loss_mean=0.271][A[A
+[LID Router Debug] Step: 7490
+Batch Size: 10
+Audio Batch Size: 76
+LID Assignments: [9, 5, 0, 5, 2, 5, 5, 5, 4, 9]
+Active Experts in Batch: {0, 2, 4, 5, 9}
+
+
+Train step of epoch 1:  16%|█▋        | 1055/6434 [2:29:04<13:01:08,  8.71s/it, gpt_loss=0.266, loss_mean=0.271][A[A
+
+Train step of epoch 1:  16%|█▋        | 1056/6434 [2:29:04<13:09:19,  8.81s/it, gpt_loss=0.266, loss_mean=0.271][A[A
+
+Train step of epoch 1:  16%|█▋        | 1056/6434 [2:29:12<13:09:19,  8.81s/it, gpt_loss=0.279, loss_mean=0.272][A[A
+
+Train step of epoch 1:  16%|█▋        | 1057/6434 [2:29:12<12:55:02,  8.65s/it, gpt_loss=0.279, loss_mean=0.272][A[A
+
+Train step of epoch 1:  16%|█▋        | 1057/6434 [2:29:21<12:55:02,  8.65s/it, gpt_loss=0.254, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  16%|█▋        | 1058/6434 [2:29:21<12:52:29,  8.62s/it, gpt_loss=0.254, loss_mean=0.27][A[A
+
+Train step of epoch 1:  16%|█▋        | 1058/6434 [2:29:30<12:52:29,  8.62s/it, gpt_loss=0.188, loss_mean=0.262][A[A
+
+Train step of epoch 1:  16%|█▋        | 1059/6434 [2:29:30<13:24:05,  8.98s/it, gpt_loss=0.188, loss_mean=0.262][A[A
+
+Train step of epoch 1:  16%|█▋        | 1059/6434 [2:29:40<13:24:05,  8.98s/it, gpt_loss=0.385, loss_mean=0.274][A[A
+
+Train step of epoch 1:  16%|█▋        | 1060/6434 [2:29:40<13:54:39,  9.32s/it, gpt_loss=0.385, loss_mean=0.274][A[A
+
+Train step of epoch 1:  16%|█▋        | 1060/6434 [2:29:48<13:54:39,  9.32s/it, gpt_loss=0.263, loss_mean=0.273][A[A
+
+Train step of epoch 1:  16%|█▋        | 1061/6434 [2:29:48<13:02:51,  8.74s/it, gpt_loss=0.263, loss_mean=0.273][A[A
+
+Train step of epoch 1:  16%|█▋        | 1061/6434 [2:29:57<13:02:51,  8.74s/it, gpt_loss=0.291, loss_mean=0.275][A[A
+
+Train step of epoch 1:  17%|█▋        | 1062/6434 [2:29:57<13:25:04,  8.99s/it, gpt_loss=0.291, loss_mean=0.275][A[A
+
+Train step of epoch 1:  17%|█▋        | 1062/6434 [2:30:06<13:25:04,  8.99s/it, gpt_loss=0.218, loss_mean=0.269][A[A
+
+Train step of epoch 1:  17%|█▋        | 1063/6434 [2:30:06<13:27:45,  9.02s/it, gpt_loss=0.218, loss_mean=0.269][A[A
+
+Train step of epoch 1:  17%|█▋        | 1063/6434 [2:30:15<13:27:45,  9.02s/it, gpt_loss=0.249, loss_mean=0.267][A[A
+
+Train step of epoch 1:  17%|█▋        | 1064/6434 [2:30:15<13:07:14,  8.80s/it, gpt_loss=0.249, loss_mean=0.267][A[A
+
+Train step of epoch 1:  17%|█▋        | 1064/6434 [2:30:22<13:07:14,  8.80s/it, gpt_loss=0.324, loss_mean=0.273][A[A
+
+Train step of epoch 1:  17%|█▋        | 1065/6434 [2:30:22<12:38:05,  8.47s/it, gpt_loss=0.324, loss_mean=0.273][A[A
+[LID Router Debug] Step: 7500
+Batch Size: 10
+Audio Batch Size: 96
+LID Assignments: [2, 2, 1, 5, 9, 9, 6, 9, 0, 2]
+Active Experts in Batch: {0, 1, 2, 5, 6, 9}
+
+
+Train step of epoch 1:  17%|█▋        | 1065/6434 [2:30:31<12:38:05,  8.47s/it, gpt_loss=0.252, loss_mean=0.271][A[A
+
+Train step of epoch 1:  17%|█▋        | 1066/6434 [2:30:31<12:30:25,  8.39s/it, gpt_loss=0.252, loss_mean=0.271][A[A
+
+Train step of epoch 1:  17%|█▋        | 1066/6434 [2:30:38<12:30:25,  8.39s/it, gpt_loss=0.281, loss_mean=0.272][A[A
+
+Train step of epoch 1:  17%|█▋        | 1067/6434 [2:30:38<12:03:24,  8.09s/it, gpt_loss=0.281, loss_mean=0.272][A[A
+
+Train step of epoch 1:  17%|█▋        | 1067/6434 [2:30:48<12:03:24,  8.09s/it, gpt_loss=0.238, loss_mean=0.268][A[A
+
+Train step of epoch 1:  17%|█▋        | 1068/6434 [2:30:48<12:45:46,  8.56s/it, gpt_loss=0.238, loss_mean=0.268][A[A
+
+Train step of epoch 1:  17%|█▋        | 1068/6434 [2:30:56<12:45:46,  8.56s/it, gpt_loss=0.254, loss_mean=0.267][A[A
+
+Train step of epoch 1:  17%|█▋        | 1069/6434 [2:30:56<12:36:00,  8.45s/it, gpt_loss=0.254, loss_mean=0.267][A[A
+
+Train step of epoch 1:  17%|█▋        | 1069/6434 [2:31:04<12:36:00,  8.45s/it, gpt_loss=0.253, loss_mean=0.265][A[A
+
+Train step of epoch 1:  17%|█▋        | 1070/6434 [2:31:04<12:20:53,  8.29s/it, gpt_loss=0.253, loss_mean=0.265][A[A
+
+Train step of epoch 1:  17%|█▋        | 1070/6434 [2:31:13<12:20:53,  8.29s/it, gpt_loss=0.28, loss_mean=0.267] [A[A
+
+Train step of epoch 1:  17%|█▋        | 1071/6434 [2:31:13<12:36:23,  8.46s/it, gpt_loss=0.28, loss_mean=0.267][A[A
+
+Train step of epoch 1:  17%|█▋        | 1071/6434 [2:31:21<12:36:23,  8.46s/it, gpt_loss=0.291, loss_mean=0.269][A[A
+
+Train step of epoch 1:  17%|█▋        | 1072/6434 [2:31:21<12:27:35,  8.37s/it, gpt_loss=0.291, loss_mean=0.269][A[A
+
+Train step of epoch 1:  17%|█▋        | 1072/6434 [2:31:29<12:27:35,  8.37s/it, gpt_loss=0.31, loss_mean=0.273] [A[A
+
+Train step of epoch 1:  17%|█▋        | 1073/6434 [2:31:29<12:31:14,  8.41s/it, gpt_loss=0.31, loss_mean=0.273][A[A
+
+Train step of epoch 1:  17%|█▋        | 1073/6434 [2:31:37<12:31:14,  8.41s/it, gpt_loss=0.283, loss_mean=0.274][A[A
+
+Train step of epoch 1:  17%|█▋        | 1074/6434 [2:31:37<12:21:42,  8.30s/it, gpt_loss=0.283, loss_mean=0.274][A[A
+
+Train step of epoch 1:  17%|█▋        | 1074/6434 [2:31:45<12:21:42,  8.30s/it, gpt_loss=0.299, loss_mean=0.277][A[A
+
+Train step of epoch 1:  17%|█▋        | 1075/6434 [2:31:45<12:15:36,  8.24s/it, gpt_loss=0.299, loss_mean=0.277][A[A
+[LID Router Debug] Step: 7510
+Batch Size: 10
+Audio Batch Size: 113
+LID Assignments: [3, 9, 3, 1, 3, 4, 1, 1, 5, 4]
+Active Experts in Batch: {1, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  17%|█▋        | 1075/6434 [2:31:54<12:15:36,  8.24s/it, gpt_loss=0.26, loss_mean=0.275] [A[A
+
+Train step of epoch 1:  17%|█▋        | 1076/6434 [2:31:54<12:28:30,  8.38s/it, gpt_loss=0.26, loss_mean=0.275][A[A
+
+Train step of epoch 1:  17%|█▋        | 1076/6434 [2:32:03<12:28:30,  8.38s/it, gpt_loss=0.29, loss_mean=0.277][A[A
+
+Train step of epoch 1:  17%|█▋        | 1077/6434 [2:32:03<12:26:39,  8.36s/it, gpt_loss=0.29, loss_mean=0.277][A[A
+
+Train step of epoch 1:  17%|█▋        | 1077/6434 [2:32:10<12:26:39,  8.36s/it, gpt_loss=0.288, loss_mean=0.278][A[A
+
+Train step of epoch 1:  17%|█▋        | 1078/6434 [2:32:10<12:12:56,  8.21s/it, gpt_loss=0.288, loss_mean=0.278][A[A
+
+Train step of epoch 1:  17%|█▋        | 1078/6434 [2:32:18<12:12:56,  8.21s/it, gpt_loss=0.287, loss_mean=0.279][A[A
+
+Train step of epoch 1:  17%|█▋        | 1079/6434 [2:32:18<11:48:52,  7.94s/it, gpt_loss=0.287, loss_mean=0.279][A[A
+
+Train step of epoch 1:  17%|█▋        | 1079/6434 [2:32:25<11:48:52,  7.94s/it, gpt_loss=0.323, loss_mean=0.283][A[A
+
+Train step of epoch 1:  17%|█▋        | 1080/6434 [2:32:25<11:35:21,  7.79s/it, gpt_loss=0.323, loss_mean=0.283][A[A
+
+Train step of epoch 1:  17%|█▋        | 1080/6434 [2:32:33<11:35:21,  7.79s/it, gpt_loss=0.227, loss_mean=0.277][A[A
+
+Train step of epoch 1:  17%|█▋        | 1081/6434 [2:32:33<11:45:54,  7.91s/it, gpt_loss=0.227, loss_mean=0.277][A[A
+
+Train step of epoch 1:  17%|█▋        | 1081/6434 [2:32:41<11:45:54,  7.91s/it, gpt_loss=0.231, loss_mean=0.273][A[A
+
+Train step of epoch 1:  17%|█▋        | 1082/6434 [2:32:41<11:49:27,  7.95s/it, gpt_loss=0.231, loss_mean=0.273][A[A
+
+Train step of epoch 1:  17%|█▋        | 1082/6434 [2:32:51<11:49:27,  7.95s/it, gpt_loss=0.297, loss_mean=0.275][A[A
+
+Train step of epoch 1:  17%|█▋        | 1083/6434 [2:32:51<12:38:36,  8.51s/it, gpt_loss=0.297, loss_mean=0.275][A[A
+
+Train step of epoch 1:  17%|█▋        | 1083/6434 [2:32:58<12:38:36,  8.51s/it, gpt_loss=0.242, loss_mean=0.272][A[A
+
+Train step of epoch 1:  17%|█▋        | 1084/6434 [2:32:58<12:00:34,  8.08s/it, gpt_loss=0.242, loss_mean=0.272][A[A
+
+Train step of epoch 1:  17%|█▋        | 1084/6434 [2:33:06<12:00:34,  8.08s/it, gpt_loss=0.252, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  17%|█▋        | 1085/6434 [2:33:06<11:57:07,  8.04s/it, gpt_loss=0.252, loss_mean=0.27][A[A
+[LID Router Debug] Step: 7520
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [3, 4, 6, 4, 1, 2, 1, 2, 5, 3]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6}
+
+
+Train step of epoch 1:  17%|█▋        | 1085/6434 [2:33:14<11:57:07,  8.04s/it, gpt_loss=0.218, loss_mean=0.265][A[A
+
+Train step of epoch 1:  17%|█▋        | 1086/6434 [2:33:14<11:43:07,  7.89s/it, gpt_loss=0.218, loss_mean=0.265][A[A
+
+Train step of epoch 1:  17%|█▋        | 1086/6434 [2:33:23<11:43:07,  7.89s/it, gpt_loss=0.209, loss_mean=0.259][A[A
+
+Train step of epoch 1:  17%|█▋        | 1087/6434 [2:33:23<12:31:39,  8.43s/it, gpt_loss=0.209, loss_mean=0.259][A[A
+
+Train step of epoch 1:  17%|█▋        | 1087/6434 [2:33:31<12:31:39,  8.43s/it, gpt_loss=0.282, loss_mean=0.261][A[A
+
+Train step of epoch 1:  17%|█▋        | 1088/6434 [2:33:31<12:03:54,  8.12s/it, gpt_loss=0.282, loss_mean=0.261][A[A
+
+Train step of epoch 1:  17%|█▋        | 1088/6434 [2:33:39<12:03:54,  8.12s/it, gpt_loss=0.267, loss_mean=0.262][A[A
+
+Train step of epoch 1:  17%|█▋        | 1089/6434 [2:33:39<12:12:26,  8.22s/it, gpt_loss=0.267, loss_mean=0.262][A[A
+
+Train step of epoch 1:  17%|█▋        | 1089/6434 [2:33:48<12:12:26,  8.22s/it, gpt_loss=0.263, loss_mean=0.262][A[A
+
+Train step of epoch 1:  17%|█▋        | 1090/6434 [2:33:48<12:33:55,  8.46s/it, gpt_loss=0.263, loss_mean=0.262][A[A
+
+Train step of epoch 1:  17%|█▋        | 1090/6434 [2:33:57<12:33:55,  8.46s/it, gpt_loss=0.197, loss_mean=0.256][A[A
+
+Train step of epoch 1:  17%|█▋        | 1091/6434 [2:33:57<12:28:38,  8.41s/it, gpt_loss=0.197, loss_mean=0.256][A[A
+
+Train step of epoch 1:  17%|█▋        | 1091/6434 [2:34:04<12:28:38,  8.41s/it, gpt_loss=0.281, loss_mean=0.258][A[A
+
+Train step of epoch 1:  17%|█▋        | 1092/6434 [2:34:04<12:14:45,  8.25s/it, gpt_loss=0.281, loss_mean=0.258][A[A
+
+Train step of epoch 1:  17%|█▋        | 1092/6434 [2:34:12<12:14:45,  8.25s/it, gpt_loss=0.278, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  17%|█▋        | 1093/6434 [2:34:12<12:00:23,  8.09s/it, gpt_loss=0.278, loss_mean=0.26][A[A
+
+Train step of epoch 1:  17%|█▋        | 1093/6434 [2:34:20<12:00:23,  8.09s/it, gpt_loss=0.238, loss_mean=0.258][A[A
+
+Train step of epoch 1:  17%|█▋        | 1094/6434 [2:34:20<11:46:08,  7.93s/it, gpt_loss=0.238, loss_mean=0.258][A[A
+
+Train step of epoch 1:  17%|█▋        | 1094/6434 [2:34:28<11:46:08,  7.93s/it, gpt_loss=0.28, loss_mean=0.26]  [A[A
+
+Train step of epoch 1:  17%|█▋        | 1095/6434 [2:34:28<11:55:31,  8.04s/it, gpt_loss=0.28, loss_mean=0.26][A[A
+[LID Router Debug] Step: 7530
+Batch Size: 10
+Audio Batch Size: 139
+LID Assignments: [0, 1, 3, 5, 9, 4, 0, 3, 3, 0]
+Active Experts in Batch: {0, 1, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  17%|█▋        | 1095/6434 [2:34:38<11:55:31,  8.04s/it, gpt_loss=0.231, loss_mean=0.257][A[A
+
+Train step of epoch 1:  17%|█▋        | 1096/6434 [2:34:38<12:37:19,  8.51s/it, gpt_loss=0.231, loss_mean=0.257][A[A
+
+Train step of epoch 1:  17%|█▋        | 1096/6434 [2:34:46<12:37:19,  8.51s/it, gpt_loss=0.282, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  17%|█▋        | 1097/6434 [2:34:46<12:37:52,  8.52s/it, gpt_loss=0.282, loss_mean=0.26][A[A
+
+Train step of epoch 1:  17%|█▋        | 1097/6434 [2:34:55<12:37:52,  8.52s/it, gpt_loss=0.313, loss_mean=0.265][A[A
+
+Train step of epoch 1:  17%|█▋        | 1098/6434 [2:34:55<12:32:20,  8.46s/it, gpt_loss=0.313, loss_mean=0.265][A[A
+
+Train step of epoch 1:  17%|█▋        | 1098/6434 [2:35:03<12:32:20,  8.46s/it, gpt_loss=0.297, loss_mean=0.268][A[A
+
+Train step of epoch 1:  17%|█▋        | 1099/6434 [2:35:03<12:28:29,  8.42s/it, gpt_loss=0.297, loss_mean=0.268][A[A
+
+Train step of epoch 1:  17%|█▋        | 1099/6434 [2:35:11<12:28:29,  8.42s/it, gpt_loss=0.304, loss_mean=0.272][A[A
+
+Train step of epoch 1:  17%|█▋        | 1100/6434 [2:35:11<12:24:31,  8.37s/it, gpt_loss=0.304, loss_mean=0.272][A[A
+
+Train step of epoch 1:  17%|█▋        | 1100/6434 [2:35:20<12:24:31,  8.37s/it, gpt_loss=0.29, loss_mean=0.274] [A[A
+
+Train step of epoch 1:  17%|█▋        | 1101/6434 [2:35:20<12:30:08,  8.44s/it, gpt_loss=0.29, loss_mean=0.274][A[A
+
+Train step of epoch 1:  17%|█▋        | 1101/6434 [2:35:28<12:30:08,  8.44s/it, gpt_loss=0.265, loss_mean=0.273][A[A
+
+Train step of epoch 1:  17%|█▋        | 1102/6434 [2:35:28<12:23:59,  8.37s/it, gpt_loss=0.265, loss_mean=0.273][A[A
+
+Train step of epoch 1:  17%|█▋        | 1102/6434 [2:35:37<12:23:59,  8.37s/it, gpt_loss=0.272, loss_mean=0.273][A[A
+
+Train step of epoch 1:  17%|█▋        | 1103/6434 [2:35:37<12:41:11,  8.57s/it, gpt_loss=0.272, loss_mean=0.273][A[A
+
+Train step of epoch 1:  17%|█▋        | 1103/6434 [2:35:45<12:41:11,  8.57s/it, gpt_loss=0.244, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  17%|█▋        | 1104/6434 [2:35:45<12:29:02,  8.43s/it, gpt_loss=0.244, loss_mean=0.27][A[A
+
+Train step of epoch 1:  17%|█▋        | 1104/6434 [2:35:54<12:29:02,  8.43s/it, gpt_loss=0.246, loss_mean=0.267][A[A
+
+Train step of epoch 1:  17%|█▋        | 1105/6434 [2:35:54<12:31:04,  8.46s/it, gpt_loss=0.246, loss_mean=0.267][A[A
+[LID Router Debug] Step: 7540
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [2, 2, 4, 3, 4, 0, 0, 6, 5, 3]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6}
+
+
+Train step of epoch 1:  17%|█▋        | 1105/6434 [2:36:03<12:31:04,  8.46s/it, gpt_loss=0.227, loss_mean=0.263][A[A
+
+Train step of epoch 1:  17%|█▋        | 1106/6434 [2:36:03<12:50:11,  8.67s/it, gpt_loss=0.227, loss_mean=0.263][A[A
+
+Train step of epoch 1:  17%|█▋        | 1106/6434 [2:36:12<12:50:11,  8.67s/it, gpt_loss=0.312, loss_mean=0.268][A[A
+
+Train step of epoch 1:  17%|█▋        | 1107/6434 [2:36:12<12:53:59,  8.72s/it, gpt_loss=0.312, loss_mean=0.268][A[A
+
+Train step of epoch 1:  17%|█▋        | 1107/6434 [2:36:19<12:53:59,  8.72s/it, gpt_loss=0.342, loss_mean=0.276][A[A
+
+Train step of epoch 1:  17%|█▋        | 1108/6434 [2:36:19<12:29:30,  8.44s/it, gpt_loss=0.342, loss_mean=0.276][A[A
+
+Train step of epoch 1:  17%|█▋        | 1108/6434 [2:36:29<12:29:30,  8.44s/it, gpt_loss=0.3, loss_mean=0.278]  [A[A
+
+Train step of epoch 1:  17%|█▋        | 1109/6434 [2:36:29<13:02:36,  8.82s/it, gpt_loss=0.3, loss_mean=0.278][A[A
+
+Train step of epoch 1:  17%|█▋        | 1109/6434 [2:36:38<13:02:36,  8.82s/it, gpt_loss=0.255, loss_mean=0.276][A[A
+
+Train step of epoch 1:  17%|█▋        | 1110/6434 [2:36:38<13:08:59,  8.89s/it, gpt_loss=0.255, loss_mean=0.276][A[A
+
+Train step of epoch 1:  17%|█▋        | 1110/6434 [2:36:47<13:08:59,  8.89s/it, gpt_loss=0.267, loss_mean=0.275][A[A
+
+Train step of epoch 1:  17%|█▋        | 1111/6434 [2:36:47<12:56:52,  8.76s/it, gpt_loss=0.267, loss_mean=0.275][A[A
+
+Train step of epoch 1:  17%|█▋        | 1111/6434 [2:36:54<12:56:52,  8.76s/it, gpt_loss=0.211, loss_mean=0.269][A[A
+
+Train step of epoch 1:  17%|█▋        | 1112/6434 [2:36:54<12:28:43,  8.44s/it, gpt_loss=0.211, loss_mean=0.269][A[A
+
+Train step of epoch 1:  17%|█▋        | 1112/6434 [2:37:04<12:28:43,  8.44s/it, gpt_loss=0.209, loss_mean=0.263][A[A
+
+Train step of epoch 1:  17%|█▋        | 1113/6434 [2:37:04<12:58:08,  8.77s/it, gpt_loss=0.209, loss_mean=0.263][A[A
+
+Train step of epoch 1:  17%|█▋        | 1113/6434 [2:37:12<12:58:08,  8.77s/it, gpt_loss=0.268, loss_mean=0.263][A[A
+
+Train step of epoch 1:  17%|█▋        | 1114/6434 [2:37:12<12:45:04,  8.63s/it, gpt_loss=0.268, loss_mean=0.263][A[A
+
+Train step of epoch 1:  17%|█▋        | 1114/6434 [2:37:20<12:45:04,  8.63s/it, gpt_loss=0.282, loss_mean=0.265][A[A
+
+Train step of epoch 1:  17%|█▋        | 1115/6434 [2:37:20<12:25:35,  8.41s/it, gpt_loss=0.282, loss_mean=0.265][A[A
+[LID Router Debug] Step: 7550
+Batch Size: 10
+Audio Batch Size: 113
+LID Assignments: [4, 0, 4, 2, 1, 0, 2, 2, 6, 9]
+Active Experts in Batch: {0, 1, 2, 4, 6, 9}
+
+
+Train step of epoch 1:  17%|█▋        | 1115/6434 [2:37:28<12:25:35,  8.41s/it, gpt_loss=0.274, loss_mean=0.266][A[A
+
+Train step of epoch 1:  17%|█▋        | 1116/6434 [2:37:28<12:23:35,  8.39s/it, gpt_loss=0.274, loss_mean=0.266][A[A
+
+Train step of epoch 1:  17%|█▋        | 1116/6434 [2:37:36<12:23:35,  8.39s/it, gpt_loss=0.3, loss_mean=0.269]  [A[A
+
+Train step of epoch 1:  17%|█▋        | 1117/6434 [2:37:36<12:11:20,  8.25s/it, gpt_loss=0.3, loss_mean=0.269][A[A
+
+Train step of epoch 1:  17%|█▋        | 1117/6434 [2:37:46<12:11:20,  8.25s/it, gpt_loss=0.284, loss_mean=0.271][A[A
+
+Train step of epoch 1:  17%|█▋        | 1118/6434 [2:37:46<12:36:55,  8.54s/it, gpt_loss=0.284, loss_mean=0.271][A[A
+
+Train step of epoch 1:  17%|█▋        | 1118/6434 [2:37:53<12:36:55,  8.54s/it, gpt_loss=0.271, loss_mean=0.271][A[A
+
+Train step of epoch 1:  17%|█▋        | 1119/6434 [2:37:53<12:12:46,  8.27s/it, gpt_loss=0.271, loss_mean=0.271][A[A
+
+Train step of epoch 1:  17%|█▋        | 1119/6434 [2:38:02<12:12:46,  8.27s/it, gpt_loss=0.22, loss_mean=0.266] [A[A
+
+Train step of epoch 1:  17%|█▋        | 1120/6434 [2:38:02<12:16:54,  8.32s/it, gpt_loss=0.22, loss_mean=0.266][A[A
+
+Train step of epoch 1:  17%|█▋        | 1120/6434 [2:38:10<12:16:54,  8.32s/it, gpt_loss=0.256, loss_mean=0.265][A[A
+
+Train step of epoch 1:  17%|█▋        | 1121/6434 [2:38:10<12:10:30,  8.25s/it, gpt_loss=0.256, loss_mean=0.265][A[A
+
+Train step of epoch 1:  17%|█▋        | 1121/6434 [2:38:18<12:10:30,  8.25s/it, gpt_loss=0.321, loss_mean=0.271][A[A
+
+Train step of epoch 1:  17%|█▋        | 1122/6434 [2:38:18<12:20:05,  8.36s/it, gpt_loss=0.321, loss_mean=0.271][A[A
+
+Train step of epoch 1:  17%|█▋        | 1122/6434 [2:38:27<12:20:05,  8.36s/it, gpt_loss=0.196, loss_mean=0.263][A[A
+
+Train step of epoch 1:  17%|█▋        | 1123/6434 [2:38:27<12:34:53,  8.53s/it, gpt_loss=0.196, loss_mean=0.263][A[A
+
+Train step of epoch 1:  17%|█▋        | 1123/6434 [2:38:36<12:34:53,  8.53s/it, gpt_loss=0.331, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  17%|█▋        | 1124/6434 [2:38:36<12:43:11,  8.62s/it, gpt_loss=0.331, loss_mean=0.27][A[A
+
+Train step of epoch 1:  17%|█▋        | 1124/6434 [2:38:44<12:43:11,  8.62s/it, gpt_loss=0.272, loss_mean=0.27][A[A
+
+Train step of epoch 1:  17%|█▋        | 1125/6434 [2:38:44<12:21:52,  8.38s/it, gpt_loss=0.272, loss_mean=0.27][A[A
+[LID Router Debug] Step: 7560
+Batch Size: 10
+Audio Batch Size: 134
+LID Assignments: [9, 2, 3, 0, 9, 9, 6, 2, 5, 9]
+Active Experts in Batch: {0, 2, 3, 5, 6, 9}
+
+
+Train step of epoch 1:  17%|█▋        | 1125/6434 [2:38:53<12:21:52,  8.38s/it, gpt_loss=0.258, loss_mean=0.269][A[A
+
+Train step of epoch 1:  18%|█▊        | 1126/6434 [2:38:53<12:28:33,  8.46s/it, gpt_loss=0.258, loss_mean=0.269][A[A
+
+Train step of epoch 1:  18%|█▊        | 1126/6434 [2:39:01<12:28:33,  8.46s/it, gpt_loss=0.295, loss_mean=0.271][A[A
+
+Train step of epoch 1:  18%|█▊        | 1127/6434 [2:39:01<12:36:11,  8.55s/it, gpt_loss=0.295, loss_mean=0.271][A[A
+
+Train step of epoch 1:  18%|█▊        | 1127/6434 [2:39:10<12:36:11,  8.55s/it, gpt_loss=0.269, loss_mean=0.271][A[A
+
+Train step of epoch 1:  18%|█▊        | 1128/6434 [2:39:10<12:43:11,  8.63s/it, gpt_loss=0.269, loss_mean=0.271][A[A
+
+Train step of epoch 1:  18%|█▊        | 1128/6434 [2:39:19<12:43:11,  8.63s/it, gpt_loss=0.276, loss_mean=0.272][A[A
+
+Train step of epoch 1:  18%|█▊        | 1129/6434 [2:39:19<12:37:01,  8.56s/it, gpt_loss=0.276, loss_mean=0.272][A[A
+
+Train step of epoch 1:  18%|█▊        | 1129/6434 [2:39:27<12:37:01,  8.56s/it, gpt_loss=0.207, loss_mean=0.265][A[A
+
+Train step of epoch 1:  18%|█▊        | 1130/6434 [2:39:27<12:43:44,  8.64s/it, gpt_loss=0.207, loss_mean=0.265][A[A
+
+Train step of epoch 1:  18%|█▊        | 1130/6434 [2:39:36<12:43:44,  8.64s/it, gpt_loss=0.264, loss_mean=0.265][A[A
+
+Train step of epoch 1:  18%|█▊        | 1131/6434 [2:39:36<12:37:58,  8.58s/it, gpt_loss=0.264, loss_mean=0.265][A[A
+
+Train step of epoch 1:  18%|█▊        | 1131/6434 [2:39:45<12:37:58,  8.58s/it, gpt_loss=0.231, loss_mean=0.262][A[A
+
+Train step of epoch 1:  18%|█▊        | 1132/6434 [2:39:45<12:53:41,  8.76s/it, gpt_loss=0.231, loss_mean=0.262][A[A
+
+Train step of epoch 1:  18%|█▊        | 1132/6434 [2:39:53<12:53:41,  8.76s/it, gpt_loss=0.295, loss_mean=0.265][A[A
+
+Train step of epoch 1:  18%|█▊        | 1133/6434 [2:39:53<12:30:18,  8.49s/it, gpt_loss=0.295, loss_mean=0.265][A[A
+
+Train step of epoch 1:  18%|█▊        | 1133/6434 [2:40:01<12:30:18,  8.49s/it, gpt_loss=0.234, loss_mean=0.262][A[A
+
+Train step of epoch 1:  18%|█▊        | 1134/6434 [2:40:01<12:22:00,  8.40s/it, gpt_loss=0.234, loss_mean=0.262][A[A
+
+Train step of epoch 1:  18%|█▊        | 1134/6434 [2:40:10<12:22:00,  8.40s/it, gpt_loss=0.229, loss_mean=0.259][A[A
+
+Train step of epoch 1:  18%|█▊        | 1135/6434 [2:40:10<12:27:34,  8.46s/it, gpt_loss=0.229, loss_mean=0.259][A[A
+[LID Router Debug] Step: 7570
+Batch Size: 10
+Audio Batch Size: 118
+LID Assignments: [3, 5, 3, 5, 0, 6, 0, 9, 5, 4]
+Active Experts in Batch: {0, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  18%|█▊        | 1135/6434 [2:40:18<12:27:34,  8.46s/it, gpt_loss=0.321, loss_mean=0.265][A[A
+
+Train step of epoch 1:  18%|█▊        | 1136/6434 [2:40:18<12:17:39,  8.35s/it, gpt_loss=0.321, loss_mean=0.265][A[A
+
+Train step of epoch 1:  18%|█▊        | 1136/6434 [2:40:25<12:17:39,  8.35s/it, gpt_loss=0.222, loss_mean=0.261][A[A
+
+Train step of epoch 1:  18%|█▊        | 1137/6434 [2:40:25<11:53:47,  8.09s/it, gpt_loss=0.222, loss_mean=0.261][A[A
+
+Train step of epoch 1:  18%|█▊        | 1137/6434 [2:40:33<11:53:47,  8.09s/it, gpt_loss=0.218, loss_mean=0.256][A[A
+
+Train step of epoch 1:  18%|█▊        | 1138/6434 [2:40:33<11:56:27,  8.12s/it, gpt_loss=0.218, loss_mean=0.256][A[A
+
+Train step of epoch 1:  18%|█▊        | 1138/6434 [2:40:42<11:56:27,  8.12s/it, gpt_loss=0.229, loss_mean=0.254][A[A
+
+Train step of epoch 1:  18%|█▊        | 1139/6434 [2:40:42<12:13:11,  8.31s/it, gpt_loss=0.229, loss_mean=0.254][A[A
+
+Train step of epoch 1:  18%|█▊        | 1139/6434 [2:40:52<12:13:11,  8.31s/it, gpt_loss=0.264, loss_mean=0.255][A[A
+
+Train step of epoch 1:  18%|█▊        | 1140/6434 [2:40:52<12:45:52,  8.68s/it, gpt_loss=0.264, loss_mean=0.255][A[A
+
+Train step of epoch 1:  18%|█▊        | 1140/6434 [2:41:01<12:45:52,  8.68s/it, gpt_loss=0.311, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  18%|█▊        | 1141/6434 [2:41:01<13:07:27,  8.93s/it, gpt_loss=0.311, loss_mean=0.26][A[A
+
+Train step of epoch 1:  18%|█▊        | 1141/6434 [2:41:10<13:07:27,  8.93s/it, gpt_loss=0.22, loss_mean=0.256][A[A
+
+Train step of epoch 1:  18%|█▊        | 1142/6434 [2:41:10<13:17:20,  9.04s/it, gpt_loss=0.22, loss_mean=0.256][A[A
+
+Train step of epoch 1:  18%|█▊        | 1142/6434 [2:41:20<13:17:20,  9.04s/it, gpt_loss=0.288, loss_mean=0.259][A[A
+
+Train step of epoch 1:  18%|█▊        | 1143/6434 [2:41:20<13:39:51,  9.30s/it, gpt_loss=0.288, loss_mean=0.259][A[A
+
+Train step of epoch 1:  18%|█▊        | 1143/6434 [2:41:30<13:39:51,  9.30s/it, gpt_loss=0.226, loss_mean=0.256][A[A
+
+Train step of epoch 1:  18%|█▊        | 1144/6434 [2:41:30<13:53:57,  9.46s/it, gpt_loss=0.226, loss_mean=0.256][A[A
+
+Train step of epoch 1:  18%|█▊        | 1144/6434 [2:41:39<13:53:57,  9.46s/it, gpt_loss=0.24, loss_mean=0.254] [A[A
+
+Train step of epoch 1:  18%|█▊        | 1145/6434 [2:41:39<13:44:29,  9.35s/it, gpt_loss=0.24, loss_mean=0.254][A[A
+[LID Router Debug] Step: 7580
+Batch Size: 10
+Audio Batch Size: 105
+LID Assignments: [9, 0, 6, 9, 9, 5, 9, 3, 9, 9]
+Active Experts in Batch: {0, 3, 5, 6, 9}
+
+
+Train step of epoch 1:  18%|█▊        | 1145/6434 [2:41:48<13:44:29,  9.35s/it, gpt_loss=0.313, loss_mean=0.26][A[A
+
+Train step of epoch 1:  18%|█▊        | 1146/6434 [2:41:48<13:24:39,  9.13s/it, gpt_loss=0.313, loss_mean=0.26][A[A
+
+Train step of epoch 1:  18%|█▊        | 1146/6434 [2:41:55<13:24:39,  9.13s/it, gpt_loss=0.227, loss_mean=0.257][A[A
+
+Train step of epoch 1:  18%|█▊        | 1147/6434 [2:41:55<12:41:13,  8.64s/it, gpt_loss=0.227, loss_mean=0.257][A[A
+
+Train step of epoch 1:  18%|█▊        | 1147/6434 [2:42:03<12:41:13,  8.64s/it, gpt_loss=0.285, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  18%|█▊        | 1148/6434 [2:42:03<12:05:24,  8.23s/it, gpt_loss=0.285, loss_mean=0.26][A[A
+
+Train step of epoch 1:  18%|█▊        | 1148/6434 [2:42:11<12:05:24,  8.23s/it, gpt_loss=0.313, loss_mean=0.265][A[A
+
+Train step of epoch 1:  18%|█▊        | 1149/6434 [2:42:11<12:16:01,  8.36s/it, gpt_loss=0.313, loss_mean=0.265][A[A
+
+Train step of epoch 1:  18%|█▊        | 1149/6434 [2:42:20<12:16:01,  8.36s/it, gpt_loss=0.233, loss_mean=0.262][A[A
+
+Train step of epoch 1:  18%|█▊        | 1150/6434 [2:42:20<12:30:35,  8.52s/it, gpt_loss=0.233, loss_mean=0.262][A[A
+
+Train step of epoch 1:  18%|█▊        | 1150/6434 [2:42:30<12:30:35,  8.52s/it, gpt_loss=0.183, loss_mean=0.254][A[A
+
+Train step of epoch 1:  18%|█▊        | 1151/6434 [2:42:30<13:02:09,  8.88s/it, gpt_loss=0.183, loss_mean=0.254][A[A
+
+Train step of epoch 1:  18%|█▊        | 1151/6434 [2:42:38<13:02:09,  8.88s/it, gpt_loss=0.251, loss_mean=0.254][A[A
+
+Train step of epoch 1:  18%|█▊        | 1152/6434 [2:42:38<12:49:18,  8.74s/it, gpt_loss=0.251, loss_mean=0.254][A[A
+
+Train step of epoch 1:  18%|█▊        | 1152/6434 [2:42:46<12:49:18,  8.74s/it, gpt_loss=0.26, loss_mean=0.254] [A[A
+
+Train step of epoch 1:  18%|█▊        | 1153/6434 [2:42:46<12:24:57,  8.46s/it, gpt_loss=0.26, loss_mean=0.254][A[A
+
+Train step of epoch 1:  18%|█▊        | 1153/6434 [2:42:55<12:24:57,  8.46s/it, gpt_loss=0.276, loss_mean=0.256][A[A
+
+Train step of epoch 1:  18%|█▊        | 1154/6434 [2:42:55<12:41:27,  8.65s/it, gpt_loss=0.276, loss_mean=0.256][A[A
+
+Train step of epoch 1:  18%|█▊        | 1154/6434 [2:43:03<12:41:27,  8.65s/it, gpt_loss=0.247, loss_mean=0.256][A[A
+
+Train step of epoch 1:  18%|█▊        | 1155/6434 [2:43:03<12:12:42,  8.33s/it, gpt_loss=0.247, loss_mean=0.256][A[A
+[LID Router Debug] Step: 7590
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [6, 0, 9, 2, 5, 0, 4, 5, 5, 9]
+Active Experts in Batch: {0, 2, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  18%|█▊        | 1155/6434 [2:43:11<12:12:42,  8.33s/it, gpt_loss=0.422, loss_mean=0.272][A[A
+
+Train step of epoch 1:  18%|█▊        | 1156/6434 [2:43:11<12:20:13,  8.41s/it, gpt_loss=0.422, loss_mean=0.272][A[A
+
+Train step of epoch 1:  18%|█▊        | 1156/6434 [2:43:20<12:20:13,  8.41s/it, gpt_loss=0.195, loss_mean=0.264][A[A
+
+Train step of epoch 1:  18%|█▊        | 1157/6434 [2:43:20<12:31:39,  8.55s/it, gpt_loss=0.195, loss_mean=0.264][A[A
+
+Train step of epoch 1:  18%|█▊        | 1157/6434 [2:43:29<12:31:39,  8.55s/it, gpt_loss=0.272, loss_mean=0.265][A[A
+
+Train step of epoch 1:  18%|█▊        | 1158/6434 [2:43:29<12:36:57,  8.61s/it, gpt_loss=0.272, loss_mean=0.265][A[A
+
+Train step of epoch 1:  18%|█▊        | 1158/6434 [2:43:38<12:36:57,  8.61s/it, gpt_loss=0.268, loss_mean=0.265][A[A
+
+Train step of epoch 1:  18%|█▊        | 1159/6434 [2:43:38<12:57:01,  8.84s/it, gpt_loss=0.268, loss_mean=0.265][A[A
+
+Train step of epoch 1:  18%|█▊        | 1159/6434 [2:43:47<12:57:01,  8.84s/it, gpt_loss=0.386, loss_mean=0.277][A[A
+
+Train step of epoch 1:  18%|█▊        | 1160/6434 [2:43:47<12:45:28,  8.71s/it, gpt_loss=0.386, loss_mean=0.277][A[A
+
+Train step of epoch 1:  18%|█▊        | 1160/6434 [2:43:55<12:45:28,  8.71s/it, gpt_loss=0.306, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  18%|█▊        | 1161/6434 [2:43:55<12:25:16,  8.48s/it, gpt_loss=0.306, loss_mean=0.28][A[A
+
+Train step of epoch 1:  18%|█▊        | 1161/6434 [2:44:04<12:25:16,  8.48s/it, gpt_loss=0.249, loss_mean=0.277][A[A
+
+Train step of epoch 1:  18%|█▊        | 1162/6434 [2:44:04<12:46:55,  8.73s/it, gpt_loss=0.249, loss_mean=0.277][A[A
+
+Train step of epoch 1:  18%|█▊        | 1162/6434 [2:44:13<12:46:55,  8.73s/it, gpt_loss=0.296, loss_mean=0.279][A[A
+
+Train step of epoch 1:  18%|█▊        | 1163/6434 [2:44:13<12:48:55,  8.75s/it, gpt_loss=0.296, loss_mean=0.279][A[A
+
+Train step of epoch 1:  18%|█▊        | 1163/6434 [2:44:23<12:48:55,  8.75s/it, gpt_loss=0.206, loss_mean=0.272][A[A
+
+Train step of epoch 1:  18%|█▊        | 1164/6434 [2:44:23<13:30:14,  9.22s/it, gpt_loss=0.206, loss_mean=0.272][A[A
+
+Train step of epoch 1:  18%|█▊        | 1164/6434 [2:44:33<13:30:14,  9.22s/it, gpt_loss=0.247, loss_mean=0.269][A[A
+
+Train step of epoch 1:  18%|█▊        | 1165/6434 [2:44:33<13:47:45,  9.43s/it, gpt_loss=0.247, loss_mean=0.269][A[A
+[LID Router Debug] Step: 7600
+Batch Size: 10
+Audio Batch Size: 111
+LID Assignments: [5, 6, 3, 4, 4, 2, 0, 0, 0, 3]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6}
+[2026-02-07 09:46:28,401] [INFO] [logging.py:96:log_dist] [Rank 0] step=3800, skipped=0, lr=[1.3458625313258495e-05, 1.3458625313258495e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 09:46:28,402] [INFO] [timer.py:260:stop] epoch=0/micro_step=7600/global_step=3800, RunningAvgSamplesPerSec=4.744341154926869, CurrSamplesPerSec=4.437271399128349, MemAllocated=12.7GB, MaxMemAllocated=49.73GB
+
+
+Train step of epoch 1:  18%|█▊        | 1165/6434 [2:44:41<13:47:45,  9.43s/it, gpt_loss=0.256, loss_mean=0.268][A[A
+
+Train step of epoch 1:  18%|█▊        | 1166/6434 [2:44:41<13:14:19,  9.05s/it, gpt_loss=0.256, loss_mean=0.268][A[A
+
+Train step of epoch 1:  18%|█▊        | 1166/6434 [2:44:49<13:14:19,  9.05s/it, gpt_loss=0.246, loss_mean=0.266][A[A
+
+Train step of epoch 1:  18%|█▊        | 1167/6434 [2:44:49<12:37:52,  8.63s/it, gpt_loss=0.246, loss_mean=0.266][A[A
+
+Train step of epoch 1:  18%|█▊        | 1167/6434 [2:44:57<12:37:52,  8.63s/it, gpt_loss=0.301, loss_mean=0.269][A[A
+
+Train step of epoch 1:  18%|█▊        | 1168/6434 [2:44:57<12:25:52,  8.50s/it, gpt_loss=0.301, loss_mean=0.269][A[A
+
+Train step of epoch 1:  18%|█▊        | 1168/6434 [2:45:05<12:25:52,  8.50s/it, gpt_loss=0.297, loss_mean=0.272][A[A
+
+Train step of epoch 1:  18%|█▊        | 1169/6434 [2:45:05<12:20:44,  8.44s/it, gpt_loss=0.297, loss_mean=0.272][A[A
+
+Train step of epoch 1:  18%|█▊        | 1169/6434 [2:45:14<12:20:44,  8.44s/it, gpt_loss=0.272, loss_mean=0.272][A[A
+
+Train step of epoch 1:  18%|█▊        | 1170/6434 [2:45:14<12:18:53,  8.42s/it, gpt_loss=0.272, loss_mean=0.272][A[A
+
+Train step of epoch 1:  18%|█▊        | 1170/6434 [2:45:21<12:18:53,  8.42s/it, gpt_loss=0.326, loss_mean=0.277][A[A
+
+Train step of epoch 1:  18%|█▊        | 1171/6434 [2:45:21<11:56:16,  8.17s/it, gpt_loss=0.326, loss_mean=0.277][A[A
+
+Train step of epoch 1:  18%|█▊        | 1171/6434 [2:45:29<11:56:16,  8.17s/it, gpt_loss=0.281, loss_mean=0.278][A[A
+
+Train step of epoch 1:  18%|█▊        | 1172/6434 [2:45:29<11:50:29,  8.10s/it, gpt_loss=0.281, loss_mean=0.278][A[A
+
+Train step of epoch 1:  18%|█▊        | 1172/6434 [2:45:37<11:50:29,  8.10s/it, gpt_loss=0.229, loss_mean=0.273][A[A
+
+Train step of epoch 1:  18%|█▊        | 1173/6434 [2:45:37<11:48:39,  8.08s/it, gpt_loss=0.229, loss_mean=0.273][A[A
+
+Train step of epoch 1:  18%|█▊        | 1173/6434 [2:45:45<11:48:39,  8.08s/it, gpt_loss=0.226, loss_mean=0.268][A[A
+
+Train step of epoch 1:  18%|█▊        | 1174/6434 [2:45:45<11:48:22,  8.08s/it, gpt_loss=0.226, loss_mean=0.268][A[A
+
+Train step of epoch 1:  18%|█▊        | 1174/6434 [2:45:54<11:48:22,  8.08s/it, gpt_loss=0.222, loss_mean=0.264][A[A
+
+Train step of epoch 1:  18%|█▊        | 1175/6434 [2:45:54<12:10:37,  8.34s/it, gpt_loss=0.222, loss_mean=0.264][A[A
+[LID Router Debug] Step: 7610
+Batch Size: 10
+Audio Batch Size: 131
+LID Assignments: [5, 5, 6, 0, 4, 10, 3, 9, 3, 9]
+Active Experts in Batch: {0, 3, 4, 5, 6, 9, 10}
+
+
+Train step of epoch 1:  18%|█▊        | 1175/6434 [2:46:03<12:10:37,  8.34s/it, gpt_loss=0.291, loss_mean=0.266][A[A
+
+Train step of epoch 1:  18%|█▊        | 1176/6434 [2:46:03<12:16:09,  8.40s/it, gpt_loss=0.291, loss_mean=0.266][A[A
+
+Train step of epoch 1:  18%|█▊        | 1176/6434 [2:46:11<12:16:09,  8.40s/it, gpt_loss=0.268, loss_mean=0.266][A[A
+
+Train step of epoch 1:  18%|█▊        | 1177/6434 [2:46:11<11:56:03,  8.17s/it, gpt_loss=0.268, loss_mean=0.266][A[A
+
+Train step of epoch 1:  18%|█▊        | 1177/6434 [2:46:19<11:56:03,  8.17s/it, gpt_loss=0.255, loss_mean=0.265][A[A
+
+Train step of epoch 1:  18%|█▊        | 1178/6434 [2:46:19<11:56:03,  8.17s/it, gpt_loss=0.255, loss_mean=0.265][A[A
+
+Train step of epoch 1:  18%|█▊        | 1178/6434 [2:46:26<11:56:03,  8.17s/it, gpt_loss=0.232, loss_mean=0.262][A[A
+
+Train step of epoch 1:  18%|█▊        | 1179/6434 [2:46:26<11:41:23,  8.01s/it, gpt_loss=0.232, loss_mean=0.262][A[A
+
+Train step of epoch 1:  18%|█▊        | 1179/6434 [2:46:34<11:41:23,  8.01s/it, gpt_loss=0.272, loss_mean=0.263][A[A
+
+Train step of epoch 1:  18%|█▊        | 1180/6434 [2:46:34<11:32:00,  7.90s/it, gpt_loss=0.272, loss_mean=0.263][A[A
+
+Train step of epoch 1:  18%|█▊        | 1180/6434 [2:46:43<11:32:00,  7.90s/it, gpt_loss=0.271, loss_mean=0.264][A[A
+
+Train step of epoch 1:  18%|█▊        | 1181/6434 [2:46:43<11:59:45,  8.22s/it, gpt_loss=0.271, loss_mean=0.264][A[A
+
+Train step of epoch 1:  18%|█▊        | 1181/6434 [2:46:52<11:59:45,  8.22s/it, gpt_loss=0.185, loss_mean=0.256][A[A
+
+Train step of epoch 1:  18%|█▊        | 1182/6434 [2:46:52<12:09:51,  8.34s/it, gpt_loss=0.185, loss_mean=0.256][A[A
+
+Train step of epoch 1:  18%|█▊        | 1182/6434 [2:46:59<12:09:51,  8.34s/it, gpt_loss=0.29, loss_mean=0.259] [A[A
+
+Train step of epoch 1:  18%|█▊        | 1183/6434 [2:46:59<11:52:52,  8.15s/it, gpt_loss=0.29, loss_mean=0.259][A[A
+
+Train step of epoch 1:  18%|█▊        | 1183/6434 [2:47:09<11:52:52,  8.15s/it, gpt_loss=0.286, loss_mean=0.262][A[A
+
+Train step of epoch 1:  18%|█▊        | 1184/6434 [2:47:09<12:25:57,  8.53s/it, gpt_loss=0.286, loss_mean=0.262][A[A
+
+Train step of epoch 1:  18%|█▊        | 1184/6434 [2:47:16<12:25:57,  8.53s/it, gpt_loss=0.295, loss_mean=0.265][A[A
+
+Train step of epoch 1:  18%|█▊        | 1185/6434 [2:47:16<12:04:37,  8.28s/it, gpt_loss=0.295, loss_mean=0.265][A[A
+[LID Router Debug] Step: 7620
+Batch Size: 10
+Audio Batch Size: 106
+LID Assignments: [2, 4, 9, 0, 0, 3, 0, 5, 1, 2]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  18%|█▊        | 1185/6434 [2:47:24<12:04:37,  8.28s/it, gpt_loss=0.363, loss_mean=0.275][A[A
+
+Train step of epoch 1:  18%|█▊        | 1186/6434 [2:47:24<11:45:14,  8.06s/it, gpt_loss=0.363, loss_mean=0.275][A[A
+
+Train step of epoch 1:  18%|█▊        | 1186/6434 [2:47:32<11:45:14,  8.06s/it, gpt_loss=0.248, loss_mean=0.272][A[A
+
+Train step of epoch 1:  18%|█▊        | 1187/6434 [2:47:32<11:46:45,  8.08s/it, gpt_loss=0.248, loss_mean=0.272][A[A
+
+Train step of epoch 1:  18%|█▊        | 1187/6434 [2:47:40<11:46:45,  8.08s/it, gpt_loss=0.214, loss_mean=0.267][A[A
+
+Train step of epoch 1:  18%|█▊        | 1188/6434 [2:47:40<11:48:19,  8.10s/it, gpt_loss=0.214, loss_mean=0.267][A[A
+
+Train step of epoch 1:  18%|█▊        | 1188/6434 [2:47:49<11:48:19,  8.10s/it, gpt_loss=0.325, loss_mean=0.272][A[A
+
+Train step of epoch 1:  18%|█▊        | 1189/6434 [2:47:49<12:01:24,  8.25s/it, gpt_loss=0.325, loss_mean=0.272][A[A
+
+Train step of epoch 1:  18%|█▊        | 1189/6434 [2:47:57<12:01:24,  8.25s/it, gpt_loss=0.29, loss_mean=0.274] [A[A
+
+Train step of epoch 1:  18%|█▊        | 1190/6434 [2:47:57<11:53:41,  8.17s/it, gpt_loss=0.29, loss_mean=0.274][A[A
+
+Train step of epoch 1:  18%|█▊        | 1190/6434 [2:48:06<11:53:41,  8.17s/it, gpt_loss=0.345, loss_mean=0.281][A[A
+
+Train step of epoch 1:  19%|█▊        | 1191/6434 [2:48:06<12:10:08,  8.36s/it, gpt_loss=0.345, loss_mean=0.281][A[A
+
+Train step of epoch 1:  19%|█▊        | 1191/6434 [2:48:14<12:10:08,  8.36s/it, gpt_loss=0.322, loss_mean=0.285][A[A
+
+Train step of epoch 1:  19%|█▊        | 1192/6434 [2:48:14<12:14:06,  8.40s/it, gpt_loss=0.322, loss_mean=0.285][A[A
+
+Train step of epoch 1:  19%|█▊        | 1192/6434 [2:48:23<12:14:06,  8.40s/it, gpt_loss=0.253, loss_mean=0.282][A[A
+
+Train step of epoch 1:  19%|█▊        | 1193/6434 [2:48:23<12:28:01,  8.56s/it, gpt_loss=0.253, loss_mean=0.282][A[A
+
+Train step of epoch 1:  19%|█▊        | 1193/6434 [2:48:32<12:28:01,  8.56s/it, gpt_loss=0.252, loss_mean=0.279][A[A
+
+Train step of epoch 1:  19%|█▊        | 1194/6434 [2:48:32<12:23:57,  8.52s/it, gpt_loss=0.252, loss_mean=0.279][A[A
+
+Train step of epoch 1:  19%|█▊        | 1194/6434 [2:48:40<12:23:57,  8.52s/it, gpt_loss=0.33, loss_mean=0.284] [A[A
+
+Train step of epoch 1:  19%|█▊        | 1195/6434 [2:48:40<12:13:59,  8.41s/it, gpt_loss=0.33, loss_mean=0.284][A[A
+[LID Router Debug] Step: 7630
+Batch Size: 10
+Audio Batch Size: 139
+LID Assignments: [9, 9, 9, 4, 2, 10, 4, 1, 8, 3]
+Active Experts in Batch: {1, 2, 3, 4, 8, 9, 10}
+
+
+Train step of epoch 1:  19%|█▊        | 1195/6434 [2:48:48<12:13:59,  8.41s/it, gpt_loss=0.239, loss_mean=0.28][A[A
+
+Train step of epoch 1:  19%|█▊        | 1196/6434 [2:48:48<12:21:48,  8.50s/it, gpt_loss=0.239, loss_mean=0.28][A[A
+
+Train step of epoch 1:  19%|█▊        | 1196/6434 [2:48:57<12:21:48,  8.50s/it, gpt_loss=0.287, loss_mean=0.28][A[A
+
+Train step of epoch 1:  19%|█▊        | 1197/6434 [2:48:57<12:17:55,  8.45s/it, gpt_loss=0.287, loss_mean=0.28][A[A
+
+Train step of epoch 1:  19%|█▊        | 1197/6434 [2:49:05<12:17:55,  8.45s/it, gpt_loss=0.22, loss_mean=0.274][A[A
+
+Train step of epoch 1:  19%|█▊        | 1198/6434 [2:49:05<12:07:14,  8.33s/it, gpt_loss=0.22, loss_mean=0.274][A[A
+
+Train step of epoch 1:  19%|█▊        | 1198/6434 [2:49:12<12:07:14,  8.33s/it, gpt_loss=0.313, loss_mean=0.278][A[A
+
+Train step of epoch 1:  19%|█▊        | 1199/6434 [2:49:12<11:49:03,  8.13s/it, gpt_loss=0.313, loss_mean=0.278][A[A
+
+Train step of epoch 1:  19%|█▊        | 1199/6434 [2:49:19<11:49:03,  8.13s/it, gpt_loss=0.198, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  19%|█▊        | 1200/6434 [2:49:19<11:19:57,  7.79s/it, gpt_loss=0.198, loss_mean=0.27][A[A
+
+Train step of epoch 1:  19%|█▊        | 1200/6434 [2:49:28<11:19:57,  7.79s/it, gpt_loss=0.271, loss_mean=0.27][A[A
+
+Train step of epoch 1:  19%|█▊        | 1201/6434 [2:49:28<11:26:49,  7.87s/it, gpt_loss=0.271, loss_mean=0.27][A[A
+
+Train step of epoch 1:  19%|█▊        | 1201/6434 [2:49:35<11:26:49,  7.87s/it, gpt_loss=0.304, loss_mean=0.274][A[A
+
+Train step of epoch 1:  19%|█▊        | 1202/6434 [2:49:35<11:19:15,  7.79s/it, gpt_loss=0.304, loss_mean=0.274][A[A
+
+Train step of epoch 1:  19%|█▊        | 1202/6434 [2:49:43<11:19:15,  7.79s/it, gpt_loss=0.279, loss_mean=0.274][A[A
+
+Train step of epoch 1:  19%|█▊        | 1203/6434 [2:49:43<11:28:09,  7.89s/it, gpt_loss=0.279, loss_mean=0.274][A[A
+
+Train step of epoch 1:  19%|█▊        | 1203/6434 [2:49:52<11:28:09,  7.89s/it, gpt_loss=0.232, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  19%|█▊        | 1204/6434 [2:49:52<11:42:29,  8.06s/it, gpt_loss=0.232, loss_mean=0.27][A[A
+
+Train step of epoch 1:  19%|█▊        | 1204/6434 [2:50:00<11:42:29,  8.06s/it, gpt_loss=0.244, loss_mean=0.268][A[A
+
+Train step of epoch 1:  19%|█▊        | 1205/6434 [2:50:00<11:55:58,  8.22s/it, gpt_loss=0.244, loss_mean=0.268][A[A
+[LID Router Debug] Step: 7640
+Batch Size: 10
+Audio Batch Size: 115
+LID Assignments: [3, 2, 4, 3, 5, 2, 9, 4, 2, 1]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  19%|█▊        | 1205/6434 [2:50:09<11:55:58,  8.22s/it, gpt_loss=0.27, loss_mean=0.268] [A[A
+
+Train step of epoch 1:  19%|█▊        | 1206/6434 [2:50:09<12:05:03,  8.32s/it, gpt_loss=0.27, loss_mean=0.268][A[A
+
+Train step of epoch 1:  19%|█▊        | 1206/6434 [2:50:17<12:05:03,  8.32s/it, gpt_loss=0.256, loss_mean=0.267][A[A
+
+Train step of epoch 1:  19%|█▉        | 1207/6434 [2:50:17<11:58:21,  8.25s/it, gpt_loss=0.256, loss_mean=0.267][A[A
+
+Train step of epoch 1:  19%|█▉        | 1207/6434 [2:50:24<11:58:21,  8.25s/it, gpt_loss=0.327, loss_mean=0.273][A[A
+
+Train step of epoch 1:  19%|█▉        | 1208/6434 [2:50:24<11:41:05,  8.05s/it, gpt_loss=0.327, loss_mean=0.273][A[A
+
+Train step of epoch 1:  19%|█▉        | 1208/6434 [2:50:32<11:41:05,  8.05s/it, gpt_loss=0.254, loss_mean=0.271][A[A
+
+Train step of epoch 1:  19%|█▉        | 1209/6434 [2:50:32<11:33:31,  7.96s/it, gpt_loss=0.254, loss_mean=0.271][A[A
+
+Train step of epoch 1:  19%|█▉        | 1209/6434 [2:50:42<11:33:31,  7.96s/it, gpt_loss=0.29, loss_mean=0.273] [A[A
+
+Train step of epoch 1:  19%|█▉        | 1210/6434 [2:50:42<12:13:00,  8.42s/it, gpt_loss=0.29, loss_mean=0.273][A[A
+
+Train step of epoch 1:  19%|█▉        | 1210/6434 [2:50:49<12:13:00,  8.42s/it, gpt_loss=0.225, loss_mean=0.268][A[A
+
+Train step of epoch 1:  19%|█▉        | 1211/6434 [2:50:49<11:52:03,  8.18s/it, gpt_loss=0.225, loss_mean=0.268][A[A
+
+Train step of epoch 1:  19%|█▉        | 1211/6434 [2:50:57<11:52:03,  8.18s/it, gpt_loss=0.355, loss_mean=0.277][A[A
+
+Train step of epoch 1:  19%|█▉        | 1212/6434 [2:50:57<11:38:36,  8.03s/it, gpt_loss=0.355, loss_mean=0.277][A[A
+
+Train step of epoch 1:  19%|█▉        | 1212/6434 [2:51:06<11:38:36,  8.03s/it, gpt_loss=0.304, loss_mean=0.279][A[A
+
+Train step of epoch 1:  19%|█▉        | 1213/6434 [2:51:06<11:56:26,  8.23s/it, gpt_loss=0.304, loss_mean=0.279][A[A
+
+Train step of epoch 1:  19%|█▉        | 1213/6434 [2:51:16<11:56:26,  8.23s/it, gpt_loss=0.262, loss_mean=0.278][A[A
+
+Train step of epoch 1:  19%|█▉        | 1214/6434 [2:51:16<12:52:56,  8.88s/it, gpt_loss=0.262, loss_mean=0.278][A[A
+
+Train step of epoch 1:  19%|█▉        | 1214/6434 [2:51:25<12:52:56,  8.88s/it, gpt_loss=0.275, loss_mean=0.277][A[A
+
+Train step of epoch 1:  19%|█▉        | 1215/6434 [2:51:25<12:39:57,  8.74s/it, gpt_loss=0.275, loss_mean=0.277][A[A
+[LID Router Debug] Step: 7650
+Batch Size: 10
+Audio Batch Size: 115
+LID Assignments: [5, 2, 10, 1, 1, 3, 5, 1, 0, 1]
+Active Experts in Batch: {0, 1, 2, 3, 5, 10}
+
+
+Train step of epoch 1:  19%|█▉        | 1215/6434 [2:51:33<12:39:57,  8.74s/it, gpt_loss=0.338, loss_mean=0.283][A[A
+
+Train step of epoch 1:  19%|█▉        | 1216/6434 [2:51:33<12:22:17,  8.54s/it, gpt_loss=0.338, loss_mean=0.283][A[A
+
+Train step of epoch 1:  19%|█▉        | 1216/6434 [2:51:41<12:22:17,  8.54s/it, gpt_loss=0.238, loss_mean=0.279][A[A
+
+Train step of epoch 1:  19%|█▉        | 1217/6434 [2:51:41<12:25:36,  8.58s/it, gpt_loss=0.238, loss_mean=0.279][A[A
+
+Train step of epoch 1:  19%|█▉        | 1217/6434 [2:51:50<12:25:36,  8.58s/it, gpt_loss=0.235, loss_mean=0.274][A[A
+
+Train step of epoch 1:  19%|█▉        | 1218/6434 [2:51:50<12:39:37,  8.74s/it, gpt_loss=0.235, loss_mean=0.274][A[A
+
+Train step of epoch 1:  19%|█▉        | 1218/6434 [2:51:59<12:39:37,  8.74s/it, gpt_loss=0.232, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  19%|█▉        | 1219/6434 [2:51:59<12:44:53,  8.80s/it, gpt_loss=0.232, loss_mean=0.27][A[A
+
+Train step of epoch 1:  19%|█▉        | 1219/6434 [2:52:08<12:44:53,  8.80s/it, gpt_loss=0.349, loss_mean=0.278][A[A
+
+Train step of epoch 1:  19%|█▉        | 1220/6434 [2:52:08<12:42:50,  8.78s/it, gpt_loss=0.349, loss_mean=0.278][A[A
+
+Train step of epoch 1:  19%|█▉        | 1220/6434 [2:52:17<12:42:50,  8.78s/it, gpt_loss=0.24, loss_mean=0.274] [A[A
+
+Train step of epoch 1:  19%|█▉        | 1221/6434 [2:52:17<12:37:44,  8.72s/it, gpt_loss=0.24, loss_mean=0.274][A[A
+
+Train step of epoch 1:  19%|█▉        | 1221/6434 [2:52:25<12:37:44,  8.72s/it, gpt_loss=0.313, loss_mean=0.278][A[A
+
+Train step of epoch 1:  19%|█▉        | 1222/6434 [2:52:25<12:24:55,  8.58s/it, gpt_loss=0.313, loss_mean=0.278][A[A
+
+Train step of epoch 1:  19%|█▉        | 1222/6434 [2:52:33<12:24:55,  8.58s/it, gpt_loss=0.302, loss_mean=0.281][A[A
+
+Train step of epoch 1:  19%|█▉        | 1223/6434 [2:52:33<12:04:33,  8.34s/it, gpt_loss=0.302, loss_mean=0.281][A[A
+
+Train step of epoch 1:  19%|█▉        | 1223/6434 [2:52:42<12:04:33,  8.34s/it, gpt_loss=0.209, loss_mean=0.273][A[A
+
+Train step of epoch 1:  19%|█▉        | 1224/6434 [2:52:42<12:20:47,  8.53s/it, gpt_loss=0.209, loss_mean=0.273][A[A
+
+Train step of epoch 1:  19%|█▉        | 1224/6434 [2:52:50<12:20:47,  8.53s/it, gpt_loss=0.375, loss_mean=0.284][A[A
+
+Train step of epoch 1:  19%|█▉        | 1225/6434 [2:52:50<12:08:58,  8.40s/it, gpt_loss=0.375, loss_mean=0.284][A[A
+[LID Router Debug] Step: 7660
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [2, 0, 5, 9, 5, 0, 1, 0, 2, 9]
+Active Experts in Batch: {0, 1, 2, 5, 9}
+
+
+Train step of epoch 1:  19%|█▉        | 1225/6434 [2:52:58<12:08:58,  8.40s/it, gpt_loss=0.254, loss_mean=0.281][A[A
+
+Train step of epoch 1:  19%|█▉        | 1226/6434 [2:52:58<11:58:39,  8.28s/it, gpt_loss=0.254, loss_mean=0.281][A[A
+
+Train step of epoch 1:  19%|█▉        | 1226/6434 [2:53:06<11:58:39,  8.28s/it, gpt_loss=0.285, loss_mean=0.281][A[A
+
+Train step of epoch 1:  19%|█▉        | 1227/6434 [2:53:06<11:46:34,  8.14s/it, gpt_loss=0.285, loss_mean=0.281][A[A
+
+Train step of epoch 1:  19%|█▉        | 1227/6434 [2:53:12<11:46:34,  8.14s/it, gpt_loss=0.243, loss_mean=0.277][A[A
+
+Train step of epoch 1:  19%|█▉        | 1228/6434 [2:53:12<11:08:05,  7.70s/it, gpt_loss=0.243, loss_mean=0.277][A[A
+
+Train step of epoch 1:  19%|█▉        | 1228/6434 [2:53:21<11:08:05,  7.70s/it, gpt_loss=0.227, loss_mean=0.272][A[A
+
+Train step of epoch 1:  19%|█▉        | 1229/6434 [2:53:21<11:35:15,  8.01s/it, gpt_loss=0.227, loss_mean=0.272][A[A
+
+Train step of epoch 1:  19%|█▉        | 1229/6434 [2:53:30<11:35:15,  8.01s/it, gpt_loss=0.282, loss_mean=0.273][A[A
+
+Train step of epoch 1:  19%|█▉        | 1230/6434 [2:53:30<11:51:32,  8.20s/it, gpt_loss=0.282, loss_mean=0.273][A[A
+
+Train step of epoch 1:  19%|█▉        | 1230/6434 [2:53:38<11:51:32,  8.20s/it, gpt_loss=0.254, loss_mean=0.271][A[A
+
+Train step of epoch 1:  19%|█▉        | 1231/6434 [2:53:38<11:49:58,  8.19s/it, gpt_loss=0.254, loss_mean=0.271][A[A
+
+Train step of epoch 1:  19%|█▉        | 1231/6434 [2:53:46<11:49:58,  8.19s/it, gpt_loss=0.328, loss_mean=0.277][A[A
+
+Train step of epoch 1:  19%|█▉        | 1232/6434 [2:53:46<11:42:33,  8.10s/it, gpt_loss=0.328, loss_mean=0.277][A[A
+
+Train step of epoch 1:  19%|█▉        | 1232/6434 [2:53:55<11:42:33,  8.10s/it, gpt_loss=0.261, loss_mean=0.275][A[A
+
+Train step of epoch 1:  19%|█▉        | 1233/6434 [2:53:55<12:05:20,  8.37s/it, gpt_loss=0.261, loss_mean=0.275][A[A
+
+Train step of epoch 1:  19%|█▉        | 1233/6434 [2:54:03<12:05:20,  8.37s/it, gpt_loss=0.257, loss_mean=0.273][A[A
+
+Train step of epoch 1:  19%|█▉        | 1234/6434 [2:54:03<12:03:16,  8.35s/it, gpt_loss=0.257, loss_mean=0.273][A[A
+
+Train step of epoch 1:  19%|█▉        | 1234/6434 [2:54:11<12:03:16,  8.35s/it, gpt_loss=0.249, loss_mean=0.271][A[A
+
+Train step of epoch 1:  19%|█▉        | 1235/6434 [2:54:11<11:51:27,  8.21s/it, gpt_loss=0.249, loss_mean=0.271][A[A
+[LID Router Debug] Step: 7670
+Batch Size: 10
+Audio Batch Size: 140
+LID Assignments: [9, 2, 4, 10, 2, 0, 0, 9, 3, 6]
+Active Experts in Batch: {0, 2, 3, 4, 6, 9, 10}
+
+
+Train step of epoch 1:  19%|█▉        | 1235/6434 [2:54:19<11:51:27,  8.21s/it, gpt_loss=0.278, loss_mean=0.272][A[A
+
+Train step of epoch 1:  19%|█▉        | 1236/6434 [2:54:19<12:01:28,  8.33s/it, gpt_loss=0.278, loss_mean=0.272][A[A
+
+Train step of epoch 1:  19%|█▉        | 1236/6434 [2:54:27<12:01:28,  8.33s/it, gpt_loss=0.352, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  19%|█▉        | 1237/6434 [2:54:27<11:44:05,  8.13s/it, gpt_loss=0.352, loss_mean=0.28][A[A
+
+Train step of epoch 1:  19%|█▉        | 1237/6434 [2:54:36<11:44:05,  8.13s/it, gpt_loss=0.233, loss_mean=0.275][A[A
+
+Train step of epoch 1:  19%|█▉        | 1238/6434 [2:54:36<11:59:44,  8.31s/it, gpt_loss=0.233, loss_mean=0.275][A[A
+
+Train step of epoch 1:  19%|█▉        | 1238/6434 [2:54:44<11:59:44,  8.31s/it, gpt_loss=0.228, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  19%|█▉        | 1239/6434 [2:54:44<11:44:08,  8.13s/it, gpt_loss=0.228, loss_mean=0.27][A[A
+
+Train step of epoch 1:  19%|█▉        | 1239/6434 [2:54:52<11:44:08,  8.13s/it, gpt_loss=0.278, loss_mean=0.271][A[A
+
+Train step of epoch 1:  19%|█▉        | 1240/6434 [2:54:52<12:02:19,  8.34s/it, gpt_loss=0.278, loss_mean=0.271][A[A
+
+Train step of epoch 1:  19%|█▉        | 1240/6434 [2:55:00<12:02:19,  8.34s/it, gpt_loss=0.351, loss_mean=0.279][A[A
+
+Train step of epoch 1:  19%|█▉        | 1241/6434 [2:55:00<11:47:55,  8.18s/it, gpt_loss=0.351, loss_mean=0.279][A[A
+
+Train step of epoch 1:  19%|█▉        | 1241/6434 [2:55:09<11:47:55,  8.18s/it, gpt_loss=0.295, loss_mean=0.281][A[A
+
+Train step of epoch 1:  19%|█▉        | 1242/6434 [2:55:09<12:14:30,  8.49s/it, gpt_loss=0.295, loss_mean=0.281][A[A
+
+Train step of epoch 1:  19%|█▉        | 1242/6434 [2:55:18<12:14:30,  8.49s/it, gpt_loss=0.25, loss_mean=0.278] [A[A
+
+Train step of epoch 1:  19%|█▉        | 1243/6434 [2:55:18<12:06:53,  8.40s/it, gpt_loss=0.25, loss_mean=0.278][A[A
+
+Train step of epoch 1:  19%|█▉        | 1243/6434 [2:55:25<12:06:53,  8.40s/it, gpt_loss=0.314, loss_mean=0.281][A[A
+
+Train step of epoch 1:  19%|█▉        | 1244/6434 [2:55:25<11:41:51,  8.11s/it, gpt_loss=0.314, loss_mean=0.281][A[A
+
+Train step of epoch 1:  19%|█▉        | 1244/6434 [2:55:33<11:41:51,  8.11s/it, gpt_loss=0.337, loss_mean=0.287][A[A
+
+Train step of epoch 1:  19%|█▉        | 1245/6434 [2:55:33<11:26:36,  7.94s/it, gpt_loss=0.337, loss_mean=0.287][A[A
+[LID Router Debug] Step: 7680
+Batch Size: 10
+Audio Batch Size: 79
+LID Assignments: [9, 4, 0, 1, 1, 9, 2, 1, 4, 1]
+Active Experts in Batch: {0, 1, 2, 4, 9}
+
+
+Train step of epoch 1:  19%|█▉        | 1245/6434 [2:55:41<11:26:36,  7.94s/it, gpt_loss=0.24, loss_mean=0.282] [A[A
+
+Train step of epoch 1:  19%|█▉        | 1246/6434 [2:55:41<11:46:05,  8.17s/it, gpt_loss=0.24, loss_mean=0.282][A[A
+
+Train step of epoch 1:  19%|█▉        | 1246/6434 [2:55:50<11:46:05,  8.17s/it, gpt_loss=0.283, loss_mean=0.282][A[A
+
+Train step of epoch 1:  19%|█▉        | 1247/6434 [2:55:50<11:53:33,  8.25s/it, gpt_loss=0.283, loss_mean=0.282][A[A
+
+Train step of epoch 1:  19%|█▉        | 1247/6434 [2:56:00<11:53:33,  8.25s/it, gpt_loss=0.279, loss_mean=0.282][A[A
+
+Train step of epoch 1:  19%|█▉        | 1248/6434 [2:56:00<12:33:12,  8.71s/it, gpt_loss=0.279, loss_mean=0.282][A[A
+
+Train step of epoch 1:  19%|█▉        | 1248/6434 [2:56:08<12:33:12,  8.71s/it, gpt_loss=0.276, loss_mean=0.281][A[A
+
+Train step of epoch 1:  19%|█▉        | 1249/6434 [2:56:08<12:14:08,  8.50s/it, gpt_loss=0.276, loss_mean=0.281][A[A
+
+Train step of epoch 1:  19%|█▉        | 1249/6434 [2:56:16<12:14:08,  8.50s/it, gpt_loss=0.253, loss_mean=0.279][A[A
+
+Train step of epoch 1:  19%|█▉        | 1250/6434 [2:56:16<12:06:48,  8.41s/it, gpt_loss=0.253, loss_mean=0.279][A[A
+
+Train step of epoch 1:  19%|█▉        | 1250/6434 [2:56:24<12:06:48,  8.41s/it, gpt_loss=0.304, loss_mean=0.281][A[A
+
+Train step of epoch 1:  19%|█▉        | 1251/6434 [2:56:24<12:09:27,  8.44s/it, gpt_loss=0.304, loss_mean=0.281][A[A
+
+Train step of epoch 1:  19%|█▉        | 1251/6434 [2:56:34<12:09:27,  8.44s/it, gpt_loss=0.253, loss_mean=0.278][A[A
+
+Train step of epoch 1:  19%|█▉        | 1252/6434 [2:56:34<12:48:14,  8.90s/it, gpt_loss=0.253, loss_mean=0.278][A[A
+
+Train step of epoch 1:  19%|█▉        | 1252/6434 [2:56:44<12:48:14,  8.90s/it, gpt_loss=0.224, loss_mean=0.273][A[A
+
+Train step of epoch 1:  19%|█▉        | 1253/6434 [2:56:44<13:02:30,  9.06s/it, gpt_loss=0.224, loss_mean=0.273][A[A
+
+Train step of epoch 1:  19%|█▉        | 1253/6434 [2:56:53<13:02:30,  9.06s/it, gpt_loss=0.231, loss_mean=0.269][A[A
+
+Train step of epoch 1:  19%|█▉        | 1254/6434 [2:56:53<13:16:36,  9.23s/it, gpt_loss=0.231, loss_mean=0.269][A[A
+
+Train step of epoch 1:  19%|█▉        | 1254/6434 [2:57:01<13:16:36,  9.23s/it, gpt_loss=0.274, loss_mean=0.269][A[A
+
+Train step of epoch 1:  20%|█▉        | 1255/6434 [2:57:01<12:45:26,  8.87s/it, gpt_loss=0.274, loss_mean=0.269][A[A
+[LID Router Debug] Step: 7690
+Batch Size: 10
+Audio Batch Size: 136
+LID Assignments: [5, 9, 9, 4, 0, 9, 3, 3, 3, 7]
+Active Experts in Batch: {0, 3, 4, 5, 7, 9}
+
+
+Train step of epoch 1:  20%|█▉        | 1255/6434 [2:57:10<12:45:26,  8.87s/it, gpt_loss=0.281, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  20%|█▉        | 1256/6434 [2:57:10<12:46:31,  8.88s/it, gpt_loss=0.281, loss_mean=0.27][A[A
+
+Train step of epoch 1:  20%|█▉        | 1256/6434 [2:57:18<12:46:31,  8.88s/it, gpt_loss=0.292, loss_mean=0.273][A[A
+
+Train step of epoch 1:  20%|█▉        | 1257/6434 [2:57:18<12:14:26,  8.51s/it, gpt_loss=0.292, loss_mean=0.273][A[A
+
+Train step of epoch 1:  20%|█▉        | 1257/6434 [2:57:25<12:14:26,  8.51s/it, gpt_loss=0.23, loss_mean=0.268] [A[A
+
+Train step of epoch 1:  20%|█▉        | 1258/6434 [2:57:25<11:39:14,  8.11s/it, gpt_loss=0.23, loss_mean=0.268][A[A
+
+Train step of epoch 1:  20%|█▉        | 1258/6434 [2:57:35<11:39:14,  8.11s/it, gpt_loss=0.338, loss_mean=0.275][A[A
+
+Train step of epoch 1:  20%|█▉        | 1259/6434 [2:57:35<12:20:25,  8.58s/it, gpt_loss=0.338, loss_mean=0.275][A[A
+
+Train step of epoch 1:  20%|█▉        | 1259/6434 [2:57:43<12:20:25,  8.58s/it, gpt_loss=0.338, loss_mean=0.281][A[A
+
+Train step of epoch 1:  20%|█▉        | 1260/6434 [2:57:43<12:07:30,  8.44s/it, gpt_loss=0.338, loss_mean=0.281][A[A
+
+Train step of epoch 1:  20%|█▉        | 1260/6434 [2:57:52<12:07:30,  8.44s/it, gpt_loss=0.344, loss_mean=0.288][A[A
+
+Train step of epoch 1:  20%|█▉        | 1261/6434 [2:57:52<12:38:06,  8.79s/it, gpt_loss=0.344, loss_mean=0.288][A[A
+
+Train step of epoch 1:  20%|█▉        | 1261/6434 [2:58:00<12:38:06,  8.79s/it, gpt_loss=0.207, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  20%|█▉        | 1262/6434 [2:58:00<12:08:42,  8.45s/it, gpt_loss=0.207, loss_mean=0.28][A[A
+
+Train step of epoch 1:  20%|█▉        | 1262/6434 [2:58:08<12:08:42,  8.45s/it, gpt_loss=0.249, loss_mean=0.277][A[A
+
+Train step of epoch 1:  20%|█▉        | 1263/6434 [2:58:08<11:53:38,  8.28s/it, gpt_loss=0.249, loss_mean=0.277][A[A
+
+Train step of epoch 1:  20%|█▉        | 1263/6434 [2:58:16<11:53:38,  8.28s/it, gpt_loss=0.368, loss_mean=0.286][A[A
+
+Train step of epoch 1:  20%|█▉        | 1264/6434 [2:58:16<11:47:36,  8.21s/it, gpt_loss=0.368, loss_mean=0.286][A[A
+
+Train step of epoch 1:  20%|█▉        | 1264/6434 [2:58:25<11:47:36,  8.21s/it, gpt_loss=0.211, loss_mean=0.278][A[A
+
+Train step of epoch 1:  20%|█▉        | 1265/6434 [2:58:25<12:02:32,  8.39s/it, gpt_loss=0.211, loss_mean=0.278][A[A
+[LID Router Debug] Step: 7700
+Batch Size: 10
+Audio Batch Size: 100
+LID Assignments: [1, 4, 1, 9, 5, 3, 4, 4, 0, 5]
+Active Experts in Batch: {0, 1, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  20%|█▉        | 1265/6434 [2:58:33<12:02:32,  8.39s/it, gpt_loss=0.318, loss_mean=0.282][A[A
+
+Train step of epoch 1:  20%|█▉        | 1266/6434 [2:58:33<11:54:45,  8.30s/it, gpt_loss=0.318, loss_mean=0.282][A[A
+
+Train step of epoch 1:  20%|█▉        | 1266/6434 [2:58:42<11:54:45,  8.30s/it, gpt_loss=0.295, loss_mean=0.284][A[A
+
+Train step of epoch 1:  20%|█▉        | 1267/6434 [2:58:42<12:08:06,  8.45s/it, gpt_loss=0.295, loss_mean=0.284][A[A
+
+Train step of epoch 1:  20%|█▉        | 1267/6434 [2:58:50<12:08:06,  8.45s/it, gpt_loss=0.234, loss_mean=0.279][A[A
+
+Train step of epoch 1:  20%|█▉        | 1268/6434 [2:58:50<11:51:40,  8.27s/it, gpt_loss=0.234, loss_mean=0.279][A[A
+
+Train step of epoch 1:  20%|█▉        | 1268/6434 [2:58:58<11:51:40,  8.27s/it, gpt_loss=0.228, loss_mean=0.274][A[A
+
+Train step of epoch 1:  20%|█▉        | 1269/6434 [2:58:58<12:06:00,  8.43s/it, gpt_loss=0.228, loss_mean=0.274][A[A
+
+Train step of epoch 1:  20%|█▉        | 1269/6434 [2:59:06<12:06:00,  8.43s/it, gpt_loss=0.264, loss_mean=0.273][A[A
+
+Train step of epoch 1:  20%|█▉        | 1270/6434 [2:59:06<11:55:06,  8.31s/it, gpt_loss=0.264, loss_mean=0.273][A[A
+
+Train step of epoch 1:  20%|█▉        | 1270/6434 [2:59:14<11:55:06,  8.31s/it, gpt_loss=0.26, loss_mean=0.271] [A[A
+
+Train step of epoch 1:  20%|█▉        | 1271/6434 [2:59:14<11:41:17,  8.15s/it, gpt_loss=0.26, loss_mean=0.271][A[A
+
+Train step of epoch 1:  20%|█▉        | 1271/6434 [2:59:22<11:41:17,  8.15s/it, gpt_loss=0.244, loss_mean=0.269][A[A
+
+Train step of epoch 1:  20%|█▉        | 1272/6434 [2:59:22<11:34:16,  8.07s/it, gpt_loss=0.244, loss_mean=0.269][A[A
+
+Train step of epoch 1:  20%|█▉        | 1272/6434 [2:59:30<11:34:16,  8.07s/it, gpt_loss=0.304, loss_mean=0.272][A[A
+
+Train step of epoch 1:  20%|█▉        | 1273/6434 [2:59:30<11:38:52,  8.12s/it, gpt_loss=0.304, loss_mean=0.272][A[A
+
+Train step of epoch 1:  20%|█▉        | 1273/6434 [2:59:39<11:38:52,  8.12s/it, gpt_loss=0.229, loss_mean=0.268][A[A
+
+Train step of epoch 1:  20%|█▉        | 1274/6434 [2:59:39<11:46:37,  8.22s/it, gpt_loss=0.229, loss_mean=0.268][A[A
+
+Train step of epoch 1:  20%|█▉        | 1274/6434 [2:59:48<11:46:37,  8.22s/it, gpt_loss=0.29, loss_mean=0.27]  [A[A
+
+Train step of epoch 1:  20%|█▉        | 1275/6434 [2:59:48<12:06:37,  8.45s/it, gpt_loss=0.29, loss_mean=0.27][A[A
+[LID Router Debug] Step: 7710
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [9, 3, 0, 6, 5, 4, 1, 5, 9, 3]
+Active Experts in Batch: {0, 1, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  20%|█▉        | 1275/6434 [2:59:56<12:06:37,  8.45s/it, gpt_loss=0.267, loss_mean=0.27][A[A
+
+Train step of epoch 1:  20%|█▉        | 1276/6434 [2:59:56<11:51:51,  8.28s/it, gpt_loss=0.267, loss_mean=0.27][A[A
+
+Train step of epoch 1:  20%|█▉        | 1276/6434 [3:00:04<11:51:51,  8.28s/it, gpt_loss=0.293, loss_mean=0.272][A[A
+
+Train step of epoch 1:  20%|█▉        | 1277/6434 [3:00:04<11:44:52,  8.20s/it, gpt_loss=0.293, loss_mean=0.272][A[A
+
+Train step of epoch 1:  20%|█▉        | 1277/6434 [3:00:11<11:44:52,  8.20s/it, gpt_loss=0.217, loss_mean=0.267][A[A
+
+Train step of epoch 1:  20%|█▉        | 1278/6434 [3:00:11<11:31:22,  8.05s/it, gpt_loss=0.217, loss_mean=0.267][A[A
+
+Train step of epoch 1:  20%|█▉        | 1278/6434 [3:00:19<11:31:22,  8.05s/it, gpt_loss=0.347, loss_mean=0.275][A[A
+
+Train step of epoch 1:  20%|█▉        | 1279/6434 [3:00:19<11:27:40,  8.00s/it, gpt_loss=0.347, loss_mean=0.275][A[A
+
+Train step of epoch 1:  20%|█▉        | 1279/6434 [3:00:28<11:27:40,  8.00s/it, gpt_loss=0.292, loss_mean=0.276][A[A
+
+Train step of epoch 1:  20%|█▉        | 1280/6434 [3:00:28<11:41:02,  8.16s/it, gpt_loss=0.292, loss_mean=0.276][A[A
+
+Train step of epoch 1:  20%|█▉        | 1280/6434 [3:00:36<11:41:02,  8.16s/it, gpt_loss=0.264, loss_mean=0.275][A[A
+
+Train step of epoch 1:  20%|█▉        | 1281/6434 [3:00:36<11:48:04,  8.24s/it, gpt_loss=0.264, loss_mean=0.275][A[A
+
+Train step of epoch 1:  20%|█▉        | 1281/6434 [3:00:44<11:48:04,  8.24s/it, gpt_loss=0.306, loss_mean=0.278][A[A
+
+Train step of epoch 1:  20%|█▉        | 1282/6434 [3:00:44<11:44:39,  8.21s/it, gpt_loss=0.306, loss_mean=0.278][A[A
+
+Train step of epoch 1:  20%|█▉        | 1282/6434 [3:00:52<11:44:39,  8.21s/it, gpt_loss=0.327, loss_mean=0.283][A[A
+
+Train step of epoch 1:  20%|█▉        | 1283/6434 [3:00:52<11:19:04,  7.91s/it, gpt_loss=0.327, loss_mean=0.283][A[A
+
+Train step of epoch 1:  20%|█▉        | 1283/6434 [3:00:59<11:19:04,  7.91s/it, gpt_loss=0.333, loss_mean=0.288][A[A
+
+Train step of epoch 1:  20%|█▉        | 1284/6434 [3:00:59<11:04:36,  7.74s/it, gpt_loss=0.333, loss_mean=0.288][A[A
+
+Train step of epoch 1:  20%|█▉        | 1284/6434 [3:01:06<11:04:36,  7.74s/it, gpt_loss=0.254, loss_mean=0.285][A[A
+
+Train step of epoch 1:  20%|█▉        | 1285/6434 [3:01:06<10:58:38,  7.67s/it, gpt_loss=0.254, loss_mean=0.285][A[A
+[LID Router Debug] Step: 7720
+Batch Size: 10
+Audio Batch Size: 100
+LID Assignments: [1, 9, 1, 4, 3, 3, 1, 2, 1, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+
+Train step of epoch 1:  20%|█▉        | 1285/6434 [3:01:16<10:58:38,  7.67s/it, gpt_loss=0.22, loss_mean=0.278] [A[A
+
+Train step of epoch 1:  20%|█▉        | 1286/6434 [3:01:16<11:45:20,  8.22s/it, gpt_loss=0.22, loss_mean=0.278][A[A
+
+Train step of epoch 1:  20%|█▉        | 1286/6434 [3:01:24<11:45:20,  8.22s/it, gpt_loss=0.268, loss_mean=0.277][A[A
+
+Train step of epoch 1:  20%|██        | 1287/6434 [3:01:24<11:39:16,  8.15s/it, gpt_loss=0.268, loss_mean=0.277][A[A
+
+Train step of epoch 1:  20%|██        | 1287/6434 [3:01:33<11:39:16,  8.15s/it, gpt_loss=0.281, loss_mean=0.278][A[A
+
+Train step of epoch 1:  20%|██        | 1288/6434 [3:01:33<11:56:34,  8.35s/it, gpt_loss=0.281, loss_mean=0.278][A[A
+
+Train step of epoch 1:  20%|██        | 1288/6434 [3:01:41<11:56:34,  8.35s/it, gpt_loss=0.238, loss_mean=0.274][A[A
+
+Train step of epoch 1:  20%|██        | 1289/6434 [3:01:41<11:45:16,  8.22s/it, gpt_loss=0.238, loss_mean=0.274][A[A
+
+Train step of epoch 1:  20%|██        | 1289/6434 [3:01:49<11:45:16,  8.22s/it, gpt_loss=0.28, loss_mean=0.274] [A[A
+
+Train step of epoch 1:  20%|██        | 1290/6434 [3:01:49<11:55:58,  8.35s/it, gpt_loss=0.28, loss_mean=0.274][A[A
+
+Train step of epoch 1:  20%|██        | 1290/6434 [3:01:58<11:55:58,  8.35s/it, gpt_loss=0.23, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  20%|██        | 1291/6434 [3:01:58<12:16:43,  8.59s/it, gpt_loss=0.23, loss_mean=0.27][A[A
+
+Train step of epoch 1:  20%|██        | 1291/6434 [3:02:06<12:16:43,  8.59s/it, gpt_loss=0.252, loss_mean=0.268][A[A
+
+Train step of epoch 1:  20%|██        | 1292/6434 [3:02:06<11:45:01,  8.23s/it, gpt_loss=0.252, loss_mean=0.268][A[A
+
+Train step of epoch 1:  20%|██        | 1292/6434 [3:02:15<11:45:01,  8.23s/it, gpt_loss=0.333, loss_mean=0.275][A[A
+
+Train step of epoch 1:  20%|██        | 1293/6434 [3:02:15<12:05:19,  8.47s/it, gpt_loss=0.333, loss_mean=0.275][A[A
+
+Train step of epoch 1:  20%|██        | 1293/6434 [3:02:23<12:05:19,  8.47s/it, gpt_loss=0.238, loss_mean=0.271][A[A
+
+Train step of epoch 1:  20%|██        | 1294/6434 [3:02:23<12:08:14,  8.50s/it, gpt_loss=0.238, loss_mean=0.271][A[A
+
+Train step of epoch 1:  20%|██        | 1294/6434 [3:02:34<12:08:14,  8.50s/it, gpt_loss=0.226, loss_mean=0.266][A[A
+
+Train step of epoch 1:  20%|██        | 1295/6434 [3:02:34<12:54:34,  9.04s/it, gpt_loss=0.226, loss_mean=0.266][A[A
+[LID Router Debug] Step: 7730
+Batch Size: 10
+Audio Batch Size: 110
+LID Assignments: [1, 6, 2, 3, 4, 2, 0, 3, 5, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+
+Train step of epoch 1:  20%|██        | 1295/6434 [3:02:42<12:54:34,  9.04s/it, gpt_loss=0.275, loss_mean=0.267][A[A
+
+Train step of epoch 1:  20%|██        | 1296/6434 [3:02:42<12:22:37,  8.67s/it, gpt_loss=0.275, loss_mean=0.267][A[A
+
+Train step of epoch 1:  20%|██        | 1296/6434 [3:02:51<12:22:37,  8.67s/it, gpt_loss=0.229, loss_mean=0.263][A[A
+
+Train step of epoch 1:  20%|██        | 1297/6434 [3:02:51<12:45:07,  8.94s/it, gpt_loss=0.229, loss_mean=0.263][A[A
+
+Train step of epoch 1:  20%|██        | 1297/6434 [3:03:02<12:45:07,  8.94s/it, gpt_loss=0.319, loss_mean=0.269][A[A
+
+Train step of epoch 1:  20%|██        | 1298/6434 [3:03:02<13:27:52,  9.44s/it, gpt_loss=0.319, loss_mean=0.269][A[A
+
+Train step of epoch 1:  20%|██        | 1298/6434 [3:03:11<13:27:52,  9.44s/it, gpt_loss=0.272, loss_mean=0.269][A[A
+
+Train step of epoch 1:  20%|██        | 1299/6434 [3:03:11<13:19:52,  9.35s/it, gpt_loss=0.272, loss_mean=0.269][A[A
+
+Train step of epoch 1:  20%|██        | 1299/6434 [3:03:19<13:19:52,  9.35s/it, gpt_loss=0.249, loss_mean=0.267][A[A
+
+Train step of epoch 1:  20%|██        | 1300/6434 [3:03:19<12:43:09,  8.92s/it, gpt_loss=0.249, loss_mean=0.267][A[A
+
+Train step of epoch 1:  20%|██        | 1300/6434 [3:03:28<12:43:09,  8.92s/it, gpt_loss=0.228, loss_mean=0.263][A[A
+
+Train step of epoch 1:  20%|██        | 1301/6434 [3:03:28<12:47:40,  8.97s/it, gpt_loss=0.228, loss_mean=0.263][A[A
+
+Train step of epoch 1:  20%|██        | 1301/6434 [3:03:35<12:47:40,  8.97s/it, gpt_loss=0.346, loss_mean=0.272][A[A
+
+Train step of epoch 1:  20%|██        | 1302/6434 [3:03:35<11:53:34,  8.34s/it, gpt_loss=0.346, loss_mean=0.272][A[A
+
+Train step of epoch 1:  20%|██        | 1302/6434 [3:03:43<11:53:34,  8.34s/it, gpt_loss=0.209, loss_mean=0.265][A[A
+
+Train step of epoch 1:  20%|██        | 1303/6434 [3:03:43<11:42:33,  8.22s/it, gpt_loss=0.209, loss_mean=0.265][A[A
+
+Train step of epoch 1:  20%|██        | 1303/6434 [3:03:51<11:42:33,  8.22s/it, gpt_loss=0.307, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  20%|██        | 1304/6434 [3:03:51<11:43:30,  8.23s/it, gpt_loss=0.307, loss_mean=0.27][A[A
+
+Train step of epoch 1:  20%|██        | 1304/6434 [3:04:00<11:43:30,  8.23s/it, gpt_loss=0.211, loss_mean=0.264][A[A
+
+Train step of epoch 1:  20%|██        | 1305/6434 [3:04:00<12:16:20,  8.61s/it, gpt_loss=0.211, loss_mean=0.264][A[A
+[LID Router Debug] Step: 7740
+Batch Size: 10
+Audio Batch Size: 110
+LID Assignments: [3, 4, 1, 0, 2, 4, 2, 7, 4, 3]
+Active Experts in Batch: {0, 1, 2, 3, 4, 7}
+
+
+Train step of epoch 1:  20%|██        | 1305/6434 [3:04:09<12:16:20,  8.61s/it, gpt_loss=0.256, loss_mean=0.263][A[A
+
+Train step of epoch 1:  20%|██        | 1306/6434 [3:04:09<12:15:53,  8.61s/it, gpt_loss=0.256, loss_mean=0.263][A[A
+
+Train step of epoch 1:  20%|██        | 1306/6434 [3:04:18<12:15:53,  8.61s/it, gpt_loss=0.332, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  20%|██        | 1307/6434 [3:04:18<12:30:35,  8.78s/it, gpt_loss=0.332, loss_mean=0.27][A[A
+
+Train step of epoch 1:  20%|██        | 1307/6434 [3:04:26<12:30:35,  8.78s/it, gpt_loss=0.251, loss_mean=0.268][A[A
+
+Train step of epoch 1:  20%|██        | 1308/6434 [3:04:26<12:03:01,  8.46s/it, gpt_loss=0.251, loss_mean=0.268][A[A
+
+Train step of epoch 1:  20%|██        | 1308/6434 [3:04:34<12:03:01,  8.46s/it, gpt_loss=0.277, loss_mean=0.269][A[A
+
+Train step of epoch 1:  20%|██        | 1309/6434 [3:04:34<11:57:46,  8.40s/it, gpt_loss=0.277, loss_mean=0.269][A[A
+
+Train step of epoch 1:  20%|██        | 1309/6434 [3:04:43<11:57:46,  8.40s/it, gpt_loss=0.25, loss_mean=0.267] [A[A
+
+Train step of epoch 1:  20%|██        | 1310/6434 [3:04:43<11:56:37,  8.39s/it, gpt_loss=0.25, loss_mean=0.267][A[A
+
+Train step of epoch 1:  20%|██        | 1310/6434 [3:04:50<11:56:37,  8.39s/it, gpt_loss=0.255, loss_mean=0.266][A[A
+
+Train step of epoch 1:  20%|██        | 1311/6434 [3:04:50<11:27:16,  8.05s/it, gpt_loss=0.255, loss_mean=0.266][A[A
+
+Train step of epoch 1:  20%|██        | 1311/6434 [3:04:58<11:27:16,  8.05s/it, gpt_loss=0.272, loss_mean=0.266][A[A
+
+Train step of epoch 1:  20%|██        | 1312/6434 [3:04:58<11:22:20,  7.99s/it, gpt_loss=0.272, loss_mean=0.266][A[A
+
+Train step of epoch 1:  20%|██        | 1312/6434 [3:05:08<11:22:20,  7.99s/it, gpt_loss=0.359, loss_mean=0.276][A[A
+
+Train step of epoch 1:  20%|██        | 1313/6434 [3:05:08<12:15:33,  8.62s/it, gpt_loss=0.359, loss_mean=0.276][A[A
+
+Train step of epoch 1:  20%|██        | 1313/6434 [3:05:15<12:15:33,  8.62s/it, gpt_loss=0.184, loss_mean=0.266][A[A
+
+Train step of epoch 1:  20%|██        | 1314/6434 [3:05:15<11:51:55,  8.34s/it, gpt_loss=0.184, loss_mean=0.266][A[A
+
+Train step of epoch 1:  20%|██        | 1314/6434 [3:05:23<11:51:55,  8.34s/it, gpt_loss=0.233, loss_mean=0.263][A[A
+
+Train step of epoch 1:  20%|██        | 1315/6434 [3:05:23<11:35:40,  8.15s/it, gpt_loss=0.233, loss_mean=0.263][A[A
+[LID Router Debug] Step: 7750
+Batch Size: 10
+Audio Batch Size: 73
+LID Assignments: [4, 1, 2, 4, 9, 5, 2, 2, 6, 1]
+Active Experts in Batch: {1, 2, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  20%|██        | 1315/6434 [3:05:32<11:35:40,  8.15s/it, gpt_loss=0.286, loss_mean=0.265][A[A
+
+Train step of epoch 1:  20%|██        | 1316/6434 [3:05:32<11:57:47,  8.41s/it, gpt_loss=0.286, loss_mean=0.265][A[A
+
+Train step of epoch 1:  20%|██        | 1316/6434 [3:05:41<11:57:47,  8.41s/it, gpt_loss=0.241, loss_mean=0.263][A[A
+
+Train step of epoch 1:  20%|██        | 1317/6434 [3:05:41<11:56:04,  8.40s/it, gpt_loss=0.241, loss_mean=0.263][A[A
+
+Train step of epoch 1:  20%|██        | 1317/6434 [3:05:49<11:56:04,  8.40s/it, gpt_loss=0.217, loss_mean=0.258][A[A
+
+Train step of epoch 1:  20%|██        | 1318/6434 [3:05:49<11:57:05,  8.41s/it, gpt_loss=0.217, loss_mean=0.258][A[A
+
+Train step of epoch 1:  20%|██        | 1318/6434 [3:05:57<11:57:05,  8.41s/it, gpt_loss=0.246, loss_mean=0.257][A[A
+
+Train step of epoch 1:  21%|██        | 1319/6434 [3:05:57<11:52:40,  8.36s/it, gpt_loss=0.246, loss_mean=0.257][A[A
+
+Train step of epoch 1:  21%|██        | 1319/6434 [3:06:05<11:52:40,  8.36s/it, gpt_loss=0.263, loss_mean=0.258][A[A
+
+Train step of epoch 1:  21%|██        | 1320/6434 [3:06:05<11:44:12,  8.26s/it, gpt_loss=0.263, loss_mean=0.258][A[A
+
+Train step of epoch 1:  21%|██        | 1320/6434 [3:06:13<11:44:12,  8.26s/it, gpt_loss=0.253, loss_mean=0.257][A[A
+
+Train step of epoch 1:  21%|██        | 1321/6434 [3:06:13<11:33:59,  8.14s/it, gpt_loss=0.253, loss_mean=0.257][A[A
+
+Train step of epoch 1:  21%|██        | 1321/6434 [3:06:22<11:33:59,  8.14s/it, gpt_loss=0.204, loss_mean=0.252][A[A
+
+Train step of epoch 1:  21%|██        | 1322/6434 [3:06:22<11:43:57,  8.26s/it, gpt_loss=0.204, loss_mean=0.252][A[A
+
+Train step of epoch 1:  21%|██        | 1322/6434 [3:06:31<11:43:57,  8.26s/it, gpt_loss=0.345, loss_mean=0.261][A[A
+
+Train step of epoch 1:  21%|██        | 1323/6434 [3:06:31<12:08:18,  8.55s/it, gpt_loss=0.345, loss_mean=0.261][A[A
+
+Train step of epoch 1:  21%|██        | 1323/6434 [3:06:38<12:08:18,  8.55s/it, gpt_loss=0.238, loss_mean=0.259][A[A
+
+Train step of epoch 1:  21%|██        | 1324/6434 [3:06:38<11:41:11,  8.23s/it, gpt_loss=0.238, loss_mean=0.259][A[A
+
+Train step of epoch 1:  21%|██        | 1324/6434 [3:06:47<11:41:11,  8.23s/it, gpt_loss=0.305, loss_mean=0.264][A[A
+
+Train step of epoch 1:  21%|██        | 1325/6434 [3:06:47<11:46:16,  8.29s/it, gpt_loss=0.305, loss_mean=0.264][A[A
+[LID Router Debug] Step: 7760
+Batch Size: 10
+Audio Batch Size: 122
+LID Assignments: [0, 9, 4, 9, 0, 3, 9, 4, 3, 2]
+Active Experts in Batch: {0, 2, 3, 4, 9}
+
+
+Train step of epoch 1:  21%|██        | 1325/6434 [3:06:55<11:46:16,  8.29s/it, gpt_loss=0.221, loss_mean=0.259][A[A
+
+Train step of epoch 1:  21%|██        | 1326/6434 [3:06:55<11:51:22,  8.36s/it, gpt_loss=0.221, loss_mean=0.259][A[A
+
+Train step of epoch 1:  21%|██        | 1326/6434 [3:07:03<11:51:22,  8.36s/it, gpt_loss=0.282, loss_mean=0.262][A[A
+
+Train step of epoch 1:  21%|██        | 1327/6434 [3:07:03<11:34:37,  8.16s/it, gpt_loss=0.282, loss_mean=0.262][A[A
+
+Train step of epoch 1:  21%|██        | 1327/6434 [3:07:11<11:34:37,  8.16s/it, gpt_loss=0.241, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  21%|██        | 1328/6434 [3:07:11<11:31:45,  8.13s/it, gpt_loss=0.241, loss_mean=0.26][A[A
+
+Train step of epoch 1:  21%|██        | 1328/6434 [3:07:20<11:31:45,  8.13s/it, gpt_loss=0.231, loss_mean=0.257][A[A
+
+Train step of epoch 1:  21%|██        | 1329/6434 [3:07:20<12:03:13,  8.50s/it, gpt_loss=0.231, loss_mean=0.257][A[A
+
+Train step of epoch 1:  21%|██        | 1329/6434 [3:07:29<12:03:13,  8.50s/it, gpt_loss=0.246, loss_mean=0.256][A[A
+
+Train step of epoch 1:  21%|██        | 1330/6434 [3:07:29<12:10:04,  8.58s/it, gpt_loss=0.246, loss_mean=0.256][A[A
+
+Train step of epoch 1:  21%|██        | 1330/6434 [3:07:37<12:10:04,  8.58s/it, gpt_loss=0.329, loss_mean=0.263][A[A
+
+Train step of epoch 1:  21%|██        | 1331/6434 [3:07:37<11:57:27,  8.44s/it, gpt_loss=0.329, loss_mean=0.263][A[A
+
+Train step of epoch 1:  21%|██        | 1331/6434 [3:07:45<11:57:27,  8.44s/it, gpt_loss=0.294, loss_mean=0.266][A[A
+
+Train step of epoch 1:  21%|██        | 1332/6434 [3:07:45<11:44:14,  8.28s/it, gpt_loss=0.294, loss_mean=0.266][A[A
+
+Train step of epoch 1:  21%|██        | 1332/6434 [3:07:54<11:44:14,  8.28s/it, gpt_loss=0.258, loss_mean=0.265][A[A
+
+Train step of epoch 1:  21%|██        | 1333/6434 [3:07:54<11:52:00,  8.37s/it, gpt_loss=0.258, loss_mean=0.265][A[A
+
+Train step of epoch 1:  21%|██        | 1333/6434 [3:08:02<11:52:00,  8.37s/it, gpt_loss=0.229, loss_mean=0.262][A[A
+
+Train step of epoch 1:  21%|██        | 1334/6434 [3:08:02<11:37:49,  8.21s/it, gpt_loss=0.229, loss_mean=0.262][A[A
+
+Train step of epoch 1:  21%|██        | 1334/6434 [3:08:11<11:37:49,  8.21s/it, gpt_loss=0.283, loss_mean=0.264][A[A
+
+Train step of epoch 1:  21%|██        | 1335/6434 [3:08:11<12:13:55,  8.64s/it, gpt_loss=0.283, loss_mean=0.264][A[A
+[LID Router Debug] Step: 7770
+Batch Size: 10
+Audio Batch Size: 80
+LID Assignments: [1, 4, 2, 0, 9, 4, 9, 1, 6, 0]
+Active Experts in Batch: {0, 1, 2, 4, 6, 9}
+
+
+Train step of epoch 1:  21%|██        | 1335/6434 [3:08:19<12:13:55,  8.64s/it, gpt_loss=0.248, loss_mean=0.262][A[A
+
+Train step of epoch 1:  21%|██        | 1336/6434 [3:08:19<12:01:59,  8.50s/it, gpt_loss=0.248, loss_mean=0.262][A[A
+
+Train step of epoch 1:  21%|██        | 1336/6434 [3:08:27<12:01:59,  8.50s/it, gpt_loss=0.458, loss_mean=0.282][A[A
+
+Train step of epoch 1:  21%|██        | 1337/6434 [3:08:27<11:46:31,  8.32s/it, gpt_loss=0.458, loss_mean=0.282][A[A
+
+Train step of epoch 1:  21%|██        | 1337/6434 [3:08:35<11:46:31,  8.32s/it, gpt_loss=0.32, loss_mean=0.286] [A[A
+
+Train step of epoch 1:  21%|██        | 1338/6434 [3:08:35<11:29:51,  8.12s/it, gpt_loss=0.32, loss_mean=0.286][A[A
+
+Train step of epoch 1:  21%|██        | 1338/6434 [3:08:43<11:29:51,  8.12s/it, gpt_loss=0.279, loss_mean=0.285][A[A
+
+Train step of epoch 1:  21%|██        | 1339/6434 [3:08:43<11:17:38,  7.98s/it, gpt_loss=0.279, loss_mean=0.285][A[A
+
+Train step of epoch 1:  21%|██        | 1339/6434 [3:08:50<11:17:38,  7.98s/it, gpt_loss=0.32, loss_mean=0.288] [A[A
+
+Train step of epoch 1:  21%|██        | 1340/6434 [3:08:50<11:11:57,  7.91s/it, gpt_loss=0.32, loss_mean=0.288][A[A
+
+Train step of epoch 1:  21%|██        | 1340/6434 [3:09:00<11:11:57,  7.91s/it, gpt_loss=0.233, loss_mean=0.283][A[A
+
+Train step of epoch 1:  21%|██        | 1341/6434 [3:09:00<11:44:45,  8.30s/it, gpt_loss=0.233, loss_mean=0.283][A[A
+
+Train step of epoch 1:  21%|██        | 1341/6434 [3:09:07<11:44:45,  8.30s/it, gpt_loss=0.243, loss_mean=0.279][A[A
+
+Train step of epoch 1:  21%|██        | 1342/6434 [3:09:07<11:30:35,  8.14s/it, gpt_loss=0.243, loss_mean=0.279][A[A
+
+Train step of epoch 1:  21%|██        | 1342/6434 [3:09:15<11:30:35,  8.14s/it, gpt_loss=0.239, loss_mean=0.275][A[A
+
+Train step of epoch 1:  21%|██        | 1343/6434 [3:09:15<11:18:20,  7.99s/it, gpt_loss=0.239, loss_mean=0.275][A[A
+
+Train step of epoch 1:  21%|██        | 1343/6434 [3:09:22<11:18:20,  7.99s/it, gpt_loss=0.252, loss_mean=0.273][A[A
+
+Train step of epoch 1:  21%|██        | 1344/6434 [3:09:22<10:50:33,  7.67s/it, gpt_loss=0.252, loss_mean=0.273][A[A
+
+Train step of epoch 1:  21%|██        | 1344/6434 [3:09:29<10:50:33,  7.67s/it, gpt_loss=0.217, loss_mean=0.267][A[A
+
+Train step of epoch 1:  21%|██        | 1345/6434 [3:09:29<10:40:38,  7.55s/it, gpt_loss=0.217, loss_mean=0.267][A[A
+[LID Router Debug] Step: 7780
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [9, 4, 0, 5, 9, 2, 4, 5, 3, 6]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  21%|██        | 1345/6434 [3:09:38<10:40:38,  7.55s/it, gpt_loss=0.313, loss_mean=0.272][A[A
+
+Train step of epoch 1:  21%|██        | 1346/6434 [3:09:38<11:08:00,  7.88s/it, gpt_loss=0.313, loss_mean=0.272][A[A
+
+Train step of epoch 1:  21%|██        | 1346/6434 [3:09:45<11:08:00,  7.88s/it, gpt_loss=0.272, loss_mean=0.272][A[A
+
+Train step of epoch 1:  21%|██        | 1347/6434 [3:09:45<10:52:01,  7.69s/it, gpt_loss=0.272, loss_mean=0.272][A[A
+
+Train step of epoch 1:  21%|██        | 1347/6434 [3:09:53<10:52:01,  7.69s/it, gpt_loss=0.331, loss_mean=0.278][A[A
+
+Train step of epoch 1:  21%|██        | 1348/6434 [3:09:53<11:01:14,  7.80s/it, gpt_loss=0.331, loss_mean=0.278][A[A
+
+Train step of epoch 1:  21%|██        | 1348/6434 [3:10:02<11:01:14,  7.80s/it, gpt_loss=0.248, loss_mean=0.275][A[A
+
+Train step of epoch 1:  21%|██        | 1349/6434 [3:10:02<11:29:16,  8.13s/it, gpt_loss=0.248, loss_mean=0.275][A[A
+
+Train step of epoch 1:  21%|██        | 1349/6434 [3:10:11<11:29:16,  8.13s/it, gpt_loss=0.284, loss_mean=0.276][A[A
+
+Train step of epoch 1:  21%|██        | 1350/6434 [3:10:11<11:57:37,  8.47s/it, gpt_loss=0.284, loss_mean=0.276][A[A
+
+Train step of epoch 1:  21%|██        | 1350/6434 [3:10:20<11:57:37,  8.47s/it, gpt_loss=0.326, loss_mean=0.281][A[A
+
+Train step of epoch 1:  21%|██        | 1351/6434 [3:10:20<12:10:03,  8.62s/it, gpt_loss=0.326, loss_mean=0.281][A[A
+
+Train step of epoch 1:  21%|██        | 1351/6434 [3:10:29<12:10:03,  8.62s/it, gpt_loss=0.306, loss_mean=0.283][A[A
+
+Train step of epoch 1:  21%|██        | 1352/6434 [3:10:29<12:00:04,  8.50s/it, gpt_loss=0.306, loss_mean=0.283][A[A
+
+Train step of epoch 1:  21%|██        | 1352/6434 [3:10:37<12:00:04,  8.50s/it, gpt_loss=0.245, loss_mean=0.279][A[A
+
+Train step of epoch 1:  21%|██        | 1353/6434 [3:10:37<12:08:19,  8.60s/it, gpt_loss=0.245, loss_mean=0.279][A[A
+
+Train step of epoch 1:  21%|██        | 1353/6434 [3:10:45<12:08:19,  8.60s/it, gpt_loss=0.23, loss_mean=0.274] [A[A
+
+Train step of epoch 1:  21%|██        | 1354/6434 [3:10:45<11:51:57,  8.41s/it, gpt_loss=0.23, loss_mean=0.274][A[A
+
+Train step of epoch 1:  21%|██        | 1354/6434 [3:10:54<11:51:57,  8.41s/it, gpt_loss=0.276, loss_mean=0.275][A[A
+
+Train step of epoch 1:  21%|██        | 1355/6434 [3:10:54<11:49:31,  8.38s/it, gpt_loss=0.276, loss_mean=0.275][A[A
+[LID Router Debug] Step: 7790
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [1, 2, 4, 4, 3, 4, 5, 9, 2, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  21%|██        | 1355/6434 [3:11:02<11:49:31,  8.38s/it, gpt_loss=0.27, loss_mean=0.274] [A[A
+
+Train step of epoch 1:  21%|██        | 1356/6434 [3:11:02<11:58:07,  8.49s/it, gpt_loss=0.27, loss_mean=0.274][A[A
+
+Train step of epoch 1:  21%|██        | 1356/6434 [3:11:11<11:58:07,  8.49s/it, gpt_loss=0.33, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  21%|██        | 1357/6434 [3:11:11<11:57:09,  8.48s/it, gpt_loss=0.33, loss_mean=0.28][A[A
+
+Train step of epoch 1:  21%|██        | 1357/6434 [3:11:19<11:57:09,  8.48s/it, gpt_loss=0.234, loss_mean=0.275][A[A
+
+Train step of epoch 1:  21%|██        | 1358/6434 [3:11:19<11:38:32,  8.26s/it, gpt_loss=0.234, loss_mean=0.275][A[A
+
+Train step of epoch 1:  21%|██        | 1358/6434 [3:11:27<11:38:32,  8.26s/it, gpt_loss=0.271, loss_mean=0.275][A[A
+
+Train step of epoch 1:  21%|██        | 1359/6434 [3:11:27<11:51:20,  8.41s/it, gpt_loss=0.271, loss_mean=0.275][A[A
+
+Train step of epoch 1:  21%|██        | 1359/6434 [3:11:37<11:51:20,  8.41s/it, gpt_loss=0.246, loss_mean=0.272][A[A
+
+Train step of epoch 1:  21%|██        | 1360/6434 [3:11:37<12:21:57,  8.77s/it, gpt_loss=0.246, loss_mean=0.272][A[A
+
+Train step of epoch 1:  21%|██        | 1360/6434 [3:11:46<12:21:57,  8.77s/it, gpt_loss=0.386, loss_mean=0.283][A[A
+
+Train step of epoch 1:  21%|██        | 1361/6434 [3:11:46<12:32:49,  8.90s/it, gpt_loss=0.386, loss_mean=0.283][A[A
+
+Train step of epoch 1:  21%|██        | 1361/6434 [3:11:54<12:32:49,  8.90s/it, gpt_loss=0.282, loss_mean=0.283][A[A
+
+Train step of epoch 1:  21%|██        | 1362/6434 [3:11:54<12:03:54,  8.56s/it, gpt_loss=0.282, loss_mean=0.283][A[A
+
+Train step of epoch 1:  21%|██        | 1362/6434 [3:12:03<12:03:54,  8.56s/it, gpt_loss=0.333, loss_mean=0.288][A[A
+
+Train step of epoch 1:  21%|██        | 1363/6434 [3:12:03<12:03:52,  8.56s/it, gpt_loss=0.333, loss_mean=0.288][A[A
+
+Train step of epoch 1:  21%|██        | 1363/6434 [3:12:12<12:03:52,  8.56s/it, gpt_loss=0.294, loss_mean=0.289][A[A
+
+Train step of epoch 1:  21%|██        | 1364/6434 [3:12:12<12:36:40,  8.95s/it, gpt_loss=0.294, loss_mean=0.289][A[A
+
+Train step of epoch 1:  21%|██        | 1364/6434 [3:12:21<12:36:40,  8.95s/it, gpt_loss=0.193, loss_mean=0.279][A[A
+
+Train step of epoch 1:  21%|██        | 1365/6434 [3:12:21<12:21:03,  8.77s/it, gpt_loss=0.193, loss_mean=0.279][A[A
+[LID Router Debug] Step: 7800
+Batch Size: 10
+Audio Batch Size: 80
+LID Assignments: [0, 2, 4, 0, 9, 0, 1, 1, 9, 0]
+Active Experts in Batch: {0, 1, 2, 4, 9}
+[2026-02-07 10:14:15,303] [INFO] [logging.py:96:log_dist] [Rank 0] step=3900, skipped=0, lr=[1.3148279753696618e-05, 1.3148279753696618e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 10:14:15,304] [INFO] [timer.py:260:stop] epoch=0/micro_step=7800/global_step=3900, RunningAvgSamplesPerSec=4.745956287405144, CurrSamplesPerSec=5.05737325729998, MemAllocated=12.36GB, MaxMemAllocated=49.73GB
+
+
+Train step of epoch 1:  21%|██        | 1365/6434 [3:12:28<12:21:03,  8.77s/it, gpt_loss=0.345, loss_mean=0.286][A[A
+
+Train step of epoch 1:  21%|██        | 1366/6434 [3:12:28<11:48:46,  8.39s/it, gpt_loss=0.345, loss_mean=0.286][A[A
+
+Train step of epoch 1:  21%|██        | 1366/6434 [3:12:37<11:48:46,  8.39s/it, gpt_loss=0.249, loss_mean=0.282][A[A
+
+Train step of epoch 1:  21%|██        | 1367/6434 [3:12:37<12:02:02,  8.55s/it, gpt_loss=0.249, loss_mean=0.282][A[A
+
+Train step of epoch 1:  21%|██        | 1367/6434 [3:12:45<12:02:02,  8.55s/it, gpt_loss=0.298, loss_mean=0.284][A[A
+
+Train step of epoch 1:  21%|██▏       | 1368/6434 [3:12:45<11:39:43,  8.29s/it, gpt_loss=0.298, loss_mean=0.284][A[A
+
+Train step of epoch 1:  21%|██▏       | 1368/6434 [3:12:53<11:39:43,  8.29s/it, gpt_loss=0.184, loss_mean=0.274][A[A
+
+Train step of epoch 1:  21%|██▏       | 1369/6434 [3:12:53<11:30:45,  8.18s/it, gpt_loss=0.184, loss_mean=0.274][A[A
+
+Train step of epoch 1:  21%|██▏       | 1369/6434 [3:13:00<11:30:45,  8.18s/it, gpt_loss=0.287, loss_mean=0.275][A[A
+
+Train step of epoch 1:  21%|██▏       | 1370/6434 [3:13:00<11:18:46,  8.04s/it, gpt_loss=0.287, loss_mean=0.275][A[A
+
+Train step of epoch 1:  21%|██▏       | 1370/6434 [3:13:09<11:18:46,  8.04s/it, gpt_loss=0.321, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  21%|██▏       | 1371/6434 [3:13:09<11:21:55,  8.08s/it, gpt_loss=0.321, loss_mean=0.28][A[A
+
+Train step of epoch 1:  21%|██▏       | 1371/6434 [3:13:16<11:21:55,  8.08s/it, gpt_loss=0.346, loss_mean=0.286][A[A
+
+Train step of epoch 1:  21%|██▏       | 1372/6434 [3:13:16<11:12:15,  7.97s/it, gpt_loss=0.346, loss_mean=0.286][A[A
+
+Train step of epoch 1:  21%|██▏       | 1372/6434 [3:13:24<11:12:15,  7.97s/it, gpt_loss=0.282, loss_mean=0.286][A[A
+
+Train step of epoch 1:  21%|██▏       | 1373/6434 [3:13:24<11:08:52,  7.93s/it, gpt_loss=0.282, loss_mean=0.286][A[A
+
+Train step of epoch 1:  21%|██▏       | 1373/6434 [3:13:32<11:08:52,  7.93s/it, gpt_loss=0.194, loss_mean=0.277][A[A
+
+Train step of epoch 1:  21%|██▏       | 1374/6434 [3:13:32<11:04:10,  7.88s/it, gpt_loss=0.194, loss_mean=0.277][A[A
+
+Train step of epoch 1:  21%|██▏       | 1374/6434 [3:13:41<11:04:10,  7.88s/it, gpt_loss=0.303, loss_mean=0.279][A[A
+
+Train step of epoch 1:  21%|██▏       | 1375/6434 [3:13:41<11:33:28,  8.22s/it, gpt_loss=0.303, loss_mean=0.279][A[A
+[LID Router Debug] Step: 7810
+Batch Size: 10
+Audio Batch Size: 84
+LID Assignments: [5, 6, 0, 1, 4, 2, 4, 5, 1, 5]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6}
+
+
+Train step of epoch 1:  21%|██▏       | 1375/6434 [3:13:49<11:33:28,  8.22s/it, gpt_loss=0.198, loss_mean=0.271][A[A
+
+Train step of epoch 1:  21%|██▏       | 1376/6434 [3:13:49<11:32:45,  8.22s/it, gpt_loss=0.198, loss_mean=0.271][A[A
+
+Train step of epoch 1:  21%|██▏       | 1376/6434 [3:13:57<11:32:45,  8.22s/it, gpt_loss=0.307, loss_mean=0.275][A[A
+
+Train step of epoch 1:  21%|██▏       | 1377/6434 [3:13:57<11:21:30,  8.09s/it, gpt_loss=0.307, loss_mean=0.275][A[A
+
+Train step of epoch 1:  21%|██▏       | 1377/6434 [3:14:05<11:21:30,  8.09s/it, gpt_loss=0.39, loss_mean=0.286] [A[A
+
+Train step of epoch 1:  21%|██▏       | 1378/6434 [3:14:05<11:14:20,  8.00s/it, gpt_loss=0.39, loss_mean=0.286][A[A
+
+Train step of epoch 1:  21%|██▏       | 1378/6434 [3:14:13<11:14:20,  8.00s/it, gpt_loss=0.33, loss_mean=0.291][A[A
+
+Train step of epoch 1:  21%|██▏       | 1379/6434 [3:14:13<11:16:11,  8.03s/it, gpt_loss=0.33, loss_mean=0.291][A[A
+
+Train step of epoch 1:  21%|██▏       | 1379/6434 [3:14:22<11:16:11,  8.03s/it, gpt_loss=0.243, loss_mean=0.286][A[A
+
+Train step of epoch 1:  21%|██▏       | 1380/6434 [3:14:22<11:37:12,  8.28s/it, gpt_loss=0.243, loss_mean=0.286][A[A
+
+Train step of epoch 1:  21%|██▏       | 1380/6434 [3:14:30<11:37:12,  8.28s/it, gpt_loss=0.185, loss_mean=0.276][A[A
+
+Train step of epoch 1:  21%|██▏       | 1381/6434 [3:14:30<11:34:59,  8.25s/it, gpt_loss=0.185, loss_mean=0.276][A[A
+
+Train step of epoch 1:  21%|██▏       | 1381/6434 [3:14:38<11:34:59,  8.25s/it, gpt_loss=0.264, loss_mean=0.275][A[A
+
+Train step of epoch 1:  21%|██▏       | 1382/6434 [3:14:38<11:19:35,  8.07s/it, gpt_loss=0.264, loss_mean=0.275][A[A
+
+Train step of epoch 1:  21%|██▏       | 1382/6434 [3:14:45<11:19:35,  8.07s/it, gpt_loss=0.219, loss_mean=0.269][A[A
+
+Train step of epoch 1:  21%|██▏       | 1383/6434 [3:14:45<11:04:39,  7.90s/it, gpt_loss=0.219, loss_mean=0.269][A[A
+
+Train step of epoch 1:  21%|██▏       | 1383/6434 [3:14:53<11:04:39,  7.90s/it, gpt_loss=0.292, loss_mean=0.271][A[A
+
+Train step of epoch 1:  22%|██▏       | 1384/6434 [3:14:53<10:58:50,  7.83s/it, gpt_loss=0.292, loss_mean=0.271][A[A
+
+Train step of epoch 1:  22%|██▏       | 1384/6434 [3:15:00<10:58:50,  7.83s/it, gpt_loss=0.216, loss_mean=0.266][A[A
+
+Train step of epoch 1:  22%|██▏       | 1385/6434 [3:15:00<10:50:52,  7.73s/it, gpt_loss=0.216, loss_mean=0.266][A[A
+[LID Router Debug] Step: 7820
+Batch Size: 10
+Audio Batch Size: 109
+LID Assignments: [4, 9, 4, 9, 4, 2, 9, 1, 2, 2]
+Active Experts in Batch: {9, 2, 4, 1}
+
+
+Train step of epoch 1:  22%|██▏       | 1385/6434 [3:15:08<10:50:52,  7.73s/it, gpt_loss=0.209, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  22%|██▏       | 1386/6434 [3:15:08<11:02:53,  7.88s/it, gpt_loss=0.209, loss_mean=0.26][A[A
+
+Train step of epoch 1:  22%|██▏       | 1386/6434 [3:15:17<11:02:53,  7.88s/it, gpt_loss=0.267, loss_mean=0.261][A[A
+
+Train step of epoch 1:  22%|██▏       | 1387/6434 [3:15:17<11:22:24,  8.11s/it, gpt_loss=0.267, loss_mean=0.261][A[A
+
+Train step of epoch 1:  22%|██▏       | 1387/6434 [3:15:26<11:22:24,  8.11s/it, gpt_loss=0.266, loss_mean=0.261][A[A
+
+Train step of epoch 1:  22%|██▏       | 1388/6434 [3:15:26<11:30:28,  8.21s/it, gpt_loss=0.266, loss_mean=0.261][A[A
+
+Train step of epoch 1:  22%|██▏       | 1388/6434 [3:15:33<11:30:28,  8.21s/it, gpt_loss=0.255, loss_mean=0.261][A[A
+
+Train step of epoch 1:  22%|██▏       | 1389/6434 [3:15:33<11:20:18,  8.09s/it, gpt_loss=0.255, loss_mean=0.261][A[A
+
+Train step of epoch 1:  22%|██▏       | 1389/6434 [3:15:42<11:20:18,  8.09s/it, gpt_loss=0.236, loss_mean=0.258][A[A
+
+Train step of epoch 1:  22%|██▏       | 1390/6434 [3:15:42<11:43:25,  8.37s/it, gpt_loss=0.236, loss_mean=0.258][A[A
+
+Train step of epoch 1:  22%|██▏       | 1390/6434 [3:15:52<11:43:25,  8.37s/it, gpt_loss=0.258, loss_mean=0.258][A[A
+
+Train step of epoch 1:  22%|██▏       | 1391/6434 [3:15:52<12:08:15,  8.66s/it, gpt_loss=0.258, loss_mean=0.258][A[A
+
+Train step of epoch 1:  22%|██▏       | 1391/6434 [3:16:00<12:08:15,  8.66s/it, gpt_loss=0.218, loss_mean=0.254][A[A
+
+Train step of epoch 1:  22%|██▏       | 1392/6434 [3:16:00<12:02:04,  8.59s/it, gpt_loss=0.218, loss_mean=0.254][A[A
+
+Train step of epoch 1:  22%|██▏       | 1392/6434 [3:16:08<12:02:04,  8.59s/it, gpt_loss=0.257, loss_mean=0.254][A[A
+
+Train step of epoch 1:  22%|██▏       | 1393/6434 [3:16:08<11:44:50,  8.39s/it, gpt_loss=0.257, loss_mean=0.254][A[A
+
+Train step of epoch 1:  22%|██▏       | 1393/6434 [3:16:16<11:44:50,  8.39s/it, gpt_loss=0.266, loss_mean=0.256][A[A
+
+Train step of epoch 1:  22%|██▏       | 1394/6434 [3:16:16<11:26:34,  8.17s/it, gpt_loss=0.266, loss_mean=0.256][A[A
+
+Train step of epoch 1:  22%|██▏       | 1394/6434 [3:16:26<11:26:34,  8.17s/it, gpt_loss=0.263, loss_mean=0.256][A[A
+
+Train step of epoch 1:  22%|██▏       | 1395/6434 [3:16:26<12:27:54,  8.91s/it, gpt_loss=0.263, loss_mean=0.256][A[A
+[LID Router Debug] Step: 7830
+Batch Size: 10
+Audio Batch Size: 152
+LID Assignments: [2, 9, 2, 5, 2, 8, 3, 3, 3, 2]
+Active Experts in Batch: {2, 3, 5, 8, 9}
+
+
+Train step of epoch 1:  22%|██▏       | 1395/6434 [3:16:36<12:27:54,  8.91s/it, gpt_loss=0.25, loss_mean=0.256] [A[A
+
+Train step of epoch 1:  22%|██▏       | 1396/6434 [3:16:36<12:48:56,  9.16s/it, gpt_loss=0.25, loss_mean=0.256][A[A
+
+Train step of epoch 1:  22%|██▏       | 1396/6434 [3:16:43<12:48:56,  9.16s/it, gpt_loss=0.264, loss_mean=0.257][A[A
+
+Train step of epoch 1:  22%|██▏       | 1397/6434 [3:16:43<12:02:20,  8.60s/it, gpt_loss=0.264, loss_mean=0.257][A[A
+
+Train step of epoch 1:  22%|██▏       | 1397/6434 [3:16:53<12:02:20,  8.60s/it, gpt_loss=0.312, loss_mean=0.262][A[A
+
+Train step of epoch 1:  22%|██▏       | 1398/6434 [3:16:53<12:20:49,  8.83s/it, gpt_loss=0.312, loss_mean=0.262][A[A
+
+Train step of epoch 1:  22%|██▏       | 1398/6434 [3:17:02<12:20:49,  8.83s/it, gpt_loss=0.32, loss_mean=0.268] [A[A
+
+Train step of epoch 1:  22%|██▏       | 1399/6434 [3:17:02<12:20:05,  8.82s/it, gpt_loss=0.32, loss_mean=0.268][A[A
+
+Train step of epoch 1:  22%|██▏       | 1399/6434 [3:17:12<12:20:05,  8.82s/it, gpt_loss=0.227, loss_mean=0.264][A[A
+
+Train step of epoch 1:  22%|██▏       | 1400/6434 [3:17:12<12:54:09,  9.23s/it, gpt_loss=0.227, loss_mean=0.264][A[A
+
+Train step of epoch 1:  22%|██▏       | 1400/6434 [3:17:20<12:54:09,  9.23s/it, gpt_loss=0.265, loss_mean=0.264][A[A
+
+Train step of epoch 1:  22%|██▏       | 1401/6434 [3:17:20<12:42:02,  9.08s/it, gpt_loss=0.265, loss_mean=0.264][A[A
+
+Train step of epoch 1:  22%|██▏       | 1401/6434 [3:17:29<12:42:02,  9.08s/it, gpt_loss=0.22, loss_mean=0.26]  [A[A
+
+Train step of epoch 1:  22%|██▏       | 1402/6434 [3:17:29<12:23:02,  8.86s/it, gpt_loss=0.22, loss_mean=0.26][A[A
+
+Train step of epoch 1:  22%|██▏       | 1402/6434 [3:17:38<12:23:02,  8.86s/it, gpt_loss=0.239, loss_mean=0.258][A[A
+
+Train step of epoch 1:  22%|██▏       | 1403/6434 [3:17:38<12:37:32,  9.03s/it, gpt_loss=0.239, loss_mean=0.258][A[A
+
+Train step of epoch 1:  22%|██▏       | 1403/6434 [3:17:47<12:37:32,  9.03s/it, gpt_loss=0.295, loss_mean=0.261][A[A
+
+Train step of epoch 1:  22%|██▏       | 1404/6434 [3:17:47<12:20:23,  8.83s/it, gpt_loss=0.295, loss_mean=0.261][A[A
+
+Train step of epoch 1:  22%|██▏       | 1404/6434 [3:17:55<12:20:23,  8.83s/it, gpt_loss=0.268, loss_mean=0.262][A[A
+
+Train step of epoch 1:  22%|██▏       | 1405/6434 [3:17:55<12:04:27,  8.64s/it, gpt_loss=0.268, loss_mean=0.262][A[A
+[LID Router Debug] Step: 7840
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [5, 0, 2, 1, 5, 0, 2, 0, 4, 4]
+Active Experts in Batch: {0, 1, 2, 4, 5}
+
+
+Train step of epoch 1:  22%|██▏       | 1405/6434 [3:18:03<12:04:27,  8.64s/it, gpt_loss=0.316, loss_mean=0.267][A[A
+
+Train step of epoch 1:  22%|██▏       | 1406/6434 [3:18:03<11:55:35,  8.54s/it, gpt_loss=0.316, loss_mean=0.267][A[A
+
+Train step of epoch 1:  22%|██▏       | 1406/6434 [3:18:12<11:55:35,  8.54s/it, gpt_loss=0.257, loss_mean=0.266][A[A
+
+Train step of epoch 1:  22%|██▏       | 1407/6434 [3:18:12<12:00:45,  8.60s/it, gpt_loss=0.257, loss_mean=0.266][A[A
+
+Train step of epoch 1:  22%|██▏       | 1407/6434 [3:18:20<12:00:45,  8.60s/it, gpt_loss=0.295, loss_mean=0.269][A[A
+
+Train step of epoch 1:  22%|██▏       | 1408/6434 [3:18:20<11:57:43,  8.57s/it, gpt_loss=0.295, loss_mean=0.269][A[A
+
+Train step of epoch 1:  22%|██▏       | 1408/6434 [3:18:28<11:57:43,  8.57s/it, gpt_loss=0.286, loss_mean=0.271][A[A
+
+Train step of epoch 1:  22%|██▏       | 1409/6434 [3:18:28<11:36:26,  8.32s/it, gpt_loss=0.286, loss_mean=0.271][A[A
+
+Train step of epoch 1:  22%|██▏       | 1409/6434 [3:18:36<11:36:26,  8.32s/it, gpt_loss=0.214, loss_mean=0.265][A[A
+
+Train step of epoch 1:  22%|██▏       | 1410/6434 [3:18:36<11:24:22,  8.17s/it, gpt_loss=0.214, loss_mean=0.265][A[A
+
+Train step of epoch 1:  22%|██▏       | 1410/6434 [3:18:45<11:24:22,  8.17s/it, gpt_loss=0.264, loss_mean=0.265][A[A
+
+Train step of epoch 1:  22%|██▏       | 1411/6434 [3:18:45<11:36:11,  8.32s/it, gpt_loss=0.264, loss_mean=0.265][A[A
+
+Train step of epoch 1:  22%|██▏       | 1411/6434 [3:18:52<11:36:11,  8.32s/it, gpt_loss=0.251, loss_mean=0.264][A[A
+
+Train step of epoch 1:  22%|██▏       | 1412/6434 [3:18:52<11:15:46,  8.07s/it, gpt_loss=0.251, loss_mean=0.264][A[A
+
+Train step of epoch 1:  22%|██▏       | 1412/6434 [3:19:01<11:15:46,  8.07s/it, gpt_loss=0.236, loss_mean=0.261][A[A
+
+Train step of epoch 1:  22%|██▏       | 1413/6434 [3:19:01<11:39:53,  8.36s/it, gpt_loss=0.236, loss_mean=0.261][A[A
+
+Train step of epoch 1:  22%|██▏       | 1413/6434 [3:19:09<11:39:53,  8.36s/it, gpt_loss=0.349, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  22%|██▏       | 1414/6434 [3:19:09<11:30:55,  8.26s/it, gpt_loss=0.349, loss_mean=0.27][A[A
+
+Train step of epoch 1:  22%|██▏       | 1414/6434 [3:19:17<11:30:55,  8.26s/it, gpt_loss=0.24, loss_mean=0.267][A[A
+
+Train step of epoch 1:  22%|██▏       | 1415/6434 [3:19:17<11:30:06,  8.25s/it, gpt_loss=0.24, loss_mean=0.267][A[A
+[LID Router Debug] Step: 7850
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [2, 9, 0, 0, 5, 5, 6, 4, 4, 2]
+Active Experts in Batch: {0, 2, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  22%|██▏       | 1415/6434 [3:19:27<11:30:06,  8.25s/it, gpt_loss=0.388, loss_mean=0.279][A[A
+
+Train step of epoch 1:  22%|██▏       | 1416/6434 [3:19:27<12:16:07,  8.80s/it, gpt_loss=0.388, loss_mean=0.279][A[A
+
+Train step of epoch 1:  22%|██▏       | 1416/6434 [3:19:36<12:16:07,  8.80s/it, gpt_loss=0.252, loss_mean=0.276][A[A
+
+Train step of epoch 1:  22%|██▏       | 1417/6434 [3:19:36<12:21:00,  8.86s/it, gpt_loss=0.252, loss_mean=0.276][A[A
+
+Train step of epoch 1:  22%|██▏       | 1417/6434 [3:19:45<12:21:00,  8.86s/it, gpt_loss=0.301, loss_mean=0.279][A[A
+
+Train step of epoch 1:  22%|██▏       | 1418/6434 [3:19:45<12:25:21,  8.92s/it, gpt_loss=0.301, loss_mean=0.279][A[A
+
+Train step of epoch 1:  22%|██▏       | 1418/6434 [3:19:55<12:25:21,  8.92s/it, gpt_loss=0.233, loss_mean=0.274][A[A
+
+Train step of epoch 1:  22%|██▏       | 1419/6434 [3:19:55<12:39:15,  9.08s/it, gpt_loss=0.233, loss_mean=0.274][A[A
+
+Train step of epoch 1:  22%|██▏       | 1419/6434 [3:20:04<12:39:15,  9.08s/it, gpt_loss=0.324, loss_mean=0.279][A[A
+
+Train step of epoch 1:  22%|██▏       | 1420/6434 [3:20:04<12:37:38,  9.07s/it, gpt_loss=0.324, loss_mean=0.279][A[A
+
+Train step of epoch 1:  22%|██▏       | 1420/6434 [3:20:13<12:37:38,  9.07s/it, gpt_loss=0.283, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  22%|██▏       | 1421/6434 [3:20:13<12:27:03,  8.94s/it, gpt_loss=0.283, loss_mean=0.28][A[A
+
+Train step of epoch 1:  22%|██▏       | 1421/6434 [3:20:20<12:27:03,  8.94s/it, gpt_loss=0.329, loss_mean=0.284][A[A
+
+Train step of epoch 1:  22%|██▏       | 1422/6434 [3:20:20<11:49:47,  8.50s/it, gpt_loss=0.329, loss_mean=0.284][A[A
+
+Train step of epoch 1:  22%|██▏       | 1422/6434 [3:20:29<11:49:47,  8.50s/it, gpt_loss=0.333, loss_mean=0.289][A[A
+
+Train step of epoch 1:  22%|██▏       | 1423/6434 [3:20:29<11:58:55,  8.61s/it, gpt_loss=0.333, loss_mean=0.289][A[A
+
+Train step of epoch 1:  22%|██▏       | 1423/6434 [3:20:40<11:58:55,  8.61s/it, gpt_loss=0.256, loss_mean=0.286][A[A
+
+Train step of epoch 1:  22%|██▏       | 1424/6434 [3:20:40<12:53:20,  9.26s/it, gpt_loss=0.256, loss_mean=0.286][A[A
+
+Train step of epoch 1:  22%|██▏       | 1424/6434 [3:20:47<12:53:20,  9.26s/it, gpt_loss=0.429, loss_mean=0.3]  [A[A
+
+Train step of epoch 1:  22%|██▏       | 1425/6434 [3:20:47<12:13:04,  8.78s/it, gpt_loss=0.429, loss_mean=0.3][A[A
+[LID Router Debug] Step: 7860
+Batch Size: 10
+Audio Batch Size: 157
+LID Assignments: [3, 1, 9, 3, 6, 1, 0, 3, 9, 2]
+Active Experts in Batch: {0, 1, 2, 3, 6, 9}
+
+
+Train step of epoch 1:  22%|██▏       | 1425/6434 [3:20:57<12:13:04,  8.78s/it, gpt_loss=0.313, loss_mean=0.301][A[A
+
+Train step of epoch 1:  22%|██▏       | 1426/6434 [3:20:57<12:38:48,  9.09s/it, gpt_loss=0.313, loss_mean=0.301][A[A
+
+Train step of epoch 1:  22%|██▏       | 1426/6434 [3:21:05<12:38:48,  9.09s/it, gpt_loss=0.284, loss_mean=0.3]  [A[A
+
+Train step of epoch 1:  22%|██▏       | 1427/6434 [3:21:05<12:02:55,  8.66s/it, gpt_loss=0.284, loss_mean=0.3][A[A
+
+Train step of epoch 1:  22%|██▏       | 1427/6434 [3:21:13<12:02:55,  8.66s/it, gpt_loss=0.306, loss_mean=0.3][A[A
+
+Train step of epoch 1:  22%|██▏       | 1428/6434 [3:21:13<11:56:37,  8.59s/it, gpt_loss=0.306, loss_mean=0.3][A[A
+
+Train step of epoch 1:  22%|██▏       | 1428/6434 [3:21:21<11:56:37,  8.59s/it, gpt_loss=0.252, loss_mean=0.296][A[A
+
+Train step of epoch 1:  22%|██▏       | 1429/6434 [3:21:21<11:45:18,  8.46s/it, gpt_loss=0.252, loss_mean=0.296][A[A
+
+Train step of epoch 1:  22%|██▏       | 1429/6434 [3:21:31<11:45:18,  8.46s/it, gpt_loss=0.272, loss_mean=0.293][A[A
+
+Train step of epoch 1:  22%|██▏       | 1430/6434 [3:21:31<12:02:01,  8.66s/it, gpt_loss=0.272, loss_mean=0.293][A[A
+
+Train step of epoch 1:  22%|██▏       | 1430/6434 [3:21:39<12:02:01,  8.66s/it, gpt_loss=0.204, loss_mean=0.284][A[A
+
+Train step of epoch 1:  22%|██▏       | 1431/6434 [3:21:39<11:50:16,  8.52s/it, gpt_loss=0.204, loss_mean=0.284][A[A
+
+Train step of epoch 1:  22%|██▏       | 1431/6434 [3:21:47<11:50:16,  8.52s/it, gpt_loss=0.284, loss_mean=0.284][A[A
+
+Train step of epoch 1:  22%|██▏       | 1432/6434 [3:21:47<11:42:57,  8.43s/it, gpt_loss=0.284, loss_mean=0.284][A[A
+
+Train step of epoch 1:  22%|██▏       | 1432/6434 [3:21:54<11:42:57,  8.43s/it, gpt_loss=0.251, loss_mean=0.281][A[A
+
+Train step of epoch 1:  22%|██▏       | 1433/6434 [3:21:54<11:19:00,  8.15s/it, gpt_loss=0.251, loss_mean=0.281][A[A
+
+Train step of epoch 1:  22%|██▏       | 1433/6434 [3:22:04<11:19:00,  8.15s/it, gpt_loss=0.278, loss_mean=0.281][A[A
+
+Train step of epoch 1:  22%|██▏       | 1434/6434 [3:22:04<11:43:53,  8.45s/it, gpt_loss=0.278, loss_mean=0.281][A[A
+
+Train step of epoch 1:  22%|██▏       | 1434/6434 [3:22:11<11:43:53,  8.45s/it, gpt_loss=0.217, loss_mean=0.274][A[A
+
+Train step of epoch 1:  22%|██▏       | 1435/6434 [3:22:11<11:21:35,  8.18s/it, gpt_loss=0.217, loss_mean=0.274][A[A
+[LID Router Debug] Step: 7870
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [5, 9, 4, 1, 1, 3, 0, 1, 4, 6]
+Active Experts in Batch: {0, 1, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  22%|██▏       | 1435/6434 [3:22:19<11:21:35,  8.18s/it, gpt_loss=0.304, loss_mean=0.277][A[A
+
+Train step of epoch 1:  22%|██▏       | 1436/6434 [3:22:19<11:17:53,  8.14s/it, gpt_loss=0.304, loss_mean=0.277][A[A
+
+Train step of epoch 1:  22%|██▏       | 1436/6434 [3:22:27<11:17:53,  8.14s/it, gpt_loss=0.231, loss_mean=0.273][A[A
+
+Train step of epoch 1:  22%|██▏       | 1437/6434 [3:22:27<11:05:47,  7.99s/it, gpt_loss=0.231, loss_mean=0.273][A[A
+
+Train step of epoch 1:  22%|██▏       | 1437/6434 [3:22:35<11:05:47,  7.99s/it, gpt_loss=0.282, loss_mean=0.274][A[A
+
+Train step of epoch 1:  22%|██▏       | 1438/6434 [3:22:35<11:07:07,  8.01s/it, gpt_loss=0.282, loss_mean=0.274][A[A
+
+Train step of epoch 1:  22%|██▏       | 1438/6434 [3:22:43<11:07:07,  8.01s/it, gpt_loss=0.25, loss_mean=0.271] [A[A
+
+Train step of epoch 1:  22%|██▏       | 1439/6434 [3:22:43<11:13:14,  8.09s/it, gpt_loss=0.25, loss_mean=0.271][A[A
+
+Train step of epoch 1:  22%|██▏       | 1439/6434 [3:22:53<11:13:14,  8.09s/it, gpt_loss=0.227, loss_mean=0.267][A[A
+
+Train step of epoch 1:  22%|██▏       | 1440/6434 [3:22:53<11:46:29,  8.49s/it, gpt_loss=0.227, loss_mean=0.267][A[A
+
+Train step of epoch 1:  22%|██▏       | 1440/6434 [3:23:02<11:46:29,  8.49s/it, gpt_loss=0.319, loss_mean=0.272][A[A
+
+Train step of epoch 1:  22%|██▏       | 1441/6434 [3:23:02<12:13:17,  8.81s/it, gpt_loss=0.319, loss_mean=0.272][A[A
+
+Train step of epoch 1:  22%|██▏       | 1441/6434 [3:23:11<12:13:17,  8.81s/it, gpt_loss=0.323, loss_mean=0.277][A[A
+
+Train step of epoch 1:  22%|██▏       | 1442/6434 [3:23:11<12:19:58,  8.89s/it, gpt_loss=0.323, loss_mean=0.277][A[A
+
+Train step of epoch 1:  22%|██▏       | 1442/6434 [3:23:21<12:19:58,  8.89s/it, gpt_loss=0.274, loss_mean=0.277][A[A
+
+Train step of epoch 1:  22%|██▏       | 1443/6434 [3:23:21<12:36:28,  9.09s/it, gpt_loss=0.274, loss_mean=0.277][A[A
+
+Train step of epoch 1:  22%|██▏       | 1443/6434 [3:23:29<12:36:28,  9.09s/it, gpt_loss=0.27, loss_mean=0.276] [A[A
+
+Train step of epoch 1:  22%|██▏       | 1444/6434 [3:23:29<12:18:56,  8.89s/it, gpt_loss=0.27, loss_mean=0.276][A[A
+
+Train step of epoch 1:  22%|██▏       | 1444/6434 [3:23:38<12:18:56,  8.89s/it, gpt_loss=0.292, loss_mean=0.278][A[A
+
+Train step of epoch 1:  22%|██▏       | 1445/6434 [3:23:38<12:09:21,  8.77s/it, gpt_loss=0.292, loss_mean=0.278][A[A
+[LID Router Debug] Step: 7880
+Batch Size: 10
+Audio Batch Size: 134
+LID Assignments: [3, 5, 0, 9, 5, 3, 1, 6, 3, 9]
+Active Experts in Batch: {0, 1, 3, 5, 6, 9}
+
+
+Train step of epoch 1:  22%|██▏       | 1445/6434 [3:23:47<12:09:21,  8.77s/it, gpt_loss=0.359, loss_mean=0.286][A[A
+
+Train step of epoch 1:  22%|██▏       | 1446/6434 [3:23:47<12:22:07,  8.93s/it, gpt_loss=0.359, loss_mean=0.286][A[A
+
+Train step of epoch 1:  22%|██▏       | 1446/6434 [3:23:54<12:22:07,  8.93s/it, gpt_loss=0.264, loss_mean=0.284][A[A
+
+Train step of epoch 1:  22%|██▏       | 1447/6434 [3:23:54<11:44:28,  8.48s/it, gpt_loss=0.264, loss_mean=0.284][A[A
+
+Train step of epoch 1:  22%|██▏       | 1447/6434 [3:24:02<11:44:28,  8.48s/it, gpt_loss=0.271, loss_mean=0.282][A[A
+
+Train step of epoch 1:  23%|██▎       | 1448/6434 [3:24:02<11:24:59,  8.24s/it, gpt_loss=0.271, loss_mean=0.282][A[A
+
+Train step of epoch 1:  23%|██▎       | 1448/6434 [3:24:11<11:24:59,  8.24s/it, gpt_loss=0.237, loss_mean=0.278][A[A
+
+Train step of epoch 1:  23%|██▎       | 1449/6434 [3:24:11<11:37:22,  8.39s/it, gpt_loss=0.237, loss_mean=0.278][A[A
+
+Train step of epoch 1:  23%|██▎       | 1449/6434 [3:24:20<11:37:22,  8.39s/it, gpt_loss=0.285, loss_mean=0.279][A[A
+
+Train step of epoch 1:  23%|██▎       | 1450/6434 [3:24:20<12:00:03,  8.67s/it, gpt_loss=0.285, loss_mean=0.279][A[A
+
+Train step of epoch 1:  23%|██▎       | 1450/6434 [3:24:29<12:00:03,  8.67s/it, gpt_loss=0.273, loss_mean=0.278][A[A
+
+Train step of epoch 1:  23%|██▎       | 1451/6434 [3:24:29<11:52:15,  8.58s/it, gpt_loss=0.273, loss_mean=0.278][A[A
+
+Train step of epoch 1:  23%|██▎       | 1451/6434 [3:24:36<11:52:15,  8.58s/it, gpt_loss=0.219, loss_mean=0.272][A[A
+
+Train step of epoch 1:  23%|██▎       | 1452/6434 [3:24:36<11:30:08,  8.31s/it, gpt_loss=0.219, loss_mean=0.272][A[A
+
+Train step of epoch 1:  23%|██▎       | 1452/6434 [3:24:44<11:30:08,  8.31s/it, gpt_loss=0.412, loss_mean=0.286][A[A
+
+Train step of epoch 1:  23%|██▎       | 1453/6434 [3:24:44<11:11:00,  8.08s/it, gpt_loss=0.412, loss_mean=0.286][A[A
+
+Train step of epoch 1:  23%|██▎       | 1453/6434 [3:24:52<11:11:00,  8.08s/it, gpt_loss=0.264, loss_mean=0.284][A[A
+
+Train step of epoch 1:  23%|██▎       | 1454/6434 [3:24:52<11:14:47,  8.13s/it, gpt_loss=0.264, loss_mean=0.284][A[A
+
+Train step of epoch 1:  23%|██▎       | 1454/6434 [3:25:00<11:14:47,  8.13s/it, gpt_loss=0.28, loss_mean=0.283] [A[A
+
+Train step of epoch 1:  23%|██▎       | 1455/6434 [3:25:00<11:19:23,  8.19s/it, gpt_loss=0.28, loss_mean=0.283][A[A
+[LID Router Debug] Step: 7890
+Batch Size: 10
+Audio Batch Size: 79
+LID Assignments: [1, 1, 6, 4, 0, 2, 5, 1, 4, 0]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6}
+
+
+Train step of epoch 1:  23%|██▎       | 1455/6434 [3:25:08<11:19:23,  8.19s/it, gpt_loss=0.226, loss_mean=0.278][A[A
+
+Train step of epoch 1:  23%|██▎       | 1456/6434 [3:25:08<11:14:04,  8.12s/it, gpt_loss=0.226, loss_mean=0.278][A[A
+
+Train step of epoch 1:  23%|██▎       | 1456/6434 [3:25:16<11:14:04,  8.12s/it, gpt_loss=0.24, loss_mean=0.274] [A[A
+
+Train step of epoch 1:  23%|██▎       | 1457/6434 [3:25:16<11:11:57,  8.10s/it, gpt_loss=0.24, loss_mean=0.274][A[A
+
+Train step of epoch 1:  23%|██▎       | 1457/6434 [3:25:25<11:11:57,  8.10s/it, gpt_loss=0.257, loss_mean=0.272][A[A
+
+Train step of epoch 1:  23%|██▎       | 1458/6434 [3:25:25<11:16:57,  8.16s/it, gpt_loss=0.257, loss_mean=0.272][A[A
+
+Train step of epoch 1:  23%|██▎       | 1458/6434 [3:25:33<11:16:57,  8.16s/it, gpt_loss=0.268, loss_mean=0.272][A[A
+
+Train step of epoch 1:  23%|██▎       | 1459/6434 [3:25:33<11:29:22,  8.31s/it, gpt_loss=0.268, loss_mean=0.272][A[A
+
+Train step of epoch 1:  23%|██▎       | 1459/6434 [3:25:41<11:29:22,  8.31s/it, gpt_loss=0.22, loss_mean=0.267] [A[A
+
+Train step of epoch 1:  23%|██▎       | 1460/6434 [3:25:41<11:22:55,  8.24s/it, gpt_loss=0.22, loss_mean=0.267][A[A
+
+Train step of epoch 1:  23%|██▎       | 1460/6434 [3:25:50<11:22:55,  8.24s/it, gpt_loss=0.304, loss_mean=0.27][A[A
+
+Train step of epoch 1:  23%|██▎       | 1461/6434 [3:25:50<11:35:50,  8.40s/it, gpt_loss=0.304, loss_mean=0.27][A[A
+
+Train step of epoch 1:  23%|██▎       | 1461/6434 [3:25:59<11:35:50,  8.40s/it, gpt_loss=0.319, loss_mean=0.275][A[A
+
+Train step of epoch 1:  23%|██▎       | 1462/6434 [3:25:59<11:44:37,  8.50s/it, gpt_loss=0.319, loss_mean=0.275][A[A
+
+Train step of epoch 1:  23%|██▎       | 1462/6434 [3:26:07<11:44:37,  8.50s/it, gpt_loss=0.302, loss_mean=0.278][A[A
+
+Train step of epoch 1:  23%|██▎       | 1463/6434 [3:26:07<11:24:21,  8.26s/it, gpt_loss=0.302, loss_mean=0.278][A[A
+
+Train step of epoch 1:  23%|██▎       | 1463/6434 [3:26:14<11:24:21,  8.26s/it, gpt_loss=0.221, loss_mean=0.272][A[A
+
+Train step of epoch 1:  23%|██▎       | 1464/6434 [3:26:14<11:00:17,  7.97s/it, gpt_loss=0.221, loss_mean=0.272][A[A
+
+Train step of epoch 1:  23%|██▎       | 1464/6434 [3:26:22<11:00:17,  7.97s/it, gpt_loss=0.294, loss_mean=0.275][A[A
+
+Train step of epoch 1:  23%|██▎       | 1465/6434 [3:26:22<10:55:39,  7.92s/it, gpt_loss=0.294, loss_mean=0.275][A[A
+[LID Router Debug] Step: 7900
+Batch Size: 10
+Audio Batch Size: 162
+LID Assignments: [1, 5, 3, 9, 3, 1, 2, 3, 2, 2]
+Active Experts in Batch: {1, 2, 3, 5, 9}
+
+
+Train step of epoch 1:  23%|██▎       | 1465/6434 [3:26:31<10:55:39,  7.92s/it, gpt_loss=0.268, loss_mean=0.274][A[A
+
+Train step of epoch 1:  23%|██▎       | 1466/6434 [3:26:31<11:36:26,  8.41s/it, gpt_loss=0.268, loss_mean=0.274][A[A
+
+Train step of epoch 1:  23%|██▎       | 1466/6434 [3:26:40<11:36:26,  8.41s/it, gpt_loss=0.308, loss_mean=0.277][A[A
+
+Train step of epoch 1:  23%|██▎       | 1467/6434 [3:26:40<11:34:37,  8.39s/it, gpt_loss=0.308, loss_mean=0.277][A[A
+
+Train step of epoch 1:  23%|██▎       | 1467/6434 [3:26:47<11:34:37,  8.39s/it, gpt_loss=0.279, loss_mean=0.278][A[A
+
+Train step of epoch 1:  23%|██▎       | 1468/6434 [3:26:47<11:05:42,  8.04s/it, gpt_loss=0.279, loss_mean=0.278][A[A
+
+Train step of epoch 1:  23%|██▎       | 1468/6434 [3:26:56<11:05:42,  8.04s/it, gpt_loss=0.293, loss_mean=0.279][A[A
+
+Train step of epoch 1:  23%|██▎       | 1469/6434 [3:26:56<11:29:32,  8.33s/it, gpt_loss=0.293, loss_mean=0.279][A[A
+
+Train step of epoch 1:  23%|██▎       | 1469/6434 [3:27:05<11:29:32,  8.33s/it, gpt_loss=0.301, loss_mean=0.281][A[A
+
+Train step of epoch 1:  23%|██▎       | 1470/6434 [3:27:05<11:53:58,  8.63s/it, gpt_loss=0.301, loss_mean=0.281][A[A
+
+Train step of epoch 1:  23%|██▎       | 1470/6434 [3:27:13<11:53:58,  8.63s/it, gpt_loss=0.37, loss_mean=0.29]  [A[A
+
+Train step of epoch 1:  23%|██▎       | 1471/6434 [3:27:13<11:29:19,  8.33s/it, gpt_loss=0.37, loss_mean=0.29][A[A
+
+Train step of epoch 1:  23%|██▎       | 1471/6434 [3:27:21<11:29:19,  8.33s/it, gpt_loss=0.212, loss_mean=0.282][A[A
+
+Train step of epoch 1:  23%|██▎       | 1472/6434 [3:27:21<11:17:11,  8.19s/it, gpt_loss=0.212, loss_mean=0.282][A[A
+
+Train step of epoch 1:  23%|██▎       | 1472/6434 [3:27:28<11:17:11,  8.19s/it, gpt_loss=0.279, loss_mean=0.282][A[A
+
+Train step of epoch 1:  23%|██▎       | 1473/6434 [3:27:28<11:04:21,  8.04s/it, gpt_loss=0.279, loss_mean=0.282][A[A
+
+Train step of epoch 1:  23%|██▎       | 1473/6434 [3:27:37<11:04:21,  8.04s/it, gpt_loss=0.319, loss_mean=0.286][A[A
+
+Train step of epoch 1:  23%|██▎       | 1474/6434 [3:27:37<11:24:13,  8.28s/it, gpt_loss=0.319, loss_mean=0.286][A[A
+
+Train step of epoch 1:  23%|██▎       | 1474/6434 [3:27:46<11:24:13,  8.28s/it, gpt_loss=0.256, loss_mean=0.283][A[A
+
+Train step of epoch 1:  23%|██▎       | 1475/6434 [3:27:46<11:47:52,  8.56s/it, gpt_loss=0.256, loss_mean=0.283][A[A
+[LID Router Debug] Step: 7910
+Batch Size: 10
+Audio Batch Size: 105
+LID Assignments: [5, 2, 5, 0, 3, 6, 4, 3, 2, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+
+Train step of epoch 1:  23%|██▎       | 1475/6434 [3:27:55<11:47:52,  8.56s/it, gpt_loss=0.223, loss_mean=0.277][A[A
+
+Train step of epoch 1:  23%|██▎       | 1476/6434 [3:27:55<11:55:07,  8.65s/it, gpt_loss=0.223, loss_mean=0.277][A[A
+
+Train step of epoch 1:  23%|██▎       | 1476/6434 [3:28:04<11:55:07,  8.65s/it, gpt_loss=0.264, loss_mean=0.275][A[A
+
+Train step of epoch 1:  23%|██▎       | 1477/6434 [3:28:04<12:02:49,  8.75s/it, gpt_loss=0.264, loss_mean=0.275][A[A
+
+Train step of epoch 1:  23%|██▎       | 1477/6434 [3:28:13<12:02:49,  8.75s/it, gpt_loss=0.226, loss_mean=0.271][A[A
+
+Train step of epoch 1:  23%|██▎       | 1478/6434 [3:28:13<11:53:30,  8.64s/it, gpt_loss=0.226, loss_mean=0.271][A[A
+
+Train step of epoch 1:  23%|██▎       | 1478/6434 [3:28:20<11:53:30,  8.64s/it, gpt_loss=0.284, loss_mean=0.272][A[A
+
+Train step of epoch 1:  23%|██▎       | 1479/6434 [3:28:20<11:27:10,  8.32s/it, gpt_loss=0.284, loss_mean=0.272][A[A
+
+Train step of epoch 1:  23%|██▎       | 1479/6434 [3:28:27<11:27:10,  8.32s/it, gpt_loss=0.28, loss_mean=0.273] [A[A
+
+Train step of epoch 1:  23%|██▎       | 1480/6434 [3:28:27<10:48:02,  7.85s/it, gpt_loss=0.28, loss_mean=0.273][A[A
+
+Train step of epoch 1:  23%|██▎       | 1480/6434 [3:28:36<10:48:02,  7.85s/it, gpt_loss=0.262, loss_mean=0.272][A[A
+
+Train step of epoch 1:  23%|██▎       | 1481/6434 [3:28:36<11:06:02,  8.07s/it, gpt_loss=0.262, loss_mean=0.272][A[A
+
+Train step of epoch 1:  23%|██▎       | 1481/6434 [3:28:43<11:06:02,  8.07s/it, gpt_loss=0.18, loss_mean=0.262] [A[A
+
+Train step of epoch 1:  23%|██▎       | 1482/6434 [3:28:43<10:58:31,  7.98s/it, gpt_loss=0.18, loss_mean=0.262][A[A
+
+Train step of epoch 1:  23%|██▎       | 1482/6434 [3:28:52<10:58:31,  7.98s/it, gpt_loss=0.266, loss_mean=0.263][A[A
+
+Train step of epoch 1:  23%|██▎       | 1483/6434 [3:28:52<11:22:40,  8.27s/it, gpt_loss=0.266, loss_mean=0.263][A[A
+
+Train step of epoch 1:  23%|██▎       | 1483/6434 [3:29:00<11:22:40,  8.27s/it, gpt_loss=0.324, loss_mean=0.269][A[A
+
+Train step of epoch 1:  23%|██▎       | 1484/6434 [3:29:00<11:06:18,  8.08s/it, gpt_loss=0.324, loss_mean=0.269][A[A
+
+Train step of epoch 1:  23%|██▎       | 1484/6434 [3:29:08<11:06:18,  8.08s/it, gpt_loss=0.299, loss_mean=0.272][A[A
+
+Train step of epoch 1:  23%|██▎       | 1485/6434 [3:29:08<11:14:59,  8.18s/it, gpt_loss=0.299, loss_mean=0.272][A[A
+[LID Router Debug] Step: 7920
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [6, 2, 9, 0, 1, 0, 9, 1, 3, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6, 9}
+
+
+Train step of epoch 1:  23%|██▎       | 1485/6434 [3:29:16<11:14:59,  8.18s/it, gpt_loss=0.272, loss_mean=0.272][A[A
+
+Train step of epoch 1:  23%|██▎       | 1486/6434 [3:29:16<11:08:09,  8.10s/it, gpt_loss=0.272, loss_mean=0.272][A[A
+
+Train step of epoch 1:  23%|██▎       | 1486/6434 [3:29:25<11:08:09,  8.10s/it, gpt_loss=0.326, loss_mean=0.277][A[A
+
+Train step of epoch 1:  23%|██▎       | 1487/6434 [3:29:25<11:12:13,  8.15s/it, gpt_loss=0.326, loss_mean=0.277][A[A
+
+Train step of epoch 1:  23%|██▎       | 1487/6434 [3:29:33<11:12:13,  8.15s/it, gpt_loss=0.316, loss_mean=0.281][A[A
+
+Train step of epoch 1:  23%|██▎       | 1488/6434 [3:29:33<11:16:41,  8.21s/it, gpt_loss=0.316, loss_mean=0.281][A[A
+
+Train step of epoch 1:  23%|██▎       | 1488/6434 [3:29:41<11:16:41,  8.21s/it, gpt_loss=0.248, loss_mean=0.278][A[A
+
+Train step of epoch 1:  23%|██▎       | 1489/6434 [3:29:41<11:25:50,  8.32s/it, gpt_loss=0.248, loss_mean=0.278][A[A
+
+Train step of epoch 1:  23%|██▎       | 1489/6434 [3:29:50<11:25:50,  8.32s/it, gpt_loss=0.275, loss_mean=0.278][A[A
+
+Train step of epoch 1:  23%|██▎       | 1490/6434 [3:29:50<11:36:53,  8.46s/it, gpt_loss=0.275, loss_mean=0.278][A[A
+
+Train step of epoch 1:  23%|██▎       | 1490/6434 [3:29:59<11:36:53,  8.46s/it, gpt_loss=0.236, loss_mean=0.274][A[A
+
+Train step of epoch 1:  23%|██▎       | 1491/6434 [3:29:59<11:32:01,  8.40s/it, gpt_loss=0.236, loss_mean=0.274][A[A
+
+Train step of epoch 1:  23%|██▎       | 1491/6434 [3:30:07<11:32:01,  8.40s/it, gpt_loss=0.278, loss_mean=0.274][A[A
+
+Train step of epoch 1:  23%|██▎       | 1492/6434 [3:30:07<11:27:50,  8.35s/it, gpt_loss=0.278, loss_mean=0.274][A[A
+
+Train step of epoch 1:  23%|██▎       | 1492/6434 [3:30:16<11:27:50,  8.35s/it, gpt_loss=0.305, loss_mean=0.277][A[A
+
+Train step of epoch 1:  23%|██▎       | 1493/6434 [3:30:16<11:46:58,  8.59s/it, gpt_loss=0.305, loss_mean=0.277][A[A
+
+Train step of epoch 1:  23%|██▎       | 1493/6434 [3:30:25<11:46:58,  8.59s/it, gpt_loss=0.34, loss_mean=0.283] [A[A
+
+Train step of epoch 1:  23%|██▎       | 1494/6434 [3:30:25<11:59:42,  8.74s/it, gpt_loss=0.34, loss_mean=0.283][A[A
+
+Train step of epoch 1:  23%|██▎       | 1494/6434 [3:30:34<11:59:42,  8.74s/it, gpt_loss=0.24, loss_mean=0.279][A[A
+
+Train step of epoch 1:  23%|██▎       | 1495/6434 [3:30:34<12:09:18,  8.86s/it, gpt_loss=0.24, loss_mean=0.279][A[A
+[LID Router Debug] Step: 7930
+Batch Size: 10
+Audio Batch Size: 73
+LID Assignments: [5, 6, 5, 5, 5, 6, 1, 4, 2, 2]
+Active Experts in Batch: {1, 2, 4, 5, 6}
+
+
+Train step of epoch 1:  23%|██▎       | 1495/6434 [3:30:42<12:09:18,  8.86s/it, gpt_loss=0.314, loss_mean=0.282][A[A
+
+Train step of epoch 1:  23%|██▎       | 1496/6434 [3:30:42<11:53:37,  8.67s/it, gpt_loss=0.314, loss_mean=0.282][A[A
+
+Train step of epoch 1:  23%|██▎       | 1496/6434 [3:30:53<11:53:37,  8.67s/it, gpt_loss=0.225, loss_mean=0.277][A[A
+
+Train step of epoch 1:  23%|██▎       | 1497/6434 [3:30:53<12:34:46,  9.17s/it, gpt_loss=0.225, loss_mean=0.277][A[A
+
+Train step of epoch 1:  23%|██▎       | 1497/6434 [3:31:01<12:34:46,  9.17s/it, gpt_loss=0.25, loss_mean=0.274] [A[A
+
+Train step of epoch 1:  23%|██▎       | 1498/6434 [3:31:01<12:12:43,  8.91s/it, gpt_loss=0.25, loss_mean=0.274][A[A
+
+Train step of epoch 1:  23%|██▎       | 1498/6434 [3:31:10<12:12:43,  8.91s/it, gpt_loss=0.255, loss_mean=0.272][A[A
+
+Train step of epoch 1:  23%|██▎       | 1499/6434 [3:31:10<12:14:26,  8.93s/it, gpt_loss=0.255, loss_mean=0.272][A[A
+
+Train step of epoch 1:  23%|██▎       | 1499/6434 [3:31:18<12:14:26,  8.93s/it, gpt_loss=0.36, loss_mean=0.281] [A[A
+
+Train step of epoch 1:  23%|██▎       | 1500/6434 [3:31:18<11:56:38,  8.71s/it, gpt_loss=0.36, loss_mean=0.281][A[A
+
+Train step of epoch 1:  23%|██▎       | 1500/6434 [3:31:27<11:56:38,  8.71s/it, gpt_loss=0.359, loss_mean=0.289][A[A
+
+Train step of epoch 1:  23%|██▎       | 1501/6434 [3:31:27<11:48:02,  8.61s/it, gpt_loss=0.359, loss_mean=0.289][A[A
+
+Train step of epoch 1:  23%|██▎       | 1501/6434 [3:31:36<11:48:02,  8.61s/it, gpt_loss=0.276, loss_mean=0.288][A[A
+
+Train step of epoch 1:  23%|██▎       | 1502/6434 [3:31:36<12:04:35,  8.81s/it, gpt_loss=0.276, loss_mean=0.288][A[A
+
+Train step of epoch 1:  23%|██▎       | 1502/6434 [3:31:44<12:04:35,  8.81s/it, gpt_loss=0.299, loss_mean=0.289][A[A
+
+Train step of epoch 1:  23%|██▎       | 1503/6434 [3:31:44<11:40:42,  8.53s/it, gpt_loss=0.299, loss_mean=0.289][A[A
+
+Train step of epoch 1:  23%|██▎       | 1503/6434 [3:31:52<11:40:42,  8.53s/it, gpt_loss=0.317, loss_mean=0.292][A[A
+
+Train step of epoch 1:  23%|██▎       | 1504/6434 [3:31:52<11:31:28,  8.42s/it, gpt_loss=0.317, loss_mean=0.292][A[A
+
+Train step of epoch 1:  23%|██▎       | 1504/6434 [3:32:00<11:31:28,  8.42s/it, gpt_loss=0.232, loss_mean=0.286][A[A
+
+Train step of epoch 1:  23%|██▎       | 1505/6434 [3:32:00<11:25:25,  8.34s/it, gpt_loss=0.232, loss_mean=0.286][A[A
+[LID Router Debug] Step: 7940
+Batch Size: 10
+Audio Batch Size: 105
+LID Assignments: [5, 9, 2, 0, 9, 5, 9, 1, 2, 9]
+Active Experts in Batch: {0, 1, 2, 5, 9}
+
+
+Train step of epoch 1:  23%|██▎       | 1505/6434 [3:32:08<11:25:25,  8.34s/it, gpt_loss=0.312, loss_mean=0.288][A[A
+
+Train step of epoch 1:  23%|██▎       | 1506/6434 [3:32:08<11:14:51,  8.22s/it, gpt_loss=0.312, loss_mean=0.288][A[A
+
+Train step of epoch 1:  23%|██▎       | 1506/6434 [3:32:16<11:14:51,  8.22s/it, gpt_loss=0.269, loss_mean=0.286][A[A
+
+Train step of epoch 1:  23%|██▎       | 1507/6434 [3:32:16<11:22:13,  8.31s/it, gpt_loss=0.269, loss_mean=0.286][A[A
+
+Train step of epoch 1:  23%|██▎       | 1507/6434 [3:32:25<11:22:13,  8.31s/it, gpt_loss=0.258, loss_mean=0.283][A[A
+
+Train step of epoch 1:  23%|██▎       | 1508/6434 [3:32:25<11:20:55,  8.29s/it, gpt_loss=0.258, loss_mean=0.283][A[A
+
+Train step of epoch 1:  23%|██▎       | 1508/6434 [3:32:32<11:20:55,  8.29s/it, gpt_loss=0.29, loss_mean=0.284] [A[A
+
+Train step of epoch 1:  23%|██▎       | 1509/6434 [3:32:32<11:06:09,  8.12s/it, gpt_loss=0.29, loss_mean=0.284][A[A
+
+Train step of epoch 1:  23%|██▎       | 1509/6434 [3:32:40<11:06:09,  8.12s/it, gpt_loss=0.468, loss_mean=0.303][A[A
+
+Train step of epoch 1:  23%|██▎       | 1510/6434 [3:32:40<11:02:52,  8.08s/it, gpt_loss=0.468, loss_mean=0.303][A[A
+
+Train step of epoch 1:  23%|██▎       | 1510/6434 [3:32:48<11:02:52,  8.08s/it, gpt_loss=0.319, loss_mean=0.304][A[A
+
+Train step of epoch 1:  23%|██▎       | 1511/6434 [3:32:48<10:59:26,  8.04s/it, gpt_loss=0.319, loss_mean=0.304][A[A
+
+Train step of epoch 1:  23%|██▎       | 1511/6434 [3:32:57<10:59:26,  8.04s/it, gpt_loss=0.334, loss_mean=0.307][A[A
+
+Train step of epoch 1:  24%|██▎       | 1512/6434 [3:32:57<11:08:32,  8.15s/it, gpt_loss=0.334, loss_mean=0.307][A[A
+
+Train step of epoch 1:  24%|██▎       | 1512/6434 [3:33:04<11:08:32,  8.15s/it, gpt_loss=0.274, loss_mean=0.304][A[A
+
+Train step of epoch 1:  24%|██▎       | 1513/6434 [3:33:04<10:50:53,  7.94s/it, gpt_loss=0.274, loss_mean=0.304][A[A
+
+Train step of epoch 1:  24%|██▎       | 1513/6434 [3:33:13<10:50:53,  7.94s/it, gpt_loss=0.304, loss_mean=0.304][A[A
+
+Train step of epoch 1:  24%|██▎       | 1514/6434 [3:33:13<11:20:54,  8.30s/it, gpt_loss=0.304, loss_mean=0.304][A[A
+
+Train step of epoch 1:  24%|██▎       | 1514/6434 [3:33:21<11:20:54,  8.30s/it, gpt_loss=0.296, loss_mean=0.303][A[A
+
+Train step of epoch 1:  24%|██▎       | 1515/6434 [3:33:21<11:12:16,  8.20s/it, gpt_loss=0.296, loss_mean=0.303][A[A
+[LID Router Debug] Step: 7950
+Batch Size: 10
+Audio Batch Size: 74
+LID Assignments: [4, 4, 9, 5, 4, 2, 2, 9, 1, 0]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+
+Train step of epoch 1:  24%|██▎       | 1515/6434 [3:33:30<11:12:16,  8.20s/it, gpt_loss=0.28, loss_mean=0.301] [A[A
+
+Train step of epoch 1:  24%|██▎       | 1516/6434 [3:33:30<11:19:39,  8.29s/it, gpt_loss=0.28, loss_mean=0.301][A[A
+
+Train step of epoch 1:  24%|██▎       | 1516/6434 [3:33:38<11:19:39,  8.29s/it, gpt_loss=0.253, loss_mean=0.296][A[A
+
+Train step of epoch 1:  24%|██▎       | 1517/6434 [3:33:38<11:18:15,  8.28s/it, gpt_loss=0.253, loss_mean=0.296][A[A
+
+Train step of epoch 1:  24%|██▎       | 1517/6434 [3:33:47<11:18:15,  8.28s/it, gpt_loss=0.291, loss_mean=0.296][A[A
+
+Train step of epoch 1:  24%|██▎       | 1518/6434 [3:33:47<11:21:56,  8.32s/it, gpt_loss=0.291, loss_mean=0.296][A[A
+
+Train step of epoch 1:  24%|██▎       | 1518/6434 [3:33:55<11:21:56,  8.32s/it, gpt_loss=0.251, loss_mean=0.291][A[A
+
+Train step of epoch 1:  24%|██▎       | 1519/6434 [3:33:55<11:15:55,  8.25s/it, gpt_loss=0.251, loss_mean=0.291][A[A
+
+Train step of epoch 1:  24%|██▎       | 1519/6434 [3:34:03<11:15:55,  8.25s/it, gpt_loss=0.243, loss_mean=0.286][A[A
+
+Train step of epoch 1:  24%|██▎       | 1520/6434 [3:34:03<11:16:11,  8.26s/it, gpt_loss=0.243, loss_mean=0.286][A[A
+
+Train step of epoch 1:  24%|██▎       | 1520/6434 [3:34:11<11:16:11,  8.26s/it, gpt_loss=0.277, loss_mean=0.285][A[A
+
+Train step of epoch 1:  24%|██▎       | 1521/6434 [3:34:11<11:09:38,  8.18s/it, gpt_loss=0.277, loss_mean=0.285][A[A
+
+Train step of epoch 1:  24%|██▎       | 1521/6434 [3:34:19<11:09:38,  8.18s/it, gpt_loss=0.273, loss_mean=0.284][A[A
+
+Train step of epoch 1:  24%|██▎       | 1522/6434 [3:34:19<11:05:03,  8.12s/it, gpt_loss=0.273, loss_mean=0.284][A[A
+
+Train step of epoch 1:  24%|██▎       | 1522/6434 [3:34:27<11:05:03,  8.12s/it, gpt_loss=0.221, loss_mean=0.278][A[A
+
+Train step of epoch 1:  24%|██▎       | 1523/6434 [3:34:27<11:06:28,  8.14s/it, gpt_loss=0.221, loss_mean=0.278][A[A
+
+Train step of epoch 1:  24%|██▎       | 1523/6434 [3:34:36<11:06:28,  8.14s/it, gpt_loss=0.204, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  24%|██▎       | 1524/6434 [3:34:36<11:30:22,  8.44s/it, gpt_loss=0.204, loss_mean=0.27][A[A
+
+Train step of epoch 1:  24%|██▎       | 1524/6434 [3:34:45<11:30:22,  8.44s/it, gpt_loss=0.229, loss_mean=0.266][A[A
+
+Train step of epoch 1:  24%|██▎       | 1525/6434 [3:34:45<11:51:51,  8.70s/it, gpt_loss=0.229, loss_mean=0.266][A[A
+[LID Router Debug] Step: 7960
+Batch Size: 10
+Audio Batch Size: 140
+LID Assignments: [3, 3, 3, 1, 4, 5, 5, 3, 4, 5]
+Active Experts in Batch: {1, 3, 4, 5}
+
+
+Train step of epoch 1:  24%|██▎       | 1525/6434 [3:34:55<11:51:51,  8.70s/it, gpt_loss=0.272, loss_mean=0.267][A[A
+
+Train step of epoch 1:  24%|██▎       | 1526/6434 [3:34:55<12:04:49,  8.86s/it, gpt_loss=0.272, loss_mean=0.267][A[A
+
+Train step of epoch 1:  24%|██▎       | 1526/6434 [3:35:05<12:04:49,  8.86s/it, gpt_loss=0.205, loss_mean=0.261][A[A
+
+Train step of epoch 1:  24%|██▎       | 1527/6434 [3:35:05<12:29:10,  9.16s/it, gpt_loss=0.205, loss_mean=0.261][A[A
+
+Train step of epoch 1:  24%|██▎       | 1527/6434 [3:35:14<12:29:10,  9.16s/it, gpt_loss=0.24, loss_mean=0.259] [A[A
+
+Train step of epoch 1:  24%|██▎       | 1528/6434 [3:35:14<12:25:58,  9.12s/it, gpt_loss=0.24, loss_mean=0.259][A[A
+
+Train step of epoch 1:  24%|██▎       | 1528/6434 [3:35:22<12:25:58,  9.12s/it, gpt_loss=0.25, loss_mean=0.258][A[A
+
+Train step of epoch 1:  24%|██▍       | 1529/6434 [3:35:22<12:05:25,  8.87s/it, gpt_loss=0.25, loss_mean=0.258][A[A
+
+Train step of epoch 1:  24%|██▍       | 1529/6434 [3:35:31<12:05:25,  8.87s/it, gpt_loss=0.264, loss_mean=0.258][A[A
+
+Train step of epoch 1:  24%|██▍       | 1530/6434 [3:35:31<11:59:17,  8.80s/it, gpt_loss=0.264, loss_mean=0.258][A[A
+
+Train step of epoch 1:  24%|██▍       | 1530/6434 [3:35:39<11:59:17,  8.80s/it, gpt_loss=0.372, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  24%|██▍       | 1531/6434 [3:35:39<11:52:01,  8.71s/it, gpt_loss=0.372, loss_mean=0.27][A[A
+
+Train step of epoch 1:  24%|██▍       | 1531/6434 [3:35:47<11:52:01,  8.71s/it, gpt_loss=0.381, loss_mean=0.281][A[A
+
+Train step of epoch 1:  24%|██▍       | 1532/6434 [3:35:47<11:34:59,  8.51s/it, gpt_loss=0.381, loss_mean=0.281][A[A
+
+Train step of epoch 1:  24%|██▍       | 1532/6434 [3:35:55<11:34:59,  8.51s/it, gpt_loss=0.233, loss_mean=0.276][A[A
+
+Train step of epoch 1:  24%|██▍       | 1533/6434 [3:35:55<11:28:16,  8.43s/it, gpt_loss=0.233, loss_mean=0.276][A[A
+
+Train step of epoch 1:  24%|██▍       | 1533/6434 [3:36:05<11:28:16,  8.43s/it, gpt_loss=0.268, loss_mean=0.275][A[A
+
+Train step of epoch 1:  24%|██▍       | 1534/6434 [3:36:05<11:48:07,  8.67s/it, gpt_loss=0.268, loss_mean=0.275][A[A
+
+Train step of epoch 1:  24%|██▍       | 1534/6434 [3:36:13<11:48:07,  8.67s/it, gpt_loss=0.249, loss_mean=0.273][A[A
+
+Train step of epoch 1:  24%|██▍       | 1535/6434 [3:36:13<11:32:56,  8.49s/it, gpt_loss=0.249, loss_mean=0.273][A[A
+[LID Router Debug] Step: 7970
+Batch Size: 10
+Audio Batch Size: 139
+LID Assignments: [3, 6, 1, 2, 1, 3, 3, 3, 5, 10]
+Active Experts in Batch: {1, 2, 3, 5, 6, 10}
+
+
+Train step of epoch 1:  24%|██▍       | 1535/6434 [3:36:23<11:32:56,  8.49s/it, gpt_loss=0.24, loss_mean=0.269] [A[A
+
+Train step of epoch 1:  24%|██▍       | 1536/6434 [3:36:23<12:09:41,  8.94s/it, gpt_loss=0.24, loss_mean=0.269][A[A
+
+Train step of epoch 1:  24%|██▍       | 1536/6434 [3:36:30<12:09:41,  8.94s/it, gpt_loss=0.297, loss_mean=0.272][A[A
+
+Train step of epoch 1:  24%|██▍       | 1537/6434 [3:36:30<11:34:05,  8.50s/it, gpt_loss=0.297, loss_mean=0.272][A[A
+
+Train step of epoch 1:  24%|██▍       | 1537/6434 [3:36:38<11:34:05,  8.50s/it, gpt_loss=0.236, loss_mean=0.268][A[A
+
+Train step of epoch 1:  24%|██▍       | 1538/6434 [3:36:38<11:17:48,  8.31s/it, gpt_loss=0.236, loss_mean=0.268][A[A
+
+Train step of epoch 1:  24%|██▍       | 1538/6434 [3:36:47<11:17:48,  8.31s/it, gpt_loss=0.359, loss_mean=0.278][A[A
+
+Train step of epoch 1:  24%|██▍       | 1539/6434 [3:36:47<11:26:56,  8.42s/it, gpt_loss=0.359, loss_mean=0.278][A[A
+
+Train step of epoch 1:  24%|██▍       | 1539/6434 [3:36:55<11:26:56,  8.42s/it, gpt_loss=0.206, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  24%|██▍       | 1540/6434 [3:36:55<11:22:52,  8.37s/it, gpt_loss=0.206, loss_mean=0.27][A[A
+
+Train step of epoch 1:  24%|██▍       | 1540/6434 [3:37:03<11:22:52,  8.37s/it, gpt_loss=0.252, loss_mean=0.269][A[A
+
+Train step of epoch 1:  24%|██▍       | 1541/6434 [3:37:03<11:18:43,  8.32s/it, gpt_loss=0.252, loss_mean=0.269][A[A
+
+Train step of epoch 1:  24%|██▍       | 1541/6434 [3:37:13<11:18:43,  8.32s/it, gpt_loss=0.274, loss_mean=0.269][A[A
+
+Train step of epoch 1:  24%|██▍       | 1542/6434 [3:37:13<11:45:27,  8.65s/it, gpt_loss=0.274, loss_mean=0.269][A[A
+
+Train step of epoch 1:  24%|██▍       | 1542/6434 [3:37:20<11:45:27,  8.65s/it, gpt_loss=0.218, loss_mean=0.264][A[A
+
+Train step of epoch 1:  24%|██▍       | 1543/6434 [3:37:20<11:16:28,  8.30s/it, gpt_loss=0.218, loss_mean=0.264][A[A
+
+Train step of epoch 1:  24%|██▍       | 1543/6434 [3:37:29<11:16:28,  8.30s/it, gpt_loss=0.259, loss_mean=0.263][A[A
+
+Train step of epoch 1:  24%|██▍       | 1544/6434 [3:37:29<11:29:04,  8.45s/it, gpt_loss=0.259, loss_mean=0.263][A[A
+
+Train step of epoch 1:  24%|██▍       | 1544/6434 [3:37:37<11:29:04,  8.45s/it, gpt_loss=0.249, loss_mean=0.262][A[A
+
+Train step of epoch 1:  24%|██▍       | 1545/6434 [3:37:37<11:33:39,  8.51s/it, gpt_loss=0.249, loss_mean=0.262][A[A
+[LID Router Debug] Step: 7980
+Batch Size: 10
+Audio Batch Size: 96
+LID Assignments: [2, 5, 0, 1, 2, 3, 6, 2, 1, 5]
+Active Experts in Batch: {0, 1, 2, 3, 5, 6}
+
+
+Train step of epoch 1:  24%|██▍       | 1545/6434 [3:37:46<11:33:39,  8.51s/it, gpt_loss=0.287, loss_mean=0.265][A[A
+
+Train step of epoch 1:  24%|██▍       | 1546/6434 [3:37:46<11:39:30,  8.59s/it, gpt_loss=0.287, loss_mean=0.265][A[A
+
+Train step of epoch 1:  24%|██▍       | 1546/6434 [3:37:54<11:39:30,  8.59s/it, gpt_loss=0.277, loss_mean=0.266][A[A
+
+Train step of epoch 1:  24%|██▍       | 1547/6434 [3:37:54<11:19:37,  8.34s/it, gpt_loss=0.277, loss_mean=0.266][A[A
+
+Train step of epoch 1:  24%|██▍       | 1547/6434 [3:38:02<11:19:37,  8.34s/it, gpt_loss=0.295, loss_mean=0.269][A[A
+
+Train step of epoch 1:  24%|██▍       | 1548/6434 [3:38:02<11:19:10,  8.34s/it, gpt_loss=0.295, loss_mean=0.269][A[A
+
+Train step of epoch 1:  24%|██▍       | 1548/6434 [3:38:11<11:19:10,  8.34s/it, gpt_loss=0.337, loss_mean=0.276][A[A
+
+Train step of epoch 1:  24%|██▍       | 1549/6434 [3:38:11<11:26:29,  8.43s/it, gpt_loss=0.337, loss_mean=0.276][A[A
+
+Train step of epoch 1:  24%|██▍       | 1549/6434 [3:38:21<11:26:29,  8.43s/it, gpt_loss=0.369, loss_mean=0.285][A[A
+
+Train step of epoch 1:  24%|██▍       | 1550/6434 [3:38:21<12:13:35,  9.01s/it, gpt_loss=0.369, loss_mean=0.285][A[A
+
+Train step of epoch 1:  24%|██▍       | 1550/6434 [3:38:30<12:13:35,  9.01s/it, gpt_loss=0.238, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  24%|██▍       | 1551/6434 [3:38:30<11:54:37,  8.78s/it, gpt_loss=0.238, loss_mean=0.28][A[A
+
+Train step of epoch 1:  24%|██▍       | 1551/6434 [3:38:38<11:54:37,  8.78s/it, gpt_loss=0.314, loss_mean=0.284][A[A
+
+Train step of epoch 1:  24%|██▍       | 1552/6434 [3:38:38<11:36:10,  8.56s/it, gpt_loss=0.314, loss_mean=0.284][A[A
+
+Train step of epoch 1:  24%|██▍       | 1552/6434 [3:38:46<11:36:10,  8.56s/it, gpt_loss=0.304, loss_mean=0.286][A[A
+
+Train step of epoch 1:  24%|██▍       | 1553/6434 [3:38:46<11:24:16,  8.41s/it, gpt_loss=0.304, loss_mean=0.286][A[A
+
+Train step of epoch 1:  24%|██▍       | 1553/6434 [3:38:55<11:24:16,  8.41s/it, gpt_loss=0.345, loss_mean=0.292][A[A
+
+Train step of epoch 1:  24%|██▍       | 1554/6434 [3:38:55<11:41:26,  8.62s/it, gpt_loss=0.345, loss_mean=0.292][A[A
+
+Train step of epoch 1:  24%|██▍       | 1554/6434 [3:39:03<11:41:26,  8.62s/it, gpt_loss=0.23, loss_mean=0.285] [A[A
+
+Train step of epoch 1:  24%|██▍       | 1555/6434 [3:39:03<11:31:20,  8.50s/it, gpt_loss=0.23, loss_mean=0.285][A[A
+[LID Router Debug] Step: 7990
+Batch Size: 10
+Audio Batch Size: 133
+LID Assignments: [0, 3, 5, 9, 5, 0, 3, 3, 2, 9]
+Active Experts in Batch: {0, 2, 3, 5, 9}
+
+
+Train step of epoch 1:  24%|██▍       | 1555/6434 [3:39:12<11:31:20,  8.50s/it, gpt_loss=0.255, loss_mean=0.282][A[A
+
+Train step of epoch 1:  24%|██▍       | 1556/6434 [3:39:12<11:36:29,  8.57s/it, gpt_loss=0.255, loss_mean=0.282][A[A
+
+Train step of epoch 1:  24%|██▍       | 1556/6434 [3:39:20<11:36:29,  8.57s/it, gpt_loss=0.222, loss_mean=0.276][A[A
+
+Train step of epoch 1:  24%|██▍       | 1557/6434 [3:39:20<11:26:45,  8.45s/it, gpt_loss=0.222, loss_mean=0.276][A[A
+
+Train step of epoch 1:  24%|██▍       | 1557/6434 [3:39:28<11:26:45,  8.45s/it, gpt_loss=0.304, loss_mean=0.279][A[A
+
+Train step of epoch 1:  24%|██▍       | 1558/6434 [3:39:28<11:21:41,  8.39s/it, gpt_loss=0.304, loss_mean=0.279][A[A
+
+Train step of epoch 1:  24%|██▍       | 1558/6434 [3:39:37<11:21:41,  8.39s/it, gpt_loss=0.383, loss_mean=0.29] [A[A
+
+Train step of epoch 1:  24%|██▍       | 1559/6434 [3:39:37<11:22:25,  8.40s/it, gpt_loss=0.383, loss_mean=0.29][A[A
+
+Train step of epoch 1:  24%|██▍       | 1559/6434 [3:39:45<11:22:25,  8.40s/it, gpt_loss=0.26, loss_mean=0.287][A[A
+
+Train step of epoch 1:  24%|██▍       | 1560/6434 [3:39:45<11:32:42,  8.53s/it, gpt_loss=0.26, loss_mean=0.287][A[A
+
+Train step of epoch 1:  24%|██▍       | 1560/6434 [3:39:56<11:32:42,  8.53s/it, gpt_loss=0.242, loss_mean=0.282][A[A
+
+Train step of epoch 1:  24%|██▍       | 1561/6434 [3:39:56<12:12:18,  9.02s/it, gpt_loss=0.242, loss_mean=0.282][A[A
+
+Train step of epoch 1:  24%|██▍       | 1561/6434 [3:40:05<12:12:18,  9.02s/it, gpt_loss=0.281, loss_mean=0.282][A[A
+
+Train step of epoch 1:  24%|██▍       | 1562/6434 [3:40:05<12:22:04,  9.14s/it, gpt_loss=0.281, loss_mean=0.282][A[A
+
+Train step of epoch 1:  24%|██▍       | 1562/6434 [3:40:13<12:22:04,  9.14s/it, gpt_loss=0.258, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  24%|██▍       | 1563/6434 [3:40:13<11:56:38,  8.83s/it, gpt_loss=0.258, loss_mean=0.28][A[A
+
+Train step of epoch 1:  24%|██▍       | 1563/6434 [3:40:23<11:56:38,  8.83s/it, gpt_loss=0.293, loss_mean=0.281][A[A
+
+Train step of epoch 1:  24%|██▍       | 1564/6434 [3:40:23<12:25:47,  9.19s/it, gpt_loss=0.293, loss_mean=0.281][A[A
+
+Train step of epoch 1:  24%|██▍       | 1564/6434 [3:40:33<12:25:47,  9.19s/it, gpt_loss=0.302, loss_mean=0.283][A[A
+
+Train step of epoch 1:  24%|██▍       | 1565/6434 [3:40:33<12:35:17,  9.31s/it, gpt_loss=0.302, loss_mean=0.283][A[A
+[LID Router Debug] Step: 8000
+Batch Size: 10
+Audio Batch Size: 106
+LID Assignments: [3, 5, 0, 1, 0, 5, 3, 5, 4, 1]
+Active Experts in Batch: {0, 1, 3, 4, 5}
+[2026-02-07 10:42:30,104] [INFO] [logging.py:96:log_dist] [Rank 0] step=4000, skipped=0, lr=[1.2834530398667662e-05, 1.2834530398667662e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 10:42:30,105] [INFO] [timer.py:260:stop] epoch=0/micro_step=8000/global_step=4000, RunningAvgSamplesPerSec=4.745545759122653, CurrSamplesPerSec=4.024175781660057, MemAllocated=12.55GB, MaxMemAllocated=49.73GB
+
+
+Train step of epoch 1:  24%|██▍       | 1565/6434 [3:40:43<12:35:17,  9.31s/it, gpt_loss=0.258, loss_mean=0.281][A[A
+
+Train step of epoch 1:  24%|██▍       | 1566/6434 [3:40:43<13:00:00,  9.61s/it, gpt_loss=0.258, loss_mean=0.281][A[A
+
+Train step of epoch 1:  24%|██▍       | 1566/6434 [3:40:51<13:00:00,  9.61s/it, gpt_loss=0.317, loss_mean=0.284][A[A
+
+Train step of epoch 1:  24%|██▍       | 1567/6434 [3:40:51<12:12:52,  9.03s/it, gpt_loss=0.317, loss_mean=0.284][A[A
+
+Train step of epoch 1:  24%|██▍       | 1567/6434 [3:40:59<12:12:52,  9.03s/it, gpt_loss=0.281, loss_mean=0.284][A[A
+
+Train step of epoch 1:  24%|██▍       | 1568/6434 [3:40:59<11:54:58,  8.82s/it, gpt_loss=0.281, loss_mean=0.284][A[A
+
+Train step of epoch 1:  24%|██▍       | 1568/6434 [3:41:08<11:54:58,  8.82s/it, gpt_loss=0.265, loss_mean=0.282][A[A
+
+Train step of epoch 1:  24%|██▍       | 1569/6434 [3:41:08<11:56:00,  8.83s/it, gpt_loss=0.265, loss_mean=0.282][A[A
+
+Train step of epoch 1:  24%|██▍       | 1569/6434 [3:41:17<11:56:00,  8.83s/it, gpt_loss=0.267, loss_mean=0.281][A[A
+
+Train step of epoch 1:  24%|██▍       | 1570/6434 [3:41:17<12:05:45,  8.95s/it, gpt_loss=0.267, loss_mean=0.281][A[A
+
+Train step of epoch 1:  24%|██▍       | 1570/6434 [3:41:26<12:05:45,  8.95s/it, gpt_loss=0.246, loss_mean=0.277][A[A
+
+Train step of epoch 1:  24%|██▍       | 1571/6434 [3:41:26<12:04:13,  8.94s/it, gpt_loss=0.246, loss_mean=0.277][A[A
+
+Train step of epoch 1:  24%|██▍       | 1571/6434 [3:41:34<12:04:13,  8.94s/it, gpt_loss=0.257, loss_mean=0.275][A[A
+
+Train step of epoch 1:  24%|██▍       | 1572/6434 [3:41:34<11:38:27,  8.62s/it, gpt_loss=0.257, loss_mean=0.275][A[A
+
+Train step of epoch 1:  24%|██▍       | 1572/6434 [3:41:42<11:38:27,  8.62s/it, gpt_loss=0.299, loss_mean=0.278][A[A
+
+Train step of epoch 1:  24%|██▍       | 1573/6434 [3:41:42<11:24:22,  8.45s/it, gpt_loss=0.299, loss_mean=0.278][A[A
+
+Train step of epoch 1:  24%|██▍       | 1573/6434 [3:41:51<11:24:22,  8.45s/it, gpt_loss=0.274, loss_mean=0.277][A[A
+
+Train step of epoch 1:  24%|██▍       | 1574/6434 [3:41:51<11:41:30,  8.66s/it, gpt_loss=0.274, loss_mean=0.277][A[A
+
+Train step of epoch 1:  24%|██▍       | 1574/6434 [3:41:59<11:41:30,  8.66s/it, gpt_loss=0.203, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  24%|██▍       | 1575/6434 [3:41:59<11:20:11,  8.40s/it, gpt_loss=0.203, loss_mean=0.27][A[A
+[LID Router Debug] Step: 8010
+Batch Size: 10
+Audio Batch Size: 95
+LID Assignments: [9, 1, 5, 9, 9, 4, 5, 5, 1, 2]
+Active Experts in Batch: {1, 2, 4, 5, 9}
+
+
+Train step of epoch 1:  24%|██▍       | 1575/6434 [3:42:08<11:20:11,  8.40s/it, gpt_loss=0.214, loss_mean=0.264][A[A
+
+Train step of epoch 1:  24%|██▍       | 1576/6434 [3:42:08<11:40:18,  8.65s/it, gpt_loss=0.214, loss_mean=0.264][A[A
+
+Train step of epoch 1:  24%|██▍       | 1576/6434 [3:42:16<11:40:18,  8.65s/it, gpt_loss=0.265, loss_mean=0.264][A[A
+
+Train step of epoch 1:  25%|██▍       | 1577/6434 [3:42:16<11:14:56,  8.34s/it, gpt_loss=0.265, loss_mean=0.264][A[A
+
+Train step of epoch 1:  25%|██▍       | 1577/6434 [3:42:25<11:14:56,  8.34s/it, gpt_loss=0.302, loss_mean=0.268][A[A
+
+Train step of epoch 1:  25%|██▍       | 1578/6434 [3:42:25<11:27:06,  8.49s/it, gpt_loss=0.302, loss_mean=0.268][A[A
+
+Train step of epoch 1:  25%|██▍       | 1578/6434 [3:42:33<11:27:06,  8.49s/it, gpt_loss=0.293, loss_mean=0.271][A[A
+
+Train step of epoch 1:  25%|██▍       | 1579/6434 [3:42:33<11:18:06,  8.38s/it, gpt_loss=0.293, loss_mean=0.271][A[A
+
+Train step of epoch 1:  25%|██▍       | 1579/6434 [3:42:40<11:18:06,  8.38s/it, gpt_loss=0.197, loss_mean=0.263][A[A
+
+Train step of epoch 1:  25%|██▍       | 1580/6434 [3:42:40<10:50:41,  8.04s/it, gpt_loss=0.197, loss_mean=0.263][A[A
+
+Train step of epoch 1:  25%|██▍       | 1580/6434 [3:42:48<10:50:41,  8.04s/it, gpt_loss=0.234, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  25%|██▍       | 1581/6434 [3:42:48<10:46:45,  8.00s/it, gpt_loss=0.234, loss_mean=0.26][A[A
+
+Train step of epoch 1:  25%|██▍       | 1581/6434 [3:42:57<10:46:45,  8.00s/it, gpt_loss=0.255, loss_mean=0.26][A[A
+
+Train step of epoch 1:  25%|██▍       | 1582/6434 [3:42:57<11:25:46,  8.48s/it, gpt_loss=0.255, loss_mean=0.26][A[A
+
+Train step of epoch 1:  25%|██▍       | 1582/6434 [3:43:05<11:25:46,  8.48s/it, gpt_loss=0.263, loss_mean=0.26][A[A
+
+Train step of epoch 1:  25%|██▍       | 1583/6434 [3:43:05<10:59:10,  8.15s/it, gpt_loss=0.263, loss_mean=0.26][A[A
+
+Train step of epoch 1:  25%|██▍       | 1583/6434 [3:43:12<10:59:10,  8.15s/it, gpt_loss=0.288, loss_mean=0.263][A[A
+
+Train step of epoch 1:  25%|██▍       | 1584/6434 [3:43:12<10:43:53,  7.97s/it, gpt_loss=0.288, loss_mean=0.263][A[A
+
+Train step of epoch 1:  25%|██▍       | 1584/6434 [3:43:20<10:43:53,  7.97s/it, gpt_loss=0.247, loss_mean=0.261][A[A
+
+Train step of epoch 1:  25%|██▍       | 1585/6434 [3:43:20<10:45:07,  7.98s/it, gpt_loss=0.247, loss_mean=0.261][A[A
+[LID Router Debug] Step: 8020
+Batch Size: 10
+Audio Batch Size: 92
+LID Assignments: [4, 4, 1, 1, 4, 0, 0, 9, 4, 9]
+Active Experts in Batch: {0, 1, 4, 9}
+
+
+Train step of epoch 1:  25%|██▍       | 1585/6434 [3:43:29<10:45:07,  7.98s/it, gpt_loss=0.218, loss_mean=0.257][A[A
+
+Train step of epoch 1:  25%|██▍       | 1586/6434 [3:43:29<10:52:38,  8.08s/it, gpt_loss=0.218, loss_mean=0.257][A[A
+
+Train step of epoch 1:  25%|██▍       | 1586/6434 [3:43:39<10:52:38,  8.08s/it, gpt_loss=0.222, loss_mean=0.254][A[A
+
+Train step of epoch 1:  25%|██▍       | 1587/6434 [3:43:39<11:39:12,  8.66s/it, gpt_loss=0.222, loss_mean=0.254][A[A
+
+Train step of epoch 1:  25%|██▍       | 1587/6434 [3:43:48<11:39:12,  8.66s/it, gpt_loss=0.344, loss_mean=0.263][A[A
+
+Train step of epoch 1:  25%|██▍       | 1588/6434 [3:43:48<11:51:54,  8.81s/it, gpt_loss=0.344, loss_mean=0.263][A[A
+
+Train step of epoch 1:  25%|██▍       | 1588/6434 [3:43:56<11:51:54,  8.81s/it, gpt_loss=0.221, loss_mean=0.258][A[A
+
+Train step of epoch 1:  25%|██▍       | 1589/6434 [3:43:56<11:44:13,  8.72s/it, gpt_loss=0.221, loss_mean=0.258][A[A
+
+Train step of epoch 1:  25%|██▍       | 1589/6434 [3:44:04<11:44:13,  8.72s/it, gpt_loss=0.265, loss_mean=0.259][A[A
+
+Train step of epoch 1:  25%|██▍       | 1590/6434 [3:44:04<11:26:02,  8.50s/it, gpt_loss=0.265, loss_mean=0.259][A[A
+
+Train step of epoch 1:  25%|██▍       | 1590/6434 [3:44:13<11:26:02,  8.50s/it, gpt_loss=0.266, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  25%|██▍       | 1591/6434 [3:44:13<11:36:41,  8.63s/it, gpt_loss=0.266, loss_mean=0.26][A[A
+
+Train step of epoch 1:  25%|██▍       | 1591/6434 [3:44:21<11:36:41,  8.63s/it, gpt_loss=0.275, loss_mean=0.261][A[A
+
+Train step of epoch 1:  25%|██▍       | 1592/6434 [3:44:21<11:08:14,  8.28s/it, gpt_loss=0.275, loss_mean=0.261][A[A
+
+Train step of epoch 1:  25%|██▍       | 1592/6434 [3:44:30<11:08:14,  8.28s/it, gpt_loss=0.328, loss_mean=0.268][A[A
+
+Train step of epoch 1:  25%|██▍       | 1593/6434 [3:44:30<11:26:23,  8.51s/it, gpt_loss=0.328, loss_mean=0.268][A[A
+
+Train step of epoch 1:  25%|██▍       | 1593/6434 [3:44:38<11:26:23,  8.51s/it, gpt_loss=0.255, loss_mean=0.267][A[A
+
+Train step of epoch 1:  25%|██▍       | 1594/6434 [3:44:38<11:23:59,  8.48s/it, gpt_loss=0.255, loss_mean=0.267][A[A
+
+Train step of epoch 1:  25%|██▍       | 1594/6434 [3:44:48<11:23:59,  8.48s/it, gpt_loss=0.233, loss_mean=0.263][A[A
+
+Train step of epoch 1:  25%|██▍       | 1595/6434 [3:44:48<11:51:01,  8.82s/it, gpt_loss=0.233, loss_mean=0.263][A[A
+[LID Router Debug] Step: 8030
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [2, 3, 0, 4, 5, 4, 5, 6, 4, 9]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  25%|██▍       | 1595/6434 [3:44:55<11:51:01,  8.82s/it, gpt_loss=0.231, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  25%|██▍       | 1596/6434 [3:44:55<11:18:06,  8.41s/it, gpt_loss=0.231, loss_mean=0.26][A[A
+
+Train step of epoch 1:  25%|██▍       | 1596/6434 [3:45:03<11:18:06,  8.41s/it, gpt_loss=0.238, loss_mean=0.258][A[A
+
+Train step of epoch 1:  25%|██▍       | 1597/6434 [3:45:03<11:11:49,  8.33s/it, gpt_loss=0.238, loss_mean=0.258][A[A
+
+Train step of epoch 1:  25%|██▍       | 1597/6434 [3:45:11<11:11:49,  8.33s/it, gpt_loss=0.256, loss_mean=0.258][A[A
+
+Train step of epoch 1:  25%|██▍       | 1598/6434 [3:45:11<10:54:24,  8.12s/it, gpt_loss=0.256, loss_mean=0.258][A[A
+
+Train step of epoch 1:  25%|██▍       | 1598/6434 [3:45:19<10:54:24,  8.12s/it, gpt_loss=0.217, loss_mean=0.254][A[A
+
+Train step of epoch 1:  25%|██▍       | 1599/6434 [3:45:19<10:56:42,  8.15s/it, gpt_loss=0.217, loss_mean=0.254][A[A
+
+Train step of epoch 1:  25%|██▍       | 1599/6434 [3:45:28<10:56:42,  8.15s/it, gpt_loss=0.255, loss_mean=0.254][A[A
+
+Train step of epoch 1:  25%|██▍       | 1600/6434 [3:45:28<11:06:08,  8.27s/it, gpt_loss=0.255, loss_mean=0.254][A[A
+
+Train step of epoch 1:  25%|██▍       | 1600/6434 [3:45:36<11:06:08,  8.27s/it, gpt_loss=0.285, loss_mean=0.257][A[A
+
+Train step of epoch 1:  25%|██▍       | 1601/6434 [3:45:36<11:01:42,  8.21s/it, gpt_loss=0.285, loss_mean=0.257][A[A
+
+Train step of epoch 1:  25%|██▍       | 1601/6434 [3:45:44<11:01:42,  8.21s/it, gpt_loss=0.281, loss_mean=0.259][A[A
+
+Train step of epoch 1:  25%|██▍       | 1602/6434 [3:45:44<10:49:07,  8.06s/it, gpt_loss=0.281, loss_mean=0.259][A[A
+
+Train step of epoch 1:  25%|██▍       | 1602/6434 [3:45:52<10:49:07,  8.06s/it, gpt_loss=0.256, loss_mean=0.259][A[A
+
+Train step of epoch 1:  25%|██▍       | 1603/6434 [3:45:52<10:53:17,  8.11s/it, gpt_loss=0.256, loss_mean=0.259][A[A
+
+Train step of epoch 1:  25%|██▍       | 1603/6434 [3:46:02<10:53:17,  8.11s/it, gpt_loss=0.295, loss_mean=0.263][A[A
+
+Train step of epoch 1:  25%|██▍       | 1604/6434 [3:46:02<11:33:37,  8.62s/it, gpt_loss=0.295, loss_mean=0.263][A[A
+
+Train step of epoch 1:  25%|██▍       | 1604/6434 [3:46:10<11:33:37,  8.62s/it, gpt_loss=0.345, loss_mean=0.271][A[A
+
+Train step of epoch 1:  25%|██▍       | 1605/6434 [3:46:10<11:36:25,  8.65s/it, gpt_loss=0.345, loss_mean=0.271][A[A
+[LID Router Debug] Step: 8040
+Batch Size: 10
+Audio Batch Size: 105
+LID Assignments: [4, 1, 5, 9, 5, 0, 4, 2, 3, 2]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  25%|██▍       | 1605/6434 [3:46:20<11:36:25,  8.65s/it, gpt_loss=0.27, loss_mean=0.271] [A[A
+
+Train step of epoch 1:  25%|██▍       | 1606/6434 [3:46:20<11:56:05,  8.90s/it, gpt_loss=0.27, loss_mean=0.271][A[A
+
+Train step of epoch 1:  25%|██▍       | 1606/6434 [3:46:29<11:56:05,  8.90s/it, gpt_loss=0.227, loss_mean=0.266][A[A
+
+Train step of epoch 1:  25%|██▍       | 1607/6434 [3:46:29<12:04:37,  9.01s/it, gpt_loss=0.227, loss_mean=0.266][A[A
+
+Train step of epoch 1:  25%|██▍       | 1607/6434 [3:46:36<12:04:37,  9.01s/it, gpt_loss=0.252, loss_mean=0.265][A[A
+
+Train step of epoch 1:  25%|██▍       | 1608/6434 [3:46:36<11:22:00,  8.48s/it, gpt_loss=0.252, loss_mean=0.265][A[A
+
+Train step of epoch 1:  25%|██▍       | 1608/6434 [3:46:44<11:22:00,  8.48s/it, gpt_loss=0.223, loss_mean=0.261][A[A
+
+Train step of epoch 1:  25%|██▌       | 1609/6434 [3:46:44<11:11:45,  8.35s/it, gpt_loss=0.223, loss_mean=0.261][A[A
+
+Train step of epoch 1:  25%|██▌       | 1609/6434 [3:46:53<11:11:45,  8.35s/it, gpt_loss=0.227, loss_mean=0.257][A[A
+
+Train step of epoch 1:  25%|██▌       | 1610/6434 [3:46:53<11:05:47,  8.28s/it, gpt_loss=0.227, loss_mean=0.257][A[A
+
+Train step of epoch 1:  25%|██▌       | 1610/6434 [3:47:00<11:05:47,  8.28s/it, gpt_loss=0.167, loss_mean=0.248][A[A
+
+Train step of epoch 1:  25%|██▌       | 1611/6434 [3:47:00<10:57:18,  8.18s/it, gpt_loss=0.167, loss_mean=0.248][A[A
+
+Train step of epoch 1:  25%|██▌       | 1611/6434 [3:47:08<10:57:18,  8.18s/it, gpt_loss=0.258, loss_mean=0.249][A[A
+
+Train step of epoch 1:  25%|██▌       | 1612/6434 [3:47:08<10:46:49,  8.05s/it, gpt_loss=0.258, loss_mean=0.249][A[A
+
+Train step of epoch 1:  25%|██▌       | 1612/6434 [3:47:17<10:46:49,  8.05s/it, gpt_loss=0.352, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  25%|██▌       | 1613/6434 [3:47:17<11:11:09,  8.35s/it, gpt_loss=0.352, loss_mean=0.26][A[A
+
+Train step of epoch 1:  25%|██▌       | 1613/6434 [3:47:25<11:11:09,  8.35s/it, gpt_loss=0.34, loss_mean=0.268][A[A
+
+Train step of epoch 1:  25%|██▌       | 1614/6434 [3:47:25<11:00:46,  8.23s/it, gpt_loss=0.34, loss_mean=0.268][A[A
+
+Train step of epoch 1:  25%|██▌       | 1614/6434 [3:47:33<11:00:46,  8.23s/it, gpt_loss=0.17, loss_mean=0.258][A[A
+
+Train step of epoch 1:  25%|██▌       | 1615/6434 [3:47:33<10:44:23,  8.02s/it, gpt_loss=0.17, loss_mean=0.258][A[A
+[LID Router Debug] Step: 8050
+Batch Size: 10
+Audio Batch Size: 94
+LID Assignments: [2, 1, 2, 4, 2, 2, 1, 5, 1, 5]
+Active Experts in Batch: {1, 2, 4, 5}
+
+
+Train step of epoch 1:  25%|██▌       | 1615/6434 [3:47:41<10:44:23,  8.02s/it, gpt_loss=0.283, loss_mean=0.26][A[A
+
+Train step of epoch 1:  25%|██▌       | 1616/6434 [3:47:41<10:49:00,  8.08s/it, gpt_loss=0.283, loss_mean=0.26][A[A
+
+Train step of epoch 1:  25%|██▌       | 1616/6434 [3:47:49<10:49:00,  8.08s/it, gpt_loss=0.272, loss_mean=0.261][A[A
+
+Train step of epoch 1:  25%|██▌       | 1617/6434 [3:47:49<10:52:45,  8.13s/it, gpt_loss=0.272, loss_mean=0.261][A[A
+
+Train step of epoch 1:  25%|██▌       | 1617/6434 [3:47:59<10:52:45,  8.13s/it, gpt_loss=0.248, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  25%|██▌       | 1618/6434 [3:47:59<11:23:25,  8.51s/it, gpt_loss=0.248, loss_mean=0.26][A[A
+
+Train step of epoch 1:  25%|██▌       | 1618/6434 [3:48:07<11:23:25,  8.51s/it, gpt_loss=0.31, loss_mean=0.265][A[A
+
+Train step of epoch 1:  25%|██▌       | 1619/6434 [3:48:07<11:18:51,  8.46s/it, gpt_loss=0.31, loss_mean=0.265][A[A
+
+Train step of epoch 1:  25%|██▌       | 1619/6434 [3:48:15<11:18:51,  8.46s/it, gpt_loss=0.294, loss_mean=0.268][A[A
+
+Train step of epoch 1:  25%|██▌       | 1620/6434 [3:48:15<11:06:55,  8.31s/it, gpt_loss=0.294, loss_mean=0.268][A[A
+
+Train step of epoch 1:  25%|██▌       | 1620/6434 [3:48:23<11:06:55,  8.31s/it, gpt_loss=0.297, loss_mean=0.271][A[A
+
+Train step of epoch 1:  25%|██▌       | 1621/6434 [3:48:23<11:03:15,  8.27s/it, gpt_loss=0.297, loss_mean=0.271][A[A
+
+Train step of epoch 1:  25%|██▌       | 1621/6434 [3:48:32<11:03:15,  8.27s/it, gpt_loss=0.21, loss_mean=0.265] [A[A
+
+Train step of epoch 1:  25%|██▌       | 1622/6434 [3:48:32<11:22:13,  8.51s/it, gpt_loss=0.21, loss_mean=0.265][A[A
+
+Train step of epoch 1:  25%|██▌       | 1622/6434 [3:48:41<11:22:13,  8.51s/it, gpt_loss=0.357, loss_mean=0.274][A[A
+
+Train step of epoch 1:  25%|██▌       | 1623/6434 [3:48:41<11:28:15,  8.58s/it, gpt_loss=0.357, loss_mean=0.274][A[A
+
+Train step of epoch 1:  25%|██▌       | 1623/6434 [3:48:49<11:28:15,  8.58s/it, gpt_loss=0.265, loss_mean=0.273][A[A
+
+Train step of epoch 1:  25%|██▌       | 1624/6434 [3:48:49<11:18:15,  8.46s/it, gpt_loss=0.265, loss_mean=0.273][A[A
+
+Train step of epoch 1:  25%|██▌       | 1624/6434 [3:48:57<11:18:15,  8.46s/it, gpt_loss=0.292, loss_mean=0.275][A[A
+
+Train step of epoch 1:  25%|██▌       | 1625/6434 [3:48:57<10:54:58,  8.17s/it, gpt_loss=0.292, loss_mean=0.275][A[A
+[LID Router Debug] Step: 8060
+Batch Size: 10
+Audio Batch Size: 132
+LID Assignments: [3, 1, 2, 3, 0, 3, 6, 4, 1, 9]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6, 9}
+
+
+Train step of epoch 1:  25%|██▌       | 1625/6434 [3:49:05<10:54:58,  8.17s/it, gpt_loss=0.397, loss_mean=0.287][A[A
+
+Train step of epoch 1:  25%|██▌       | 1626/6434 [3:49:05<11:01:44,  8.26s/it, gpt_loss=0.397, loss_mean=0.287][A[A
+
+Train step of epoch 1:  25%|██▌       | 1626/6434 [3:49:14<11:01:44,  8.26s/it, gpt_loss=0.306, loss_mean=0.289][A[A
+
+Train step of epoch 1:  25%|██▌       | 1627/6434 [3:49:14<11:07:14,  8.33s/it, gpt_loss=0.306, loss_mean=0.289][A[A
+
+Train step of epoch 1:  25%|██▌       | 1627/6434 [3:49:22<11:07:14,  8.33s/it, gpt_loss=0.239, loss_mean=0.284][A[A
+
+Train step of epoch 1:  25%|██▌       | 1628/6434 [3:49:22<11:18:33,  8.47s/it, gpt_loss=0.239, loss_mean=0.284][A[A
+
+Train step of epoch 1:  25%|██▌       | 1628/6434 [3:49:31<11:18:33,  8.47s/it, gpt_loss=0.2, loss_mean=0.276]  [A[A
+
+Train step of epoch 1:  25%|██▌       | 1629/6434 [3:49:31<11:28:48,  8.60s/it, gpt_loss=0.2, loss_mean=0.276][A[A
+
+Train step of epoch 1:  25%|██▌       | 1629/6434 [3:49:39<11:28:48,  8.60s/it, gpt_loss=0.28, loss_mean=0.276][A[A
+
+Train step of epoch 1:  25%|██▌       | 1630/6434 [3:49:39<11:07:12,  8.33s/it, gpt_loss=0.28, loss_mean=0.276][A[A
+
+Train step of epoch 1:  25%|██▌       | 1630/6434 [3:49:47<11:07:12,  8.33s/it, gpt_loss=0.208, loss_mean=0.269][A[A
+
+Train step of epoch 1:  25%|██▌       | 1631/6434 [3:49:47<11:05:02,  8.31s/it, gpt_loss=0.208, loss_mean=0.269][A[A
+
+Train step of epoch 1:  25%|██▌       | 1631/6434 [3:49:55<11:05:02,  8.31s/it, gpt_loss=0.248, loss_mean=0.267][A[A
+
+Train step of epoch 1:  25%|██▌       | 1632/6434 [3:49:55<10:56:22,  8.20s/it, gpt_loss=0.248, loss_mean=0.267][A[A
+
+Train step of epoch 1:  25%|██▌       | 1632/6434 [3:50:03<10:56:22,  8.20s/it, gpt_loss=0.275, loss_mean=0.268][A[A
+
+Train step of epoch 1:  25%|██▌       | 1633/6434 [3:50:03<10:47:42,  8.09s/it, gpt_loss=0.275, loss_mean=0.268][A[A
+
+Train step of epoch 1:  25%|██▌       | 1633/6434 [3:50:11<10:47:42,  8.09s/it, gpt_loss=0.219, loss_mean=0.263][A[A
+
+Train step of epoch 1:  25%|██▌       | 1634/6434 [3:50:11<10:37:25,  7.97s/it, gpt_loss=0.219, loss_mean=0.263][A[A
+
+Train step of epoch 1:  25%|██▌       | 1634/6434 [3:50:19<10:37:25,  7.97s/it, gpt_loss=0.254, loss_mean=0.262][A[A
+
+Train step of epoch 1:  25%|██▌       | 1635/6434 [3:50:19<10:42:46,  8.04s/it, gpt_loss=0.254, loss_mean=0.262][A[A
+[LID Router Debug] Step: 8070
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [3, 6, 4, 6, 9, 4, 1, 3, 2, 1]
+Active Experts in Batch: {1, 2, 3, 4, 6, 9}
+
+
+Train step of epoch 1:  25%|██▌       | 1635/6434 [3:50:28<10:42:46,  8.04s/it, gpt_loss=0.331, loss_mean=0.269][A[A
+
+Train step of epoch 1:  25%|██▌       | 1636/6434 [3:50:28<11:12:15,  8.41s/it, gpt_loss=0.331, loss_mean=0.269][A[A
+
+Train step of epoch 1:  25%|██▌       | 1636/6434 [3:50:37<11:12:15,  8.41s/it, gpt_loss=0.25, loss_mean=0.267] [A[A
+
+Train step of epoch 1:  25%|██▌       | 1637/6434 [3:50:37<11:22:23,  8.54s/it, gpt_loss=0.25, loss_mean=0.267][A[A
+
+Train step of epoch 1:  25%|██▌       | 1637/6434 [3:50:45<11:22:23,  8.54s/it, gpt_loss=0.32, loss_mean=0.272][A[A
+
+Train step of epoch 1:  25%|██▌       | 1638/6434 [3:50:45<11:14:29,  8.44s/it, gpt_loss=0.32, loss_mean=0.272][A[A
+
+Train step of epoch 1:  25%|██▌       | 1638/6434 [3:50:54<11:14:29,  8.44s/it, gpt_loss=0.233, loss_mean=0.269][A[A
+
+Train step of epoch 1:  25%|██▌       | 1639/6434 [3:50:54<11:30:32,  8.64s/it, gpt_loss=0.233, loss_mean=0.269][A[A
+
+Train step of epoch 1:  25%|██▌       | 1639/6434 [3:51:03<11:30:32,  8.64s/it, gpt_loss=0.331, loss_mean=0.275][A[A
+
+Train step of epoch 1:  25%|██▌       | 1640/6434 [3:51:03<11:21:18,  8.53s/it, gpt_loss=0.331, loss_mean=0.275][A[A
+
+Train step of epoch 1:  25%|██▌       | 1640/6434 [3:51:11<11:21:18,  8.53s/it, gpt_loss=0.234, loss_mean=0.271][A[A
+
+Train step of epoch 1:  26%|██▌       | 1641/6434 [3:51:11<11:28:06,  8.61s/it, gpt_loss=0.234, loss_mean=0.271][A[A
+
+Train step of epoch 1:  26%|██▌       | 1641/6434 [3:51:20<11:28:06,  8.61s/it, gpt_loss=0.244, loss_mean=0.268][A[A
+
+Train step of epoch 1:  26%|██▌       | 1642/6434 [3:51:20<11:17:27,  8.48s/it, gpt_loss=0.244, loss_mean=0.268][A[A
+
+Train step of epoch 1:  26%|██▌       | 1642/6434 [3:51:28<11:17:27,  8.48s/it, gpt_loss=0.28, loss_mean=0.269] [A[A
+
+Train step of epoch 1:  26%|██▌       | 1643/6434 [3:51:28<11:11:07,  8.40s/it, gpt_loss=0.28, loss_mean=0.269][A[A
+
+Train step of epoch 1:  26%|██▌       | 1643/6434 [3:51:36<11:11:07,  8.40s/it, gpt_loss=0.275, loss_mean=0.27][A[A
+
+Train step of epoch 1:  26%|██▌       | 1644/6434 [3:51:36<11:05:13,  8.33s/it, gpt_loss=0.275, loss_mean=0.27][A[A
+
+Train step of epoch 1:  26%|██▌       | 1644/6434 [3:51:44<11:05:13,  8.33s/it, gpt_loss=0.246, loss_mean=0.267][A[A
+
+Train step of epoch 1:  26%|██▌       | 1645/6434 [3:51:44<11:08:01,  8.37s/it, gpt_loss=0.246, loss_mean=0.267][A[A
+[LID Router Debug] Step: 8080
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [4, 2, 9, 4, 1, 4, 5, 1, 9, 3]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  26%|██▌       | 1645/6434 [3:51:52<11:08:01,  8.37s/it, gpt_loss=0.254, loss_mean=0.266][A[A
+
+Train step of epoch 1:  26%|██▌       | 1646/6434 [3:51:52<10:44:51,  8.08s/it, gpt_loss=0.254, loss_mean=0.266][A[A
+
+Train step of epoch 1:  26%|██▌       | 1646/6434 [3:52:00<10:44:51,  8.08s/it, gpt_loss=0.339, loss_mean=0.273][A[A
+
+Train step of epoch 1:  26%|██▌       | 1647/6434 [3:52:00<10:55:53,  8.22s/it, gpt_loss=0.339, loss_mean=0.273][A[A
+
+Train step of epoch 1:  26%|██▌       | 1647/6434 [3:52:09<10:55:53,  8.22s/it, gpt_loss=0.223, loss_mean=0.268][A[A
+
+Train step of epoch 1:  26%|██▌       | 1648/6434 [3:52:09<11:09:12,  8.39s/it, gpt_loss=0.223, loss_mean=0.268][A[A
+
+Train step of epoch 1:  26%|██▌       | 1648/6434 [3:52:19<11:09:12,  8.39s/it, gpt_loss=0.247, loss_mean=0.266][A[A
+
+Train step of epoch 1:  26%|██▌       | 1649/6434 [3:52:19<11:32:24,  8.68s/it, gpt_loss=0.247, loss_mean=0.266][A[A
+
+Train step of epoch 1:  26%|██▌       | 1649/6434 [3:52:26<11:32:24,  8.68s/it, gpt_loss=0.231, loss_mean=0.263][A[A
+
+Train step of epoch 1:  26%|██▌       | 1650/6434 [3:52:26<11:09:36,  8.40s/it, gpt_loss=0.231, loss_mean=0.263][A[A
+
+Train step of epoch 1:  26%|██▌       | 1650/6434 [3:52:35<11:09:36,  8.40s/it, gpt_loss=0.408, loss_mean=0.277][A[A
+
+Train step of epoch 1:  26%|██▌       | 1651/6434 [3:52:35<11:25:21,  8.60s/it, gpt_loss=0.408, loss_mean=0.277][A[A
+
+Train step of epoch 1:  26%|██▌       | 1651/6434 [3:52:44<11:25:21,  8.60s/it, gpt_loss=0.24, loss_mean=0.273] [A[A
+
+Train step of epoch 1:  26%|██▌       | 1652/6434 [3:52:44<11:30:36,  8.67s/it, gpt_loss=0.24, loss_mean=0.273][A[A
+
+Train step of epoch 1:  26%|██▌       | 1652/6434 [3:52:53<11:30:36,  8.67s/it, gpt_loss=0.296, loss_mean=0.276][A[A
+
+Train step of epoch 1:  26%|██▌       | 1653/6434 [3:52:53<11:45:20,  8.85s/it, gpt_loss=0.296, loss_mean=0.276][A[A
+
+Train step of epoch 1:  26%|██▌       | 1653/6434 [3:53:03<11:45:20,  8.85s/it, gpt_loss=0.191, loss_mean=0.267][A[A
+
+Train step of epoch 1:  26%|██▌       | 1654/6434 [3:53:03<11:54:12,  8.97s/it, gpt_loss=0.191, loss_mean=0.267][A[A
+
+Train step of epoch 1:  26%|██▌       | 1654/6434 [3:53:11<11:54:12,  8.97s/it, gpt_loss=0.342, loss_mean=0.275][A[A
+
+Train step of epoch 1:  26%|██▌       | 1655/6434 [3:53:11<11:38:58,  8.78s/it, gpt_loss=0.342, loss_mean=0.275][A[A
+[LID Router Debug] Step: 8090
+Batch Size: 10
+Audio Batch Size: 107
+LID Assignments: [2, 3, 4, 1, 5, 4, 1, 9, 3, 7]
+Active Experts in Batch: {1, 2, 3, 4, 5, 7, 9}
+
+
+Train step of epoch 1:  26%|██▌       | 1655/6434 [3:53:20<11:38:58,  8.78s/it, gpt_loss=0.289, loss_mean=0.276][A[A
+
+Train step of epoch 1:  26%|██▌       | 1656/6434 [3:53:20<11:33:03,  8.70s/it, gpt_loss=0.289, loss_mean=0.276][A[A
+
+Train step of epoch 1:  26%|██▌       | 1656/6434 [3:53:28<11:33:03,  8.70s/it, gpt_loss=0.31, loss_mean=0.28]  [A[A
+
+Train step of epoch 1:  26%|██▌       | 1657/6434 [3:53:28<11:36:04,  8.74s/it, gpt_loss=0.31, loss_mean=0.28][A[A
+
+Train step of epoch 1:  26%|██▌       | 1657/6434 [3:53:37<11:36:04,  8.74s/it, gpt_loss=0.325, loss_mean=0.284][A[A
+
+Train step of epoch 1:  26%|██▌       | 1658/6434 [3:53:37<11:35:08,  8.73s/it, gpt_loss=0.325, loss_mean=0.284][A[A
+
+Train step of epoch 1:  26%|██▌       | 1658/6434 [3:53:44<11:35:08,  8.73s/it, gpt_loss=0.306, loss_mean=0.286][A[A
+
+Train step of epoch 1:  26%|██▌       | 1659/6434 [3:53:44<11:03:21,  8.34s/it, gpt_loss=0.306, loss_mean=0.286][A[A
+
+Train step of epoch 1:  26%|██▌       | 1659/6434 [3:53:53<11:03:21,  8.34s/it, gpt_loss=0.288, loss_mean=0.286][A[A
+
+Train step of epoch 1:  26%|██▌       | 1660/6434 [3:53:53<11:03:35,  8.34s/it, gpt_loss=0.288, loss_mean=0.286][A[A
+
+Train step of epoch 1:  26%|██▌       | 1660/6434 [3:54:00<11:03:35,  8.34s/it, gpt_loss=0.33, loss_mean=0.291] [A[A
+
+Train step of epoch 1:  26%|██▌       | 1661/6434 [3:54:00<10:45:31,  8.11s/it, gpt_loss=0.33, loss_mean=0.291][A[A
+
+Train step of epoch 1:  26%|██▌       | 1661/6434 [3:54:09<10:45:31,  8.11s/it, gpt_loss=0.276, loss_mean=0.289][A[A
+
+Train step of epoch 1:  26%|██▌       | 1662/6434 [3:54:09<11:02:36,  8.33s/it, gpt_loss=0.276, loss_mean=0.289][A[A
+
+Train step of epoch 1:  26%|██▌       | 1662/6434 [3:54:17<11:02:36,  8.33s/it, gpt_loss=0.215, loss_mean=0.282][A[A
+
+Train step of epoch 1:  26%|██▌       | 1663/6434 [3:54:17<10:47:01,  8.14s/it, gpt_loss=0.215, loss_mean=0.282][A[A
+
+Train step of epoch 1:  26%|██▌       | 1663/6434 [3:54:25<10:47:01,  8.14s/it, gpt_loss=0.271, loss_mean=0.281][A[A
+
+Train step of epoch 1:  26%|██▌       | 1664/6434 [3:54:25<10:53:00,  8.21s/it, gpt_loss=0.271, loss_mean=0.281][A[A
+
+Train step of epoch 1:  26%|██▌       | 1664/6434 [3:54:34<10:53:00,  8.21s/it, gpt_loss=0.32, loss_mean=0.285] [A[A
+
+Train step of epoch 1:  26%|██▌       | 1665/6434 [3:54:34<10:53:00,  8.22s/it, gpt_loss=0.32, loss_mean=0.285][A[A
+[LID Router Debug] Step: 8100
+Batch Size: 10
+Audio Batch Size: 129
+LID Assignments: [4, 4, 9, 0, 0, 5, 2, 9, 2, 9]
+Active Experts in Batch: {0, 2, 4, 5, 9}
+
+
+Train step of epoch 1:  26%|██▌       | 1665/6434 [3:54:42<10:53:00,  8.22s/it, gpt_loss=0.324, loss_mean=0.289][A[A
+
+Train step of epoch 1:  26%|██▌       | 1666/6434 [3:54:42<10:59:13,  8.30s/it, gpt_loss=0.324, loss_mean=0.289][A[A
+
+Train step of epoch 1:  26%|██▌       | 1666/6434 [3:54:52<10:59:13,  8.30s/it, gpt_loss=0.347, loss_mean=0.294][A[A
+
+Train step of epoch 1:  26%|██▌       | 1667/6434 [3:54:52<11:28:31,  8.67s/it, gpt_loss=0.347, loss_mean=0.294][A[A
+
+Train step of epoch 1:  26%|██▌       | 1667/6434 [3:55:00<11:28:31,  8.67s/it, gpt_loss=0.257, loss_mean=0.291][A[A
+
+Train step of epoch 1:  26%|██▌       | 1668/6434 [3:55:00<11:19:12,  8.55s/it, gpt_loss=0.257, loss_mean=0.291][A[A
+
+Train step of epoch 1:  26%|██▌       | 1668/6434 [3:55:08<11:19:12,  8.55s/it, gpt_loss=0.29, loss_mean=0.291] [A[A
+
+Train step of epoch 1:  26%|██▌       | 1669/6434 [3:55:08<10:58:19,  8.29s/it, gpt_loss=0.29, loss_mean=0.291][A[A
+
+Train step of epoch 1:  26%|██▌       | 1669/6434 [3:55:16<10:58:19,  8.29s/it, gpt_loss=0.327, loss_mean=0.294][A[A
+
+Train step of epoch 1:  26%|██▌       | 1670/6434 [3:55:16<10:53:20,  8.23s/it, gpt_loss=0.327, loss_mean=0.294][A[A
+
+Train step of epoch 1:  26%|██▌       | 1670/6434 [3:55:23<10:53:20,  8.23s/it, gpt_loss=0.249, loss_mean=0.29] [A[A
+
+Train step of epoch 1:  26%|██▌       | 1671/6434 [3:55:23<10:42:14,  8.09s/it, gpt_loss=0.249, loss_mean=0.29][A[A
+
+Train step of epoch 1:  26%|██▌       | 1671/6434 [3:55:33<10:42:14,  8.09s/it, gpt_loss=0.233, loss_mean=0.284][A[A
+
+Train step of epoch 1:  26%|██▌       | 1672/6434 [3:55:33<11:10:28,  8.45s/it, gpt_loss=0.233, loss_mean=0.284][A[A
+
+Train step of epoch 1:  26%|██▌       | 1672/6434 [3:55:41<11:10:28,  8.45s/it, gpt_loss=0.252, loss_mean=0.281][A[A
+
+Train step of epoch 1:  26%|██▌       | 1673/6434 [3:55:41<11:17:03,  8.53s/it, gpt_loss=0.252, loss_mean=0.281][A[A
+
+Train step of epoch 1:  26%|██▌       | 1673/6434 [3:55:48<11:17:03,  8.53s/it, gpt_loss=0.222, loss_mean=0.275][A[A
+
+Train step of epoch 1:  26%|██▌       | 1674/6434 [3:55:48<10:37:17,  8.03s/it, gpt_loss=0.222, loss_mean=0.275][A[A
+
+Train step of epoch 1:  26%|██▌       | 1674/6434 [3:55:55<10:37:17,  8.03s/it, gpt_loss=0.274, loss_mean=0.275][A[A
+
+Train step of epoch 1:  26%|██▌       | 1675/6434 [3:55:55<10:13:00,  7.73s/it, gpt_loss=0.274, loss_mean=0.275][A[A
+[LID Router Debug] Step: 8110
+Batch Size: 10
+Audio Batch Size: 104
+LID Assignments: [3, 0, 1, 0, 2, 3, 9, 1, 4, 5]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  26%|██▌       | 1675/6434 [3:56:03<10:13:00,  7.73s/it, gpt_loss=0.239, loss_mean=0.271][A[A
+
+Train step of epoch 1:  26%|██▌       | 1676/6434 [3:56:03<10:11:55,  7.72s/it, gpt_loss=0.239, loss_mean=0.271][A[A
+
+Train step of epoch 1:  26%|██▌       | 1676/6434 [3:56:11<10:11:55,  7.72s/it, gpt_loss=0.24, loss_mean=0.268] [A[A
+
+Train step of epoch 1:  26%|██▌       | 1677/6434 [3:56:11<10:30:53,  7.96s/it, gpt_loss=0.24, loss_mean=0.268][A[A
+
+Train step of epoch 1:  26%|██▌       | 1677/6434 [3:56:20<10:30:53,  7.96s/it, gpt_loss=0.237, loss_mean=0.265][A[A
+
+Train step of epoch 1:  26%|██▌       | 1678/6434 [3:56:20<10:51:03,  8.21s/it, gpt_loss=0.237, loss_mean=0.265][A[A
+
+Train step of epoch 1:  26%|██▌       | 1678/6434 [3:56:28<10:51:03,  8.21s/it, gpt_loss=0.256, loss_mean=0.264][A[A
+
+Train step of epoch 1:  26%|██▌       | 1679/6434 [3:56:28<10:35:56,  8.02s/it, gpt_loss=0.256, loss_mean=0.264][A[A
+
+Train step of epoch 1:  26%|██▌       | 1679/6434 [3:56:37<10:35:56,  8.02s/it, gpt_loss=0.312, loss_mean=0.269][A[A
+
+Train step of epoch 1:  26%|██▌       | 1680/6434 [3:56:37<11:11:55,  8.48s/it, gpt_loss=0.312, loss_mean=0.269][A[A
+
+Train step of epoch 1:  26%|██▌       | 1680/6434 [3:56:45<11:11:55,  8.48s/it, gpt_loss=0.235, loss_mean=0.266][A[A
+
+Train step of epoch 1:  26%|██▌       | 1681/6434 [3:56:45<10:58:35,  8.31s/it, gpt_loss=0.235, loss_mean=0.266][A[A
+
+Train step of epoch 1:  26%|██▌       | 1681/6434 [3:56:54<10:58:35,  8.31s/it, gpt_loss=0.247, loss_mean=0.264][A[A
+
+Train step of epoch 1:  26%|██▌       | 1682/6434 [3:56:54<11:04:43,  8.39s/it, gpt_loss=0.247, loss_mean=0.264][A[A
+
+Train step of epoch 1:  26%|██▌       | 1682/6434 [3:57:03<11:04:43,  8.39s/it, gpt_loss=0.276, loss_mean=0.265][A[A
+
+Train step of epoch 1:  26%|██▌       | 1683/6434 [3:57:03<11:14:52,  8.52s/it, gpt_loss=0.276, loss_mean=0.265][A[A
+
+Train step of epoch 1:  26%|██▌       | 1683/6434 [3:57:11<11:14:52,  8.52s/it, gpt_loss=0.221, loss_mean=0.261][A[A
+
+Train step of epoch 1:  26%|██▌       | 1684/6434 [3:57:11<11:02:12,  8.36s/it, gpt_loss=0.221, loss_mean=0.261][A[A
+
+Train step of epoch 1:  26%|██▌       | 1684/6434 [3:57:18<11:02:12,  8.36s/it, gpt_loss=0.323, loss_mean=0.267][A[A
+
+Train step of epoch 1:  26%|██▌       | 1685/6434 [3:57:18<10:42:34,  8.12s/it, gpt_loss=0.323, loss_mean=0.267][A[A
+[LID Router Debug] Step: 8120
+Batch Size: 10
+Audio Batch Size: 106
+LID Assignments: [3, 1, 1, 2, 2, 5, 0, 6, 3, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+
+Train step of epoch 1:  26%|██▌       | 1685/6434 [3:57:28<10:42:34,  8.12s/it, gpt_loss=0.24, loss_mean=0.264] [A[A
+
+Train step of epoch 1:  26%|██▌       | 1686/6434 [3:57:28<11:17:23,  8.56s/it, gpt_loss=0.24, loss_mean=0.264][A[A
+
+Train step of epoch 1:  26%|██▌       | 1686/6434 [3:57:37<11:17:23,  8.56s/it, gpt_loss=0.244, loss_mean=0.262][A[A
+
+Train step of epoch 1:  26%|██▌       | 1687/6434 [3:57:37<11:40:15,  8.85s/it, gpt_loss=0.244, loss_mean=0.262][A[A
+
+Train step of epoch 1:  26%|██▌       | 1687/6434 [3:57:46<11:40:15,  8.85s/it, gpt_loss=0.348, loss_mean=0.271][A[A
+
+Train step of epoch 1:  26%|██▌       | 1688/6434 [3:57:46<11:39:38,  8.85s/it, gpt_loss=0.348, loss_mean=0.271][A[A
+
+Train step of epoch 1:  26%|██▌       | 1688/6434 [3:57:56<11:39:38,  8.85s/it, gpt_loss=0.297, loss_mean=0.273][A[A
+
+Train step of epoch 1:  26%|██▋       | 1689/6434 [3:57:56<12:05:48,  9.18s/it, gpt_loss=0.297, loss_mean=0.273][A[A
+
+Train step of epoch 1:  26%|██▋       | 1689/6434 [3:58:06<12:05:48,  9.18s/it, gpt_loss=0.259, loss_mean=0.272][A[A
+
+Train step of epoch 1:  26%|██▋       | 1690/6434 [3:58:06<12:17:16,  9.32s/it, gpt_loss=0.259, loss_mean=0.272][A[A
+
+Train step of epoch 1:  26%|██▋       | 1690/6434 [3:58:16<12:17:16,  9.32s/it, gpt_loss=0.308, loss_mean=0.275][A[A
+
+Train step of epoch 1:  26%|██▋       | 1691/6434 [3:58:16<12:35:20,  9.56s/it, gpt_loss=0.308, loss_mean=0.275][A[A
+
+Train step of epoch 1:  26%|██▋       | 1691/6434 [3:58:25<12:35:20,  9.56s/it, gpt_loss=0.284, loss_mean=0.276][A[A
+
+Train step of epoch 1:  26%|██▋       | 1692/6434 [3:58:25<12:26:14,  9.44s/it, gpt_loss=0.284, loss_mean=0.276][A[A
+
+Train step of epoch 1:  26%|██▋       | 1692/6434 [3:58:34<12:26:14,  9.44s/it, gpt_loss=0.247, loss_mean=0.273][A[A
+
+Train step of epoch 1:  26%|██▋       | 1693/6434 [3:58:34<12:08:14,  9.22s/it, gpt_loss=0.247, loss_mean=0.273][A[A
+
+Train step of epoch 1:  26%|██▋       | 1693/6434 [3:58:42<12:08:14,  9.22s/it, gpt_loss=0.254, loss_mean=0.271][A[A
+
+Train step of epoch 1:  26%|██▋       | 1694/6434 [3:58:42<11:46:57,  8.95s/it, gpt_loss=0.254, loss_mean=0.271][A[A
+
+Train step of epoch 1:  26%|██▋       | 1694/6434 [3:58:50<11:46:57,  8.95s/it, gpt_loss=0.275, loss_mean=0.272][A[A
+
+Train step of epoch 1:  26%|██▋       | 1695/6434 [3:58:50<11:28:12,  8.71s/it, gpt_loss=0.275, loss_mean=0.272][A[A
+[LID Router Debug] Step: 8130
+Batch Size: 10
+Audio Batch Size: 124
+LID Assignments: [2, 1, 3, 2, 0, 0, 4, 6, 9, 9]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6, 9}
+
+
+Train step of epoch 1:  26%|██▋       | 1695/6434 [3:58:59<11:28:12,  8.71s/it, gpt_loss=0.214, loss_mean=0.266][A[A
+
+Train step of epoch 1:  26%|██▋       | 1696/6434 [3:58:59<11:27:01,  8.70s/it, gpt_loss=0.214, loss_mean=0.266][A[A
+
+Train step of epoch 1:  26%|██▋       | 1696/6434 [3:59:07<11:27:01,  8.70s/it, gpt_loss=0.225, loss_mean=0.262][A[A
+
+Train step of epoch 1:  26%|██▋       | 1697/6434 [3:59:07<11:13:41,  8.53s/it, gpt_loss=0.225, loss_mean=0.262][A[A
+
+Train step of epoch 1:  26%|██▋       | 1697/6434 [3:59:15<11:13:41,  8.53s/it, gpt_loss=0.229, loss_mean=0.259][A[A
+
+Train step of epoch 1:  26%|██▋       | 1698/6434 [3:59:15<11:08:49,  8.47s/it, gpt_loss=0.229, loss_mean=0.259][A[A
+
+Train step of epoch 1:  26%|██▋       | 1698/6434 [3:59:24<11:08:49,  8.47s/it, gpt_loss=0.259, loss_mean=0.259][A[A
+
+Train step of epoch 1:  26%|██▋       | 1699/6434 [3:59:24<11:03:07,  8.40s/it, gpt_loss=0.259, loss_mean=0.259][A[A
+
+Train step of epoch 1:  26%|██▋       | 1699/6434 [3:59:32<11:03:07,  8.40s/it, gpt_loss=0.239, loss_mean=0.257][A[A
+
+Train step of epoch 1:  26%|██▋       | 1700/6434 [3:59:32<11:01:00,  8.38s/it, gpt_loss=0.239, loss_mean=0.257][A[A
+
+Train step of epoch 1:  26%|██▋       | 1700/6434 [3:59:40<11:01:00,  8.38s/it, gpt_loss=0.275, loss_mean=0.259][A[A
+
+Train step of epoch 1:  26%|██▋       | 1701/6434 [3:59:40<11:03:13,  8.41s/it, gpt_loss=0.275, loss_mean=0.259][A[A
+
+Train step of epoch 1:  26%|██▋       | 1701/6434 [3:59:50<11:03:13,  8.41s/it, gpt_loss=0.26, loss_mean=0.259] [A[A
+
+Train step of epoch 1:  26%|██▋       | 1702/6434 [3:59:50<11:26:47,  8.71s/it, gpt_loss=0.26, loss_mean=0.259][A[A
+
+Train step of epoch 1:  26%|██▋       | 1702/6434 [3:59:59<11:26:47,  8.71s/it, gpt_loss=0.307, loss_mean=0.264][A[A
+
+Train step of epoch 1:  26%|██▋       | 1703/6434 [3:59:59<11:30:50,  8.76s/it, gpt_loss=0.307, loss_mean=0.264][A[A
+
+Train step of epoch 1:  26%|██▋       | 1703/6434 [4:00:07<11:30:50,  8.76s/it, gpt_loss=0.268, loss_mean=0.264][A[A
+
+Train step of epoch 1:  26%|██▋       | 1704/6434 [4:00:07<11:17:54,  8.60s/it, gpt_loss=0.268, loss_mean=0.264][A[A
+
+Train step of epoch 1:  26%|██▋       | 1704/6434 [4:00:15<11:17:54,  8.60s/it, gpt_loss=0.242, loss_mean=0.262][A[A
+
+Train step of epoch 1:  26%|██▋       | 1705/6434 [4:00:15<11:05:13,  8.44s/it, gpt_loss=0.242, loss_mean=0.262][A[A
+[LID Router Debug] Step: 8140
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [2, 5, 0, 0, 0, 9, 9, 2, 5, 0]
+Active Experts in Batch: {0, 9, 2, 5}
+
+
+Train step of epoch 1:  26%|██▋       | 1705/6434 [4:00:24<11:05:13,  8.44s/it, gpt_loss=0.268, loss_mean=0.262][A[A
+
+Train step of epoch 1:  27%|██▋       | 1706/6434 [4:00:24<11:07:46,  8.47s/it, gpt_loss=0.268, loss_mean=0.262][A[A
+
+Train step of epoch 1:  27%|██▋       | 1706/6434 [4:00:33<11:07:46,  8.47s/it, gpt_loss=0.225, loss_mean=0.259][A[A
+
+Train step of epoch 1:  27%|██▋       | 1707/6434 [4:00:33<11:31:21,  8.78s/it, gpt_loss=0.225, loss_mean=0.259][A[A
+
+Train step of epoch 1:  27%|██▋       | 1707/6434 [4:00:41<11:31:21,  8.78s/it, gpt_loss=0.217, loss_mean=0.254][A[A
+
+Train step of epoch 1:  27%|██▋       | 1708/6434 [4:00:41<11:10:28,  8.51s/it, gpt_loss=0.217, loss_mean=0.254][A[A
+
+Train step of epoch 1:  27%|██▋       | 1708/6434 [4:00:51<11:10:28,  8.51s/it, gpt_loss=0.232, loss_mean=0.252][A[A
+
+Train step of epoch 1:  27%|██▋       | 1709/6434 [4:00:51<11:36:45,  8.85s/it, gpt_loss=0.232, loss_mean=0.252][A[A
+
+Train step of epoch 1:  27%|██▋       | 1709/6434 [4:00:59<11:36:45,  8.85s/it, gpt_loss=0.358, loss_mean=0.263][A[A
+
+Train step of epoch 1:  27%|██▋       | 1710/6434 [4:00:59<11:26:44,  8.72s/it, gpt_loss=0.358, loss_mean=0.263][A[A
+
+Train step of epoch 1:  27%|██▋       | 1710/6434 [4:01:09<11:26:44,  8.72s/it, gpt_loss=0.322, loss_mean=0.269][A[A
+
+Train step of epoch 1:  27%|██▋       | 1711/6434 [4:01:09<11:45:31,  8.96s/it, gpt_loss=0.322, loss_mean=0.269][A[A
+
+Train step of epoch 1:  27%|██▋       | 1711/6434 [4:01:17<11:45:31,  8.96s/it, gpt_loss=0.278, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  27%|██▋       | 1712/6434 [4:01:17<11:33:55,  8.82s/it, gpt_loss=0.278, loss_mean=0.27][A[A
+
+Train step of epoch 1:  27%|██▋       | 1712/6434 [4:01:25<11:33:55,  8.82s/it, gpt_loss=0.248, loss_mean=0.267][A[A
+
+Train step of epoch 1:  27%|██▋       | 1713/6434 [4:01:25<11:19:26,  8.64s/it, gpt_loss=0.248, loss_mean=0.267][A[A
+
+Train step of epoch 1:  27%|██▋       | 1713/6434 [4:01:34<11:19:26,  8.64s/it, gpt_loss=0.314, loss_mean=0.272][A[A
+
+Train step of epoch 1:  27%|██▋       | 1714/6434 [4:01:34<11:12:03,  8.54s/it, gpt_loss=0.314, loss_mean=0.272][A[A
+
+Train step of epoch 1:  27%|██▋       | 1714/6434 [4:01:43<11:12:03,  8.54s/it, gpt_loss=0.262, loss_mean=0.271][A[A
+
+Train step of epoch 1:  27%|██▋       | 1715/6434 [4:01:43<11:32:48,  8.81s/it, gpt_loss=0.262, loss_mean=0.271][A[A
+[LID Router Debug] Step: 8150
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [4, 1, 4, 5, 3, 5, 9, 3, 1, 4]
+Active Experts in Batch: {1, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  27%|██▋       | 1715/6434 [4:01:50<11:32:48,  8.81s/it, gpt_loss=0.216, loss_mean=0.266][A[A
+
+Train step of epoch 1:  27%|██▋       | 1716/6434 [4:01:50<10:57:49,  8.37s/it, gpt_loss=0.216, loss_mean=0.266][A[A
+
+Train step of epoch 1:  27%|██▋       | 1716/6434 [4:01:59<10:57:49,  8.37s/it, gpt_loss=0.28, loss_mean=0.267] [A[A
+
+Train step of epoch 1:  27%|██▋       | 1717/6434 [4:01:59<11:06:40,  8.48s/it, gpt_loss=0.28, loss_mean=0.267][A[A
+
+Train step of epoch 1:  27%|██▋       | 1717/6434 [4:02:07<11:06:40,  8.48s/it, gpt_loss=0.281, loss_mean=0.269][A[A
+
+Train step of epoch 1:  27%|██▋       | 1718/6434 [4:02:07<10:52:37,  8.30s/it, gpt_loss=0.281, loss_mean=0.269][A[A
+
+Train step of epoch 1:  27%|██▋       | 1718/6434 [4:02:16<10:52:37,  8.30s/it, gpt_loss=0.22, loss_mean=0.264] [A[A
+
+Train step of epoch 1:  27%|██▋       | 1719/6434 [4:02:16<10:58:58,  8.39s/it, gpt_loss=0.22, loss_mean=0.264][A[A
+
+Train step of epoch 1:  27%|██▋       | 1719/6434 [4:02:25<10:58:58,  8.39s/it, gpt_loss=0.249, loss_mean=0.262][A[A
+
+Train step of epoch 1:  27%|██▋       | 1720/6434 [4:02:25<11:21:54,  8.68s/it, gpt_loss=0.249, loss_mean=0.262][A[A
+
+Train step of epoch 1:  27%|██▋       | 1720/6434 [4:02:33<11:21:54,  8.68s/it, gpt_loss=0.326, loss_mean=0.269][A[A
+
+Train step of epoch 1:  27%|██▋       | 1721/6434 [4:02:33<11:00:10,  8.40s/it, gpt_loss=0.326, loss_mean=0.269][A[A
+
+Train step of epoch 1:  27%|██▋       | 1721/6434 [4:02:42<11:00:10,  8.40s/it, gpt_loss=0.208, loss_mean=0.262][A[A
+
+Train step of epoch 1:  27%|██▋       | 1722/6434 [4:02:42<11:25:31,  8.73s/it, gpt_loss=0.208, loss_mean=0.262][A[A
+
+Train step of epoch 1:  27%|██▋       | 1722/6434 [4:02:51<11:25:31,  8.73s/it, gpt_loss=0.312, loss_mean=0.267][A[A
+
+Train step of epoch 1:  27%|██▋       | 1723/6434 [4:02:51<11:24:42,  8.72s/it, gpt_loss=0.312, loss_mean=0.267][A[A
+
+Train step of epoch 1:  27%|██▋       | 1723/6434 [4:02:59<11:24:42,  8.72s/it, gpt_loss=0.302, loss_mean=0.271][A[A
+
+Train step of epoch 1:  27%|██▋       | 1724/6434 [4:02:59<11:17:07,  8.63s/it, gpt_loss=0.302, loss_mean=0.271][A[A
+
+Train step of epoch 1:  27%|██▋       | 1724/6434 [4:03:08<11:17:07,  8.63s/it, gpt_loss=0.32, loss_mean=0.276] [A[A
+
+Train step of epoch 1:  27%|██▋       | 1725/6434 [4:03:08<11:09:37,  8.53s/it, gpt_loss=0.32, loss_mean=0.276][A[A
+[LID Router Debug] Step: 8160
+Batch Size: 10
+Audio Batch Size: 90
+LID Assignments: [1, 4, 6, 5, 1, 5, 5, 2, 3, 5]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6}
+
+
+Train step of epoch 1:  27%|██▋       | 1725/6434 [4:03:16<11:09:37,  8.53s/it, gpt_loss=0.235, loss_mean=0.272][A[A
+
+Train step of epoch 1:  27%|██▋       | 1726/6434 [4:03:16<11:06:08,  8.49s/it, gpt_loss=0.235, loss_mean=0.272][A[A
+
+Train step of epoch 1:  27%|██▋       | 1726/6434 [4:03:24<11:06:08,  8.49s/it, gpt_loss=0.231, loss_mean=0.268][A[A
+
+Train step of epoch 1:  27%|██▋       | 1727/6434 [4:03:24<10:55:11,  8.35s/it, gpt_loss=0.231, loss_mean=0.268][A[A
+
+Train step of epoch 1:  27%|██▋       | 1727/6434 [4:03:32<10:55:11,  8.35s/it, gpt_loss=0.249, loss_mean=0.266][A[A
+
+Train step of epoch 1:  27%|██▋       | 1728/6434 [4:03:32<10:54:48,  8.35s/it, gpt_loss=0.249, loss_mean=0.266][A[A
+
+Train step of epoch 1:  27%|██▋       | 1728/6434 [4:03:40<10:54:48,  8.35s/it, gpt_loss=0.265, loss_mean=0.266][A[A
+
+Train step of epoch 1:  27%|██▋       | 1729/6434 [4:03:40<10:47:51,  8.26s/it, gpt_loss=0.265, loss_mean=0.266][A[A
+
+Train step of epoch 1:  27%|██▋       | 1729/6434 [4:03:49<10:47:51,  8.26s/it, gpt_loss=0.355, loss_mean=0.275][A[A
+
+Train step of epoch 1:  27%|██▋       | 1730/6434 [4:03:49<10:52:27,  8.32s/it, gpt_loss=0.355, loss_mean=0.275][A[A
+
+Train step of epoch 1:  27%|██▋       | 1730/6434 [4:03:57<10:52:27,  8.32s/it, gpt_loss=0.311, loss_mean=0.278][A[A
+
+Train step of epoch 1:  27%|██▋       | 1731/6434 [4:03:57<10:41:41,  8.19s/it, gpt_loss=0.311, loss_mean=0.278][A[A
+
+Train step of epoch 1:  27%|██▋       | 1731/6434 [4:04:04<10:41:41,  8.19s/it, gpt_loss=0.256, loss_mean=0.276][A[A
+
+Train step of epoch 1:  27%|██▋       | 1732/6434 [4:04:04<10:21:18,  7.93s/it, gpt_loss=0.256, loss_mean=0.276][A[A
+
+Train step of epoch 1:  27%|██▋       | 1732/6434 [4:04:14<10:21:18,  7.93s/it, gpt_loss=0.315, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  27%|██▋       | 1733/6434 [4:04:14<11:06:20,  8.50s/it, gpt_loss=0.315, loss_mean=0.28][A[A
+
+Train step of epoch 1:  27%|██▋       | 1733/6434 [4:04:23<11:06:20,  8.50s/it, gpt_loss=0.221, loss_mean=0.274][A[A
+
+Train step of epoch 1:  27%|██▋       | 1734/6434 [4:04:23<11:23:55,  8.73s/it, gpt_loss=0.221, loss_mean=0.274][A[A
+
+Train step of epoch 1:  27%|██▋       | 1734/6434 [4:04:32<11:23:55,  8.73s/it, gpt_loss=0.288, loss_mean=0.275][A[A
+
+Train step of epoch 1:  27%|██▋       | 1735/6434 [4:04:32<11:35:17,  8.88s/it, gpt_loss=0.288, loss_mean=0.275][A[A
+[LID Router Debug] Step: 8170
+Batch Size: 10
+Audio Batch Size: 146
+LID Assignments: [3, 3, 2, 3, 0, 5, 2, 2, 2, 9]
+Active Experts in Batch: {0, 2, 3, 5, 9}
+
+
+Train step of epoch 1:  27%|██▋       | 1735/6434 [4:04:42<11:35:17,  8.88s/it, gpt_loss=0.233, loss_mean=0.271][A[A
+
+Train step of epoch 1:  27%|██▋       | 1736/6434 [4:04:42<11:43:22,  8.98s/it, gpt_loss=0.233, loss_mean=0.271][A[A
+
+Train step of epoch 1:  27%|██▋       | 1736/6434 [4:04:52<11:43:22,  8.98s/it, gpt_loss=0.259, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  27%|██▋       | 1737/6434 [4:04:52<12:07:24,  9.29s/it, gpt_loss=0.259, loss_mean=0.27][A[A
+
+Train step of epoch 1:  27%|██▋       | 1737/6434 [4:04:59<12:07:24,  9.29s/it, gpt_loss=0.254, loss_mean=0.268][A[A
+
+Train step of epoch 1:  27%|██▋       | 1738/6434 [4:04:59<11:32:27,  8.85s/it, gpt_loss=0.254, loss_mean=0.268][A[A
+
+Train step of epoch 1:  27%|██▋       | 1738/6434 [4:05:07<11:32:27,  8.85s/it, gpt_loss=0.306, loss_mean=0.272][A[A
+
+Train step of epoch 1:  27%|██▋       | 1739/6434 [4:05:07<11:09:07,  8.55s/it, gpt_loss=0.306, loss_mean=0.272][A[A
+
+Train step of epoch 1:  27%|██▋       | 1739/6434 [4:05:15<11:09:07,  8.55s/it, gpt_loss=0.23, loss_mean=0.268] [A[A
+
+Train step of epoch 1:  27%|██▋       | 1740/6434 [4:05:15<10:55:13,  8.38s/it, gpt_loss=0.23, loss_mean=0.268][A[A
+
+Train step of epoch 1:  27%|██▋       | 1740/6434 [4:05:23<10:55:13,  8.38s/it, gpt_loss=0.199, loss_mean=0.261][A[A
+
+Train step of epoch 1:  27%|██▋       | 1741/6434 [4:05:23<10:46:19,  8.26s/it, gpt_loss=0.199, loss_mean=0.261][A[A
+
+Train step of epoch 1:  27%|██▋       | 1741/6434 [4:05:31<10:46:19,  8.26s/it, gpt_loss=0.254, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  27%|██▋       | 1742/6434 [4:05:31<10:43:25,  8.23s/it, gpt_loss=0.254, loss_mean=0.26][A[A
+
+Train step of epoch 1:  27%|██▋       | 1742/6434 [4:05:41<10:43:25,  8.23s/it, gpt_loss=0.308, loss_mean=0.265][A[A
+
+Train step of epoch 1:  27%|██▋       | 1743/6434 [4:05:41<11:23:24,  8.74s/it, gpt_loss=0.308, loss_mean=0.265][A[A
+
+Train step of epoch 1:  27%|██▋       | 1743/6434 [4:05:49<11:23:24,  8.74s/it, gpt_loss=0.283, loss_mean=0.267][A[A
+
+Train step of epoch 1:  27%|██▋       | 1744/6434 [4:05:49<11:03:23,  8.49s/it, gpt_loss=0.283, loss_mean=0.267][A[A
+
+Train step of epoch 1:  27%|██▋       | 1744/6434 [4:05:58<11:03:23,  8.49s/it, gpt_loss=0.24, loss_mean=0.264] [A[A
+
+Train step of epoch 1:  27%|██▋       | 1745/6434 [4:05:58<11:04:32,  8.50s/it, gpt_loss=0.24, loss_mean=0.264][A[A
+[LID Router Debug] Step: 8180
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [4, 3, 4, 2, 2, 5, 9, 7, 0, 3]
+Active Experts in Batch: {0, 2, 3, 4, 5, 7, 9}
+
+
+Train step of epoch 1:  27%|██▋       | 1745/6434 [4:06:05<11:04:32,  8.50s/it, gpt_loss=0.324, loss_mean=0.27][A[A
+
+Train step of epoch 1:  27%|██▋       | 1746/6434 [4:06:05<10:34:43,  8.12s/it, gpt_loss=0.324, loss_mean=0.27][A[A
+
+Train step of epoch 1:  27%|██▋       | 1746/6434 [4:06:13<10:34:43,  8.12s/it, gpt_loss=0.309, loss_mean=0.274][A[A
+
+Train step of epoch 1:  27%|██▋       | 1747/6434 [4:06:13<10:39:50,  8.19s/it, gpt_loss=0.309, loss_mean=0.274][A[A
+
+Train step of epoch 1:  27%|██▋       | 1747/6434 [4:06:23<10:39:50,  8.19s/it, gpt_loss=0.263, loss_mean=0.273][A[A
+
+Train step of epoch 1:  27%|██▋       | 1748/6434 [4:06:23<11:00:58,  8.46s/it, gpt_loss=0.263, loss_mean=0.273][A[A
+
+Train step of epoch 1:  27%|██▋       | 1748/6434 [4:06:30<11:00:58,  8.46s/it, gpt_loss=0.425, loss_mean=0.288][A[A
+
+Train step of epoch 1:  27%|██▋       | 1749/6434 [4:06:30<10:35:07,  8.13s/it, gpt_loss=0.425, loss_mean=0.288][A[A
+
+Train step of epoch 1:  27%|██▋       | 1749/6434 [4:06:38<10:35:07,  8.13s/it, gpt_loss=0.313, loss_mean=0.291][A[A
+
+Train step of epoch 1:  27%|██▋       | 1750/6434 [4:06:38<10:46:24,  8.28s/it, gpt_loss=0.313, loss_mean=0.291][A[A
+
+Train step of epoch 1:  27%|██▋       | 1750/6434 [4:06:48<10:46:24,  8.28s/it, gpt_loss=0.242, loss_mean=0.286][A[A
+
+Train step of epoch 1:  27%|██▋       | 1751/6434 [4:06:48<11:04:16,  8.51s/it, gpt_loss=0.242, loss_mean=0.286][A[A
+
+Train step of epoch 1:  27%|██▋       | 1751/6434 [4:06:55<11:04:16,  8.51s/it, gpt_loss=0.26, loss_mean=0.283] [A[A
+
+Train step of epoch 1:  27%|██▋       | 1752/6434 [4:06:55<10:40:05,  8.20s/it, gpt_loss=0.26, loss_mean=0.283][A[A
+
+Train step of epoch 1:  27%|██▋       | 1752/6434 [4:07:03<10:40:05,  8.20s/it, gpt_loss=0.251, loss_mean=0.28][A[A
+
+Train step of epoch 1:  27%|██▋       | 1753/6434 [4:07:03<10:27:27,  8.04s/it, gpt_loss=0.251, loss_mean=0.28][A[A
+
+Train step of epoch 1:  27%|██▋       | 1753/6434 [4:07:12<10:27:27,  8.04s/it, gpt_loss=0.336, loss_mean=0.286][A[A
+
+Train step of epoch 1:  27%|██▋       | 1754/6434 [4:07:12<10:57:26,  8.43s/it, gpt_loss=0.336, loss_mean=0.286][A[A
+
+Train step of epoch 1:  27%|██▋       | 1754/6434 [4:07:21<10:57:26,  8.43s/it, gpt_loss=0.319, loss_mean=0.289][A[A
+
+Train step of epoch 1:  27%|██▋       | 1755/6434 [4:07:21<11:02:03,  8.49s/it, gpt_loss=0.319, loss_mean=0.289][A[A
+[LID Router Debug] Step: 8190
+Batch Size: 10
+Audio Batch Size: 89
+LID Assignments: [10, 4, 5, 1, 4, 4, 2, 5, 1, 4]
+Active Experts in Batch: {1, 2, 4, 5, 10}
+
+
+Train step of epoch 1:  27%|██▋       | 1755/6434 [4:07:29<11:02:03,  8.49s/it, gpt_loss=0.229, loss_mean=0.283][A[A
+
+Train step of epoch 1:  27%|██▋       | 1756/6434 [4:07:29<11:06:46,  8.55s/it, gpt_loss=0.229, loss_mean=0.283][A[A
+
+Train step of epoch 1:  27%|██▋       | 1756/6434 [4:07:38<11:06:46,  8.55s/it, gpt_loss=0.23, loss_mean=0.278] [A[A
+
+Train step of epoch 1:  27%|██▋       | 1757/6434 [4:07:38<11:08:24,  8.57s/it, gpt_loss=0.23, loss_mean=0.278][A[A
+
+Train step of epoch 1:  27%|██▋       | 1757/6434 [4:07:46<11:08:24,  8.57s/it, gpt_loss=0.258, loss_mean=0.276][A[A
+
+Train step of epoch 1:  27%|██▋       | 1758/6434 [4:07:46<10:52:43,  8.38s/it, gpt_loss=0.258, loss_mean=0.276][A[A
+
+Train step of epoch 1:  27%|██▋       | 1758/6434 [4:07:54<10:52:43,  8.38s/it, gpt_loss=0.265, loss_mean=0.275][A[A
+
+Train step of epoch 1:  27%|██▋       | 1759/6434 [4:07:54<10:44:27,  8.27s/it, gpt_loss=0.265, loss_mean=0.275][A[A
+
+Train step of epoch 1:  27%|██▋       | 1759/6434 [4:08:02<10:44:27,  8.27s/it, gpt_loss=0.256, loss_mean=0.273][A[A
+
+Train step of epoch 1:  27%|██▋       | 1760/6434 [4:08:02<10:43:43,  8.26s/it, gpt_loss=0.256, loss_mean=0.273][A[A
+
+Train step of epoch 1:  27%|██▋       | 1760/6434 [4:08:10<10:43:43,  8.26s/it, gpt_loss=0.323, loss_mean=0.278][A[A
+
+Train step of epoch 1:  27%|██▋       | 1761/6434 [4:08:10<10:34:53,  8.15s/it, gpt_loss=0.323, loss_mean=0.278][A[A
+
+Train step of epoch 1:  27%|██▋       | 1761/6434 [4:08:19<10:34:53,  8.15s/it, gpt_loss=0.343, loss_mean=0.284][A[A
+
+Train step of epoch 1:  27%|██▋       | 1762/6434 [4:08:19<10:49:35,  8.34s/it, gpt_loss=0.343, loss_mean=0.284][A[A
+
+Train step of epoch 1:  27%|██▋       | 1762/6434 [4:08:29<10:49:35,  8.34s/it, gpt_loss=0.255, loss_mean=0.281][A[A
+
+Train step of epoch 1:  27%|██▋       | 1763/6434 [4:08:29<11:36:10,  8.94s/it, gpt_loss=0.255, loss_mean=0.281][A[A
+
+Train step of epoch 1:  27%|██▋       | 1763/6434 [4:08:38<11:36:10,  8.94s/it, gpt_loss=0.242, loss_mean=0.277][A[A
+
+Train step of epoch 1:  27%|██▋       | 1764/6434 [4:08:38<11:25:23,  8.81s/it, gpt_loss=0.242, loss_mean=0.277][A[A
+
+Train step of epoch 1:  27%|██▋       | 1764/6434 [4:08:46<11:25:23,  8.81s/it, gpt_loss=0.249, loss_mean=0.275][A[A
+
+Train step of epoch 1:  27%|██▋       | 1765/6434 [4:08:46<11:15:24,  8.68s/it, gpt_loss=0.249, loss_mean=0.275][A[A
+[LID Router Debug] Step: 8200
+Batch Size: 10
+Audio Batch Size: 109
+LID Assignments: [1, 3, 9, 1, 4, 5, 4, 5, 3, 6]
+Active Experts in Batch: {1, 3, 4, 5, 6, 9}
+[2026-02-07 11:10:41,467] [INFO] [logging.py:96:log_dist] [Rank 0] step=4100, skipped=0, lr=[1.2517716461556719e-05, 1.2517716461556719e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 11:10:41,468] [INFO] [timer.py:260:stop] epoch=0/micro_step=8200/global_step=4100, RunningAvgSamplesPerSec=4.745372638078939, CurrSamplesPerSec=4.7963520634325905, MemAllocated=12.58GB, MaxMemAllocated=49.73GB
+
+
+Train step of epoch 1:  27%|██▋       | 1765/6434 [4:08:54<11:15:24,  8.68s/it, gpt_loss=0.232, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  27%|██▋       | 1766/6434 [4:08:54<11:07:14,  8.58s/it, gpt_loss=0.232, loss_mean=0.27][A[A
+
+Train step of epoch 1:  27%|██▋       | 1766/6434 [4:09:03<11:07:14,  8.58s/it, gpt_loss=0.293, loss_mean=0.273][A[A
+
+Train step of epoch 1:  27%|██▋       | 1767/6434 [4:09:03<11:14:32,  8.67s/it, gpt_loss=0.293, loss_mean=0.273][A[A
+
+Train step of epoch 1:  27%|██▋       | 1767/6434 [4:09:11<11:14:32,  8.67s/it, gpt_loss=0.326, loss_mean=0.278][A[A
+
+Train step of epoch 1:  27%|██▋       | 1768/6434 [4:09:11<11:02:24,  8.52s/it, gpt_loss=0.326, loss_mean=0.278][A[A
+
+Train step of epoch 1:  27%|██▋       | 1768/6434 [4:09:19<11:02:24,  8.52s/it, gpt_loss=0.224, loss_mean=0.272][A[A
+
+Train step of epoch 1:  27%|██▋       | 1769/6434 [4:09:19<10:45:45,  8.31s/it, gpt_loss=0.224, loss_mean=0.272][A[A
+
+Train step of epoch 1:  27%|██▋       | 1769/6434 [4:09:28<10:45:45,  8.31s/it, gpt_loss=0.229, loss_mean=0.268][A[A
+
+Train step of epoch 1:  28%|██▊       | 1770/6434 [4:09:28<10:50:30,  8.37s/it, gpt_loss=0.229, loss_mean=0.268][A[A
+
+Train step of epoch 1:  28%|██▊       | 1770/6434 [4:09:36<10:50:30,  8.37s/it, gpt_loss=0.284, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  28%|██▊       | 1771/6434 [4:09:36<10:44:39,  8.29s/it, gpt_loss=0.284, loss_mean=0.27][A[A
+
+Train step of epoch 1:  28%|██▊       | 1771/6434 [4:09:45<10:44:39,  8.29s/it, gpt_loss=0.299, loss_mean=0.273][A[A
+
+Train step of epoch 1:  28%|██▊       | 1772/6434 [4:09:45<11:10:01,  8.62s/it, gpt_loss=0.299, loss_mean=0.273][A[A
+
+Train step of epoch 1:  28%|██▊       | 1772/6434 [4:09:53<11:10:01,  8.62s/it, gpt_loss=0.27, loss_mean=0.272] [A[A
+
+Train step of epoch 1:  28%|██▊       | 1773/6434 [4:09:53<10:45:05,  8.30s/it, gpt_loss=0.27, loss_mean=0.272][A[A
+
+Train step of epoch 1:  28%|██▊       | 1773/6434 [4:10:01<10:45:05,  8.30s/it, gpt_loss=0.281, loss_mean=0.273][A[A
+
+Train step of epoch 1:  28%|██▊       | 1774/6434 [4:10:01<10:44:50,  8.30s/it, gpt_loss=0.281, loss_mean=0.273][A[A
+
+Train step of epoch 1:  28%|██▊       | 1774/6434 [4:10:09<10:44:50,  8.30s/it, gpt_loss=0.261, loss_mean=0.272][A[A
+
+Train step of epoch 1:  28%|██▊       | 1775/6434 [4:10:09<10:40:02,  8.24s/it, gpt_loss=0.261, loss_mean=0.272][A[A
+[LID Router Debug] Step: 8210
+Batch Size: 10
+Audio Batch Size: 104
+LID Assignments: [6, 5, 5, 1, 9, 0, 3, 9, 1, 9]
+Active Experts in Batch: {0, 1, 3, 5, 6, 9}
+
+
+Train step of epoch 1:  28%|██▊       | 1775/6434 [4:10:17<10:40:02,  8.24s/it, gpt_loss=0.231, loss_mean=0.268][A[A
+
+Train step of epoch 1:  28%|██▊       | 1776/6434 [4:10:17<10:17:12,  7.95s/it, gpt_loss=0.231, loss_mean=0.268][A[A
+
+Train step of epoch 1:  28%|██▊       | 1776/6434 [4:10:24<10:17:12,  7.95s/it, gpt_loss=0.296, loss_mean=0.271][A[A
+
+Train step of epoch 1:  28%|██▊       | 1777/6434 [4:10:24<10:00:44,  7.74s/it, gpt_loss=0.296, loss_mean=0.271][A[A
+
+Train step of epoch 1:  28%|██▊       | 1777/6434 [4:10:33<10:00:44,  7.74s/it, gpt_loss=0.278, loss_mean=0.271][A[A
+
+Train step of epoch 1:  28%|██▊       | 1778/6434 [4:10:33<10:29:04,  8.11s/it, gpt_loss=0.278, loss_mean=0.271][A[A
+
+Train step of epoch 1:  28%|██▊       | 1778/6434 [4:10:42<10:29:04,  8.11s/it, gpt_loss=0.276, loss_mean=0.272][A[A
+
+Train step of epoch 1:  28%|██▊       | 1779/6434 [4:10:42<10:49:56,  8.38s/it, gpt_loss=0.276, loss_mean=0.272][A[A
+
+Train step of epoch 1:  28%|██▊       | 1779/6434 [4:10:50<10:49:56,  8.38s/it, gpt_loss=0.209, loss_mean=0.266][A[A
+
+Train step of epoch 1:  28%|██▊       | 1780/6434 [4:10:50<10:47:53,  8.35s/it, gpt_loss=0.209, loss_mean=0.266][A[A
+
+Train step of epoch 1:  28%|██▊       | 1780/6434 [4:11:00<10:47:53,  8.35s/it, gpt_loss=0.243, loss_mean=0.263][A[A
+
+Train step of epoch 1:  28%|██▊       | 1781/6434 [4:11:00<11:16:01,  8.72s/it, gpt_loss=0.243, loss_mean=0.263][A[A
+
+Train step of epoch 1:  28%|██▊       | 1781/6434 [4:11:08<11:16:01,  8.72s/it, gpt_loss=0.194, loss_mean=0.256][A[A
+
+Train step of epoch 1:  28%|██▊       | 1782/6434 [4:11:08<11:05:55,  8.59s/it, gpt_loss=0.194, loss_mean=0.256][A[A
+
+Train step of epoch 1:  28%|██▊       | 1782/6434 [4:11:17<11:05:55,  8.59s/it, gpt_loss=0.351, loss_mean=0.266][A[A
+
+Train step of epoch 1:  28%|██▊       | 1783/6434 [4:11:17<11:09:03,  8.63s/it, gpt_loss=0.351, loss_mean=0.266][A[A
+
+Train step of epoch 1:  28%|██▊       | 1783/6434 [4:11:25<11:09:03,  8.63s/it, gpt_loss=0.321, loss_mean=0.271][A[A
+
+Train step of epoch 1:  28%|██▊       | 1784/6434 [4:11:25<11:00:43,  8.53s/it, gpt_loss=0.321, loss_mean=0.271][A[A
+
+Train step of epoch 1:  28%|██▊       | 1784/6434 [4:11:32<11:00:43,  8.53s/it, gpt_loss=0.307, loss_mean=0.275][A[A
+
+Train step of epoch 1:  28%|██▊       | 1785/6434 [4:11:32<10:30:11,  8.13s/it, gpt_loss=0.307, loss_mean=0.275][A[A
+[LID Router Debug] Step: 8220
+Batch Size: 10
+Audio Batch Size: 77
+LID Assignments: [0, 4, 4, 5, 1, 1, 5, 0, 6, 1]
+Active Experts in Batch: {0, 1, 4, 5, 6}
+
+
+Train step of epoch 1:  28%|██▊       | 1785/6434 [4:11:41<10:30:11,  8.13s/it, gpt_loss=0.266, loss_mean=0.274][A[A
+
+Train step of epoch 1:  28%|██▊       | 1786/6434 [4:11:41<10:58:06,  8.50s/it, gpt_loss=0.266, loss_mean=0.274][A[A
+
+Train step of epoch 1:  28%|██▊       | 1786/6434 [4:11:50<10:58:06,  8.50s/it, gpt_loss=0.298, loss_mean=0.276][A[A
+
+Train step of epoch 1:  28%|██▊       | 1787/6434 [4:11:50<10:54:59,  8.46s/it, gpt_loss=0.298, loss_mean=0.276][A[A
+
+Train step of epoch 1:  28%|██▊       | 1787/6434 [4:11:58<10:54:59,  8.46s/it, gpt_loss=0.183, loss_mean=0.267][A[A
+
+Train step of epoch 1:  28%|██▊       | 1788/6434 [4:11:58<10:43:55,  8.32s/it, gpt_loss=0.183, loss_mean=0.267][A[A
+
+Train step of epoch 1:  28%|██▊       | 1788/6434 [4:12:07<10:43:55,  8.32s/it, gpt_loss=0.252, loss_mean=0.266][A[A
+
+Train step of epoch 1:  28%|██▊       | 1789/6434 [4:12:07<11:12:16,  8.68s/it, gpt_loss=0.252, loss_mean=0.266][A[A
+
+Train step of epoch 1:  28%|██▊       | 1789/6434 [4:12:16<11:12:16,  8.68s/it, gpt_loss=0.345, loss_mean=0.274][A[A
+
+Train step of epoch 1:  28%|██▊       | 1790/6434 [4:12:16<11:04:10,  8.58s/it, gpt_loss=0.345, loss_mean=0.274][A[A
+
+Train step of epoch 1:  28%|██▊       | 1790/6434 [4:12:25<11:04:10,  8.58s/it, gpt_loss=0.328, loss_mean=0.279][A[A
+
+Train step of epoch 1:  28%|██▊       | 1791/6434 [4:12:25<11:17:10,  8.75s/it, gpt_loss=0.328, loss_mean=0.279][A[A
+
+Train step of epoch 1:  28%|██▊       | 1791/6434 [4:12:33<11:17:10,  8.75s/it, gpt_loss=0.303, loss_mean=0.281][A[A
+
+Train step of epoch 1:  28%|██▊       | 1792/6434 [4:12:33<11:12:13,  8.69s/it, gpt_loss=0.303, loss_mean=0.281][A[A
+
+Train step of epoch 1:  28%|██▊       | 1792/6434 [4:12:42<11:12:13,  8.69s/it, gpt_loss=0.203, loss_mean=0.274][A[A
+
+Train step of epoch 1:  28%|██▊       | 1793/6434 [4:12:42<11:02:47,  8.57s/it, gpt_loss=0.203, loss_mean=0.274][A[A
+
+Train step of epoch 1:  28%|██▊       | 1793/6434 [4:12:51<11:02:47,  8.57s/it, gpt_loss=0.325, loss_mean=0.279][A[A
+
+Train step of epoch 1:  28%|██▊       | 1794/6434 [4:12:51<11:14:47,  8.73s/it, gpt_loss=0.325, loss_mean=0.279][A[A
+
+Train step of epoch 1:  28%|██▊       | 1794/6434 [4:12:58<11:14:47,  8.73s/it, gpt_loss=0.422, loss_mean=0.293][A[A
+
+Train step of epoch 1:  28%|██▊       | 1795/6434 [4:12:58<10:38:29,  8.26s/it, gpt_loss=0.422, loss_mean=0.293][A[A
+[LID Router Debug] Step: 8230
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [5, 5, 3, 2, 4, 2, 3, 4, 4, 6]
+Active Experts in Batch: {2, 3, 4, 5, 6}
+
+
+Train step of epoch 1:  28%|██▊       | 1795/6434 [4:13:05<10:38:29,  8.26s/it, gpt_loss=0.344, loss_mean=0.298][A[A
+
+Train step of epoch 1:  28%|██▊       | 1796/6434 [4:13:05<10:17:41,  7.99s/it, gpt_loss=0.344, loss_mean=0.298][A[A
+
+Train step of epoch 1:  28%|██▊       | 1796/6434 [4:13:15<10:17:41,  7.99s/it, gpt_loss=0.235, loss_mean=0.292][A[A
+
+Train step of epoch 1:  28%|██▊       | 1797/6434 [4:13:15<10:45:56,  8.36s/it, gpt_loss=0.235, loss_mean=0.292][A[A
+
+Train step of epoch 1:  28%|██▊       | 1797/6434 [4:13:23<10:45:56,  8.36s/it, gpt_loss=0.313, loss_mean=0.294][A[A
+
+Train step of epoch 1:  28%|██▊       | 1798/6434 [4:13:23<10:40:18,  8.29s/it, gpt_loss=0.313, loss_mean=0.294][A[A
+
+Train step of epoch 1:  28%|██▊       | 1798/6434 [4:13:31<10:40:18,  8.29s/it, gpt_loss=0.263, loss_mean=0.291][A[A
+
+Train step of epoch 1:  28%|██▊       | 1799/6434 [4:13:31<10:36:56,  8.25s/it, gpt_loss=0.263, loss_mean=0.291][A[A
+
+Train step of epoch 1:  28%|██▊       | 1799/6434 [4:13:38<10:36:56,  8.25s/it, gpt_loss=0.295, loss_mean=0.291][A[A
+
+Train step of epoch 1:  28%|██▊       | 1800/6434 [4:13:38<10:22:25,  8.06s/it, gpt_loss=0.295, loss_mean=0.291][A[A
+
+Train step of epoch 1:  28%|██▊       | 1800/6434 [4:13:46<10:22:25,  8.06s/it, gpt_loss=0.262, loss_mean=0.288][A[A
+
+Train step of epoch 1:  28%|██▊       | 1801/6434 [4:13:46<10:19:04,  8.02s/it, gpt_loss=0.262, loss_mean=0.288][A[A
+
+Train step of epoch 1:  28%|██▊       | 1801/6434 [4:13:54<10:19:04,  8.02s/it, gpt_loss=0.264, loss_mean=0.286][A[A
+
+Train step of epoch 1:  28%|██▊       | 1802/6434 [4:13:54<10:13:41,  7.95s/it, gpt_loss=0.264, loss_mean=0.286][A[A
+
+Train step of epoch 1:  28%|██▊       | 1802/6434 [4:14:02<10:13:41,  7.95s/it, gpt_loss=0.282, loss_mean=0.286][A[A
+
+Train step of epoch 1:  28%|██▊       | 1803/6434 [4:14:02<10:15:10,  7.97s/it, gpt_loss=0.282, loss_mean=0.286][A[A
+
+Train step of epoch 1:  28%|██▊       | 1803/6434 [4:14:10<10:15:10,  7.97s/it, gpt_loss=0.295, loss_mean=0.287][A[A
+
+Train step of epoch 1:  28%|██▊       | 1804/6434 [4:14:10<10:23:36,  8.08s/it, gpt_loss=0.295, loss_mean=0.287][A[A
+
+Train step of epoch 1:  28%|██▊       | 1804/6434 [4:14:18<10:23:36,  8.08s/it, gpt_loss=0.226, loss_mean=0.281][A[A
+
+Train step of epoch 1:  28%|██▊       | 1805/6434 [4:14:18<10:16:42,  7.99s/it, gpt_loss=0.226, loss_mean=0.281][A[A
+[LID Router Debug] Step: 8240
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [3, 5, 6, 2, 1, 0, 3, 2, 2, 9]
+Active Experts in Batch: {0, 1, 2, 3, 5, 6, 9}
+
+
+Train step of epoch 1:  28%|██▊       | 1805/6434 [4:14:27<10:16:42,  7.99s/it, gpt_loss=0.338, loss_mean=0.286][A[A
+
+Train step of epoch 1:  28%|██▊       | 1806/6434 [4:14:27<10:37:36,  8.27s/it, gpt_loss=0.338, loss_mean=0.286][A[A
+
+Train step of epoch 1:  28%|██▊       | 1806/6434 [4:14:37<10:37:36,  8.27s/it, gpt_loss=0.223, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  28%|██▊       | 1807/6434 [4:14:37<11:07:48,  8.66s/it, gpt_loss=0.223, loss_mean=0.28][A[A
+
+Train step of epoch 1:  28%|██▊       | 1807/6434 [4:14:44<11:07:48,  8.66s/it, gpt_loss=0.29, loss_mean=0.281][A[A
+
+Train step of epoch 1:  28%|██▊       | 1808/6434 [4:14:44<10:44:15,  8.36s/it, gpt_loss=0.29, loss_mean=0.281][A[A
+
+Train step of epoch 1:  28%|██▊       | 1808/6434 [4:14:52<10:44:15,  8.36s/it, gpt_loss=0.319, loss_mean=0.285][A[A
+
+Train step of epoch 1:  28%|██▊       | 1809/6434 [4:14:52<10:37:54,  8.28s/it, gpt_loss=0.319, loss_mean=0.285][A[A
+
+Train step of epoch 1:  28%|██▊       | 1809/6434 [4:15:00<10:37:54,  8.28s/it, gpt_loss=0.289, loss_mean=0.285][A[A
+
+Train step of epoch 1:  28%|██▊       | 1810/6434 [4:15:00<10:22:46,  8.08s/it, gpt_loss=0.289, loss_mean=0.285][A[A
+
+Train step of epoch 1:  28%|██▊       | 1810/6434 [4:15:08<10:22:46,  8.08s/it, gpt_loss=0.262, loss_mean=0.283][A[A
+
+Train step of epoch 1:  28%|██▊       | 1811/6434 [4:15:08<10:23:47,  8.10s/it, gpt_loss=0.262, loss_mean=0.283][A[A
+
+Train step of epoch 1:  28%|██▊       | 1811/6434 [4:15:17<10:23:47,  8.10s/it, gpt_loss=0.279, loss_mean=0.282][A[A
+
+Train step of epoch 1:  28%|██▊       | 1812/6434 [4:15:17<10:46:07,  8.39s/it, gpt_loss=0.279, loss_mean=0.282][A[A
+
+Train step of epoch 1:  28%|██▊       | 1812/6434 [4:15:26<10:46:07,  8.39s/it, gpt_loss=0.313, loss_mean=0.285][A[A
+
+Train step of epoch 1:  28%|██▊       | 1813/6434 [4:15:26<10:45:43,  8.38s/it, gpt_loss=0.313, loss_mean=0.285][A[A
+
+Train step of epoch 1:  28%|██▊       | 1813/6434 [4:15:34<10:45:43,  8.38s/it, gpt_loss=0.249, loss_mean=0.282][A[A
+
+Train step of epoch 1:  28%|██▊       | 1814/6434 [4:15:34<10:32:42,  8.22s/it, gpt_loss=0.249, loss_mean=0.282][A[A
+
+Train step of epoch 1:  28%|██▊       | 1814/6434 [4:15:41<10:32:42,  8.22s/it, gpt_loss=0.271, loss_mean=0.281][A[A
+
+Train step of epoch 1:  28%|██▊       | 1815/6434 [4:15:41<10:21:41,  8.08s/it, gpt_loss=0.271, loss_mean=0.281][A[A
+[LID Router Debug] Step: 8250
+Batch Size: 10
+Audio Batch Size: 143
+LID Assignments: [9, 0, 9, 0, 4, 1, 3, 2, 3, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+
+Train step of epoch 1:  28%|██▊       | 1815/6434 [4:15:50<10:21:41,  8.08s/it, gpt_loss=0.26, loss_mean=0.279] [A[A
+
+Train step of epoch 1:  28%|██▊       | 1816/6434 [4:15:50<10:46:58,  8.41s/it, gpt_loss=0.26, loss_mean=0.279][A[A
+
+Train step of epoch 1:  28%|██▊       | 1816/6434 [4:16:00<10:46:58,  8.41s/it, gpt_loss=0.328, loss_mean=0.284][A[A
+
+Train step of epoch 1:  28%|██▊       | 1817/6434 [4:16:00<11:04:56,  8.64s/it, gpt_loss=0.328, loss_mean=0.284][A[A
+
+Train step of epoch 1:  28%|██▊       | 1817/6434 [4:16:08<11:04:56,  8.64s/it, gpt_loss=0.244, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  28%|██▊       | 1818/6434 [4:16:08<10:49:00,  8.44s/it, gpt_loss=0.244, loss_mean=0.28][A[A
+
+Train step of epoch 1:  28%|██▊       | 1818/6434 [4:16:16<10:49:00,  8.44s/it, gpt_loss=0.291, loss_mean=0.281][A[A
+
+Train step of epoch 1:  28%|██▊       | 1819/6434 [4:16:16<10:58:42,  8.56s/it, gpt_loss=0.291, loss_mean=0.281][A[A
+
+Train step of epoch 1:  28%|██▊       | 1819/6434 [4:16:25<10:58:42,  8.56s/it, gpt_loss=0.291, loss_mean=0.282][A[A
+
+Train step of epoch 1:  28%|██▊       | 1820/6434 [4:16:25<10:54:09,  8.51s/it, gpt_loss=0.291, loss_mean=0.282][A[A
+
+Train step of epoch 1:  28%|██▊       | 1820/6434 [4:16:34<10:54:09,  8.51s/it, gpt_loss=0.258, loss_mean=0.279][A[A
+
+Train step of epoch 1:  28%|██▊       | 1821/6434 [4:16:34<11:00:31,  8.59s/it, gpt_loss=0.258, loss_mean=0.279][A[A
+
+Train step of epoch 1:  28%|██▊       | 1821/6434 [4:16:43<11:00:31,  8.59s/it, gpt_loss=0.282, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  28%|██▊       | 1822/6434 [4:16:43<11:09:23,  8.71s/it, gpt_loss=0.282, loss_mean=0.28][A[A
+
+Train step of epoch 1:  28%|██▊       | 1822/6434 [4:16:51<11:09:23,  8.71s/it, gpt_loss=0.296, loss_mean=0.281][A[A
+
+Train step of epoch 1:  28%|██▊       | 1823/6434 [4:16:51<10:58:57,  8.57s/it, gpt_loss=0.296, loss_mean=0.281][A[A
+
+Train step of epoch 1:  28%|██▊       | 1823/6434 [4:17:00<10:58:57,  8.57s/it, gpt_loss=0.205, loss_mean=0.274][A[A
+
+Train step of epoch 1:  28%|██▊       | 1824/6434 [4:17:00<11:20:12,  8.85s/it, gpt_loss=0.205, loss_mean=0.274][A[A
+
+Train step of epoch 1:  28%|██▊       | 1824/6434 [4:17:08<11:20:12,  8.85s/it, gpt_loss=0.282, loss_mean=0.275][A[A
+
+Train step of epoch 1:  28%|██▊       | 1825/6434 [4:17:08<10:51:58,  8.49s/it, gpt_loss=0.282, loss_mean=0.275][A[A
+[LID Router Debug] Step: 8260
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [2, 4, 0, 3, 4, 5, 1, 0, 0, 5]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5}
+
+
+Train step of epoch 1:  28%|██▊       | 1825/6434 [4:17:16<10:51:58,  8.49s/it, gpt_loss=0.187, loss_mean=0.266][A[A
+
+Train step of epoch 1:  28%|██▊       | 1826/6434 [4:17:16<10:39:01,  8.32s/it, gpt_loss=0.187, loss_mean=0.266][A[A
+
+Train step of epoch 1:  28%|██▊       | 1826/6434 [4:17:26<10:39:01,  8.32s/it, gpt_loss=0.265, loss_mean=0.266][A[A
+
+Train step of epoch 1:  28%|██▊       | 1827/6434 [4:17:26<11:10:41,  8.73s/it, gpt_loss=0.265, loss_mean=0.266][A[A
+
+Train step of epoch 1:  28%|██▊       | 1827/6434 [4:17:33<11:10:41,  8.73s/it, gpt_loss=0.274, loss_mean=0.267][A[A
+
+Train step of epoch 1:  28%|██▊       | 1828/6434 [4:17:33<10:42:03,  8.36s/it, gpt_loss=0.274, loss_mean=0.267][A[A
+
+Train step of epoch 1:  28%|██▊       | 1828/6434 [4:17:42<10:42:03,  8.36s/it, gpt_loss=0.27, loss_mean=0.267] [A[A
+
+Train step of epoch 1:  28%|██▊       | 1829/6434 [4:17:42<10:54:43,  8.53s/it, gpt_loss=0.27, loss_mean=0.267][A[A
+
+Train step of epoch 1:  28%|██▊       | 1829/6434 [4:17:51<10:54:43,  8.53s/it, gpt_loss=0.243, loss_mean=0.264][A[A
+
+Train step of epoch 1:  28%|██▊       | 1830/6434 [4:17:51<11:15:35,  8.80s/it, gpt_loss=0.243, loss_mean=0.264][A[A
+
+Train step of epoch 1:  28%|██▊       | 1830/6434 [4:18:00<11:15:35,  8.80s/it, gpt_loss=0.321, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  28%|██▊       | 1831/6434 [4:18:00<11:04:04,  8.66s/it, gpt_loss=0.321, loss_mean=0.27][A[A
+
+Train step of epoch 1:  28%|██▊       | 1831/6434 [4:18:08<11:04:04,  8.66s/it, gpt_loss=0.25, loss_mean=0.268][A[A
+
+Train step of epoch 1:  28%|██▊       | 1832/6434 [4:18:08<10:54:15,  8.53s/it, gpt_loss=0.25, loss_mean=0.268][A[A
+
+Train step of epoch 1:  28%|██▊       | 1832/6434 [4:18:16<10:54:15,  8.53s/it, gpt_loss=0.307, loss_mean=0.272][A[A
+
+Train step of epoch 1:  28%|██▊       | 1833/6434 [4:18:16<10:46:13,  8.43s/it, gpt_loss=0.307, loss_mean=0.272][A[A
+
+Train step of epoch 1:  28%|██▊       | 1833/6434 [4:18:24<10:46:13,  8.43s/it, gpt_loss=0.268, loss_mean=0.272][A[A
+
+Train step of epoch 1:  29%|██▊       | 1834/6434 [4:18:24<10:38:53,  8.33s/it, gpt_loss=0.268, loss_mean=0.272][A[A
+
+Train step of epoch 1:  29%|██▊       | 1834/6434 [4:18:32<10:38:53,  8.33s/it, gpt_loss=0.325, loss_mean=0.277][A[A
+
+Train step of epoch 1:  29%|██▊       | 1835/6434 [4:18:32<10:26:22,  8.17s/it, gpt_loss=0.325, loss_mean=0.277][A[A
+[LID Router Debug] Step: 8270
+Batch Size: 10
+Audio Batch Size: 121
+LID Assignments: [2, 9, 6, 6, 3, 2, 2, 9, 9, 4]
+Active Experts in Batch: {2, 3, 4, 6, 9}
+
+
+Train step of epoch 1:  29%|██▊       | 1835/6434 [4:18:40<10:26:22,  8.17s/it, gpt_loss=0.268, loss_mean=0.276][A[A
+
+Train step of epoch 1:  29%|██▊       | 1836/6434 [4:18:40<10:24:34,  8.15s/it, gpt_loss=0.268, loss_mean=0.276][A[A
+
+Train step of epoch 1:  29%|██▊       | 1836/6434 [4:18:49<10:24:34,  8.15s/it, gpt_loss=0.254, loss_mean=0.274][A[A
+
+Train step of epoch 1:  29%|██▊       | 1837/6434 [4:18:49<10:47:08,  8.45s/it, gpt_loss=0.254, loss_mean=0.274][A[A
+
+Train step of epoch 1:  29%|██▊       | 1837/6434 [4:18:59<10:47:08,  8.45s/it, gpt_loss=0.34, loss_mean=0.28]  [A[A
+
+Train step of epoch 1:  29%|██▊       | 1838/6434 [4:18:59<11:08:47,  8.73s/it, gpt_loss=0.34, loss_mean=0.28][A[A
+
+Train step of epoch 1:  29%|██▊       | 1838/6434 [4:19:06<11:08:47,  8.73s/it, gpt_loss=0.262, loss_mean=0.279][A[A
+
+Train step of epoch 1:  29%|██▊       | 1839/6434 [4:19:06<10:33:51,  8.28s/it, gpt_loss=0.262, loss_mean=0.279][A[A
+
+Train step of epoch 1:  29%|██▊       | 1839/6434 [4:19:13<10:33:51,  8.28s/it, gpt_loss=0.263, loss_mean=0.277][A[A
+
+Train step of epoch 1:  29%|██▊       | 1840/6434 [4:19:13<10:11:12,  7.98s/it, gpt_loss=0.263, loss_mean=0.277][A[A
+
+Train step of epoch 1:  29%|██▊       | 1840/6434 [4:19:22<10:11:12,  7.98s/it, gpt_loss=0.313, loss_mean=0.281][A[A
+
+Train step of epoch 1:  29%|██▊       | 1841/6434 [4:19:22<10:38:22,  8.34s/it, gpt_loss=0.313, loss_mean=0.281][A[A
+
+Train step of epoch 1:  29%|██▊       | 1841/6434 [4:19:31<10:38:22,  8.34s/it, gpt_loss=0.27, loss_mean=0.28]  [A[A
+
+Train step of epoch 1:  29%|██▊       | 1842/6434 [4:19:31<10:32:18,  8.26s/it, gpt_loss=0.27, loss_mean=0.28][A[A
+
+Train step of epoch 1:  29%|██▊       | 1842/6434 [4:19:38<10:32:18,  8.26s/it, gpt_loss=0.276, loss_mean=0.279][A[A
+
+Train step of epoch 1:  29%|██▊       | 1843/6434 [4:19:38<10:04:52,  7.91s/it, gpt_loss=0.276, loss_mean=0.279][A[A
+
+Train step of epoch 1:  29%|██▊       | 1843/6434 [4:19:46<10:04:52,  7.91s/it, gpt_loss=0.251, loss_mean=0.276][A[A
+
+Train step of epoch 1:  29%|██▊       | 1844/6434 [4:19:46<10:13:46,  8.02s/it, gpt_loss=0.251, loss_mean=0.276][A[A
+
+Train step of epoch 1:  29%|██▊       | 1844/6434 [4:19:55<10:13:46,  8.02s/it, gpt_loss=0.269, loss_mean=0.276][A[A
+
+Train step of epoch 1:  29%|██▊       | 1845/6434 [4:19:55<10:48:28,  8.48s/it, gpt_loss=0.269, loss_mean=0.276][A[A
+[LID Router Debug] Step: 8280
+Batch Size: 10
+Audio Batch Size: 95
+LID Assignments: [2, 3, 2, 2, 5, 1, 0, 1, 1, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5}
+
+
+Train step of epoch 1:  29%|██▊       | 1845/6434 [4:20:03<10:48:28,  8.48s/it, gpt_loss=0.271, loss_mean=0.275][A[A
+
+Train step of epoch 1:  29%|██▊       | 1846/6434 [4:20:03<10:30:01,  8.24s/it, gpt_loss=0.271, loss_mean=0.275][A[A
+
+Train step of epoch 1:  29%|██▊       | 1846/6434 [4:20:12<10:30:01,  8.24s/it, gpt_loss=0.258, loss_mean=0.274][A[A
+
+Train step of epoch 1:  29%|██▊       | 1847/6434 [4:20:12<10:42:26,  8.40s/it, gpt_loss=0.258, loss_mean=0.274][A[A
+
+Train step of epoch 1:  29%|██▊       | 1847/6434 [4:20:21<10:42:26,  8.40s/it, gpt_loss=0.267, loss_mean=0.273][A[A
+
+Train step of epoch 1:  29%|██▊       | 1848/6434 [4:20:21<10:58:13,  8.61s/it, gpt_loss=0.267, loss_mean=0.273][A[A
+
+Train step of epoch 1:  29%|██▊       | 1848/6434 [4:20:30<10:58:13,  8.61s/it, gpt_loss=0.351, loss_mean=0.281][A[A
+
+Train step of epoch 1:  29%|██▊       | 1849/6434 [4:20:30<11:01:35,  8.66s/it, gpt_loss=0.351, loss_mean=0.281][A[A
+
+Train step of epoch 1:  29%|██▊       | 1849/6434 [4:20:38<11:01:35,  8.66s/it, gpt_loss=0.269, loss_mean=0.279][A[A
+
+Train step of epoch 1:  29%|██▉       | 1850/6434 [4:20:38<10:54:15,  8.56s/it, gpt_loss=0.269, loss_mean=0.279][A[A
+
+Train step of epoch 1:  29%|██▉       | 1850/6434 [4:20:47<10:54:15,  8.56s/it, gpt_loss=0.217, loss_mean=0.273][A[A
+
+Train step of epoch 1:  29%|██▉       | 1851/6434 [4:20:47<11:00:15,  8.64s/it, gpt_loss=0.217, loss_mean=0.273][A[A
+
+Train step of epoch 1:  29%|██▉       | 1851/6434 [4:20:54<11:00:15,  8.64s/it, gpt_loss=0.344, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  29%|██▉       | 1852/6434 [4:20:54<10:33:39,  8.30s/it, gpt_loss=0.344, loss_mean=0.28][A[A
+
+Train step of epoch 1:  29%|██▉       | 1852/6434 [4:21:03<10:33:39,  8.30s/it, gpt_loss=0.28, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  29%|██▉       | 1853/6434 [4:21:03<10:35:52,  8.33s/it, gpt_loss=0.28, loss_mean=0.28][A[A
+
+Train step of epoch 1:  29%|██▉       | 1853/6434 [4:21:12<10:35:52,  8.33s/it, gpt_loss=0.236, loss_mean=0.276][A[A
+
+Train step of epoch 1:  29%|██▉       | 1854/6434 [4:21:12<11:02:39,  8.68s/it, gpt_loss=0.236, loss_mean=0.276][A[A
+
+Train step of epoch 1:  29%|██▉       | 1854/6434 [4:21:20<11:02:39,  8.68s/it, gpt_loss=0.309, loss_mean=0.279][A[A
+
+Train step of epoch 1:  29%|██▉       | 1855/6434 [4:21:20<10:40:27,  8.39s/it, gpt_loss=0.309, loss_mean=0.279][A[A
+[LID Router Debug] Step: 8290
+Batch Size: 10
+Audio Batch Size: 96
+LID Assignments: [5, 0, 6, 6, 1, 0, 9, 4, 3, 2]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  29%|██▉       | 1855/6434 [4:21:29<10:40:27,  8.39s/it, gpt_loss=0.277, loss_mean=0.279][A[A
+
+Train step of epoch 1:  29%|██▉       | 1856/6434 [4:21:29<10:45:37,  8.46s/it, gpt_loss=0.277, loss_mean=0.279][A[A
+
+Train step of epoch 1:  29%|██▉       | 1856/6434 [4:21:36<10:45:37,  8.46s/it, gpt_loss=0.355, loss_mean=0.287][A[A
+
+Train step of epoch 1:  29%|██▉       | 1857/6434 [4:21:36<10:15:06,  8.06s/it, gpt_loss=0.355, loss_mean=0.287][A[A
+
+Train step of epoch 1:  29%|██▉       | 1857/6434 [4:21:44<10:15:06,  8.06s/it, gpt_loss=0.314, loss_mean=0.289][A[A
+
+Train step of epoch 1:  29%|██▉       | 1858/6434 [4:21:44<10:08:09,  7.97s/it, gpt_loss=0.314, loss_mean=0.289][A[A
+
+Train step of epoch 1:  29%|██▉       | 1858/6434 [4:21:53<10:08:09,  7.97s/it, gpt_loss=0.257, loss_mean=0.286][A[A
+
+Train step of epoch 1:  29%|██▉       | 1859/6434 [4:21:53<10:45:39,  8.47s/it, gpt_loss=0.257, loss_mean=0.286][A[A
+
+Train step of epoch 1:  29%|██▉       | 1859/6434 [4:22:03<10:45:39,  8.47s/it, gpt_loss=0.215, loss_mean=0.279][A[A
+
+Train step of epoch 1:  29%|██▉       | 1860/6434 [4:22:03<11:17:23,  8.89s/it, gpt_loss=0.215, loss_mean=0.279][A[A
+
+Train step of epoch 1:  29%|██▉       | 1860/6434 [4:22:12<11:17:23,  8.89s/it, gpt_loss=0.266, loss_mean=0.278][A[A
+
+Train step of epoch 1:  29%|██▉       | 1861/6434 [4:22:12<11:26:31,  9.01s/it, gpt_loss=0.266, loss_mean=0.278][A[A
+
+Train step of epoch 1:  29%|██▉       | 1861/6434 [4:22:20<11:26:31,  9.01s/it, gpt_loss=0.248, loss_mean=0.275][A[A
+
+Train step of epoch 1:  29%|██▉       | 1862/6434 [4:22:20<10:45:27,  8.47s/it, gpt_loss=0.248, loss_mean=0.275][A[A
+
+Train step of epoch 1:  29%|██▉       | 1862/6434 [4:22:28<10:45:27,  8.47s/it, gpt_loss=0.222, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  29%|██▉       | 1863/6434 [4:22:28<10:43:41,  8.45s/it, gpt_loss=0.222, loss_mean=0.27][A[A
+
+Train step of epoch 1:  29%|██▉       | 1863/6434 [4:22:37<10:43:41,  8.45s/it, gpt_loss=0.378, loss_mean=0.28][A[A
+
+Train step of epoch 1:  29%|██▉       | 1864/6434 [4:22:37<10:48:51,  8.52s/it, gpt_loss=0.378, loss_mean=0.28][A[A
+
+Train step of epoch 1:  29%|██▉       | 1864/6434 [4:22:46<10:48:51,  8.52s/it, gpt_loss=0.223, loss_mean=0.275][A[A
+
+Train step of epoch 1:  29%|██▉       | 1865/6434 [4:22:46<11:01:12,  8.68s/it, gpt_loss=0.223, loss_mean=0.275][A[A
+[LID Router Debug] Step: 8300
+Batch Size: 10
+Audio Batch Size: 81
+LID Assignments: [0, 5, 1, 0, 5, 1, 9, 2, 5, 4]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+
+Train step of epoch 1:  29%|██▉       | 1865/6434 [4:22:54<11:01:12,  8.68s/it, gpt_loss=0.276, loss_mean=0.275][A[A
+
+Train step of epoch 1:  29%|██▉       | 1866/6434 [4:22:54<10:44:10,  8.46s/it, gpt_loss=0.276, loss_mean=0.275][A[A
+
+Train step of epoch 1:  29%|██▉       | 1866/6434 [4:23:03<10:44:10,  8.46s/it, gpt_loss=0.271, loss_mean=0.274][A[A
+
+Train step of epoch 1:  29%|██▉       | 1867/6434 [4:23:03<11:10:57,  8.81s/it, gpt_loss=0.271, loss_mean=0.274][A[A
+
+Train step of epoch 1:  29%|██▉       | 1867/6434 [4:23:12<11:10:57,  8.81s/it, gpt_loss=0.178, loss_mean=0.265][A[A
+
+Train step of epoch 1:  29%|██▉       | 1868/6434 [4:23:12<11:17:46,  8.91s/it, gpt_loss=0.178, loss_mean=0.265][A[A
+
+Train step of epoch 1:  29%|██▉       | 1868/6434 [4:23:21<11:17:46,  8.91s/it, gpt_loss=0.288, loss_mean=0.267][A[A
+
+Train step of epoch 1:  29%|██▉       | 1869/6434 [4:23:21<11:16:13,  8.89s/it, gpt_loss=0.288, loss_mean=0.267][A[A
+
+Train step of epoch 1:  29%|██▉       | 1869/6434 [4:23:29<11:16:13,  8.89s/it, gpt_loss=0.289, loss_mean=0.269][A[A
+
+Train step of epoch 1:  29%|██▉       | 1870/6434 [4:23:29<10:48:28,  8.53s/it, gpt_loss=0.289, loss_mean=0.269][A[A
+
+Train step of epoch 1:  29%|██▉       | 1870/6434 [4:23:37<10:48:28,  8.53s/it, gpt_loss=0.237, loss_mean=0.266][A[A
+
+Train step of epoch 1:  29%|██▉       | 1871/6434 [4:23:37<10:44:32,  8.48s/it, gpt_loss=0.237, loss_mean=0.266][A[A
+
+Train step of epoch 1:  29%|██▉       | 1871/6434 [4:23:46<10:44:32,  8.48s/it, gpt_loss=0.254, loss_mean=0.265][A[A
+
+Train step of epoch 1:  29%|██▉       | 1872/6434 [4:23:46<10:55:25,  8.62s/it, gpt_loss=0.254, loss_mean=0.265][A[A
+
+Train step of epoch 1:  29%|██▉       | 1872/6434 [4:23:54<10:55:25,  8.62s/it, gpt_loss=0.356, loss_mean=0.274][A[A
+
+Train step of epoch 1:  29%|██▉       | 1873/6434 [4:23:54<10:42:26,  8.45s/it, gpt_loss=0.356, loss_mean=0.274][A[A
+
+Train step of epoch 1:  29%|██▉       | 1873/6434 [4:24:02<10:42:26,  8.45s/it, gpt_loss=0.3, loss_mean=0.277]  [A[A
+
+Train step of epoch 1:  29%|██▉       | 1874/6434 [4:24:02<10:25:14,  8.23s/it, gpt_loss=0.3, loss_mean=0.277][A[A
+
+Train step of epoch 1:  29%|██▉       | 1874/6434 [4:24:09<10:25:14,  8.23s/it, gpt_loss=0.305, loss_mean=0.279][A[A
+
+Train step of epoch 1:  29%|██▉       | 1875/6434 [4:24:09<9:54:45,  7.83s/it, gpt_loss=0.305, loss_mean=0.279] [A[A
+[LID Router Debug] Step: 8310
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [3, 1, 2, 0, 5, 1, 2, 9, 0, 1]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9}
+
+
+Train step of epoch 1:  29%|██▉       | 1875/6434 [4:24:18<9:54:45,  7.83s/it, gpt_loss=0.252, loss_mean=0.277][A[A
+
+Train step of epoch 1:  29%|██▉       | 1876/6434 [4:24:18<10:24:54,  8.23s/it, gpt_loss=0.252, loss_mean=0.277][A[A
+
+Train step of epoch 1:  29%|██▉       | 1876/6434 [4:24:25<10:24:54,  8.23s/it, gpt_loss=0.214, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  29%|██▉       | 1877/6434 [4:24:25<9:56:48,  7.86s/it, gpt_loss=0.214, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  29%|██▉       | 1877/6434 [4:24:33<9:56:48,  7.86s/it, gpt_loss=0.282, loss_mean=0.272][A[A
+
+Train step of epoch 1:  29%|██▉       | 1878/6434 [4:24:33<9:57:26,  7.87s/it, gpt_loss=0.282, loss_mean=0.272][A[A
+
+Train step of epoch 1:  29%|██▉       | 1878/6434 [4:24:41<9:57:26,  7.87s/it, gpt_loss=0.329, loss_mean=0.277][A[A
+
+Train step of epoch 1:  29%|██▉       | 1879/6434 [4:24:41<10:11:16,  8.05s/it, gpt_loss=0.329, loss_mean=0.277][A[A
+
+Train step of epoch 1:  29%|██▉       | 1879/6434 [4:24:50<10:11:16,  8.05s/it, gpt_loss=0.267, loss_mean=0.276][A[A
+
+Train step of epoch 1:  29%|██▉       | 1880/6434 [4:24:50<10:18:43,  8.15s/it, gpt_loss=0.267, loss_mean=0.276][A[A
+
+Train step of epoch 1:  29%|██▉       | 1880/6434 [4:24:58<10:18:43,  8.15s/it, gpt_loss=0.263, loss_mean=0.275][A[A
+
+Train step of epoch 1:  29%|██▉       | 1881/6434 [4:24:58<10:16:01,  8.12s/it, gpt_loss=0.263, loss_mean=0.275][A[A
+
+Train step of epoch 1:  29%|██▉       | 1881/6434 [4:25:06<10:16:01,  8.12s/it, gpt_loss=0.226, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  29%|██▉       | 1882/6434 [4:25:06<10:17:06,  8.13s/it, gpt_loss=0.226, loss_mean=0.27][A[A
+
+Train step of epoch 1:  29%|██▉       | 1882/6434 [4:25:14<10:17:06,  8.13s/it, gpt_loss=0.243, loss_mean=0.267][A[A
+
+Train step of epoch 1:  29%|██▉       | 1883/6434 [4:25:14<10:08:59,  8.03s/it, gpt_loss=0.243, loss_mean=0.267][A[A
+
+Train step of epoch 1:  29%|██▉       | 1883/6434 [4:25:24<10:08:59,  8.03s/it, gpt_loss=0.28, loss_mean=0.269] [A[A
+
+Train step of epoch 1:  29%|██▉       | 1884/6434 [4:25:24<10:52:41,  8.61s/it, gpt_loss=0.28, loss_mean=0.269][A[A
+
+Train step of epoch 1:  29%|██▉       | 1884/6434 [4:25:34<10:52:41,  8.61s/it, gpt_loss=0.187, loss_mean=0.26][A[A
+
+Train step of epoch 1:  29%|██▉       | 1885/6434 [4:25:34<11:26:13,  9.05s/it, gpt_loss=0.187, loss_mean=0.26][A[A
+[LID Router Debug] Step: 8320
+Batch Size: 10
+Audio Batch Size: 124
+LID Assignments: [9, 5, 0, 3, 1, 9, 9, 3, 4, 5]
+Active Experts in Batch: {0, 1, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  29%|██▉       | 1885/6434 [4:25:42<11:26:13,  9.05s/it, gpt_loss=0.291, loss_mean=0.264][A[A
+
+Train step of epoch 1:  29%|██▉       | 1886/6434 [4:25:42<11:05:08,  8.77s/it, gpt_loss=0.291, loss_mean=0.264][A[A
+
+Train step of epoch 1:  29%|██▉       | 1886/6434 [4:25:52<11:05:08,  8.77s/it, gpt_loss=0.276, loss_mean=0.265][A[A
+
+Train step of epoch 1:  29%|██▉       | 1887/6434 [4:25:52<11:38:12,  9.21s/it, gpt_loss=0.276, loss_mean=0.265][A[A
+
+Train step of epoch 1:  29%|██▉       | 1887/6434 [4:26:01<11:38:12,  9.21s/it, gpt_loss=0.235, loss_mean=0.262][A[A
+
+Train step of epoch 1:  29%|██▉       | 1888/6434 [4:26:01<11:18:15,  8.95s/it, gpt_loss=0.235, loss_mean=0.262][A[A
+
+Train step of epoch 1:  29%|██▉       | 1888/6434 [4:26:10<11:18:15,  8.95s/it, gpt_loss=0.246, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  29%|██▉       | 1889/6434 [4:26:10<11:18:18,  8.95s/it, gpt_loss=0.246, loss_mean=0.26][A[A
+
+Train step of epoch 1:  29%|██▉       | 1889/6434 [4:26:18<11:18:18,  8.95s/it, gpt_loss=0.325, loss_mean=0.267][A[A
+
+Train step of epoch 1:  29%|██▉       | 1890/6434 [4:26:18<11:12:12,  8.88s/it, gpt_loss=0.325, loss_mean=0.267][A[A
+
+Train step of epoch 1:  29%|██▉       | 1890/6434 [4:26:26<11:12:12,  8.88s/it, gpt_loss=0.22, loss_mean=0.262] [A[A
+
+Train step of epoch 1:  29%|██▉       | 1891/6434 [4:26:26<10:47:44,  8.55s/it, gpt_loss=0.22, loss_mean=0.262][A[A
+
+Train step of epoch 1:  29%|██▉       | 1891/6434 [4:26:36<10:47:44,  8.55s/it, gpt_loss=0.352, loss_mean=0.271][A[A
+
+Train step of epoch 1:  29%|██▉       | 1892/6434 [4:26:36<11:10:31,  8.86s/it, gpt_loss=0.352, loss_mean=0.271][A[A
+
+Train step of epoch 1:  29%|██▉       | 1892/6434 [4:26:45<11:10:31,  8.86s/it, gpt_loss=0.265, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  29%|██▉       | 1893/6434 [4:26:45<11:23:18,  9.03s/it, gpt_loss=0.265, loss_mean=0.27][A[A
+
+Train step of epoch 1:  29%|██▉       | 1893/6434 [4:26:53<11:23:18,  9.03s/it, gpt_loss=0.247, loss_mean=0.268][A[A
+
+Train step of epoch 1:  29%|██▉       | 1894/6434 [4:26:53<11:02:46,  8.76s/it, gpt_loss=0.247, loss_mean=0.268][A[A
+
+Train step of epoch 1:  29%|██▉       | 1894/6434 [4:27:01<11:02:46,  8.76s/it, gpt_loss=0.273, loss_mean=0.269][A[A
+
+Train step of epoch 1:  29%|██▉       | 1895/6434 [4:27:01<10:49:24,  8.58s/it, gpt_loss=0.273, loss_mean=0.269][A[A
+[LID Router Debug] Step: 8330
+Batch Size: 10
+Audio Batch Size: 135
+LID Assignments: [4, 2, 5, 3, 9, 3, 3, 3, 0, 2]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  29%|██▉       | 1895/6434 [4:27:10<10:49:24,  8.58s/it, gpt_loss=0.218, loss_mean=0.264][A[A
+
+Train step of epoch 1:  29%|██▉       | 1896/6434 [4:27:10<10:55:25,  8.67s/it, gpt_loss=0.218, loss_mean=0.264][A[A
+
+Train step of epoch 1:  29%|██▉       | 1896/6434 [4:27:18<10:55:25,  8.67s/it, gpt_loss=0.286, loss_mean=0.266][A[A
+
+Train step of epoch 1:  29%|██▉       | 1897/6434 [4:27:18<10:36:57,  8.42s/it, gpt_loss=0.286, loss_mean=0.266][A[A
+
+Train step of epoch 1:  29%|██▉       | 1897/6434 [4:27:26<10:36:57,  8.42s/it, gpt_loss=0.262, loss_mean=0.265][A[A
+
+Train step of epoch 1:  29%|██▉       | 1898/6434 [4:27:26<10:34:09,  8.39s/it, gpt_loss=0.262, loss_mean=0.265][A[A
+
+Train step of epoch 1:  29%|██▉       | 1898/6434 [4:27:35<10:34:09,  8.39s/it, gpt_loss=0.251, loss_mean=0.264][A[A
+
+Train step of epoch 1:  30%|██▉       | 1899/6434 [4:27:35<10:48:02,  8.57s/it, gpt_loss=0.251, loss_mean=0.264][A[A
+
+Train step of epoch 1:  30%|██▉       | 1899/6434 [4:27:43<10:48:02,  8.57s/it, gpt_loss=0.208, loss_mean=0.258][A[A
+
+Train step of epoch 1:  30%|██▉       | 1900/6434 [4:27:43<10:30:06,  8.34s/it, gpt_loss=0.208, loss_mean=0.258][A[A
+
+Train step of epoch 1:  30%|██▉       | 1900/6434 [4:27:52<10:30:06,  8.34s/it, gpt_loss=0.255, loss_mean=0.258][A[A
+
+Train step of epoch 1:  30%|██▉       | 1901/6434 [4:27:52<10:46:38,  8.56s/it, gpt_loss=0.255, loss_mean=0.258][A[A
+
+Train step of epoch 1:  30%|██▉       | 1901/6434 [4:28:02<10:46:38,  8.56s/it, gpt_loss=0.226, loss_mean=0.255][A[A
+
+Train step of epoch 1:  30%|██▉       | 1902/6434 [4:28:02<11:03:38,  8.79s/it, gpt_loss=0.226, loss_mean=0.255][A[A
+
+Train step of epoch 1:  30%|██▉       | 1902/6434 [4:28:10<11:03:38,  8.79s/it, gpt_loss=0.268, loss_mean=0.256][A[A
+
+Train step of epoch 1:  30%|██▉       | 1903/6434 [4:28:10<10:50:02,  8.61s/it, gpt_loss=0.268, loss_mean=0.256][A[A
+
+Train step of epoch 1:  30%|██▉       | 1903/6434 [4:28:18<10:50:02,  8.61s/it, gpt_loss=0.329, loss_mean=0.264][A[A
+
+Train step of epoch 1:  30%|██▉       | 1904/6434 [4:28:18<10:32:07,  8.37s/it, gpt_loss=0.329, loss_mean=0.264][A[A
+
+Train step of epoch 1:  30%|██▉       | 1904/6434 [4:28:26<10:32:07,  8.37s/it, gpt_loss=0.242, loss_mean=0.261][A[A
+
+Train step of epoch 1:  30%|██▉       | 1905/6434 [4:28:26<10:33:04,  8.39s/it, gpt_loss=0.242, loss_mean=0.261][A[A
+[LID Router Debug] Step: 8340
+Batch Size: 10
+Audio Batch Size: 115
+LID Assignments: [5, 9, 2, 1, 9, 1, 2, 2, 3, 9]
+Active Experts in Batch: {1, 2, 3, 5, 9}
+
+
+Train step of epoch 1:  30%|██▉       | 1905/6434 [4:28:34<10:33:04,  8.39s/it, gpt_loss=0.283, loss_mean=0.264][A[A
+
+Train step of epoch 1:  30%|██▉       | 1906/6434 [4:28:34<10:21:47,  8.24s/it, gpt_loss=0.283, loss_mean=0.264][A[A
+
+Train step of epoch 1:  30%|██▉       | 1906/6434 [4:28:42<10:21:47,  8.24s/it, gpt_loss=0.37, loss_mean=0.274] [A[A
+
+Train step of epoch 1:  30%|██▉       | 1907/6434 [4:28:42<10:19:23,  8.21s/it, gpt_loss=0.37, loss_mean=0.274][A[A
+
+Train step of epoch 1:  30%|██▉       | 1907/6434 [4:28:51<10:19:23,  8.21s/it, gpt_loss=0.345, loss_mean=0.281][A[A
+
+Train step of epoch 1:  30%|██▉       | 1908/6434 [4:28:51<10:31:26,  8.37s/it, gpt_loss=0.345, loss_mean=0.281][A[A
+
+Train step of epoch 1:  30%|██▉       | 1908/6434 [4:28:59<10:31:26,  8.37s/it, gpt_loss=0.346, loss_mean=0.288][A[A
+
+Train step of epoch 1:  30%|██▉       | 1909/6434 [4:28:59<10:36:14,  8.44s/it, gpt_loss=0.346, loss_mean=0.288][A[A
+
+Train step of epoch 1:  30%|██▉       | 1909/6434 [4:29:08<10:36:14,  8.44s/it, gpt_loss=0.288, loss_mean=0.288][A[A
+
+Train step of epoch 1:  30%|██▉       | 1910/6434 [4:29:08<10:48:14,  8.60s/it, gpt_loss=0.288, loss_mean=0.288][A[A
+
+Train step of epoch 1:  30%|██▉       | 1910/6434 [4:29:17<10:48:14,  8.60s/it, gpt_loss=0.277, loss_mean=0.287][A[A
+
+Train step of epoch 1:  30%|██▉       | 1911/6434 [4:29:17<10:42:51,  8.53s/it, gpt_loss=0.277, loss_mean=0.287][A[A
+
+Train step of epoch 1:  30%|██▉       | 1911/6434 [4:29:25<10:42:51,  8.53s/it, gpt_loss=0.342, loss_mean=0.292][A[A
+
+Train step of epoch 1:  30%|██▉       | 1912/6434 [4:29:25<10:36:20,  8.44s/it, gpt_loss=0.342, loss_mean=0.292][A[A
+
+Train step of epoch 1:  30%|██▉       | 1912/6434 [4:29:33<10:36:20,  8.44s/it, gpt_loss=0.275, loss_mean=0.291][A[A
+
+Train step of epoch 1:  30%|██▉       | 1913/6434 [4:29:33<10:18:32,  8.21s/it, gpt_loss=0.275, loss_mean=0.291][A[A
+
+Train step of epoch 1:  30%|██▉       | 1913/6434 [4:29:40<10:18:32,  8.21s/it, gpt_loss=0.294, loss_mean=0.291][A[A
+
+Train step of epoch 1:  30%|██▉       | 1914/6434 [4:29:40<9:50:21,  7.84s/it, gpt_loss=0.294, loss_mean=0.291] [A[A
+
+Train step of epoch 1:  30%|██▉       | 1914/6434 [4:29:48<9:50:21,  7.84s/it, gpt_loss=0.282, loss_mean=0.29] [A[A
+
+Train step of epoch 1:  30%|██▉       | 1915/6434 [4:29:48<10:04:32,  8.03s/it, gpt_loss=0.282, loss_mean=0.29][A[A
+[LID Router Debug] Step: 8350
+Batch Size: 10
+Audio Batch Size: 119
+LID Assignments: [0, 1, 2, 4, 4, 3, 3, 5, 5, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5}
+
+
+Train step of epoch 1:  30%|██▉       | 1915/6434 [4:29:57<10:04:32,  8.03s/it, gpt_loss=0.236, loss_mean=0.285][A[A
+
+Train step of epoch 1:  30%|██▉       | 1916/6434 [4:29:57<10:26:24,  8.32s/it, gpt_loss=0.236, loss_mean=0.285][A[A
+
+Train step of epoch 1:  30%|██▉       | 1916/6434 [4:30:05<10:26:24,  8.32s/it, gpt_loss=0.298, loss_mean=0.286][A[A
+
+Train step of epoch 1:  30%|██▉       | 1917/6434 [4:30:05<10:18:40,  8.22s/it, gpt_loss=0.298, loss_mean=0.286][A[A
+
+Train step of epoch 1:  30%|██▉       | 1917/6434 [4:30:13<10:18:40,  8.22s/it, gpt_loss=0.293, loss_mean=0.287][A[A
+
+Train step of epoch 1:  30%|██▉       | 1918/6434 [4:30:13<10:15:21,  8.18s/it, gpt_loss=0.293, loss_mean=0.287][A[A
+
+Train step of epoch 1:  30%|██▉       | 1918/6434 [4:30:20<10:15:21,  8.18s/it, gpt_loss=0.387, loss_mean=0.297][A[A
+
+Train step of epoch 1:  30%|██▉       | 1919/6434 [4:30:20<9:55:08,  7.91s/it, gpt_loss=0.387, loss_mean=0.297] [A[A
+
+Train step of epoch 1:  30%|██▉       | 1919/6434 [4:30:29<9:55:08,  7.91s/it, gpt_loss=0.354, loss_mean=0.302][A[A
+
+Train step of epoch 1:  30%|██▉       | 1920/6434 [4:30:29<10:06:36,  8.06s/it, gpt_loss=0.354, loss_mean=0.302][A[A
+
+Train step of epoch 1:  30%|██▉       | 1920/6434 [4:30:37<10:06:36,  8.06s/it, gpt_loss=0.297, loss_mean=0.302][A[A
+
+Train step of epoch 1:  30%|██▉       | 1921/6434 [4:30:37<10:19:28,  8.24s/it, gpt_loss=0.297, loss_mean=0.302][A[A
+
+Train step of epoch 1:  30%|██▉       | 1921/6434 [4:30:46<10:19:28,  8.24s/it, gpt_loss=0.274, loss_mean=0.299][A[A
+
+Train step of epoch 1:  30%|██▉       | 1922/6434 [4:30:46<10:27:12,  8.34s/it, gpt_loss=0.274, loss_mean=0.299][A[A
+
+Train step of epoch 1:  30%|██▉       | 1922/6434 [4:30:54<10:27:12,  8.34s/it, gpt_loss=0.248, loss_mean=0.294][A[A
+
+Train step of epoch 1:  30%|██▉       | 1923/6434 [4:30:54<10:15:04,  8.18s/it, gpt_loss=0.248, loss_mean=0.294][A[A
+
+Train step of epoch 1:  30%|██▉       | 1923/6434 [4:31:02<10:15:04,  8.18s/it, gpt_loss=0.277, loss_mean=0.292][A[A
+
+Train step of epoch 1:  30%|██▉       | 1924/6434 [4:31:02<10:03:56,  8.03s/it, gpt_loss=0.277, loss_mean=0.292][A[A
+
+Train step of epoch 1:  30%|██▉       | 1924/6434 [4:31:10<10:03:56,  8.03s/it, gpt_loss=0.34, loss_mean=0.297] [A[A
+
+Train step of epoch 1:  30%|██▉       | 1925/6434 [4:31:10<10:17:16,  8.21s/it, gpt_loss=0.34, loss_mean=0.297][A[A
+[LID Router Debug] Step: 8360
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [4, 9, 3, 4, 0, 1, 9, 2, 5, 9]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  30%|██▉       | 1925/6434 [4:31:18<10:17:16,  8.21s/it, gpt_loss=0.251, loss_mean=0.293][A[A
+
+Train step of epoch 1:  30%|██▉       | 1926/6434 [4:31:18<10:12:28,  8.15s/it, gpt_loss=0.251, loss_mean=0.293][A[A
+
+Train step of epoch 1:  30%|██▉       | 1926/6434 [4:31:25<10:12:28,  8.15s/it, gpt_loss=0.259, loss_mean=0.289][A[A
+
+Train step of epoch 1:  30%|██▉       | 1927/6434 [4:31:25<9:53:07,  7.90s/it, gpt_loss=0.259, loss_mean=0.289] [A[A
+
+Train step of epoch 1:  30%|██▉       | 1927/6434 [4:31:34<9:53:07,  7.90s/it, gpt_loss=0.3, loss_mean=0.29]   [A[A
+
+Train step of epoch 1:  30%|██▉       | 1928/6434 [4:31:34<10:17:26,  8.22s/it, gpt_loss=0.3, loss_mean=0.29][A[A
+
+Train step of epoch 1:  30%|██▉       | 1928/6434 [4:31:42<10:17:26,  8.22s/it, gpt_loss=0.297, loss_mean=0.291][A[A
+
+Train step of epoch 1:  30%|██▉       | 1929/6434 [4:31:42<10:09:56,  8.12s/it, gpt_loss=0.297, loss_mean=0.291][A[A
+
+Train step of epoch 1:  30%|██▉       | 1929/6434 [4:31:51<10:09:56,  8.12s/it, gpt_loss=0.233, loss_mean=0.285][A[A
+
+Train step of epoch 1:  30%|██▉       | 1930/6434 [4:31:51<10:30:48,  8.40s/it, gpt_loss=0.233, loss_mean=0.285][A[A
+
+Train step of epoch 1:  30%|██▉       | 1930/6434 [4:31:59<10:30:48,  8.40s/it, gpt_loss=0.256, loss_mean=0.282][A[A
+
+Train step of epoch 1:  30%|███       | 1931/6434 [4:31:59<10:12:35,  8.16s/it, gpt_loss=0.256, loss_mean=0.282][A[A
+
+Train step of epoch 1:  30%|███       | 1931/6434 [4:32:08<10:12:35,  8.16s/it, gpt_loss=0.344, loss_mean=0.288][A[A
+
+Train step of epoch 1:  30%|███       | 1932/6434 [4:32:08<10:28:26,  8.38s/it, gpt_loss=0.344, loss_mean=0.288][A[A
+
+Train step of epoch 1:  30%|███       | 1932/6434 [4:32:16<10:28:26,  8.38s/it, gpt_loss=0.26, loss_mean=0.286] [A[A
+
+Train step of epoch 1:  30%|███       | 1933/6434 [4:32:16<10:22:24,  8.30s/it, gpt_loss=0.26, loss_mean=0.286][A[A
+
+Train step of epoch 1:  30%|███       | 1933/6434 [4:32:24<10:22:24,  8.30s/it, gpt_loss=0.269, loss_mean=0.284][A[A
+
+Train step of epoch 1:  30%|███       | 1934/6434 [4:32:24<10:16:41,  8.22s/it, gpt_loss=0.269, loss_mean=0.284][A[A
+
+Train step of epoch 1:  30%|███       | 1934/6434 [4:32:32<10:16:41,  8.22s/it, gpt_loss=0.263, loss_mean=0.282][A[A
+
+Train step of epoch 1:  30%|███       | 1935/6434 [4:32:32<10:11:09,  8.15s/it, gpt_loss=0.263, loss_mean=0.282][A[A
+[LID Router Debug] Step: 8370
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [6, 4, 2, 5, 0, 9, 5, 3, 3, 9]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  30%|███       | 1935/6434 [4:32:40<10:11:09,  8.15s/it, gpt_loss=0.311, loss_mean=0.285][A[A
+
+Train step of epoch 1:  30%|███       | 1936/6434 [4:32:40<9:56:15,  7.95s/it, gpt_loss=0.311, loss_mean=0.285] [A[A
+
+Train step of epoch 1:  30%|███       | 1936/6434 [4:32:48<9:56:15,  7.95s/it, gpt_loss=0.211, loss_mean=0.277][A[A
+
+Train step of epoch 1:  30%|███       | 1937/6434 [4:32:48<10:19:00,  8.26s/it, gpt_loss=0.211, loss_mean=0.277][A[A
+
+Train step of epoch 1:  30%|███       | 1937/6434 [4:32:58<10:19:00,  8.26s/it, gpt_loss=0.288, loss_mean=0.278][A[A
+
+Train step of epoch 1:  30%|███       | 1938/6434 [4:32:58<10:38:48,  8.53s/it, gpt_loss=0.288, loss_mean=0.278][A[A
+
+Train step of epoch 1:  30%|███       | 1938/6434 [4:33:07<10:38:48,  8.53s/it, gpt_loss=0.24, loss_mean=0.275] [A[A
+
+Train step of epoch 1:  30%|███       | 1939/6434 [4:33:07<10:48:14,  8.65s/it, gpt_loss=0.24, loss_mean=0.275][A[A
+
+Train step of epoch 1:  30%|███       | 1939/6434 [4:33:15<10:48:14,  8.65s/it, gpt_loss=0.297, loss_mean=0.277][A[A
+
+Train step of epoch 1:  30%|███       | 1940/6434 [4:33:15<10:38:10,  8.52s/it, gpt_loss=0.297, loss_mean=0.277][A[A
+
+Train step of epoch 1:  30%|███       | 1940/6434 [4:33:23<10:38:10,  8.52s/it, gpt_loss=0.209, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  30%|███       | 1941/6434 [4:33:23<10:40:48,  8.56s/it, gpt_loss=0.209, loss_mean=0.27][A[A
+
+Train step of epoch 1:  30%|███       | 1941/6434 [4:33:32<10:40:48,  8.56s/it, gpt_loss=0.367, loss_mean=0.28][A[A
+
+Train step of epoch 1:  30%|███       | 1942/6434 [4:33:32<10:35:41,  8.49s/it, gpt_loss=0.367, loss_mean=0.28][A[A
+
+Train step of epoch 1:  30%|███       | 1942/6434 [4:33:39<10:35:41,  8.49s/it, gpt_loss=0.298, loss_mean=0.282][A[A
+
+Train step of epoch 1:  30%|███       | 1943/6434 [4:33:39<10:12:06,  8.18s/it, gpt_loss=0.298, loss_mean=0.282][A[A
+
+Train step of epoch 1:  30%|███       | 1943/6434 [4:33:48<10:12:06,  8.18s/it, gpt_loss=0.244, loss_mean=0.278][A[A
+
+Train step of epoch 1:  30%|███       | 1944/6434 [4:33:48<10:20:25,  8.29s/it, gpt_loss=0.244, loss_mean=0.278][A[A
+
+Train step of epoch 1:  30%|███       | 1944/6434 [4:33:57<10:20:25,  8.29s/it, gpt_loss=0.29, loss_mean=0.279] [A[A
+
+Train step of epoch 1:  30%|███       | 1945/6434 [4:33:57<10:40:49,  8.57s/it, gpt_loss=0.29, loss_mean=0.279][A[A
+[LID Router Debug] Step: 8380
+Batch Size: 10
+Audio Batch Size: 84
+LID Assignments: [5, 2, 4, 0, 9, 0, 4, 4, 6, 4]
+Active Experts in Batch: {0, 2, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  30%|███       | 1945/6434 [4:34:06<10:40:49,  8.57s/it, gpt_loss=0.235, loss_mean=0.275][A[A
+
+Train step of epoch 1:  30%|███       | 1946/6434 [4:34:06<11:01:01,  8.84s/it, gpt_loss=0.235, loss_mean=0.275][A[A
+
+Train step of epoch 1:  30%|███       | 1946/6434 [4:34:15<11:01:01,  8.84s/it, gpt_loss=0.266, loss_mean=0.274][A[A
+
+Train step of epoch 1:  30%|███       | 1947/6434 [4:34:15<11:04:58,  8.89s/it, gpt_loss=0.266, loss_mean=0.274][A[A
+
+Train step of epoch 1:  30%|███       | 1947/6434 [4:34:23<11:04:58,  8.89s/it, gpt_loss=0.248, loss_mean=0.271][A[A
+
+Train step of epoch 1:  30%|███       | 1948/6434 [4:34:23<10:34:09,  8.48s/it, gpt_loss=0.248, loss_mean=0.271][A[A
+
+Train step of epoch 1:  30%|███       | 1948/6434 [4:34:31<10:34:09,  8.48s/it, gpt_loss=0.282, loss_mean=0.272][A[A
+
+Train step of epoch 1:  30%|███       | 1949/6434 [4:34:31<10:17:41,  8.26s/it, gpt_loss=0.282, loss_mean=0.272][A[A
+
+Train step of epoch 1:  30%|███       | 1949/6434 [4:34:40<10:17:41,  8.26s/it, gpt_loss=0.268, loss_mean=0.272][A[A
+
+Train step of epoch 1:  30%|███       | 1950/6434 [4:34:40<10:31:43,  8.45s/it, gpt_loss=0.268, loss_mean=0.272][A[A
+
+Train step of epoch 1:  30%|███       | 1950/6434 [4:34:48<10:31:43,  8.45s/it, gpt_loss=0.216, loss_mean=0.266][A[A
+
+Train step of epoch 1:  30%|███       | 1951/6434 [4:34:48<10:20:43,  8.31s/it, gpt_loss=0.216, loss_mean=0.266][A[A
+
+Train step of epoch 1:  30%|███       | 1951/6434 [4:34:56<10:20:43,  8.31s/it, gpt_loss=0.303, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  30%|███       | 1952/6434 [4:34:56<10:30:01,  8.43s/it, gpt_loss=0.303, loss_mean=0.27][A[A
+
+Train step of epoch 1:  30%|███       | 1952/6434 [4:35:04<10:30:01,  8.43s/it, gpt_loss=0.414, loss_mean=0.284][A[A
+
+Train step of epoch 1:  30%|███       | 1953/6434 [4:35:04<10:03:45,  8.08s/it, gpt_loss=0.414, loss_mean=0.284][A[A
+
+Train step of epoch 1:  30%|███       | 1953/6434 [4:35:11<10:03:45,  8.08s/it, gpt_loss=0.201, loss_mean=0.276][A[A
+
+Train step of epoch 1:  30%|███       | 1954/6434 [4:35:11<9:59:03,  8.02s/it, gpt_loss=0.201, loss_mean=0.276] [A[A
+
+Train step of epoch 1:  30%|███       | 1954/6434 [4:35:22<9:59:03,  8.02s/it, gpt_loss=0.216, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  30%|███       | 1955/6434 [4:35:22<10:51:09,  8.72s/it, gpt_loss=0.216, loss_mean=0.27][A[A
+[LID Router Debug] Step: 8390
+Batch Size: 10
+Audio Batch Size: 85
+LID Assignments: [5, 1, 9, 9, 4, 9, 0, 2, 0, 1]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+
+Train step of epoch 1:  30%|███       | 1955/6434 [4:35:31<10:51:09,  8.72s/it, gpt_loss=0.176, loss_mean=0.261][A[A
+
+Train step of epoch 1:  30%|███       | 1956/6434 [4:35:31<11:09:49,  8.97s/it, gpt_loss=0.176, loss_mean=0.261][A[A
+
+Train step of epoch 1:  30%|███       | 1956/6434 [4:35:39<11:09:49,  8.97s/it, gpt_loss=0.295, loss_mean=0.264][A[A
+
+Train step of epoch 1:  30%|███       | 1957/6434 [4:35:39<10:47:03,  8.67s/it, gpt_loss=0.295, loss_mean=0.264][A[A
+
+Train step of epoch 1:  30%|███       | 1957/6434 [4:35:47<10:47:03,  8.67s/it, gpt_loss=0.215, loss_mean=0.259][A[A
+
+Train step of epoch 1:  30%|███       | 1958/6434 [4:35:47<10:33:53,  8.50s/it, gpt_loss=0.215, loss_mean=0.259][A[A
+
+Train step of epoch 1:  30%|███       | 1958/6434 [4:35:56<10:33:53,  8.50s/it, gpt_loss=0.218, loss_mean=0.255][A[A
+
+Train step of epoch 1:  30%|███       | 1959/6434 [4:35:56<10:24:03,  8.37s/it, gpt_loss=0.218, loss_mean=0.255][A[A
+
+Train step of epoch 1:  30%|███       | 1959/6434 [4:36:03<10:24:03,  8.37s/it, gpt_loss=0.295, loss_mean=0.259][A[A
+
+Train step of epoch 1:  30%|███       | 1960/6434 [4:36:03<10:02:20,  8.08s/it, gpt_loss=0.295, loss_mean=0.259][A[A
+
+Train step of epoch 1:  30%|███       | 1960/6434 [4:36:11<10:02:20,  8.08s/it, gpt_loss=0.291, loss_mean=0.262][A[A
+
+Train step of epoch 1:  30%|███       | 1961/6434 [4:36:11<10:12:13,  8.21s/it, gpt_loss=0.291, loss_mean=0.262][A[A
+
+Train step of epoch 1:  30%|███       | 1961/6434 [4:36:20<10:12:13,  8.21s/it, gpt_loss=0.273, loss_mean=0.263][A[A
+
+Train step of epoch 1:  30%|███       | 1962/6434 [4:36:20<10:13:34,  8.23s/it, gpt_loss=0.273, loss_mean=0.263][A[A
+
+Train step of epoch 1:  30%|███       | 1962/6434 [4:36:29<10:13:34,  8.23s/it, gpt_loss=0.266, loss_mean=0.263][A[A
+
+Train step of epoch 1:  31%|███       | 1963/6434 [4:36:29<10:37:05,  8.55s/it, gpt_loss=0.266, loss_mean=0.263][A[A
+
+Train step of epoch 1:  31%|███       | 1963/6434 [4:36:37<10:37:05,  8.55s/it, gpt_loss=0.282, loss_mean=0.265][A[A
+
+Train step of epoch 1:  31%|███       | 1964/6434 [4:36:37<10:29:31,  8.45s/it, gpt_loss=0.282, loss_mean=0.265][A[A
+
+Train step of epoch 1:  31%|███       | 1964/6434 [4:36:45<10:29:31,  8.45s/it, gpt_loss=0.256, loss_mean=0.264][A[A
+
+Train step of epoch 1:  31%|███       | 1965/6434 [4:36:45<10:23:21,  8.37s/it, gpt_loss=0.256, loss_mean=0.264][A[A
+[LID Router Debug] Step: 8400
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [2, 4, 2, 6, 0, 3, 9, 4, 4, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6, 9}
+[2026-02-07 11:38:40,963] [INFO] [logging.py:96:log_dist] [Rank 0] step=4200, skipped=0, lr=[1.2198180469053727e-05, 1.2198180469053727e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 11:38:40,964] [INFO] [timer.py:260:stop] epoch=0/micro_step=8400/global_step=4200, RunningAvgSamplesPerSec=4.746018082984515, CurrSamplesPerSec=4.811841687542234, MemAllocated=12.73GB, MaxMemAllocated=49.73GB
+
+
+Train step of epoch 1:  31%|███       | 1965/6434 [4:36:54<10:23:21,  8.37s/it, gpt_loss=0.286, loss_mean=0.267][A[A
+
+Train step of epoch 1:  31%|███       | 1966/6434 [4:36:54<10:25:38,  8.40s/it, gpt_loss=0.286, loss_mean=0.267][A[A
+
+Train step of epoch 1:  31%|███       | 1966/6434 [4:37:03<10:25:38,  8.40s/it, gpt_loss=0.376, loss_mean=0.278][A[A
+
+Train step of epoch 1:  31%|███       | 1967/6434 [4:37:03<10:33:20,  8.51s/it, gpt_loss=0.376, loss_mean=0.278][A[A
+
+Train step of epoch 1:  31%|███       | 1967/6434 [4:37:11<10:33:20,  8.51s/it, gpt_loss=0.279, loss_mean=0.278][A[A
+
+Train step of epoch 1:  31%|███       | 1968/6434 [4:37:11<10:20:45,  8.34s/it, gpt_loss=0.279, loss_mean=0.278][A[A
+
+Train step of epoch 1:  31%|███       | 1968/6434 [4:37:18<10:20:45,  8.34s/it, gpt_loss=0.203, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  31%|███       | 1969/6434 [4:37:18<10:02:20,  8.09s/it, gpt_loss=0.203, loss_mean=0.27][A[A
+
+Train step of epoch 1:  31%|███       | 1969/6434 [4:37:27<10:02:20,  8.09s/it, gpt_loss=0.259, loss_mean=0.269][A[A
+
+Train step of epoch 1:  31%|███       | 1970/6434 [4:37:27<10:16:15,  8.28s/it, gpt_loss=0.259, loss_mean=0.269][A[A
+
+Train step of epoch 1:  31%|███       | 1970/6434 [4:37:36<10:16:15,  8.28s/it, gpt_loss=0.251, loss_mean=0.267][A[A
+
+Train step of epoch 1:  31%|███       | 1971/6434 [4:37:36<10:34:38,  8.53s/it, gpt_loss=0.251, loss_mean=0.267][A[A
+
+Train step of epoch 1:  31%|███       | 1971/6434 [4:37:45<10:34:38,  8.53s/it, gpt_loss=0.28, loss_mean=0.268] [A[A
+
+Train step of epoch 1:  31%|███       | 1972/6434 [4:37:45<10:42:14,  8.64s/it, gpt_loss=0.28, loss_mean=0.268][A[A
+
+Train step of epoch 1:  31%|███       | 1972/6434 [4:37:53<10:42:14,  8.64s/it, gpt_loss=0.337, loss_mean=0.275][A[A
+
+Train step of epoch 1:  31%|███       | 1973/6434 [4:37:53<10:41:20,  8.63s/it, gpt_loss=0.337, loss_mean=0.275][A[A
+
+Train step of epoch 1:  31%|███       | 1973/6434 [4:38:02<10:41:20,  8.63s/it, gpt_loss=0.331, loss_mean=0.281][A[A
+
+Train step of epoch 1:  31%|███       | 1974/6434 [4:38:02<10:39:41,  8.61s/it, gpt_loss=0.331, loss_mean=0.281][A[A
+
+Train step of epoch 1:  31%|███       | 1974/6434 [4:38:10<10:39:41,  8.61s/it, gpt_loss=0.336, loss_mean=0.286][A[A
+
+Train step of epoch 1:  31%|███       | 1975/6434 [4:38:10<10:25:56,  8.42s/it, gpt_loss=0.336, loss_mean=0.286][A[A
+[LID Router Debug] Step: 8410
+Batch Size: 10
+Audio Batch Size: 141
+LID Assignments: [9, 9, 3, 5, 4, 2, 2, 3, 9, 5]
+Active Experts in Batch: {2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  31%|███       | 1975/6434 [4:38:19<10:25:56,  8.42s/it, gpt_loss=0.231, loss_mean=0.281][A[A
+
+Train step of epoch 1:  31%|███       | 1976/6434 [4:38:19<10:41:04,  8.63s/it, gpt_loss=0.231, loss_mean=0.281][A[A
+
+Train step of epoch 1:  31%|███       | 1976/6434 [4:38:27<10:41:04,  8.63s/it, gpt_loss=0.228, loss_mean=0.276][A[A
+
+Train step of epoch 1:  31%|███       | 1977/6434 [4:38:27<10:32:36,  8.52s/it, gpt_loss=0.228, loss_mean=0.276][A[A
+
+Train step of epoch 1:  31%|███       | 1977/6434 [4:38:36<10:32:36,  8.52s/it, gpt_loss=0.301, loss_mean=0.278][A[A
+
+Train step of epoch 1:  31%|███       | 1978/6434 [4:38:36<10:26:06,  8.43s/it, gpt_loss=0.301, loss_mean=0.278][A[A
+
+Train step of epoch 1:  31%|███       | 1978/6434 [4:38:44<10:26:06,  8.43s/it, gpt_loss=0.241, loss_mean=0.274][A[A
+
+Train step of epoch 1:  31%|███       | 1979/6434 [4:38:44<10:26:18,  8.44s/it, gpt_loss=0.241, loss_mean=0.274][A[A
+
+Train step of epoch 1:  31%|███       | 1979/6434 [4:38:52<10:26:18,  8.44s/it, gpt_loss=0.297, loss_mean=0.277][A[A
+
+Train step of epoch 1:  31%|███       | 1980/6434 [4:38:52<10:07:05,  8.18s/it, gpt_loss=0.297, loss_mean=0.277][A[A
+
+Train step of epoch 1:  31%|███       | 1980/6434 [4:39:00<10:07:05,  8.18s/it, gpt_loss=0.287, loss_mean=0.278][A[A
+
+Train step of epoch 1:  31%|███       | 1981/6434 [4:39:00<10:18:14,  8.33s/it, gpt_loss=0.287, loss_mean=0.278][A[A
+
+Train step of epoch 1:  31%|███       | 1981/6434 [4:39:09<10:18:14,  8.33s/it, gpt_loss=0.257, loss_mean=0.276][A[A
+
+Train step of epoch 1:  31%|███       | 1982/6434 [4:39:09<10:17:40,  8.32s/it, gpt_loss=0.257, loss_mean=0.276][A[A
+
+Train step of epoch 1:  31%|███       | 1982/6434 [4:39:17<10:17:40,  8.32s/it, gpt_loss=0.297, loss_mean=0.278][A[A
+
+Train step of epoch 1:  31%|███       | 1983/6434 [4:39:17<10:19:45,  8.35s/it, gpt_loss=0.297, loss_mean=0.278][A[A
+
+Train step of epoch 1:  31%|███       | 1983/6434 [4:39:26<10:19:45,  8.35s/it, gpt_loss=0.25, loss_mean=0.275] [A[A
+
+Train step of epoch 1:  31%|███       | 1984/6434 [4:39:26<10:40:19,  8.63s/it, gpt_loss=0.25, loss_mean=0.275][A[A
+
+Train step of epoch 1:  31%|███       | 1984/6434 [4:39:35<10:40:19,  8.63s/it, gpt_loss=0.238, loss_mean=0.271][A[A
+
+Train step of epoch 1:  31%|███       | 1985/6434 [4:39:35<10:32:50,  8.53s/it, gpt_loss=0.238, loss_mean=0.271][A[A
+[LID Router Debug] Step: 8420
+Batch Size: 10
+Audio Batch Size: 79
+LID Assignments: [0, 9, 0, 1, 5, 5, 1, 2, 0, 1]
+Active Experts in Batch: {0, 1, 2, 5, 9}
+
+
+Train step of epoch 1:  31%|███       | 1985/6434 [4:39:43<10:32:50,  8.53s/it, gpt_loss=0.212, loss_mean=0.265][A[A
+
+Train step of epoch 1:  31%|███       | 1986/6434 [4:39:43<10:27:57,  8.47s/it, gpt_loss=0.212, loss_mean=0.265][A[A
+
+Train step of epoch 1:  31%|███       | 1986/6434 [4:39:52<10:27:57,  8.47s/it, gpt_loss=0.18, loss_mean=0.257] [A[A
+
+Train step of epoch 1:  31%|███       | 1987/6434 [4:39:52<10:52:09,  8.80s/it, gpt_loss=0.18, loss_mean=0.257][A[A
+
+Train step of epoch 1:  31%|███       | 1987/6434 [4:40:02<10:52:09,  8.80s/it, gpt_loss=0.239, loss_mean=0.255][A[A
+
+Train step of epoch 1:  31%|███       | 1988/6434 [4:40:02<11:08:32,  9.02s/it, gpt_loss=0.239, loss_mean=0.255][A[A
+
+Train step of epoch 1:  31%|███       | 1988/6434 [4:40:10<11:08:32,  9.02s/it, gpt_loss=0.301, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  31%|███       | 1989/6434 [4:40:10<10:44:58,  8.71s/it, gpt_loss=0.301, loss_mean=0.26][A[A
+
+Train step of epoch 1:  31%|███       | 1989/6434 [4:40:19<10:44:58,  8.71s/it, gpt_loss=0.205, loss_mean=0.254][A[A
+
+Train step of epoch 1:  31%|███       | 1990/6434 [4:40:19<10:46:18,  8.73s/it, gpt_loss=0.205, loss_mean=0.254][A[A
+
+Train step of epoch 1:  31%|███       | 1990/6434 [4:40:28<10:46:18,  8.73s/it, gpt_loss=0.289, loss_mean=0.258][A[A
+
+Train step of epoch 1:  31%|███       | 1991/6434 [4:40:28<10:53:54,  8.83s/it, gpt_loss=0.289, loss_mean=0.258][A[A
+
+Train step of epoch 1:  31%|███       | 1991/6434 [4:40:35<10:53:54,  8.83s/it, gpt_loss=0.255, loss_mean=0.257][A[A
+
+Train step of epoch 1:  31%|███       | 1992/6434 [4:40:35<10:17:55,  8.35s/it, gpt_loss=0.255, loss_mean=0.257][A[A
+
+Train step of epoch 1:  31%|███       | 1992/6434 [4:40:45<10:17:55,  8.35s/it, gpt_loss=0.246, loss_mean=0.256][A[A
+
+Train step of epoch 1:  31%|███       | 1993/6434 [4:40:45<11:01:47,  8.94s/it, gpt_loss=0.246, loss_mean=0.256][A[A
+
+Train step of epoch 1:  31%|███       | 1993/6434 [4:40:56<11:01:47,  8.94s/it, gpt_loss=0.204, loss_mean=0.251][A[A
+
+Train step of epoch 1:  31%|███       | 1994/6434 [4:40:56<11:39:41,  9.46s/it, gpt_loss=0.204, loss_mean=0.251][A[A
+
+Train step of epoch 1:  31%|███       | 1994/6434 [4:41:04<11:39:41,  9.46s/it, gpt_loss=0.263, loss_mean=0.252][A[A
+
+Train step of epoch 1:  31%|███       | 1995/6434 [4:41:04<11:09:11,  9.05s/it, gpt_loss=0.263, loss_mean=0.252][A[A
+[LID Router Debug] Step: 8430
+Batch Size: 10
+Audio Batch Size: 98
+LID Assignments: [9, 0, 5, 2, 5, 4, 2, 3, 0, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  31%|███       | 1995/6434 [4:41:12<11:09:11,  9.05s/it, gpt_loss=0.266, loss_mean=0.254][A[A
+
+Train step of epoch 1:  31%|███       | 1996/6434 [4:41:12<10:47:08,  8.75s/it, gpt_loss=0.266, loss_mean=0.254][A[A
+
+Train step of epoch 1:  31%|███       | 1996/6434 [4:41:21<10:47:08,  8.75s/it, gpt_loss=0.28, loss_mean=0.256] [A[A
+
+Train step of epoch 1:  31%|███       | 1997/6434 [4:41:21<10:41:43,  8.68s/it, gpt_loss=0.28, loss_mean=0.256][A[A
+
+Train step of epoch 1:  31%|███       | 1997/6434 [4:41:30<10:41:43,  8.68s/it, gpt_loss=0.321, loss_mean=0.263][A[A
+
+Train step of epoch 1:  31%|███       | 1998/6434 [4:41:30<10:54:22,  8.85s/it, gpt_loss=0.321, loss_mean=0.263][A[A
+
+Train step of epoch 1:  31%|███       | 1998/6434 [4:41:38<10:54:22,  8.85s/it, gpt_loss=0.186, loss_mean=0.255][A[A
+
+Train step of epoch 1:  31%|███       | 1999/6434 [4:41:38<10:30:45,  8.53s/it, gpt_loss=0.186, loss_mean=0.255][A[A
+
+Train step of epoch 1:  31%|███       | 1999/6434 [4:41:45<10:30:45,  8.53s/it, gpt_loss=0.278, loss_mean=0.257][A[A
+
+Train step of epoch 1:  31%|███       | 2000/6434 [4:41:45<10:03:53,  8.17s/it, gpt_loss=0.278, loss_mean=0.257][A[A
+
+Train step of epoch 1:  31%|███       | 2000/6434 [4:41:53<10:03:53,  8.17s/it, gpt_loss=0.205, loss_mean=0.252][A[A
+
+Train step of epoch 1:  31%|███       | 2001/6434 [4:41:53<9:58:25,  8.10s/it, gpt_loss=0.205, loss_mean=0.252] [A[A
+
+Train step of epoch 1:  31%|███       | 2001/6434 [4:42:01<9:58:25,  8.10s/it, gpt_loss=0.321, loss_mean=0.259][A[A
+
+Train step of epoch 1:  31%|███       | 2002/6434 [4:42:01<10:02:59,  8.16s/it, gpt_loss=0.321, loss_mean=0.259][A[A
+
+Train step of epoch 1:  31%|███       | 2002/6434 [4:42:10<10:02:59,  8.16s/it, gpt_loss=0.226, loss_mean=0.256][A[A
+
+Train step of epoch 1:  31%|███       | 2003/6434 [4:42:10<10:07:04,  8.22s/it, gpt_loss=0.226, loss_mean=0.256][A[A
+
+Train step of epoch 1:  31%|███       | 2003/6434 [4:42:18<10:07:04,  8.22s/it, gpt_loss=0.331, loss_mean=0.263][A[A
+
+Train step of epoch 1:  31%|███       | 2004/6434 [4:42:18<10:08:03,  8.24s/it, gpt_loss=0.331, loss_mean=0.263][A[A
+
+Train step of epoch 1:  31%|███       | 2004/6434 [4:42:26<10:08:03,  8.24s/it, gpt_loss=0.301, loss_mean=0.267][A[A
+
+Train step of epoch 1:  31%|███       | 2005/6434 [4:42:26<10:05:25,  8.20s/it, gpt_loss=0.301, loss_mean=0.267][A[A
+[LID Router Debug] Step: 8440
+Batch Size: 10
+Audio Batch Size: 99
+LID Assignments: [2, 4, 4, 0, 5, 2, 5, 3, 2, 2]
+Active Experts in Batch: {0, 2, 3, 4, 5}
+
+
+Train step of epoch 1:  31%|███       | 2005/6434 [4:42:35<10:05:25,  8.20s/it, gpt_loss=0.235, loss_mean=0.264][A[A
+
+Train step of epoch 1:  31%|███       | 2006/6434 [4:42:35<10:27:47,  8.51s/it, gpt_loss=0.235, loss_mean=0.264][A[A
+
+Train step of epoch 1:  31%|███       | 2006/6434 [4:42:43<10:27:47,  8.51s/it, gpt_loss=0.283, loss_mean=0.266][A[A
+
+Train step of epoch 1:  31%|███       | 2007/6434 [4:42:43<10:12:38,  8.30s/it, gpt_loss=0.283, loss_mean=0.266][A[A
+
+Train step of epoch 1:  31%|███       | 2007/6434 [4:42:51<10:12:38,  8.30s/it, gpt_loss=0.251, loss_mean=0.264][A[A
+
+Train step of epoch 1:  31%|███       | 2008/6434 [4:42:51<10:06:14,  8.22s/it, gpt_loss=0.251, loss_mean=0.264][A[A
+
+Train step of epoch 1:  31%|███       | 2008/6434 [4:42:59<10:06:14,  8.22s/it, gpt_loss=0.322, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  31%|███       | 2009/6434 [4:42:59<10:05:49,  8.21s/it, gpt_loss=0.322, loss_mean=0.27][A[A
+
+Train step of epoch 1:  31%|███       | 2009/6434 [4:43:07<10:05:49,  8.21s/it, gpt_loss=0.283, loss_mean=0.271][A[A
+
+Train step of epoch 1:  31%|███       | 2010/6434 [4:43:07<9:59:13,  8.13s/it, gpt_loss=0.283, loss_mean=0.271] [A[A
+
+Train step of epoch 1:  31%|███       | 2010/6434 [4:43:15<9:59:13,  8.13s/it, gpt_loss=0.231, loss_mean=0.267][A[A
+
+Train step of epoch 1:  31%|███▏      | 2011/6434 [4:43:15<9:54:25,  8.06s/it, gpt_loss=0.231, loss_mean=0.267][A[A
+
+Train step of epoch 1:  31%|███▏      | 2011/6434 [4:43:23<9:54:25,  8.06s/it, gpt_loss=0.297, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  31%|███▏      | 2012/6434 [4:43:23<9:49:42,  8.00s/it, gpt_loss=0.297, loss_mean=0.27][A[A
+
+Train step of epoch 1:  31%|███▏      | 2012/6434 [4:43:32<9:49:42,  8.00s/it, gpt_loss=0.26, loss_mean=0.269][A[A
+
+Train step of epoch 1:  31%|███▏      | 2013/6434 [4:43:32<10:03:16,  8.19s/it, gpt_loss=0.26, loss_mean=0.269][A[A
+
+Train step of epoch 1:  31%|███▏      | 2013/6434 [4:43:40<10:03:16,  8.19s/it, gpt_loss=0.252, loss_mean=0.268][A[A
+
+Train step of epoch 1:  31%|███▏      | 2014/6434 [4:43:40<10:07:28,  8.25s/it, gpt_loss=0.252, loss_mean=0.268][A[A
+
+Train step of epoch 1:  31%|███▏      | 2014/6434 [4:43:48<10:07:28,  8.25s/it, gpt_loss=0.331, loss_mean=0.274][A[A
+
+Train step of epoch 1:  31%|███▏      | 2015/6434 [4:43:48<9:54:01,  8.07s/it, gpt_loss=0.331, loss_mean=0.274] [A[A
+[LID Router Debug] Step: 8450
+Batch Size: 10
+Audio Batch Size: 139
+LID Assignments: [3, 9, 5, 9, 3, 9, 2, 4, 2, 1]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  31%|███▏      | 2015/6434 [4:43:56<9:54:01,  8.07s/it, gpt_loss=0.335, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  31%|███▏      | 2016/6434 [4:43:56<10:07:53,  8.26s/it, gpt_loss=0.335, loss_mean=0.28][A[A
+
+Train step of epoch 1:  31%|███▏      | 2016/6434 [4:44:06<10:07:53,  8.26s/it, gpt_loss=0.254, loss_mean=0.277][A[A
+
+Train step of epoch 1:  31%|███▏      | 2017/6434 [4:44:06<10:28:11,  8.53s/it, gpt_loss=0.254, loss_mean=0.277][A[A
+
+Train step of epoch 1:  31%|███▏      | 2017/6434 [4:44:14<10:28:11,  8.53s/it, gpt_loss=0.266, loss_mean=0.276][A[A
+
+Train step of epoch 1:  31%|███▏      | 2018/6434 [4:44:14<10:18:09,  8.40s/it, gpt_loss=0.266, loss_mean=0.276][A[A
+
+Train step of epoch 1:  31%|███▏      | 2018/6434 [4:44:22<10:18:09,  8.40s/it, gpt_loss=0.18, loss_mean=0.267] [A[A
+
+Train step of epoch 1:  31%|███▏      | 2019/6434 [4:44:22<10:06:23,  8.24s/it, gpt_loss=0.18, loss_mean=0.267][A[A
+
+Train step of epoch 1:  31%|███▏      | 2019/6434 [4:44:31<10:06:23,  8.24s/it, gpt_loss=0.373, loss_mean=0.277][A[A
+
+Train step of epoch 1:  31%|███▏      | 2020/6434 [4:44:31<10:38:00,  8.67s/it, gpt_loss=0.373, loss_mean=0.277][A[A
+
+Train step of epoch 1:  31%|███▏      | 2020/6434 [4:44:39<10:38:00,  8.67s/it, gpt_loss=0.251, loss_mean=0.275][A[A
+
+Train step of epoch 1:  31%|███▏      | 2021/6434 [4:44:39<10:14:22,  8.35s/it, gpt_loss=0.251, loss_mean=0.275][A[A
+
+Train step of epoch 1:  31%|███▏      | 2021/6434 [4:44:49<10:14:22,  8.35s/it, gpt_loss=0.278, loss_mean=0.275][A[A
+
+Train step of epoch 1:  31%|███▏      | 2022/6434 [4:44:49<11:00:22,  8.98s/it, gpt_loss=0.278, loss_mean=0.275][A[A
+
+Train step of epoch 1:  31%|███▏      | 2022/6434 [4:44:58<11:00:22,  8.98s/it, gpt_loss=0.315, loss_mean=0.279][A[A
+
+Train step of epoch 1:  31%|███▏      | 2023/6434 [4:44:58<10:52:48,  8.88s/it, gpt_loss=0.315, loss_mean=0.279][A[A
+
+Train step of epoch 1:  31%|███▏      | 2023/6434 [4:45:07<10:52:48,  8.88s/it, gpt_loss=0.268, loss_mean=0.278][A[A
+
+Train step of epoch 1:  31%|███▏      | 2024/6434 [4:45:07<10:47:25,  8.81s/it, gpt_loss=0.268, loss_mean=0.278][A[A
+
+Train step of epoch 1:  31%|███▏      | 2024/6434 [4:45:15<10:47:25,  8.81s/it, gpt_loss=0.204, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  31%|███▏      | 2025/6434 [4:45:15<10:32:58,  8.61s/it, gpt_loss=0.204, loss_mean=0.27][A[A
+[LID Router Debug] Step: 8460
+Batch Size: 10
+Audio Batch Size: 98
+LID Assignments: [5, 2, 1, 3, 4, 0, 4, 2, 6, 2]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+
+Train step of epoch 1:  31%|███▏      | 2025/6434 [4:45:22<10:32:58,  8.61s/it, gpt_loss=0.287, loss_mean=0.272][A[A
+
+Train step of epoch 1:  31%|███▏      | 2026/6434 [4:45:22<10:02:50,  8.21s/it, gpt_loss=0.287, loss_mean=0.272][A[A
+
+Train step of epoch 1:  31%|███▏      | 2026/6434 [4:45:31<10:02:50,  8.21s/it, gpt_loss=0.312, loss_mean=0.276][A[A
+
+Train step of epoch 1:  32%|███▏      | 2027/6434 [4:45:31<10:16:14,  8.39s/it, gpt_loss=0.312, loss_mean=0.276][A[A
+
+Train step of epoch 1:  32%|███▏      | 2027/6434 [4:45:39<10:16:14,  8.39s/it, gpt_loss=0.195, loss_mean=0.268][A[A
+
+Train step of epoch 1:  32%|███▏      | 2028/6434 [4:45:39<10:13:02,  8.35s/it, gpt_loss=0.195, loss_mean=0.268][A[A
+
+Train step of epoch 1:  32%|███▏      | 2028/6434 [4:45:47<10:13:02,  8.35s/it, gpt_loss=0.26, loss_mean=0.267] [A[A
+
+Train step of epoch 1:  32%|███▏      | 2029/6434 [4:45:47<9:59:48,  8.17s/it, gpt_loss=0.26, loss_mean=0.267] [A[A
+
+Train step of epoch 1:  32%|███▏      | 2029/6434 [4:45:55<9:59:48,  8.17s/it, gpt_loss=0.193, loss_mean=0.26][A[A
+
+Train step of epoch 1:  32%|███▏      | 2030/6434 [4:45:55<9:52:21,  8.07s/it, gpt_loss=0.193, loss_mean=0.26][A[A
+
+Train step of epoch 1:  32%|███▏      | 2030/6434 [4:46:03<9:52:21,  8.07s/it, gpt_loss=0.199, loss_mean=0.254][A[A
+
+Train step of epoch 1:  32%|███▏      | 2031/6434 [4:46:03<10:00:51,  8.19s/it, gpt_loss=0.199, loss_mean=0.254][A[A
+
+Train step of epoch 1:  32%|███▏      | 2031/6434 [4:46:12<10:00:51,  8.19s/it, gpt_loss=0.224, loss_mean=0.251][A[A
+
+Train step of epoch 1:  32%|███▏      | 2032/6434 [4:46:12<10:05:43,  8.26s/it, gpt_loss=0.224, loss_mean=0.251][A[A
+
+Train step of epoch 1:  32%|███▏      | 2032/6434 [4:46:20<10:05:43,  8.26s/it, gpt_loss=0.223, loss_mean=0.248][A[A
+
+Train step of epoch 1:  32%|███▏      | 2033/6434 [4:46:20<10:02:18,  8.21s/it, gpt_loss=0.223, loss_mean=0.248][A[A
+
+Train step of epoch 1:  32%|███▏      | 2033/6434 [4:46:26<10:02:18,  8.21s/it, gpt_loss=0.249, loss_mean=0.248][A[A
+
+Train step of epoch 1:  32%|███▏      | 2034/6434 [4:46:26<9:30:55,  7.79s/it, gpt_loss=0.249, loss_mean=0.248] [A[A
+
+Train step of epoch 1:  32%|███▏      | 2034/6434 [4:46:35<9:30:55,  7.79s/it, gpt_loss=0.234, loss_mean=0.247][A[A
+
+Train step of epoch 1:  32%|███▏      | 2035/6434 [4:46:35<9:56:28,  8.14s/it, gpt_loss=0.234, loss_mean=0.247][A[A
+[LID Router Debug] Step: 8470
+Batch Size: 10
+Audio Batch Size: 84
+LID Assignments: [2, 5, 1, 5, 5, 1, 5, 9, 1, 9]
+Active Experts in Batch: {1, 2, 5, 9}
+
+
+Train step of epoch 1:  32%|███▏      | 2035/6434 [4:46:44<9:56:28,  8.14s/it, gpt_loss=0.277, loss_mean=0.25] [A[A
+
+Train step of epoch 1:  32%|███▏      | 2036/6434 [4:46:44<10:16:42,  8.41s/it, gpt_loss=0.277, loss_mean=0.25][A[A
+
+Train step of epoch 1:  32%|███▏      | 2036/6434 [4:46:52<10:16:42,  8.41s/it, gpt_loss=0.249, loss_mean=0.25][A[A
+
+Train step of epoch 1:  32%|███▏      | 2037/6434 [4:46:52<9:48:29,  8.03s/it, gpt_loss=0.249, loss_mean=0.25] [A[A
+
+Train step of epoch 1:  32%|███▏      | 2037/6434 [4:47:00<9:48:29,  8.03s/it, gpt_loss=0.316, loss_mean=0.256][A[A
+
+Train step of epoch 1:  32%|███▏      | 2038/6434 [4:47:00<9:47:50,  8.02s/it, gpt_loss=0.316, loss_mean=0.256][A[A
+
+Train step of epoch 1:  32%|███▏      | 2038/6434 [4:47:07<9:47:50,  8.02s/it, gpt_loss=0.354, loss_mean=0.266][A[A
+
+Train step of epoch 1:  32%|███▏      | 2039/6434 [4:47:07<9:43:34,  7.97s/it, gpt_loss=0.354, loss_mean=0.266][A[A
+
+Train step of epoch 1:  32%|███▏      | 2039/6434 [4:47:16<9:43:34,  7.97s/it, gpt_loss=0.25, loss_mean=0.264] [A[A
+
+Train step of epoch 1:  32%|███▏      | 2040/6434 [4:47:16<9:55:25,  8.13s/it, gpt_loss=0.25, loss_mean=0.264][A[A
+
+Train step of epoch 1:  32%|███▏      | 2040/6434 [4:47:25<9:55:25,  8.13s/it, gpt_loss=0.228, loss_mean=0.261][A[A
+
+Train step of epoch 1:  32%|███▏      | 2041/6434 [4:47:25<10:06:35,  8.28s/it, gpt_loss=0.228, loss_mean=0.261][A[A
+
+Train step of epoch 1:  32%|███▏      | 2041/6434 [4:47:33<10:06:35,  8.28s/it, gpt_loss=0.24, loss_mean=0.259] [A[A
+
+Train step of epoch 1:  32%|███▏      | 2042/6434 [4:47:33<9:59:49,  8.19s/it, gpt_loss=0.24, loss_mean=0.259] [A[A
+
+Train step of epoch 1:  32%|███▏      | 2042/6434 [4:47:42<9:59:49,  8.19s/it, gpt_loss=0.266, loss_mean=0.259][A[A
+
+Train step of epoch 1:  32%|███▏      | 2043/6434 [4:47:42<10:23:00,  8.51s/it, gpt_loss=0.266, loss_mean=0.259][A[A
+
+Train step of epoch 1:  32%|███▏      | 2043/6434 [4:47:50<10:23:00,  8.51s/it, gpt_loss=0.235, loss_mean=0.257][A[A
+
+Train step of epoch 1:  32%|███▏      | 2044/6434 [4:47:50<10:24:15,  8.53s/it, gpt_loss=0.235, loss_mean=0.257][A[A
+
+Train step of epoch 1:  32%|███▏      | 2044/6434 [4:47:59<10:24:15,  8.53s/it, gpt_loss=0.275, loss_mean=0.259][A[A
+
+Train step of epoch 1:  32%|███▏      | 2045/6434 [4:47:59<10:26:39,  8.57s/it, gpt_loss=0.275, loss_mean=0.259][A[A
+[LID Router Debug] Step: 8480
+Batch Size: 10
+Audio Batch Size: 120
+LID Assignments: [5, 9, 3, 6, 5, 4, 3, 0, 3, 9]
+Active Experts in Batch: {0, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  32%|███▏      | 2045/6434 [4:48:07<10:26:39,  8.57s/it, gpt_loss=0.278, loss_mean=0.261][A[A
+
+Train step of epoch 1:  32%|███▏      | 2046/6434 [4:48:07<10:21:32,  8.50s/it, gpt_loss=0.278, loss_mean=0.261][A[A
+
+Train step of epoch 1:  32%|███▏      | 2046/6434 [4:48:15<10:21:32,  8.50s/it, gpt_loss=0.292, loss_mean=0.264][A[A
+
+Train step of epoch 1:  32%|███▏      | 2047/6434 [4:48:15<10:02:33,  8.24s/it, gpt_loss=0.292, loss_mean=0.264][A[A
+
+Train step of epoch 1:  32%|███▏      | 2047/6434 [4:48:24<10:02:33,  8.24s/it, gpt_loss=0.227, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  32%|███▏      | 2048/6434 [4:48:24<10:20:54,  8.49s/it, gpt_loss=0.227, loss_mean=0.26][A[A
+
+Train step of epoch 1:  32%|███▏      | 2048/6434 [4:48:33<10:20:54,  8.49s/it, gpt_loss=0.295, loss_mean=0.264][A[A
+
+Train step of epoch 1:  32%|███▏      | 2049/6434 [4:48:33<10:34:46,  8.69s/it, gpt_loss=0.295, loss_mean=0.264][A[A
+
+Train step of epoch 1:  32%|███▏      | 2049/6434 [4:48:42<10:34:46,  8.69s/it, gpt_loss=0.259, loss_mean=0.263][A[A
+
+Train step of epoch 1:  32%|███▏      | 2050/6434 [4:48:42<10:28:07,  8.60s/it, gpt_loss=0.259, loss_mean=0.263][A[A
+
+Train step of epoch 1:  32%|███▏      | 2050/6434 [4:48:51<10:28:07,  8.60s/it, gpt_loss=0.258, loss_mean=0.263][A[A
+
+Train step of epoch 1:  32%|███▏      | 2051/6434 [4:48:51<10:34:57,  8.69s/it, gpt_loss=0.258, loss_mean=0.263][A[A
+
+Train step of epoch 1:  32%|███▏      | 2051/6434 [4:49:00<10:34:57,  8.69s/it, gpt_loss=0.329, loss_mean=0.269][A[A
+
+Train step of epoch 1:  32%|███▏      | 2052/6434 [4:49:00<10:46:37,  8.85s/it, gpt_loss=0.329, loss_mean=0.269][A[A
+
+Train step of epoch 1:  32%|███▏      | 2052/6434 [4:49:08<10:46:37,  8.85s/it, gpt_loss=0.193, loss_mean=0.262][A[A
+
+Train step of epoch 1:  32%|███▏      | 2053/6434 [4:49:08<10:35:37,  8.71s/it, gpt_loss=0.193, loss_mean=0.262][A[A
+
+Train step of epoch 1:  32%|███▏      | 2053/6434 [4:49:17<10:35:37,  8.71s/it, gpt_loss=0.317, loss_mean=0.267][A[A
+
+Train step of epoch 1:  32%|███▏      | 2054/6434 [4:49:17<10:47:45,  8.87s/it, gpt_loss=0.317, loss_mean=0.267][A[A
+
+Train step of epoch 1:  32%|███▏      | 2054/6434 [4:49:26<10:47:45,  8.87s/it, gpt_loss=0.225, loss_mean=0.263][A[A
+
+Train step of epoch 1:  32%|███▏      | 2055/6434 [4:49:26<10:48:16,  8.88s/it, gpt_loss=0.225, loss_mean=0.263][A[A
+[LID Router Debug] Step: 8490
+Batch Size: 10
+Audio Batch Size: 102
+LID Assignments: [9, 11, 5, 2, 5, 4, 1, 0, 0, 2]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9, 11}
+
+
+Train step of epoch 1:  32%|███▏      | 2055/6434 [4:49:34<10:48:16,  8.88s/it, gpt_loss=0.337, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  32%|███▏      | 2056/6434 [4:49:34<10:29:12,  8.62s/it, gpt_loss=0.337, loss_mean=0.27][A[A
+
+Train step of epoch 1:  32%|███▏      | 2056/6434 [4:49:44<10:29:12,  8.62s/it, gpt_loss=0.347, loss_mean=0.278][A[A
+
+Train step of epoch 1:  32%|███▏      | 2057/6434 [4:49:44<10:55:25,  8.98s/it, gpt_loss=0.347, loss_mean=0.278][A[A
+
+Train step of epoch 1:  32%|███▏      | 2057/6434 [4:49:54<10:55:25,  8.98s/it, gpt_loss=0.285, loss_mean=0.279][A[A
+
+Train step of epoch 1:  32%|███▏      | 2058/6434 [4:49:54<11:14:04,  9.24s/it, gpt_loss=0.285, loss_mean=0.279][A[A
+
+Train step of epoch 1:  32%|███▏      | 2058/6434 [4:50:02<11:14:04,  9.24s/it, gpt_loss=0.346, loss_mean=0.285][A[A
+
+Train step of epoch 1:  32%|███▏      | 2059/6434 [4:50:02<10:51:48,  8.94s/it, gpt_loss=0.346, loss_mean=0.285][A[A
+
+Train step of epoch 1:  32%|███▏      | 2059/6434 [4:50:12<10:51:48,  8.94s/it, gpt_loss=0.286, loss_mean=0.285][A[A
+
+Train step of epoch 1:  32%|███▏      | 2060/6434 [4:50:12<11:13:53,  9.24s/it, gpt_loss=0.286, loss_mean=0.285][A[A
+
+Train step of epoch 1:  32%|███▏      | 2060/6434 [4:50:21<11:13:53,  9.24s/it, gpt_loss=0.419, loss_mean=0.299][A[A
+
+Train step of epoch 1:  32%|███▏      | 2061/6434 [4:50:21<11:09:32,  9.19s/it, gpt_loss=0.419, loss_mean=0.299][A[A
+
+Train step of epoch 1:  32%|███▏      | 2061/6434 [4:50:31<11:09:32,  9.19s/it, gpt_loss=0.304, loss_mean=0.299][A[A
+
+Train step of epoch 1:  32%|███▏      | 2062/6434 [4:50:31<11:14:09,  9.25s/it, gpt_loss=0.304, loss_mean=0.299][A[A
+
+Train step of epoch 1:  32%|███▏      | 2062/6434 [4:50:40<11:14:09,  9.25s/it, gpt_loss=0.216, loss_mean=0.291][A[A
+
+Train step of epoch 1:  32%|███▏      | 2063/6434 [4:50:40<11:10:49,  9.21s/it, gpt_loss=0.216, loss_mean=0.291][A[A
+
+Train step of epoch 1:  32%|███▏      | 2063/6434 [4:50:48<11:10:49,  9.21s/it, gpt_loss=0.297, loss_mean=0.292][A[A
+
+Train step of epoch 1:  32%|███▏      | 2064/6434 [4:50:48<10:49:12,  8.91s/it, gpt_loss=0.297, loss_mean=0.292][A[A
+
+Train step of epoch 1:  32%|███▏      | 2064/6434 [4:50:56<10:49:12,  8.91s/it, gpt_loss=0.279, loss_mean=0.29] [A[A
+
+Train step of epoch 1:  32%|███▏      | 2065/6434 [4:50:56<10:32:42,  8.69s/it, gpt_loss=0.279, loss_mean=0.29][A[A
+[LID Router Debug] Step: 8500
+Batch Size: 10
+Audio Batch Size: 108
+LID Assignments: [5, 9, 2, 9, 5, 5, 3, 1, 2, 2]
+Active Experts in Batch: {1, 2, 3, 5, 9}
+
+
+Train step of epoch 1:  32%|███▏      | 2065/6434 [4:51:04<10:32:42,  8.69s/it, gpt_loss=0.236, loss_mean=0.285][A[A
+
+Train step of epoch 1:  32%|███▏      | 2066/6434 [4:51:04<10:14:30,  8.44s/it, gpt_loss=0.236, loss_mean=0.285][A[A
+
+Train step of epoch 1:  32%|███▏      | 2066/6434 [4:51:12<10:14:30,  8.44s/it, gpt_loss=0.26, loss_mean=0.282] [A[A
+
+Train step of epoch 1:  32%|███▏      | 2067/6434 [4:51:12<10:08:43,  8.36s/it, gpt_loss=0.26, loss_mean=0.282][A[A
+
+Train step of epoch 1:  32%|███▏      | 2067/6434 [4:51:20<10:08:43,  8.36s/it, gpt_loss=0.333, loss_mean=0.287][A[A
+
+Train step of epoch 1:  32%|███▏      | 2068/6434 [4:51:20<9:53:07,  8.15s/it, gpt_loss=0.333, loss_mean=0.287] [A[A
+
+Train step of epoch 1:  32%|███▏      | 2068/6434 [4:51:27<9:53:07,  8.15s/it, gpt_loss=0.291, loss_mean=0.288][A[A
+
+Train step of epoch 1:  32%|███▏      | 2069/6434 [4:51:27<9:42:30,  8.01s/it, gpt_loss=0.291, loss_mean=0.288][A[A
+
+Train step of epoch 1:  32%|███▏      | 2069/6434 [4:51:35<9:42:30,  8.01s/it, gpt_loss=0.242, loss_mean=0.283][A[A
+
+Train step of epoch 1:  32%|███▏      | 2070/6434 [4:51:35<9:42:24,  8.01s/it, gpt_loss=0.242, loss_mean=0.283][A[A
+
+Train step of epoch 1:  32%|███▏      | 2070/6434 [4:51:44<9:42:24,  8.01s/it, gpt_loss=0.276, loss_mean=0.283][A[A
+
+Train step of epoch 1:  32%|███▏      | 2071/6434 [4:51:44<9:46:47,  8.07s/it, gpt_loss=0.276, loss_mean=0.283][A[A
+
+Train step of epoch 1:  32%|███▏      | 2071/6434 [4:51:53<9:46:47,  8.07s/it, gpt_loss=0.195, loss_mean=0.274][A[A
+
+Train step of epoch 1:  32%|███▏      | 2072/6434 [4:51:53<10:03:48,  8.31s/it, gpt_loss=0.195, loss_mean=0.274][A[A
+
+Train step of epoch 1:  32%|███▏      | 2072/6434 [4:52:01<10:03:48,  8.31s/it, gpt_loss=0.215, loss_mean=0.268][A[A
+
+Train step of epoch 1:  32%|███▏      | 2073/6434 [4:52:01<10:11:32,  8.41s/it, gpt_loss=0.215, loss_mean=0.268][A[A
+
+Train step of epoch 1:  32%|███▏      | 2073/6434 [4:52:09<10:11:32,  8.41s/it, gpt_loss=0.27, loss_mean=0.268] [A[A
+
+Train step of epoch 1:  32%|███▏      | 2074/6434 [4:52:09<10:03:55,  8.31s/it, gpt_loss=0.27, loss_mean=0.268][A[A
+
+Train step of epoch 1:  32%|███▏      | 2074/6434 [4:52:18<10:03:55,  8.31s/it, gpt_loss=0.241, loss_mean=0.265][A[A
+
+Train step of epoch 1:  32%|███▏      | 2075/6434 [4:52:18<10:11:15,  8.41s/it, gpt_loss=0.241, loss_mean=0.265][A[A
+[LID Router Debug] Step: 8510
+Batch Size: 10
+Audio Batch Size: 129
+LID Assignments: [3, 1, 9, 3, 5, 2, 2, 5, 3, 5]
+Active Experts in Batch: {1, 2, 3, 5, 9}
+
+
+Train step of epoch 1:  32%|███▏      | 2075/6434 [4:52:27<10:11:15,  8.41s/it, gpt_loss=0.312, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  32%|███▏      | 2076/6434 [4:52:27<10:15:03,  8.47s/it, gpt_loss=0.312, loss_mean=0.27][A[A
+
+Train step of epoch 1:  32%|███▏      | 2076/6434 [4:52:35<10:15:03,  8.47s/it, gpt_loss=0.232, loss_mean=0.266][A[A
+
+Train step of epoch 1:  32%|███▏      | 2077/6434 [4:52:35<10:20:11,  8.54s/it, gpt_loss=0.232, loss_mean=0.266][A[A
+
+Train step of epoch 1:  32%|███▏      | 2077/6434 [4:52:44<10:20:11,  8.54s/it, gpt_loss=0.288, loss_mean=0.269][A[A
+
+Train step of epoch 1:  32%|███▏      | 2078/6434 [4:52:44<10:20:18,  8.54s/it, gpt_loss=0.288, loss_mean=0.269][A[A
+
+Train step of epoch 1:  32%|███▏      | 2078/6434 [4:52:52<10:20:18,  8.54s/it, gpt_loss=0.254, loss_mean=0.267][A[A
+
+Train step of epoch 1:  32%|███▏      | 2079/6434 [4:52:52<10:13:31,  8.45s/it, gpt_loss=0.254, loss_mean=0.267][A[A
+
+Train step of epoch 1:  32%|███▏      | 2079/6434 [4:53:01<10:13:31,  8.45s/it, gpt_loss=0.296, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  32%|███▏      | 2080/6434 [4:53:01<10:22:04,  8.57s/it, gpt_loss=0.296, loss_mean=0.27][A[A
+
+Train step of epoch 1:  32%|███▏      | 2080/6434 [4:53:11<10:22:04,  8.57s/it, gpt_loss=0.241, loss_mean=0.267][A[A
+
+Train step of epoch 1:  32%|███▏      | 2081/6434 [4:53:11<10:51:03,  8.97s/it, gpt_loss=0.241, loss_mean=0.267][A[A
+
+Train step of epoch 1:  32%|███▏      | 2081/6434 [4:53:20<10:51:03,  8.97s/it, gpt_loss=0.215, loss_mean=0.262][A[A
+
+Train step of epoch 1:  32%|███▏      | 2082/6434 [4:53:20<10:56:13,  9.05s/it, gpt_loss=0.215, loss_mean=0.262][A[A
+
+Train step of epoch 1:  32%|███▏      | 2082/6434 [4:53:28<10:56:13,  9.05s/it, gpt_loss=0.289, loss_mean=0.265][A[A
+
+Train step of epoch 1:  32%|███▏      | 2083/6434 [4:53:28<10:40:03,  8.83s/it, gpt_loss=0.289, loss_mean=0.265][A[A
+
+Train step of epoch 1:  32%|███▏      | 2083/6434 [4:53:37<10:40:03,  8.83s/it, gpt_loss=0.342, loss_mean=0.272][A[A
+
+Train step of epoch 1:  32%|███▏      | 2084/6434 [4:53:37<10:37:10,  8.79s/it, gpt_loss=0.342, loss_mean=0.272][A[A
+
+Train step of epoch 1:  32%|███▏      | 2084/6434 [4:53:45<10:37:10,  8.79s/it, gpt_loss=0.24, loss_mean=0.269] [A[A
+
+Train step of epoch 1:  32%|███▏      | 2085/6434 [4:53:45<10:23:34,  8.60s/it, gpt_loss=0.24, loss_mean=0.269][A[A
+[LID Router Debug] Step: 8520
+Batch Size: 10
+Audio Batch Size: 112
+LID Assignments: [2, 10, 4, 0, 5, 5, 3, 2, 4, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 10}
+
+
+Train step of epoch 1:  32%|███▏      | 2085/6434 [4:53:53<10:23:34,  8.60s/it, gpt_loss=0.311, loss_mean=0.273][A[A
+
+Train step of epoch 1:  32%|███▏      | 2086/6434 [4:53:53<10:04:07,  8.34s/it, gpt_loss=0.311, loss_mean=0.273][A[A
+
+Train step of epoch 1:  32%|███▏      | 2086/6434 [4:54:01<10:04:07,  8.34s/it, gpt_loss=0.246, loss_mean=0.271][A[A
+
+Train step of epoch 1:  32%|███▏      | 2087/6434 [4:54:01<10:05:48,  8.36s/it, gpt_loss=0.246, loss_mean=0.271][A[A
+
+Train step of epoch 1:  32%|███▏      | 2087/6434 [4:54:11<10:05:48,  8.36s/it, gpt_loss=0.314, loss_mean=0.275][A[A
+
+Train step of epoch 1:  32%|███▏      | 2088/6434 [4:54:11<10:26:49,  8.65s/it, gpt_loss=0.314, loss_mean=0.275][A[A
+
+Train step of epoch 1:  32%|███▏      | 2088/6434 [4:54:18<10:26:49,  8.65s/it, gpt_loss=0.238, loss_mean=0.271][A[A
+
+Train step of epoch 1:  32%|███▏      | 2089/6434 [4:54:18<10:02:03,  8.31s/it, gpt_loss=0.238, loss_mean=0.271][A[A
+
+Train step of epoch 1:  32%|███▏      | 2089/6434 [4:54:26<10:02:03,  8.31s/it, gpt_loss=0.286, loss_mean=0.273][A[A
+
+Train step of epoch 1:  32%|███▏      | 2090/6434 [4:54:26<9:43:25,  8.06s/it, gpt_loss=0.286, loss_mean=0.273] [A[A
+
+Train step of epoch 1:  32%|███▏      | 2090/6434 [4:54:34<9:43:25,  8.06s/it, gpt_loss=0.257, loss_mean=0.271][A[A
+
+Train step of epoch 1:  32%|███▏      | 2091/6434 [4:54:34<9:52:21,  8.18s/it, gpt_loss=0.257, loss_mean=0.271][A[A
+
+Train step of epoch 1:  32%|███▏      | 2091/6434 [4:54:43<9:52:21,  8.18s/it, gpt_loss=0.243, loss_mean=0.268][A[A
+
+Train step of epoch 1:  33%|███▎      | 2092/6434 [4:54:43<9:56:54,  8.25s/it, gpt_loss=0.243, loss_mean=0.268][A[A
+
+Train step of epoch 1:  33%|███▎      | 2092/6434 [4:54:51<9:56:54,  8.25s/it, gpt_loss=0.301, loss_mean=0.272][A[A
+
+Train step of epoch 1:  33%|███▎      | 2093/6434 [4:54:51<9:52:11,  8.18s/it, gpt_loss=0.301, loss_mean=0.272][A[A
+
+Train step of epoch 1:  33%|███▎      | 2093/6434 [4:54:58<9:52:11,  8.18s/it, gpt_loss=0.244, loss_mean=0.269][A[A
+
+Train step of epoch 1:  33%|███▎      | 2094/6434 [4:54:58<9:32:36,  7.92s/it, gpt_loss=0.244, loss_mean=0.269][A[A
+
+Train step of epoch 1:  33%|███▎      | 2094/6434 [4:55:06<9:32:36,  7.92s/it, gpt_loss=0.262, loss_mean=0.268][A[A
+
+Train step of epoch 1:  33%|███▎      | 2095/6434 [4:55:06<9:29:27,  7.87s/it, gpt_loss=0.262, loss_mean=0.268][A[A
+[LID Router Debug] Step: 8530
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [0, 3, 2, 9, 5, 2, 4, 0, 2, 1]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  33%|███▎      | 2095/6434 [4:55:14<9:29:27,  7.87s/it, gpt_loss=0.312, loss_mean=0.273][A[A
+
+Train step of epoch 1:  33%|███▎      | 2096/6434 [4:55:14<9:39:28,  8.01s/it, gpt_loss=0.312, loss_mean=0.273][A[A
+
+Train step of epoch 1:  33%|███▎      | 2096/6434 [4:55:23<9:39:28,  8.01s/it, gpt_loss=0.255, loss_mean=0.271][A[A
+
+Train step of epoch 1:  33%|███▎      | 2097/6434 [4:55:23<10:00:40,  8.31s/it, gpt_loss=0.255, loss_mean=0.271][A[A
+
+Train step of epoch 1:  33%|███▎      | 2097/6434 [4:55:33<10:00:40,  8.31s/it, gpt_loss=0.268, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  33%|███▎      | 2098/6434 [4:55:33<10:34:47,  8.78s/it, gpt_loss=0.268, loss_mean=0.27][A[A
+
+Train step of epoch 1:  33%|███▎      | 2098/6434 [4:55:41<10:34:47,  8.78s/it, gpt_loss=0.24, loss_mean=0.267][A[A
+
+Train step of epoch 1:  33%|███▎      | 2099/6434 [4:55:41<10:13:03,  8.49s/it, gpt_loss=0.24, loss_mean=0.267][A[A
+
+Train step of epoch 1:  33%|███▎      | 2099/6434 [4:55:49<10:13:03,  8.49s/it, gpt_loss=0.325, loss_mean=0.273][A[A
+
+Train step of epoch 1:  33%|███▎      | 2100/6434 [4:55:49<10:18:03,  8.56s/it, gpt_loss=0.325, loss_mean=0.273][A[A
+
+Train step of epoch 1:  33%|███▎      | 2100/6434 [4:55:56<10:18:03,  8.56s/it, gpt_loss=0.177, loss_mean=0.264][A[A
+
+Train step of epoch 1:  33%|███▎      | 2101/6434 [4:55:56<9:43:13,  8.08s/it, gpt_loss=0.177, loss_mean=0.264] [A[A
+
+Train step of epoch 1:  33%|███▎      | 2101/6434 [4:56:04<9:43:13,  8.08s/it, gpt_loss=0.272, loss_mean=0.264][A[A
+
+Train step of epoch 1:  33%|███▎      | 2102/6434 [4:56:04<9:43:51,  8.09s/it, gpt_loss=0.272, loss_mean=0.264][A[A
+
+Train step of epoch 1:  33%|███▎      | 2102/6434 [4:56:14<9:43:51,  8.09s/it, gpt_loss=0.227, loss_mean=0.261][A[A
+
+Train step of epoch 1:  33%|███▎      | 2103/6434 [4:56:14<10:23:20,  8.64s/it, gpt_loss=0.227, loss_mean=0.261][A[A
+
+Train step of epoch 1:  33%|███▎      | 2103/6434 [4:56:23<10:23:20,  8.64s/it, gpt_loss=0.254, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  33%|███▎      | 2104/6434 [4:56:23<10:16:59,  8.55s/it, gpt_loss=0.254, loss_mean=0.26][A[A
+
+Train step of epoch 1:  33%|███▎      | 2104/6434 [4:56:31<10:16:59,  8.55s/it, gpt_loss=0.276, loss_mean=0.262][A[A
+
+Train step of epoch 1:  33%|███▎      | 2105/6434 [4:56:31<10:13:41,  8.51s/it, gpt_loss=0.276, loss_mean=0.262][A[A
+[LID Router Debug] Step: 8540
+Batch Size: 10
+Audio Batch Size: 126
+LID Assignments: [9, 1, 9, 3, 3, 0, 3, 5, 9, 2]
+Active Experts in Batch: {0, 1, 2, 3, 5, 9}
+
+
+Train step of epoch 1:  33%|███▎      | 2105/6434 [4:56:40<10:13:41,  8.51s/it, gpt_loss=0.344, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  33%|███▎      | 2106/6434 [4:56:40<10:14:10,  8.51s/it, gpt_loss=0.344, loss_mean=0.27][A[A
+
+Train step of epoch 1:  33%|███▎      | 2106/6434 [4:56:49<10:14:10,  8.51s/it, gpt_loss=0.259, loss_mean=0.269][A[A
+
+Train step of epoch 1:  33%|███▎      | 2107/6434 [4:56:49<10:37:21,  8.84s/it, gpt_loss=0.259, loss_mean=0.269][A[A
+
+Train step of epoch 1:  33%|███▎      | 2107/6434 [4:56:58<10:37:21,  8.84s/it, gpt_loss=0.225, loss_mean=0.264][A[A
+
+Train step of epoch 1:  33%|███▎      | 2108/6434 [4:56:58<10:41:15,  8.89s/it, gpt_loss=0.225, loss_mean=0.264][A[A
+
+Train step of epoch 1:  33%|███▎      | 2108/6434 [4:57:08<10:41:15,  8.89s/it, gpt_loss=0.344, loss_mean=0.272][A[A
+
+Train step of epoch 1:  33%|███▎      | 2109/6434 [4:57:08<10:54:37,  9.08s/it, gpt_loss=0.344, loss_mean=0.272][A[A
+
+Train step of epoch 1:  33%|███▎      | 2109/6434 [4:57:15<10:54:37,  9.08s/it, gpt_loss=0.236, loss_mean=0.269][A[A
+
+Train step of epoch 1:  33%|███▎      | 2110/6434 [4:57:15<10:22:42,  8.64s/it, gpt_loss=0.236, loss_mean=0.269][A[A
+
+Train step of epoch 1:  33%|███▎      | 2110/6434 [4:57:24<10:22:42,  8.64s/it, gpt_loss=0.38, loss_mean=0.28]  [A[A
+
+Train step of epoch 1:  33%|███▎      | 2111/6434 [4:57:24<10:20:36,  8.61s/it, gpt_loss=0.38, loss_mean=0.28][A[A
+
+Train step of epoch 1:  33%|███▎      | 2111/6434 [4:57:32<10:20:36,  8.61s/it, gpt_loss=0.169, loss_mean=0.269][A[A
+
+Train step of epoch 1:  33%|███▎      | 2112/6434 [4:57:32<10:09:14,  8.46s/it, gpt_loss=0.169, loss_mean=0.269][A[A
+
+Train step of epoch 1:  33%|███▎      | 2112/6434 [4:57:39<10:09:14,  8.46s/it, gpt_loss=0.245, loss_mean=0.266][A[A
+
+Train step of epoch 1:  33%|███▎      | 2113/6434 [4:57:39<9:42:20,  8.09s/it, gpt_loss=0.245, loss_mean=0.266] [A[A
+
+Train step of epoch 1:  33%|███▎      | 2113/6434 [4:57:47<9:42:20,  8.09s/it, gpt_loss=0.234, loss_mean=0.263][A[A
+
+Train step of epoch 1:  33%|███▎      | 2114/6434 [4:57:47<9:44:34,  8.12s/it, gpt_loss=0.234, loss_mean=0.263][A[A
+
+Train step of epoch 1:  33%|███▎      | 2114/6434 [4:57:56<9:44:34,  8.12s/it, gpt_loss=0.322, loss_mean=0.269][A[A
+
+Train step of epoch 1:  33%|███▎      | 2115/6434 [4:57:56<9:48:18,  8.17s/it, gpt_loss=0.322, loss_mean=0.269][A[A
+[LID Router Debug] Step: 8550
+Batch Size: 10
+Audio Batch Size: 88
+LID Assignments: [0, 0, 6, 4, 2, 4, 4, 9, 0, 6]
+Active Experts in Batch: {0, 2, 4, 6, 9}
+
+
+Train step of epoch 1:  33%|███▎      | 2115/6434 [4:58:04<9:48:18,  8.17s/it, gpt_loss=0.206, loss_mean=0.263][A[A
+
+Train step of epoch 1:  33%|███▎      | 2116/6434 [4:58:04<9:42:00,  8.09s/it, gpt_loss=0.206, loss_mean=0.263][A[A
+
+Train step of epoch 1:  33%|███▎      | 2116/6434 [4:58:13<9:42:00,  8.09s/it, gpt_loss=0.187, loss_mean=0.255][A[A
+
+Train step of epoch 1:  33%|███▎      | 2117/6434 [4:58:13<10:01:00,  8.35s/it, gpt_loss=0.187, loss_mean=0.255][A[A
+
+Train step of epoch 1:  33%|███▎      | 2117/6434 [4:58:22<10:01:00,  8.35s/it, gpt_loss=0.228, loss_mean=0.252][A[A
+
+Train step of epoch 1:  33%|███▎      | 2118/6434 [4:58:22<10:17:41,  8.59s/it, gpt_loss=0.228, loss_mean=0.252][A[A
+
+Train step of epoch 1:  33%|███▎      | 2118/6434 [4:58:29<10:17:41,  8.59s/it, gpt_loss=0.28, loss_mean=0.255] [A[A
+
+Train step of epoch 1:  33%|███▎      | 2119/6434 [4:58:29<9:51:04,  8.22s/it, gpt_loss=0.28, loss_mean=0.255] [A[A
+
+Train step of epoch 1:  33%|███▎      | 2119/6434 [4:58:38<9:51:04,  8.22s/it, gpt_loss=0.263, loss_mean=0.256][A[A
+
+Train step of epoch 1:  33%|███▎      | 2120/6434 [4:58:38<10:06:00,  8.43s/it, gpt_loss=0.263, loss_mean=0.256][A[A
+
+Train step of epoch 1:  33%|███▎      | 2120/6434 [4:58:47<10:06:00,  8.43s/it, gpt_loss=0.305, loss_mean=0.261][A[A
+
+Train step of epoch 1:  33%|███▎      | 2121/6434 [4:58:47<10:08:50,  8.47s/it, gpt_loss=0.305, loss_mean=0.261][A[A
+
+Train step of epoch 1:  33%|███▎      | 2121/6434 [4:58:55<10:08:50,  8.47s/it, gpt_loss=0.354, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  33%|███▎      | 2122/6434 [4:58:55<9:58:37,  8.33s/it, gpt_loss=0.354, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  33%|███▎      | 2122/6434 [4:59:03<9:58:37,  8.33s/it, gpt_loss=0.272, loss_mean=0.27][A[A
+
+Train step of epoch 1:  33%|███▎      | 2123/6434 [4:59:03<10:01:45,  8.38s/it, gpt_loss=0.272, loss_mean=0.27][A[A
+
+Train step of epoch 1:  33%|███▎      | 2123/6434 [4:59:12<10:01:45,  8.38s/it, gpt_loss=0.216, loss_mean=0.265][A[A
+
+Train step of epoch 1:  33%|███▎      | 2124/6434 [4:59:12<10:02:50,  8.39s/it, gpt_loss=0.216, loss_mean=0.265][A[A
+
+Train step of epoch 1:  33%|███▎      | 2124/6434 [4:59:20<10:02:50,  8.39s/it, gpt_loss=0.201, loss_mean=0.259][A[A
+
+Train step of epoch 1:  33%|███▎      | 2125/6434 [4:59:20<10:03:54,  8.41s/it, gpt_loss=0.201, loss_mean=0.259][A[A
+[LID Router Debug] Step: 8560
+Batch Size: 10
+Audio Batch Size: 87
+LID Assignments: [4, 9, 0, 0, 4, 5, 2, 1, 3, 6]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  33%|███▎      | 2125/6434 [4:59:28<10:03:54,  8.41s/it, gpt_loss=0.265, loss_mean=0.259][A[A
+
+Train step of epoch 1:  33%|███▎      | 2126/6434 [4:59:28<9:49:35,  8.21s/it, gpt_loss=0.265, loss_mean=0.259] [A[A
+
+Train step of epoch 1:  33%|███▎      | 2126/6434 [4:59:38<9:49:35,  8.21s/it, gpt_loss=0.308, loss_mean=0.264][A[A
+
+Train step of epoch 1:  33%|███▎      | 2127/6434 [4:59:38<10:27:25,  8.74s/it, gpt_loss=0.308, loss_mean=0.264][A[A
+
+Train step of epoch 1:  33%|███▎      | 2127/6434 [4:59:46<10:27:25,  8.74s/it, gpt_loss=0.249, loss_mean=0.263][A[A
+
+Train step of epoch 1:  33%|███▎      | 2128/6434 [4:59:46<10:15:00,  8.57s/it, gpt_loss=0.249, loss_mean=0.263][A[A
+
+Train step of epoch 1:  33%|███▎      | 2128/6434 [4:59:55<10:15:00,  8.57s/it, gpt_loss=0.26, loss_mean=0.262] [A[A
+
+Train step of epoch 1:  33%|███▎      | 2129/6434 [4:59:55<10:21:51,  8.67s/it, gpt_loss=0.26, loss_mean=0.262][A[A
+
+Train step of epoch 1:  33%|███▎      | 2129/6434 [5:00:03<10:21:51,  8.67s/it, gpt_loss=0.21, loss_mean=0.257][A[A
+
+Train step of epoch 1:  33%|███▎      | 2130/6434 [5:00:03<10:15:30,  8.58s/it, gpt_loss=0.21, loss_mean=0.257][A[A
+
+Train step of epoch 1:  33%|███▎      | 2130/6434 [5:00:11<10:15:30,  8.58s/it, gpt_loss=0.204, loss_mean=0.252][A[A
+
+Train step of epoch 1:  33%|███▎      | 2131/6434 [5:00:11<9:52:52,  8.27s/it, gpt_loss=0.204, loss_mean=0.252] [A[A
+
+Train step of epoch 1:  33%|███▎      | 2131/6434 [5:00:19<9:52:52,  8.27s/it, gpt_loss=0.313, loss_mean=0.258][A[A
+
+Train step of epoch 1:  33%|███▎      | 2132/6434 [5:00:19<9:58:57,  8.35s/it, gpt_loss=0.313, loss_mean=0.258][A[A
+
+Train step of epoch 1:  33%|███▎      | 2132/6434 [5:00:27<9:58:57,  8.35s/it, gpt_loss=0.28, loss_mean=0.26]  [A[A
+
+Train step of epoch 1:  33%|███▎      | 2133/6434 [5:00:27<9:52:13,  8.26s/it, gpt_loss=0.28, loss_mean=0.26][A[A
+
+Train step of epoch 1:  33%|███▎      | 2133/6434 [5:00:36<9:52:13,  8.26s/it, gpt_loss=0.229, loss_mean=0.257][A[A
+
+Train step of epoch 1:  33%|███▎      | 2134/6434 [5:00:36<9:54:35,  8.30s/it, gpt_loss=0.229, loss_mean=0.257][A[A
+
+Train step of epoch 1:  33%|███▎      | 2134/6434 [5:00:44<9:54:35,  8.30s/it, gpt_loss=0.275, loss_mean=0.259][A[A
+
+Train step of epoch 1:  33%|███▎      | 2135/6434 [5:00:44<9:58:41,  8.36s/it, gpt_loss=0.275, loss_mean=0.259][A[A
+[LID Router Debug] Step: 8570
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [1, 1, 0, 1, 9, 0, 3, 3, 1, 5]
+Active Experts in Batch: {0, 1, 3, 5, 9}
+
+
+Train step of epoch 1:  33%|███▎      | 2135/6434 [5:00:53<9:58:41,  8.36s/it, gpt_loss=0.23, loss_mean=0.256] [A[A
+
+Train step of epoch 1:  33%|███▎      | 2136/6434 [5:00:53<10:04:17,  8.44s/it, gpt_loss=0.23, loss_mean=0.256][A[A
+
+Train step of epoch 1:  33%|███▎      | 2136/6434 [5:01:01<10:04:17,  8.44s/it, gpt_loss=0.222, loss_mean=0.253][A[A
+
+Train step of epoch 1:  33%|███▎      | 2137/6434 [5:01:01<10:09:12,  8.51s/it, gpt_loss=0.222, loss_mean=0.253][A[A
+
+Train step of epoch 1:  33%|███▎      | 2137/6434 [5:01:10<10:09:12,  8.51s/it, gpt_loss=0.338, loss_mean=0.261][A[A
+
+Train step of epoch 1:  33%|███▎      | 2138/6434 [5:01:10<10:16:17,  8.61s/it, gpt_loss=0.338, loss_mean=0.261][A[A
+
+Train step of epoch 1:  33%|███▎      | 2138/6434 [5:01:18<10:16:17,  8.61s/it, gpt_loss=0.249, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  33%|███▎      | 2139/6434 [5:01:18<9:57:51,  8.35s/it, gpt_loss=0.249, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  33%|███▎      | 2139/6434 [5:01:26<9:57:51,  8.35s/it, gpt_loss=0.321, loss_mean=0.266][A[A
+
+Train step of epoch 1:  33%|███▎      | 2140/6434 [5:01:26<9:57:22,  8.35s/it, gpt_loss=0.321, loss_mean=0.266][A[A
+
+Train step of epoch 1:  33%|███▎      | 2140/6434 [5:01:34<9:57:22,  8.35s/it, gpt_loss=0.225, loss_mean=0.262][A[A
+
+Train step of epoch 1:  33%|███▎      | 2141/6434 [5:01:34<9:32:51,  8.01s/it, gpt_loss=0.225, loss_mean=0.262][A[A
+
+Train step of epoch 1:  33%|███▎      | 2141/6434 [5:01:41<9:32:51,  8.01s/it, gpt_loss=0.235, loss_mean=0.259][A[A
+
+Train step of epoch 1:  33%|███▎      | 2142/6434 [5:01:41<9:13:56,  7.74s/it, gpt_loss=0.235, loss_mean=0.259][A[A
+
+Train step of epoch 1:  33%|███▎      | 2142/6434 [5:01:51<9:13:56,  7.74s/it, gpt_loss=0.216, loss_mean=0.255][A[A
+
+Train step of epoch 1:  33%|███▎      | 2143/6434 [5:01:51<9:58:03,  8.36s/it, gpt_loss=0.216, loss_mean=0.255][A[A
+
+Train step of epoch 1:  33%|███▎      | 2143/6434 [5:01:59<9:58:03,  8.36s/it, gpt_loss=0.356, loss_mean=0.265][A[A
+
+Train step of epoch 1:  33%|███▎      | 2144/6434 [5:01:59<9:54:27,  8.31s/it, gpt_loss=0.356, loss_mean=0.265][A[A
+
+Train step of epoch 1:  33%|███▎      | 2144/6434 [5:02:06<9:54:27,  8.31s/it, gpt_loss=0.23, loss_mean=0.261] [A[A
+
+Train step of epoch 1:  33%|███▎      | 2145/6434 [5:02:06<9:32:54,  8.01s/it, gpt_loss=0.23, loss_mean=0.261][A[A
+[LID Router Debug] Step: 8580
+Batch Size: 10
+Audio Batch Size: 85
+LID Assignments: [1, 5, 0, 4, 2, 1, 9, 4, 9, 2]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+
+Train step of epoch 1:  33%|███▎      | 2145/6434 [5:02:14<9:32:54,  8.01s/it, gpt_loss=0.268, loss_mean=0.262][A[A
+
+Train step of epoch 1:  33%|███▎      | 2146/6434 [5:02:14<9:25:45,  7.92s/it, gpt_loss=0.268, loss_mean=0.262][A[A
+
+Train step of epoch 1:  33%|███▎      | 2146/6434 [5:02:22<9:25:45,  7.92s/it, gpt_loss=0.213, loss_mean=0.257][A[A
+
+Train step of epoch 1:  33%|███▎      | 2147/6434 [5:02:22<9:33:42,  8.03s/it, gpt_loss=0.213, loss_mean=0.257][A[A
+
+Train step of epoch 1:  33%|███▎      | 2147/6434 [5:02:30<9:33:42,  8.03s/it, gpt_loss=0.22, loss_mean=0.254] [A[A
+
+Train step of epoch 1:  33%|███▎      | 2148/6434 [5:02:30<9:31:18,  8.00s/it, gpt_loss=0.22, loss_mean=0.254][A[A
+
+Train step of epoch 1:  33%|███▎      | 2148/6434 [5:02:37<9:31:18,  8.00s/it, gpt_loss=0.289, loss_mean=0.257][A[A
+
+Train step of epoch 1:  33%|███▎      | 2149/6434 [5:02:37<9:21:23,  7.86s/it, gpt_loss=0.289, loss_mean=0.257][A[A
+
+Train step of epoch 1:  33%|███▎      | 2149/6434 [5:02:45<9:21:23,  7.86s/it, gpt_loss=0.26, loss_mean=0.257] [A[A
+
+Train step of epoch 1:  33%|███▎      | 2150/6434 [5:02:45<9:15:24,  7.78s/it, gpt_loss=0.26, loss_mean=0.257][A[A
+
+Train step of epoch 1:  33%|███▎      | 2150/6434 [5:02:53<9:15:24,  7.78s/it, gpt_loss=0.293, loss_mean=0.261][A[A
+
+Train step of epoch 1:  33%|███▎      | 2151/6434 [5:02:53<9:26:45,  7.94s/it, gpt_loss=0.293, loss_mean=0.261][A[A
+
+Train step of epoch 1:  33%|███▎      | 2151/6434 [5:03:02<9:26:45,  7.94s/it, gpt_loss=0.296, loss_mean=0.265][A[A
+
+Train step of epoch 1:  33%|███▎      | 2152/6434 [5:03:02<9:38:25,  8.10s/it, gpt_loss=0.296, loss_mean=0.265][A[A
+
+Train step of epoch 1:  33%|███▎      | 2152/6434 [5:03:11<9:38:25,  8.10s/it, gpt_loss=0.293, loss_mean=0.267][A[A
+
+Train step of epoch 1:  33%|███▎      | 2153/6434 [5:03:11<9:56:08,  8.36s/it, gpt_loss=0.293, loss_mean=0.267][A[A
+
+Train step of epoch 1:  33%|███▎      | 2153/6434 [5:03:19<9:56:08,  8.36s/it, gpt_loss=0.208, loss_mean=0.261][A[A
+
+Train step of epoch 1:  33%|███▎      | 2154/6434 [5:03:19<9:54:50,  8.34s/it, gpt_loss=0.208, loss_mean=0.261][A[A
+
+Train step of epoch 1:  33%|███▎      | 2154/6434 [5:03:29<9:54:50,  8.34s/it, gpt_loss=0.323, loss_mean=0.268][A[A
+
+Train step of epoch 1:  33%|███▎      | 2155/6434 [5:03:29<10:21:53,  8.72s/it, gpt_loss=0.323, loss_mean=0.268][A[A
+[LID Router Debug] Step: 8590
+Batch Size: 10
+Audio Batch Size: 91
+LID Assignments: [2, 1, 1, 9, 2, 0, 0, 5, 5, 9]
+Active Experts in Batch: {0, 1, 2, 5, 9}
+
+
+Train step of epoch 1:  33%|███▎      | 2155/6434 [5:03:37<10:21:53,  8.72s/it, gpt_loss=0.263, loss_mean=0.267][A[A
+
+Train step of epoch 1:  34%|███▎      | 2156/6434 [5:03:37<10:08:30,  8.53s/it, gpt_loss=0.263, loss_mean=0.267][A[A
+
+Train step of epoch 1:  34%|███▎      | 2156/6434 [5:03:44<10:08:30,  8.53s/it, gpt_loss=0.283, loss_mean=0.269][A[A
+
+Train step of epoch 1:  34%|███▎      | 2157/6434 [5:03:44<9:45:52,  8.22s/it, gpt_loss=0.283, loss_mean=0.269] [A[A
+
+Train step of epoch 1:  34%|███▎      | 2157/6434 [5:03:53<9:45:52,  8.22s/it, gpt_loss=0.263, loss_mean=0.268][A[A
+
+Train step of epoch 1:  34%|███▎      | 2158/6434 [5:03:53<9:56:15,  8.37s/it, gpt_loss=0.263, loss_mean=0.268][A[A
+
+Train step of epoch 1:  34%|███▎      | 2158/6434 [5:04:00<9:56:15,  8.37s/it, gpt_loss=0.243, loss_mean=0.266][A[A
+
+Train step of epoch 1:  34%|███▎      | 2159/6434 [5:04:00<9:32:59,  8.04s/it, gpt_loss=0.243, loss_mean=0.266][A[A
+
+Train step of epoch 1:  34%|███▎      | 2159/6434 [5:04:09<9:32:59,  8.04s/it, gpt_loss=0.318, loss_mean=0.271][A[A
+
+Train step of epoch 1:  34%|███▎      | 2160/6434 [5:04:09<9:43:59,  8.20s/it, gpt_loss=0.318, loss_mean=0.271][A[A
+
+Train step of epoch 1:  34%|███▎      | 2160/6434 [5:04:17<9:43:59,  8.20s/it, gpt_loss=0.241, loss_mean=0.268][A[A
+
+Train step of epoch 1:  34%|███▎      | 2161/6434 [5:04:17<9:43:40,  8.20s/it, gpt_loss=0.241, loss_mean=0.268][A[A
+
+Train step of epoch 1:  34%|███▎      | 2161/6434 [5:04:24<9:43:40,  8.20s/it, gpt_loss=0.278, loss_mean=0.269][A[A
+
+Train step of epoch 1:  34%|███▎      | 2162/6434 [5:04:24<9:23:59,  7.92s/it, gpt_loss=0.278, loss_mean=0.269][A[A
+
+Train step of epoch 1:  34%|███▎      | 2162/6434 [5:04:32<9:23:59,  7.92s/it, gpt_loss=0.209, loss_mean=0.263][A[A
+
+Train step of epoch 1:  34%|███▎      | 2163/6434 [5:04:32<9:11:03,  7.74s/it, gpt_loss=0.209, loss_mean=0.263][A[A
+
+Train step of epoch 1:  34%|███▎      | 2163/6434 [5:04:40<9:11:03,  7.74s/it, gpt_loss=0.268, loss_mean=0.264][A[A
+
+Train step of epoch 1:  34%|███▎      | 2164/6434 [5:04:40<9:23:55,  7.92s/it, gpt_loss=0.268, loss_mean=0.264][A[A
+
+Train step of epoch 1:  34%|███▎      | 2164/6434 [5:04:48<9:23:55,  7.92s/it, gpt_loss=0.283, loss_mean=0.266][A[A
+
+Train step of epoch 1:  34%|███▎      | 2165/6434 [5:04:48<9:33:21,  8.06s/it, gpt_loss=0.283, loss_mean=0.266][A[A
+[LID Router Debug] Step: 8600
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [1, 9, 9, 5, 9, 4, 1, 3, 2, 1]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+[2026-02-07 12:06:43,871] [INFO] [logging.py:96:log_dist] [Rank 0] step=4300, skipped=0, lr=[1.1876267890827186e-05, 1.1876267890827186e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 12:06:43,872] [INFO] [timer.py:260:stop] epoch=0/micro_step=8600/global_step=4300, RunningAvgSamplesPerSec=4.746396671828146, CurrSamplesPerSec=4.775270038228993, MemAllocated=12.74GB, MaxMemAllocated=49.73GB
+
+
+Train step of epoch 1:  34%|███▎      | 2165/6434 [5:04:57<9:33:21,  8.06s/it, gpt_loss=0.343, loss_mean=0.273][A[A
+
+Train step of epoch 1:  34%|███▎      | 2166/6434 [5:04:57<9:40:45,  8.16s/it, gpt_loss=0.343, loss_mean=0.273][A[A
+
+Train step of epoch 1:  34%|███▎      | 2166/6434 [5:05:06<9:40:45,  8.16s/it, gpt_loss=0.216, loss_mean=0.268][A[A
+
+Train step of epoch 1:  34%|███▎      | 2167/6434 [5:05:06<10:00:48,  8.45s/it, gpt_loss=0.216, loss_mean=0.268][A[A
+
+Train step of epoch 1:  34%|███▎      | 2167/6434 [5:05:15<10:00:48,  8.45s/it, gpt_loss=0.239, loss_mean=0.265][A[A
+
+Train step of epoch 1:  34%|███▎      | 2168/6434 [5:05:15<10:14:15,  8.64s/it, gpt_loss=0.239, loss_mean=0.265][A[A
+
+Train step of epoch 1:  34%|███▎      | 2168/6434 [5:05:22<10:14:15,  8.64s/it, gpt_loss=0.21, loss_mean=0.259] [A[A
+
+Train step of epoch 1:  34%|███▎      | 2169/6434 [5:05:22<9:39:13,  8.15s/it, gpt_loss=0.21, loss_mean=0.259] [A[A
+
+Train step of epoch 1:  34%|███▎      | 2169/6434 [5:05:30<9:39:13,  8.15s/it, gpt_loss=0.244, loss_mean=0.258][A[A
+
+Train step of epoch 1:  34%|███▎      | 2170/6434 [5:05:30<9:40:22,  8.17s/it, gpt_loss=0.244, loss_mean=0.258][A[A
+
+Train step of epoch 1:  34%|███▎      | 2170/6434 [5:05:39<9:40:22,  8.17s/it, gpt_loss=0.26, loss_mean=0.258] [A[A
+
+Train step of epoch 1:  34%|███▎      | 2171/6434 [5:05:39<9:47:50,  8.27s/it, gpt_loss=0.26, loss_mean=0.258][A[A
+
+Train step of epoch 1:  34%|███▎      | 2171/6434 [5:05:47<9:47:50,  8.27s/it, gpt_loss=0.291, loss_mean=0.261][A[A
+
+Train step of epoch 1:  34%|███▍      | 2172/6434 [5:05:47<9:47:06,  8.27s/it, gpt_loss=0.291, loss_mean=0.261][A[A
+
+Train step of epoch 1:  34%|███▍      | 2172/6434 [5:05:56<9:47:06,  8.27s/it, gpt_loss=0.259, loss_mean=0.261][A[A
+
+Train step of epoch 1:  34%|███▍      | 2173/6434 [5:05:56<10:04:26,  8.51s/it, gpt_loss=0.259, loss_mean=0.261][A[A
+
+Train step of epoch 1:  34%|███▍      | 2173/6434 [5:06:04<10:04:26,  8.51s/it, gpt_loss=0.27, loss_mean=0.262] [A[A
+
+Train step of epoch 1:  34%|███▍      | 2174/6434 [5:06:04<9:59:59,  8.45s/it, gpt_loss=0.27, loss_mean=0.262] [A[A
+
+Train step of epoch 1:  34%|███▍      | 2174/6434 [5:06:13<9:59:59,  8.45s/it, gpt_loss=0.214, loss_mean=0.257][A[A
+
+Train step of epoch 1:  34%|███▍      | 2175/6434 [5:06:13<9:57:17,  8.41s/it, gpt_loss=0.214, loss_mean=0.257][A[A
+[LID Router Debug] Step: 8610
+Batch Size: 10
+Audio Batch Size: 149
+LID Assignments: [5, 0, 4, 3, 9, 2, 6, 9, 0, 3]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  34%|███▍      | 2175/6434 [5:06:22<9:57:17,  8.41s/it, gpt_loss=0.234, loss_mean=0.255][A[A
+
+Train step of epoch 1:  34%|███▍      | 2176/6434 [5:06:22<10:09:25,  8.59s/it, gpt_loss=0.234, loss_mean=0.255][A[A
+
+Train step of epoch 1:  34%|███▍      | 2176/6434 [5:06:29<10:09:25,  8.59s/it, gpt_loss=0.248, loss_mean=0.254][A[A
+
+Train step of epoch 1:  34%|███▍      | 2177/6434 [5:06:29<9:46:13,  8.26s/it, gpt_loss=0.248, loss_mean=0.254] [A[A
+
+Train step of epoch 1:  34%|███▍      | 2177/6434 [5:06:37<9:46:13,  8.26s/it, gpt_loss=0.318, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  34%|███▍      | 2178/6434 [5:06:37<9:42:12,  8.21s/it, gpt_loss=0.318, loss_mean=0.26][A[A
+
+Train step of epoch 1:  34%|███▍      | 2178/6434 [5:06:45<9:42:12,  8.21s/it, gpt_loss=0.253, loss_mean=0.26][A[A
+
+Train step of epoch 1:  34%|███▍      | 2179/6434 [5:06:45<9:35:45,  8.12s/it, gpt_loss=0.253, loss_mean=0.26][A[A
+
+Train step of epoch 1:  34%|███▍      | 2179/6434 [5:06:54<9:35:45,  8.12s/it, gpt_loss=0.35, loss_mean=0.269][A[A
+
+Train step of epoch 1:  34%|███▍      | 2180/6434 [5:06:54<9:47:34,  8.29s/it, gpt_loss=0.35, loss_mean=0.269][A[A
+
+Train step of epoch 1:  34%|███▍      | 2180/6434 [5:07:02<9:47:34,  8.29s/it, gpt_loss=0.239, loss_mean=0.266][A[A
+
+Train step of epoch 1:  34%|███▍      | 2181/6434 [5:07:02<9:48:51,  8.31s/it, gpt_loss=0.239, loss_mean=0.266][A[A
+
+Train step of epoch 1:  34%|███▍      | 2181/6434 [5:07:11<9:48:51,  8.31s/it, gpt_loss=0.311, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  34%|███▍      | 2182/6434 [5:07:11<9:53:58,  8.38s/it, gpt_loss=0.311, loss_mean=0.27][A[A
+
+Train step of epoch 1:  34%|███▍      | 2182/6434 [5:07:20<9:53:58,  8.38s/it, gpt_loss=0.294, loss_mean=0.273][A[A
+
+Train step of epoch 1:  34%|███▍      | 2183/6434 [5:07:20<10:11:41,  8.63s/it, gpt_loss=0.294, loss_mean=0.273][A[A
+
+Train step of epoch 1:  34%|███▍      | 2183/6434 [5:07:29<10:11:41,  8.63s/it, gpt_loss=0.254, loss_mean=0.271][A[A
+
+Train step of epoch 1:  34%|███▍      | 2184/6434 [5:07:29<10:13:58,  8.67s/it, gpt_loss=0.254, loss_mean=0.271][A[A
+
+Train step of epoch 1:  34%|███▍      | 2184/6434 [5:07:37<10:13:58,  8.67s/it, gpt_loss=0.244, loss_mean=0.268][A[A
+
+Train step of epoch 1:  34%|███▍      | 2185/6434 [5:07:37<10:07:24,  8.58s/it, gpt_loss=0.244, loss_mean=0.268][A[A
+[LID Router Debug] Step: 8620
+Batch Size: 10
+Audio Batch Size: 120
+LID Assignments: [0, 1, 3, 4, 4, 2, 3, 3, 9, 9]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+
+Train step of epoch 1:  34%|███▍      | 2185/6434 [5:07:47<10:07:24,  8.58s/it, gpt_loss=0.249, loss_mean=0.266][A[A
+
+Train step of epoch 1:  34%|███▍      | 2186/6434 [5:07:47<10:42:43,  9.08s/it, gpt_loss=0.249, loss_mean=0.266][A[A
+
+Train step of epoch 1:  34%|███▍      | 2186/6434 [5:07:57<10:42:43,  9.08s/it, gpt_loss=0.286, loss_mean=0.268][A[A
+
+Train step of epoch 1:  34%|███▍      | 2187/6434 [5:07:57<10:48:03,  9.16s/it, gpt_loss=0.286, loss_mean=0.268][A[A
+
+Train step of epoch 1:  34%|███▍      | 2187/6434 [5:08:05<10:48:03,  9.16s/it, gpt_loss=0.256, loss_mean=0.267][A[A
+
+Train step of epoch 1:  34%|███▍      | 2188/6434 [5:08:05<10:35:22,  8.98s/it, gpt_loss=0.256, loss_mean=0.267][A[A
+
+Train step of epoch 1:  34%|███▍      | 2188/6434 [5:08:15<10:35:22,  8.98s/it, gpt_loss=0.241, loss_mean=0.264][A[A
+
+Train step of epoch 1:  34%|███▍      | 2189/6434 [5:08:15<10:49:18,  9.18s/it, gpt_loss=0.241, loss_mean=0.264][A[A
+
+Train step of epoch 1:  34%|███▍      | 2189/6434 [5:08:23<10:49:18,  9.18s/it, gpt_loss=0.256, loss_mean=0.263][A[A
+
+Train step of epoch 1:  34%|███▍      | 2190/6434 [5:08:23<10:19:18,  8.76s/it, gpt_loss=0.256, loss_mean=0.263][A[A
+
+Train step of epoch 1:  34%|███▍      | 2190/6434 [5:08:32<10:19:18,  8.76s/it, gpt_loss=0.248, loss_mean=0.262][A[A
+
+Train step of epoch 1:  34%|███▍      | 2191/6434 [5:08:32<10:29:06,  8.90s/it, gpt_loss=0.248, loss_mean=0.262][A[A
+
+Train step of epoch 1:  34%|███▍      | 2191/6434 [5:08:39<10:29:06,  8.90s/it, gpt_loss=0.235, loss_mean=0.259][A[A
+
+Train step of epoch 1:  34%|███▍      | 2192/6434 [5:08:39<9:57:05,  8.45s/it, gpt_loss=0.235, loss_mean=0.259] [A[A
+
+Train step of epoch 1:  34%|███▍      | 2192/6434 [5:08:47<9:57:05,  8.45s/it, gpt_loss=0.325, loss_mean=0.266][A[A
+
+Train step of epoch 1:  34%|███▍      | 2193/6434 [5:08:47<9:43:02,  8.25s/it, gpt_loss=0.325, loss_mean=0.266][A[A
+
+Train step of epoch 1:  34%|███▍      | 2193/6434 [5:08:55<9:43:02,  8.25s/it, gpt_loss=0.267, loss_mean=0.266][A[A
+
+Train step of epoch 1:  34%|███▍      | 2194/6434 [5:08:55<9:45:38,  8.29s/it, gpt_loss=0.267, loss_mean=0.266][A[A
+
+Train step of epoch 1:  34%|███▍      | 2194/6434 [5:09:05<9:45:38,  8.29s/it, gpt_loss=0.234, loss_mean=0.263][A[A
+
+Train step of epoch 1:  34%|███▍      | 2195/6434 [5:09:05<10:12:42,  8.67s/it, gpt_loss=0.234, loss_mean=0.263][A[A
+[LID Router Debug] Step: 8630
+Batch Size: 10
+Audio Batch Size: 81
+LID Assignments: [1, 1, 2, 4, 4, 4, 2, 5, 2, 6]
+Active Experts in Batch: {1, 2, 4, 5, 6}
+
+
+Train step of epoch 1:  34%|███▍      | 2195/6434 [5:09:15<10:12:42,  8.67s/it, gpt_loss=0.33, loss_mean=0.269] [A[A
+
+Train step of epoch 1:  34%|███▍      | 2196/6434 [5:09:15<10:35:35,  9.00s/it, gpt_loss=0.33, loss_mean=0.269][A[A
+
+Train step of epoch 1:  34%|███▍      | 2196/6434 [5:09:23<10:35:35,  9.00s/it, gpt_loss=0.353, loss_mean=0.278][A[A
+
+Train step of epoch 1:  34%|███▍      | 2197/6434 [5:09:23<10:15:12,  8.71s/it, gpt_loss=0.353, loss_mean=0.278][A[A
+
+Train step of epoch 1:  34%|███▍      | 2197/6434 [5:09:32<10:15:12,  8.71s/it, gpt_loss=0.307, loss_mean=0.281][A[A
+
+Train step of epoch 1:  34%|███▍      | 2198/6434 [5:09:32<10:23:25,  8.83s/it, gpt_loss=0.307, loss_mean=0.281][A[A
+
+Train step of epoch 1:  34%|███▍      | 2198/6434 [5:09:42<10:23:25,  8.83s/it, gpt_loss=0.252, loss_mean=0.278][A[A
+
+Train step of epoch 1:  34%|███▍      | 2199/6434 [5:09:42<10:49:51,  9.21s/it, gpt_loss=0.252, loss_mean=0.278][A[A
+
+Train step of epoch 1:  34%|███▍      | 2199/6434 [5:09:50<10:49:51,  9.21s/it, gpt_loss=0.277, loss_mean=0.278][A[A
+
+Train step of epoch 1:  34%|███▍      | 2200/6434 [5:09:50<10:33:52,  8.98s/it, gpt_loss=0.277, loss_mean=0.278][A[A
+
+Train step of epoch 1:  34%|███▍      | 2200/6434 [5:09:59<10:33:52,  8.98s/it, gpt_loss=0.272, loss_mean=0.277][A[A
+
+Train step of epoch 1:  34%|███▍      | 2201/6434 [5:09:59<10:13:54,  8.70s/it, gpt_loss=0.272, loss_mean=0.277][A[A
+
+Train step of epoch 1:  34%|███▍      | 2201/6434 [5:10:08<10:13:54,  8.70s/it, gpt_loss=0.332, loss_mean=0.283][A[A
+
+Train step of epoch 1:  34%|███▍      | 2202/6434 [5:10:08<10:23:24,  8.84s/it, gpt_loss=0.332, loss_mean=0.283][A[A
+
+Train step of epoch 1:  34%|███▍      | 2202/6434 [5:10:16<10:23:24,  8.84s/it, gpt_loss=0.298, loss_mean=0.284][A[A
+
+Train step of epoch 1:  34%|███▍      | 2203/6434 [5:10:16<10:05:00,  8.58s/it, gpt_loss=0.298, loss_mean=0.284][A[A
+
+Train step of epoch 1:  34%|███▍      | 2203/6434 [5:10:24<10:05:00,  8.58s/it, gpt_loss=0.354, loss_mean=0.291][A[A
+
+Train step of epoch 1:  34%|███▍      | 2204/6434 [5:10:24<9:53:37,  8.42s/it, gpt_loss=0.354, loss_mean=0.291] [A[A
+
+Train step of epoch 1:  34%|███▍      | 2204/6434 [5:10:32<9:53:37,  8.42s/it, gpt_loss=0.335, loss_mean=0.296][A[A
+
+Train step of epoch 1:  34%|███▍      | 2205/6434 [5:10:32<9:58:33,  8.49s/it, gpt_loss=0.335, loss_mean=0.296][A[A
+[LID Router Debug] Step: 8640
+Batch Size: 10
+Audio Batch Size: 117
+LID Assignments: [4, 8, 0, 0, 9, 3, 6, 9, 2, 2]
+Active Experts in Batch: {0, 2, 3, 4, 6, 8, 9}
+
+
+Train step of epoch 1:  34%|███▍      | 2205/6434 [5:10:41<9:58:33,  8.49s/it, gpt_loss=0.227, loss_mean=0.289][A[A
+
+Train step of epoch 1:  34%|███▍      | 2206/6434 [5:10:41<10:02:05,  8.54s/it, gpt_loss=0.227, loss_mean=0.289][A[A
+
+Train step of epoch 1:  34%|███▍      | 2206/6434 [5:10:50<10:02:05,  8.54s/it, gpt_loss=0.225, loss_mean=0.282][A[A
+
+Train step of epoch 1:  34%|███▍      | 2207/6434 [5:10:50<10:12:10,  8.69s/it, gpt_loss=0.225, loss_mean=0.282][A[A
+
+Train step of epoch 1:  34%|███▍      | 2207/6434 [5:10:59<10:12:10,  8.69s/it, gpt_loss=0.296, loss_mean=0.284][A[A
+
+Train step of epoch 1:  34%|███▍      | 2208/6434 [5:10:59<10:10:51,  8.67s/it, gpt_loss=0.296, loss_mean=0.284][A[A
+
+Train step of epoch 1:  34%|███▍      | 2208/6434 [5:11:07<10:10:51,  8.67s/it, gpt_loss=0.278, loss_mean=0.283][A[A
+
+Train step of epoch 1:  34%|███▍      | 2209/6434 [5:11:07<10:10:51,  8.67s/it, gpt_loss=0.278, loss_mean=0.283][A[A
+
+Train step of epoch 1:  34%|███▍      | 2209/6434 [5:11:16<10:10:51,  8.67s/it, gpt_loss=0.254, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  34%|███▍      | 2210/6434 [5:11:16<10:07:31,  8.63s/it, gpt_loss=0.254, loss_mean=0.28][A[A
+
+Train step of epoch 1:  34%|███▍      | 2210/6434 [5:11:25<10:07:31,  8.63s/it, gpt_loss=0.326, loss_mean=0.285][A[A
+
+Train step of epoch 1:  34%|███▍      | 2211/6434 [5:11:25<10:20:06,  8.81s/it, gpt_loss=0.326, loss_mean=0.285][A[A
+
+Train step of epoch 1:  34%|███▍      | 2211/6434 [5:11:34<10:20:06,  8.81s/it, gpt_loss=0.217, loss_mean=0.278][A[A
+
+Train step of epoch 1:  34%|███▍      | 2212/6434 [5:11:34<10:19:46,  8.81s/it, gpt_loss=0.217, loss_mean=0.278][A[A
+
+Train step of epoch 1:  34%|███▍      | 2212/6434 [5:11:43<10:19:46,  8.81s/it, gpt_loss=0.225, loss_mean=0.273][A[A
+
+Train step of epoch 1:  34%|███▍      | 2213/6434 [5:11:43<10:31:59,  8.98s/it, gpt_loss=0.225, loss_mean=0.273][A[A
+
+Train step of epoch 1:  34%|███▍      | 2213/6434 [5:11:52<10:31:59,  8.98s/it, gpt_loss=0.319, loss_mean=0.277][A[A
+
+Train step of epoch 1:  34%|███▍      | 2214/6434 [5:11:52<10:15:25,  8.75s/it, gpt_loss=0.319, loss_mean=0.277][A[A
+
+Train step of epoch 1:  34%|███▍      | 2214/6434 [5:12:00<10:15:25,  8.75s/it, gpt_loss=0.26, loss_mean=0.276] [A[A
+
+Train step of epoch 1:  34%|███▍      | 2215/6434 [5:12:00<10:02:36,  8.57s/it, gpt_loss=0.26, loss_mean=0.276][A[A
+[LID Router Debug] Step: 8650
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [2, 9, 1, 9, 6, 9, 1, 2, 5, 4]
+Active Experts in Batch: {1, 2, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  34%|███▍      | 2215/6434 [5:12:07<10:02:36,  8.57s/it, gpt_loss=0.259, loss_mean=0.274][A[A
+
+Train step of epoch 1:  34%|███▍      | 2216/6434 [5:12:07<9:42:51,  8.29s/it, gpt_loss=0.259, loss_mean=0.274] [A[A
+
+Train step of epoch 1:  34%|███▍      | 2216/6434 [5:12:16<9:42:51,  8.29s/it, gpt_loss=0.201, loss_mean=0.267][A[A
+
+Train step of epoch 1:  34%|███▍      | 2217/6434 [5:12:16<9:43:22,  8.30s/it, gpt_loss=0.201, loss_mean=0.267][A[A
+
+Train step of epoch 1:  34%|███▍      | 2217/6434 [5:12:24<9:43:22,  8.30s/it, gpt_loss=0.202, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  34%|███▍      | 2218/6434 [5:12:24<9:54:40,  8.46s/it, gpt_loss=0.202, loss_mean=0.26][A[A
+
+Train step of epoch 1:  34%|███▍      | 2218/6434 [5:12:33<9:54:40,  8.46s/it, gpt_loss=0.268, loss_mean=0.261][A[A
+
+Train step of epoch 1:  34%|███▍      | 2219/6434 [5:12:33<10:02:13,  8.57s/it, gpt_loss=0.268, loss_mean=0.261][A[A
+
+Train step of epoch 1:  34%|███▍      | 2219/6434 [5:12:43<10:02:13,  8.57s/it, gpt_loss=0.257, loss_mean=0.261][A[A
+
+Train step of epoch 1:  35%|███▍      | 2220/6434 [5:12:43<10:19:55,  8.83s/it, gpt_loss=0.257, loss_mean=0.261][A[A
+
+Train step of epoch 1:  35%|███▍      | 2220/6434 [5:12:51<10:19:55,  8.83s/it, gpt_loss=0.295, loss_mean=0.264][A[A
+
+Train step of epoch 1:  35%|███▍      | 2221/6434 [5:12:51<10:16:12,  8.78s/it, gpt_loss=0.295, loss_mean=0.264][A[A
+
+Train step of epoch 1:  35%|███▍      | 2221/6434 [5:13:01<10:16:12,  8.78s/it, gpt_loss=0.276, loss_mean=0.265][A[A
+
+Train step of epoch 1:  35%|███▍      | 2222/6434 [5:13:01<10:24:52,  8.90s/it, gpt_loss=0.276, loss_mean=0.265][A[A
+
+Train step of epoch 1:  35%|███▍      | 2222/6434 [5:13:09<10:24:52,  8.90s/it, gpt_loss=0.278, loss_mean=0.266][A[A
+
+Train step of epoch 1:  35%|███▍      | 2223/6434 [5:13:09<10:13:50,  8.75s/it, gpt_loss=0.278, loss_mean=0.266][A[A
+
+Train step of epoch 1:  35%|███▍      | 2223/6434 [5:13:17<10:13:50,  8.75s/it, gpt_loss=0.26, loss_mean=0.266] [A[A
+
+Train step of epoch 1:  35%|███▍      | 2224/6434 [5:13:17<10:00:32,  8.56s/it, gpt_loss=0.26, loss_mean=0.266][A[A
+
+Train step of epoch 1:  35%|███▍      | 2224/6434 [5:13:25<10:00:32,  8.56s/it, gpt_loss=0.269, loss_mean=0.266][A[A
+
+Train step of epoch 1:  35%|███▍      | 2225/6434 [5:13:25<9:46:53,  8.37s/it, gpt_loss=0.269, loss_mean=0.266] [A[A
+[LID Router Debug] Step: 8660
+Batch Size: 10
+Audio Batch Size: 112
+LID Assignments: [1, 9, 4, 4, 3, 1, 0, 9, 0, 6]
+Active Experts in Batch: {0, 1, 3, 4, 6, 9}
+
+
+Train step of epoch 1:  35%|███▍      | 2225/6434 [5:13:34<9:46:53,  8.37s/it, gpt_loss=0.315, loss_mean=0.271][A[A
+
+Train step of epoch 1:  35%|███▍      | 2226/6434 [5:13:34<10:10:14,  8.70s/it, gpt_loss=0.315, loss_mean=0.271][A[A
+
+Train step of epoch 1:  35%|███▍      | 2226/6434 [5:13:43<10:10:14,  8.70s/it, gpt_loss=0.216, loss_mean=0.265][A[A
+
+Train step of epoch 1:  35%|███▍      | 2227/6434 [5:13:43<10:06:50,  8.65s/it, gpt_loss=0.216, loss_mean=0.265][A[A
+
+Train step of epoch 1:  35%|███▍      | 2227/6434 [5:13:51<10:06:50,  8.65s/it, gpt_loss=0.248, loss_mean=0.264][A[A
+
+Train step of epoch 1:  35%|███▍      | 2228/6434 [5:13:51<9:50:53,  8.43s/it, gpt_loss=0.248, loss_mean=0.264] [A[A
+
+Train step of epoch 1:  35%|███▍      | 2228/6434 [5:13:59<9:50:53,  8.43s/it, gpt_loss=0.254, loss_mean=0.263][A[A
+
+Train step of epoch 1:  35%|███▍      | 2229/6434 [5:13:59<9:32:29,  8.17s/it, gpt_loss=0.254, loss_mean=0.263][A[A
+
+Train step of epoch 1:  35%|███▍      | 2229/6434 [5:14:07<9:32:29,  8.17s/it, gpt_loss=0.428, loss_mean=0.279][A[A
+
+Train step of epoch 1:  35%|███▍      | 2230/6434 [5:14:07<9:44:48,  8.35s/it, gpt_loss=0.428, loss_mean=0.279][A[A
+
+Train step of epoch 1:  35%|███▍      | 2230/6434 [5:14:16<9:44:48,  8.35s/it, gpt_loss=0.289, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  35%|███▍      | 2231/6434 [5:14:16<9:48:09,  8.40s/it, gpt_loss=0.289, loss_mean=0.28][A[A
+
+Train step of epoch 1:  35%|███▍      | 2231/6434 [5:14:25<9:48:09,  8.40s/it, gpt_loss=0.254, loss_mean=0.278][A[A
+
+Train step of epoch 1:  35%|███▍      | 2232/6434 [5:14:25<10:10:38,  8.72s/it, gpt_loss=0.254, loss_mean=0.278][A[A
+
+Train step of epoch 1:  35%|███▍      | 2232/6434 [5:14:33<10:10:38,  8.72s/it, gpt_loss=0.332, loss_mean=0.283][A[A
+
+Train step of epoch 1:  35%|███▍      | 2233/6434 [5:14:33<9:50:05,  8.43s/it, gpt_loss=0.332, loss_mean=0.283] [A[A
+
+Train step of epoch 1:  35%|███▍      | 2233/6434 [5:14:41<9:50:05,  8.43s/it, gpt_loss=0.313, loss_mean=0.286][A[A
+
+Train step of epoch 1:  35%|███▍      | 2234/6434 [5:14:41<9:48:08,  8.40s/it, gpt_loss=0.313, loss_mean=0.286][A[A
+
+Train step of epoch 1:  35%|███▍      | 2234/6434 [5:14:49<9:48:08,  8.40s/it, gpt_loss=0.366, loss_mean=0.294][A[A
+
+Train step of epoch 1:  35%|███▍      | 2235/6434 [5:14:49<9:31:53,  8.17s/it, gpt_loss=0.366, loss_mean=0.294][A[A
+[LID Router Debug] Step: 8670
+Batch Size: 10
+Audio Batch Size: 115
+LID Assignments: [1, 5, 9, 2, 2, 3, 1, 5, 6, 3]
+Active Experts in Batch: {1, 2, 3, 5, 6, 9}
+
+
+Train step of epoch 1:  35%|███▍      | 2235/6434 [5:14:58<9:31:53,  8.17s/it, gpt_loss=0.304, loss_mean=0.295][A[A
+
+Train step of epoch 1:  35%|███▍      | 2236/6434 [5:14:58<9:43:36,  8.34s/it, gpt_loss=0.304, loss_mean=0.295][A[A
+
+Train step of epoch 1:  35%|███▍      | 2236/6434 [5:15:07<9:43:36,  8.34s/it, gpt_loss=0.265, loss_mean=0.292][A[A
+
+Train step of epoch 1:  35%|███▍      | 2237/6434 [5:15:07<10:01:37,  8.60s/it, gpt_loss=0.265, loss_mean=0.292][A[A
+
+Train step of epoch 1:  35%|███▍      | 2237/6434 [5:15:15<10:01:37,  8.60s/it, gpt_loss=0.246, loss_mean=0.287][A[A
+
+Train step of epoch 1:  35%|███▍      | 2238/6434 [5:15:15<9:50:50,  8.45s/it, gpt_loss=0.246, loss_mean=0.287] [A[A
+
+Train step of epoch 1:  35%|███▍      | 2238/6434 [5:15:23<9:50:50,  8.45s/it, gpt_loss=0.28, loss_mean=0.287] [A[A
+
+Train step of epoch 1:  35%|███▍      | 2239/6434 [5:15:23<9:33:01,  8.20s/it, gpt_loss=0.28, loss_mean=0.287][A[A
+
+Train step of epoch 1:  35%|███▍      | 2239/6434 [5:15:30<9:33:01,  8.20s/it, gpt_loss=0.249, loss_mean=0.283][A[A
+
+Train step of epoch 1:  35%|███▍      | 2240/6434 [5:15:30<9:21:35,  8.03s/it, gpt_loss=0.249, loss_mean=0.283][A[A
+
+Train step of epoch 1:  35%|███▍      | 2240/6434 [5:15:39<9:21:35,  8.03s/it, gpt_loss=0.261, loss_mean=0.281][A[A
+
+Train step of epoch 1:  35%|███▍      | 2241/6434 [5:15:39<9:26:01,  8.10s/it, gpt_loss=0.261, loss_mean=0.281][A[A
+
+Train step of epoch 1:  35%|███▍      | 2241/6434 [5:15:48<9:26:01,  8.10s/it, gpt_loss=0.26, loss_mean=0.279] [A[A
+
+Train step of epoch 1:  35%|███▍      | 2242/6434 [5:15:48<9:53:20,  8.49s/it, gpt_loss=0.26, loss_mean=0.279][A[A
+
+Train step of epoch 1:  35%|███▍      | 2242/6434 [5:15:57<9:53:20,  8.49s/it, gpt_loss=0.294, loss_mean=0.28][A[A
+
+Train step of epoch 1:  35%|███▍      | 2243/6434 [5:15:57<10:02:27,  8.63s/it, gpt_loss=0.294, loss_mean=0.28][A[A
+
+Train step of epoch 1:  35%|███▍      | 2243/6434 [5:16:05<10:02:27,  8.63s/it, gpt_loss=0.264, loss_mean=0.279][A[A
+
+Train step of epoch 1:  35%|███▍      | 2244/6434 [5:16:05<9:50:52,  8.46s/it, gpt_loss=0.264, loss_mean=0.279] [A[A
+
+Train step of epoch 1:  35%|███▍      | 2244/6434 [5:16:15<9:50:52,  8.46s/it, gpt_loss=0.279, loss_mean=0.279][A[A
+
+Train step of epoch 1:  35%|███▍      | 2245/6434 [5:16:15<10:14:11,  8.80s/it, gpt_loss=0.279, loss_mean=0.279][A[A
+[LID Router Debug] Step: 8680
+Batch Size: 10
+Audio Batch Size: 85
+LID Assignments: [1, 5, 1, 2, 4, 1, 1, 10, 4, 1]
+Active Experts in Batch: {1, 2, 4, 5, 10}
+
+
+Train step of epoch 1:  35%|███▍      | 2245/6434 [5:16:23<10:14:11,  8.80s/it, gpt_loss=0.256, loss_mean=0.276][A[A
+
+Train step of epoch 1:  35%|███▍      | 2246/6434 [5:16:23<10:05:12,  8.67s/it, gpt_loss=0.256, loss_mean=0.276][A[A
+
+Train step of epoch 1:  35%|███▍      | 2246/6434 [5:16:31<10:05:12,  8.67s/it, gpt_loss=0.271, loss_mean=0.276][A[A
+
+Train step of epoch 1:  35%|███▍      | 2247/6434 [5:16:31<9:50:02,  8.46s/it, gpt_loss=0.271, loss_mean=0.276] [A[A
+
+Train step of epoch 1:  35%|███▍      | 2247/6434 [5:16:39<9:50:02,  8.46s/it, gpt_loss=0.224, loss_mean=0.271][A[A
+
+Train step of epoch 1:  35%|███▍      | 2248/6434 [5:16:39<9:50:32,  8.46s/it, gpt_loss=0.224, loss_mean=0.271][A[A
+
+Train step of epoch 1:  35%|███▍      | 2248/6434 [5:16:48<9:50:32,  8.46s/it, gpt_loss=0.205, loss_mean=0.264][A[A
+
+Train step of epoch 1:  35%|███▍      | 2249/6434 [5:16:48<9:48:05,  8.43s/it, gpt_loss=0.205, loss_mean=0.264][A[A
+
+Train step of epoch 1:  35%|███▍      | 2249/6434 [5:16:56<9:48:05,  8.43s/it, gpt_loss=0.23, loss_mean=0.261] [A[A
+
+Train step of epoch 1:  35%|███▍      | 2250/6434 [5:16:56<9:48:40,  8.44s/it, gpt_loss=0.23, loss_mean=0.261][A[A
+
+Train step of epoch 1:  35%|███▍      | 2250/6434 [5:17:03<9:48:40,  8.44s/it, gpt_loss=0.245, loss_mean=0.259][A[A
+
+Train step of epoch 1:  35%|███▍      | 2251/6434 [5:17:03<9:20:58,  8.05s/it, gpt_loss=0.245, loss_mean=0.259][A[A
+
+Train step of epoch 1:  35%|███▍      | 2251/6434 [5:17:13<9:20:58,  8.05s/it, gpt_loss=0.253, loss_mean=0.258][A[A
+
+Train step of epoch 1:  35%|███▌      | 2252/6434 [5:17:13<9:47:43,  8.43s/it, gpt_loss=0.253, loss_mean=0.258][A[A
+
+Train step of epoch 1:  35%|███▌      | 2252/6434 [5:17:21<9:47:43,  8.43s/it, gpt_loss=0.264, loss_mean=0.259][A[A
+
+Train step of epoch 1:  35%|███▌      | 2253/6434 [5:17:21<9:45:47,  8.41s/it, gpt_loss=0.264, loss_mean=0.259][A[A
+
+Train step of epoch 1:  35%|███▌      | 2253/6434 [5:17:30<9:45:47,  8.41s/it, gpt_loss=0.211, loss_mean=0.254][A[A
+
+Train step of epoch 1:  35%|███▌      | 2254/6434 [5:17:30<9:56:47,  8.57s/it, gpt_loss=0.211, loss_mean=0.254][A[A
+
+Train step of epoch 1:  35%|███▌      | 2254/6434 [5:17:37<9:56:47,  8.57s/it, gpt_loss=0.267, loss_mean=0.255][A[A
+
+Train step of epoch 1:  35%|███▌      | 2255/6434 [5:17:37<9:31:06,  8.20s/it, gpt_loss=0.267, loss_mean=0.255][A[A
+[LID Router Debug] Step: 8690
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [9, 3, 5, 4, 2, 4, 9, 0, 1, 5]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  35%|███▌      | 2255/6434 [5:17:45<9:31:06,  8.20s/it, gpt_loss=0.269, loss_mean=0.257][A[A
+
+Train step of epoch 1:  35%|███▌      | 2256/6434 [5:17:45<9:22:43,  8.08s/it, gpt_loss=0.269, loss_mean=0.257][A[A
+
+Train step of epoch 1:  35%|███▌      | 2256/6434 [5:17:53<9:22:43,  8.08s/it, gpt_loss=0.227, loss_mean=0.254][A[A
+
+Train step of epoch 1:  35%|███▌      | 2257/6434 [5:17:53<9:22:30,  8.08s/it, gpt_loss=0.227, loss_mean=0.254][A[A
+
+Train step of epoch 1:  35%|███▌      | 2257/6434 [5:18:01<9:22:30,  8.08s/it, gpt_loss=0.291, loss_mean=0.258][A[A
+
+Train step of epoch 1:  35%|███▌      | 2258/6434 [5:18:01<9:18:52,  8.03s/it, gpt_loss=0.291, loss_mean=0.258][A[A
+
+Train step of epoch 1:  35%|███▌      | 2258/6434 [5:18:10<9:18:52,  8.03s/it, gpt_loss=0.296, loss_mean=0.261][A[A
+
+Train step of epoch 1:  35%|███▌      | 2259/6434 [5:18:10<9:33:54,  8.25s/it, gpt_loss=0.296, loss_mean=0.261][A[A
+
+Train step of epoch 1:  35%|███▌      | 2259/6434 [5:18:19<9:33:54,  8.25s/it, gpt_loss=0.203, loss_mean=0.256][A[A
+
+Train step of epoch 1:  35%|███▌      | 2260/6434 [5:18:19<9:59:53,  8.62s/it, gpt_loss=0.203, loss_mean=0.256][A[A
+
+Train step of epoch 1:  35%|███▌      | 2260/6434 [5:18:28<9:59:53,  8.62s/it, gpt_loss=0.213, loss_mean=0.251][A[A
+
+Train step of epoch 1:  35%|███▌      | 2261/6434 [5:18:28<10:04:15,  8.69s/it, gpt_loss=0.213, loss_mean=0.251][A[A
+
+Train step of epoch 1:  35%|███▌      | 2261/6434 [5:18:36<10:04:15,  8.69s/it, gpt_loss=0.18, loss_mean=0.244] [A[A
+
+Train step of epoch 1:  35%|███▌      | 2262/6434 [5:18:36<9:52:34,  8.52s/it, gpt_loss=0.18, loss_mean=0.244] [A[A
+
+Train step of epoch 1:  35%|███▌      | 2262/6434 [5:18:44<9:52:34,  8.52s/it, gpt_loss=0.3, loss_mean=0.25]  [A[A
+
+Train step of epoch 1:  35%|███▌      | 2263/6434 [5:18:44<9:37:38,  8.31s/it, gpt_loss=0.3, loss_mean=0.25][A[A
+
+Train step of epoch 1:  35%|███▌      | 2263/6434 [5:18:52<9:37:38,  8.31s/it, gpt_loss=0.202, loss_mean=0.245][A[A
+
+Train step of epoch 1:  35%|███▌      | 2264/6434 [5:18:52<9:33:27,  8.25s/it, gpt_loss=0.202, loss_mean=0.245][A[A
+
+Train step of epoch 1:  35%|███▌      | 2264/6434 [5:19:01<9:33:27,  8.25s/it, gpt_loss=0.28, loss_mean=0.248] [A[A
+
+Train step of epoch 1:  35%|███▌      | 2265/6434 [5:19:01<9:37:22,  8.31s/it, gpt_loss=0.28, loss_mean=0.248][A[A
+[LID Router Debug] Step: 8700
+Batch Size: 10
+Audio Batch Size: 129
+LID Assignments: [3, 3, 4, 9, 4, 9, 0, 5, 3, 9]
+Active Experts in Batch: {0, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  35%|███▌      | 2265/6434 [5:19:10<9:37:22,  8.31s/it, gpt_loss=0.239, loss_mean=0.248][A[A
+
+Train step of epoch 1:  35%|███▌      | 2266/6434 [5:19:10<9:49:22,  8.48s/it, gpt_loss=0.239, loss_mean=0.248][A[A
+
+Train step of epoch 1:  35%|███▌      | 2266/6434 [5:19:19<9:49:22,  8.48s/it, gpt_loss=0.298, loss_mean=0.253][A[A
+
+Train step of epoch 1:  35%|███▌      | 2267/6434 [5:19:19<10:06:15,  8.73s/it, gpt_loss=0.298, loss_mean=0.253][A[A
+
+Train step of epoch 1:  35%|███▌      | 2267/6434 [5:19:27<10:06:15,  8.73s/it, gpt_loss=0.248, loss_mean=0.252][A[A
+
+Train step of epoch 1:  35%|███▌      | 2268/6434 [5:19:27<9:56:57,  8.60s/it, gpt_loss=0.248, loss_mean=0.252] [A[A
+
+Train step of epoch 1:  35%|███▌      | 2268/6434 [5:19:36<9:56:57,  8.60s/it, gpt_loss=0.227, loss_mean=0.25] [A[A
+
+Train step of epoch 1:  35%|███▌      | 2269/6434 [5:19:36<9:54:40,  8.57s/it, gpt_loss=0.227, loss_mean=0.25][A[A
+
+Train step of epoch 1:  35%|███▌      | 2269/6434 [5:19:44<9:54:40,  8.57s/it, gpt_loss=0.327, loss_mean=0.257][A[A
+
+Train step of epoch 1:  35%|███▌      | 2270/6434 [5:19:44<9:55:05,  8.57s/it, gpt_loss=0.327, loss_mean=0.257][A[A
+
+Train step of epoch 1:  35%|███▌      | 2270/6434 [5:19:52<9:55:05,  8.57s/it, gpt_loss=0.299, loss_mean=0.261][A[A
+
+Train step of epoch 1:  35%|███▌      | 2271/6434 [5:19:52<9:34:44,  8.28s/it, gpt_loss=0.299, loss_mean=0.261][A[A
+
+Train step of epoch 1:  35%|███▌      | 2271/6434 [5:20:01<9:34:44,  8.28s/it, gpt_loss=0.268, loss_mean=0.262][A[A
+
+Train step of epoch 1:  35%|███▌      | 2272/6434 [5:20:01<9:53:20,  8.55s/it, gpt_loss=0.268, loss_mean=0.262][A[A
+
+Train step of epoch 1:  35%|███▌      | 2272/6434 [5:20:09<9:53:20,  8.55s/it, gpt_loss=0.308, loss_mean=0.267][A[A
+
+Train step of epoch 1:  35%|███▌      | 2273/6434 [5:20:09<9:31:10,  8.24s/it, gpt_loss=0.308, loss_mean=0.267][A[A
+
+Train step of epoch 1:  35%|███▌      | 2273/6434 [5:20:17<9:31:10,  8.24s/it, gpt_loss=0.307, loss_mean=0.271][A[A
+
+Train step of epoch 1:  35%|███▌      | 2274/6434 [5:20:17<9:40:23,  8.37s/it, gpt_loss=0.307, loss_mean=0.271][A[A
+
+Train step of epoch 1:  35%|███▌      | 2274/6434 [5:20:28<9:40:23,  8.37s/it, gpt_loss=0.295, loss_mean=0.273][A[A
+
+Train step of epoch 1:  35%|███▌      | 2275/6434 [5:20:28<10:21:46,  8.97s/it, gpt_loss=0.295, loss_mean=0.273][A[A
+[LID Router Debug] Step: 8710
+Batch Size: 10
+Audio Batch Size: 89
+LID Assignments: [4, 2, 6, 3, 1, 5, 5, 9, 5, 2]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  35%|███▌      | 2275/6434 [5:20:35<10:21:46,  8.97s/it, gpt_loss=0.282, loss_mean=0.274][A[A
+
+Train step of epoch 1:  35%|███▌      | 2276/6434 [5:20:35<9:48:50,  8.50s/it, gpt_loss=0.282, loss_mean=0.274] [A[A
+
+Train step of epoch 1:  35%|███▌      | 2276/6434 [5:20:43<9:48:50,  8.50s/it, gpt_loss=0.233, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  35%|███▌      | 2277/6434 [5:20:43<9:44:15,  8.43s/it, gpt_loss=0.233, loss_mean=0.27][A[A
+
+Train step of epoch 1:  35%|███▌      | 2277/6434 [5:20:53<9:44:15,  8.43s/it, gpt_loss=0.38, loss_mean=0.281][A[A
+
+Train step of epoch 1:  35%|███▌      | 2278/6434 [5:20:53<10:02:57,  8.70s/it, gpt_loss=0.38, loss_mean=0.281][A[A
+
+Train step of epoch 1:  35%|███▌      | 2278/6434 [5:21:01<10:02:57,  8.70s/it, gpt_loss=0.247, loss_mean=0.278][A[A
+
+Train step of epoch 1:  35%|███▌      | 2279/6434 [5:21:01<9:46:42,  8.47s/it, gpt_loss=0.247, loss_mean=0.278] [A[A
+
+Train step of epoch 1:  35%|███▌      | 2279/6434 [5:21:09<9:46:42,  8.47s/it, gpt_loss=0.194, loss_mean=0.269][A[A
+
+Train step of epoch 1:  35%|███▌      | 2280/6434 [5:21:09<9:47:51,  8.49s/it, gpt_loss=0.194, loss_mean=0.269][A[A
+
+Train step of epoch 1:  35%|███▌      | 2280/6434 [5:21:18<9:47:51,  8.49s/it, gpt_loss=0.211, loss_mean=0.263][A[A
+
+Train step of epoch 1:  35%|███▌      | 2281/6434 [5:21:18<10:00:51,  8.68s/it, gpt_loss=0.211, loss_mean=0.263][A[A
+
+Train step of epoch 1:  35%|███▌      | 2281/6434 [5:21:26<10:00:51,  8.68s/it, gpt_loss=0.335, loss_mean=0.271][A[A
+
+Train step of epoch 1:  35%|███▌      | 2282/6434 [5:21:26<9:42:16,  8.41s/it, gpt_loss=0.335, loss_mean=0.271] [A[A
+
+Train step of epoch 1:  35%|███▌      | 2282/6434 [5:21:35<9:42:16,  8.41s/it, gpt_loss=0.281, loss_mean=0.272][A[A
+
+Train step of epoch 1:  35%|███▌      | 2283/6434 [5:21:35<9:49:29,  8.52s/it, gpt_loss=0.281, loss_mean=0.272][A[A
+
+Train step of epoch 1:  35%|███▌      | 2283/6434 [5:21:43<9:49:29,  8.52s/it, gpt_loss=0.392, loss_mean=0.284][A[A
+
+Train step of epoch 1:  35%|███▌      | 2284/6434 [5:21:43<9:51:25,  8.55s/it, gpt_loss=0.392, loss_mean=0.284][A[A
+
+Train step of epoch 1:  35%|███▌      | 2284/6434 [5:21:52<9:51:25,  8.55s/it, gpt_loss=0.192, loss_mean=0.274][A[A
+
+Train step of epoch 1:  36%|███▌      | 2285/6434 [5:21:52<9:44:22,  8.45s/it, gpt_loss=0.192, loss_mean=0.274][A[A
+[LID Router Debug] Step: 8720
+Batch Size: 10
+Audio Batch Size: 128
+LID Assignments: [3, 3, 9, 4, 0, 1, 0, 5, 3, 2]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  36%|███▌      | 2285/6434 [5:22:00<9:44:22,  8.45s/it, gpt_loss=0.205, loss_mean=0.268][A[A
+
+Train step of epoch 1:  36%|███▌      | 2286/6434 [5:22:00<9:44:07,  8.45s/it, gpt_loss=0.205, loss_mean=0.268][A[A
+
+Train step of epoch 1:  36%|███▌      | 2286/6434 [5:22:09<9:44:07,  8.45s/it, gpt_loss=0.23, loss_mean=0.264] [A[A
+
+Train step of epoch 1:  36%|███▌      | 2287/6434 [5:22:09<9:56:33,  8.63s/it, gpt_loss=0.23, loss_mean=0.264][A[A
+
+Train step of epoch 1:  36%|███▌      | 2287/6434 [5:22:16<9:56:33,  8.63s/it, gpt_loss=0.241, loss_mean=0.261][A[A
+
+Train step of epoch 1:  36%|███▌      | 2288/6434 [5:22:16<9:25:03,  8.18s/it, gpt_loss=0.241, loss_mean=0.261][A[A
+
+Train step of epoch 1:  36%|███▌      | 2288/6434 [5:22:24<9:25:03,  8.18s/it, gpt_loss=0.34, loss_mean=0.269] [A[A
+
+Train step of epoch 1:  36%|███▌      | 2289/6434 [5:22:24<9:19:23,  8.10s/it, gpt_loss=0.34, loss_mean=0.269][A[A
+
+Train step of epoch 1:  36%|███▌      | 2289/6434 [5:22:33<9:19:23,  8.10s/it, gpt_loss=0.277, loss_mean=0.27][A[A
+
+Train step of epoch 1:  36%|███▌      | 2290/6434 [5:22:33<9:35:19,  8.33s/it, gpt_loss=0.277, loss_mean=0.27][A[A
+
+Train step of epoch 1:  36%|███▌      | 2290/6434 [5:22:41<9:35:19,  8.33s/it, gpt_loss=0.243, loss_mean=0.267][A[A
+
+Train step of epoch 1:  36%|███▌      | 2291/6434 [5:22:41<9:37:51,  8.37s/it, gpt_loss=0.243, loss_mean=0.267][A[A
+
+Train step of epoch 1:  36%|███▌      | 2291/6434 [5:22:50<9:37:51,  8.37s/it, gpt_loss=0.285, loss_mean=0.269][A[A
+
+Train step of epoch 1:  36%|███▌      | 2292/6434 [5:22:50<9:45:01,  8.47s/it, gpt_loss=0.285, loss_mean=0.269][A[A
+
+Train step of epoch 1:  36%|███▌      | 2292/6434 [5:23:00<9:45:01,  8.47s/it, gpt_loss=0.304, loss_mean=0.273][A[A
+
+Train step of epoch 1:  36%|███▌      | 2293/6434 [5:23:00<10:07:55,  8.81s/it, gpt_loss=0.304, loss_mean=0.273][A[A
+
+Train step of epoch 1:  36%|███▌      | 2293/6434 [5:23:08<10:07:55,  8.81s/it, gpt_loss=0.313, loss_mean=0.277][A[A
+
+Train step of epoch 1:  36%|███▌      | 2294/6434 [5:23:08<10:03:58,  8.75s/it, gpt_loss=0.313, loss_mean=0.277][A[A
+
+Train step of epoch 1:  36%|███▌      | 2294/6434 [5:23:17<10:03:58,  8.75s/it, gpt_loss=0.241, loss_mean=0.273][A[A
+
+Train step of epoch 1:  36%|███▌      | 2295/6434 [5:23:17<10:01:15,  8.72s/it, gpt_loss=0.241, loss_mean=0.273][A[A
+[LID Router Debug] Step: 8730
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [2, 4, 9, 0, 9, 0, 2, 3, 4, 5]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  36%|███▌      | 2295/6434 [5:23:25<10:01:15,  8.72s/it, gpt_loss=0.276, loss_mean=0.273][A[A
+
+Train step of epoch 1:  36%|███▌      | 2296/6434 [5:23:25<9:51:42,  8.58s/it, gpt_loss=0.276, loss_mean=0.273] [A[A
+
+Train step of epoch 1:  36%|███▌      | 2296/6434 [5:23:33<9:51:42,  8.58s/it, gpt_loss=0.249, loss_mean=0.271][A[A
+
+Train step of epoch 1:  36%|███▌      | 2297/6434 [5:23:33<9:40:56,  8.43s/it, gpt_loss=0.249, loss_mean=0.271][A[A
+
+Train step of epoch 1:  36%|███▌      | 2297/6434 [5:23:42<9:40:56,  8.43s/it, gpt_loss=0.225, loss_mean=0.266][A[A
+
+Train step of epoch 1:  36%|███▌      | 2298/6434 [5:23:42<9:45:45,  8.50s/it, gpt_loss=0.225, loss_mean=0.266][A[A
+
+Train step of epoch 1:  36%|███▌      | 2298/6434 [5:23:50<9:45:45,  8.50s/it, gpt_loss=0.232, loss_mean=0.263][A[A
+
+Train step of epoch 1:  36%|███▌      | 2299/6434 [5:23:50<9:44:29,  8.48s/it, gpt_loss=0.232, loss_mean=0.263][A[A
+
+Train step of epoch 1:  36%|███▌      | 2299/6434 [5:23:59<9:44:29,  8.48s/it, gpt_loss=0.263, loss_mean=0.263][A[A
+
+Train step of epoch 1:  36%|███▌      | 2300/6434 [5:23:59<9:36:36,  8.37s/it, gpt_loss=0.263, loss_mean=0.263][A[A
+
+Train step of epoch 1:  36%|███▌      | 2300/6434 [5:24:07<9:36:36,  8.37s/it, gpt_loss=0.324, loss_mean=0.269][A[A
+
+Train step of epoch 1:  36%|███▌      | 2301/6434 [5:24:07<9:41:26,  8.44s/it, gpt_loss=0.324, loss_mean=0.269][A[A
+
+Train step of epoch 1:  36%|███▌      | 2301/6434 [5:24:16<9:41:26,  8.44s/it, gpt_loss=0.286, loss_mean=0.271][A[A
+
+Train step of epoch 1:  36%|███▌      | 2302/6434 [5:24:16<9:52:43,  8.61s/it, gpt_loss=0.286, loss_mean=0.271][A[A
+
+Train step of epoch 1:  36%|███▌      | 2302/6434 [5:24:24<9:52:43,  8.61s/it, gpt_loss=0.299, loss_mean=0.273][A[A
+
+Train step of epoch 1:  36%|███▌      | 2303/6434 [5:24:24<9:38:25,  8.40s/it, gpt_loss=0.299, loss_mean=0.273][A[A
+
+Train step of epoch 1:  36%|███▌      | 2303/6434 [5:24:33<9:38:25,  8.40s/it, gpt_loss=0.266, loss_mean=0.273][A[A
+
+Train step of epoch 1:  36%|███▌      | 2304/6434 [5:24:33<9:42:59,  8.47s/it, gpt_loss=0.266, loss_mean=0.273][A[A
+
+Train step of epoch 1:  36%|███▌      | 2304/6434 [5:24:40<9:42:59,  8.47s/it, gpt_loss=0.268, loss_mean=0.272][A[A
+
+Train step of epoch 1:  36%|███▌      | 2305/6434 [5:24:40<9:16:48,  8.09s/it, gpt_loss=0.268, loss_mean=0.272][A[A
+[LID Router Debug] Step: 8740
+Batch Size: 10
+Audio Batch Size: 132
+LID Assignments: [2, 3, 2, 1, 9, 9, 9, 2, 0, 9]
+Active Experts in Batch: {0, 1, 2, 3, 9}
+
+
+Train step of epoch 1:  36%|███▌      | 2305/6434 [5:24:48<9:16:48,  8.09s/it, gpt_loss=0.225, loss_mean=0.267][A[A
+
+Train step of epoch 1:  36%|███▌      | 2306/6434 [5:24:48<9:23:54,  8.20s/it, gpt_loss=0.225, loss_mean=0.267][A[A
+
+Train step of epoch 1:  36%|███▌      | 2306/6434 [5:24:56<9:23:54,  8.20s/it, gpt_loss=0.183, loss_mean=0.259][A[A
+
+Train step of epoch 1:  36%|███▌      | 2307/6434 [5:24:56<9:22:43,  8.18s/it, gpt_loss=0.183, loss_mean=0.259][A[A
+
+Train step of epoch 1:  36%|███▌      | 2307/6434 [5:25:05<9:22:43,  8.18s/it, gpt_loss=0.256, loss_mean=0.259][A[A
+
+Train step of epoch 1:  36%|███▌      | 2308/6434 [5:25:05<9:26:57,  8.24s/it, gpt_loss=0.256, loss_mean=0.259][A[A
+
+Train step of epoch 1:  36%|███▌      | 2308/6434 [5:25:13<9:26:57,  8.24s/it, gpt_loss=0.231, loss_mean=0.256][A[A
+
+Train step of epoch 1:  36%|███▌      | 2309/6434 [5:25:13<9:19:07,  8.13s/it, gpt_loss=0.231, loss_mean=0.256][A[A
+
+Train step of epoch 1:  36%|███▌      | 2309/6434 [5:25:21<9:19:07,  8.13s/it, gpt_loss=0.203, loss_mean=0.251][A[A
+
+Train step of epoch 1:  36%|███▌      | 2310/6434 [5:25:21<9:28:30,  8.27s/it, gpt_loss=0.203, loss_mean=0.251][A[A
+
+Train step of epoch 1:  36%|███▌      | 2310/6434 [5:25:29<9:28:30,  8.27s/it, gpt_loss=0.215, loss_mean=0.247][A[A
+
+Train step of epoch 1:  36%|███▌      | 2311/6434 [5:25:29<9:22:12,  8.18s/it, gpt_loss=0.215, loss_mean=0.247][A[A
+
+Train step of epoch 1:  36%|███▌      | 2311/6434 [5:25:37<9:22:12,  8.18s/it, gpt_loss=0.249, loss_mean=0.247][A[A
+
+Train step of epoch 1:  36%|███▌      | 2312/6434 [5:25:37<9:15:32,  8.09s/it, gpt_loss=0.249, loss_mean=0.247][A[A
+
+Train step of epoch 1:  36%|███▌      | 2312/6434 [5:25:46<9:15:32,  8.09s/it, gpt_loss=0.26, loss_mean=0.248] [A[A
+
+Train step of epoch 1:  36%|███▌      | 2313/6434 [5:25:46<9:22:36,  8.19s/it, gpt_loss=0.26, loss_mean=0.248][A[A
+
+Train step of epoch 1:  36%|███▌      | 2313/6434 [5:25:54<9:22:36,  8.19s/it, gpt_loss=0.254, loss_mean=0.249][A[A
+
+Train step of epoch 1:  36%|███▌      | 2314/6434 [5:25:54<9:20:07,  8.16s/it, gpt_loss=0.254, loss_mean=0.249][A[A
+
+Train step of epoch 1:  36%|███▌      | 2314/6434 [5:26:02<9:20:07,  8.16s/it, gpt_loss=0.261, loss_mean=0.25] [A[A
+
+Train step of epoch 1:  36%|███▌      | 2315/6434 [5:26:02<9:26:27,  8.25s/it, gpt_loss=0.261, loss_mean=0.25][A[A
+[LID Router Debug] Step: 8750
+Batch Size: 10
+Audio Batch Size: 120
+LID Assignments: [0, 1, 1, 3, 9, 9, 3, 4, 2, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+
+Train step of epoch 1:  36%|███▌      | 2315/6434 [5:26:11<9:26:27,  8.25s/it, gpt_loss=0.369, loss_mean=0.262][A[A
+
+Train step of epoch 1:  36%|███▌      | 2316/6434 [5:26:11<9:28:30,  8.28s/it, gpt_loss=0.369, loss_mean=0.262][A[A
+
+Train step of epoch 1:  36%|███▌      | 2316/6434 [5:26:19<9:28:30,  8.28s/it, gpt_loss=0.231, loss_mean=0.259][A[A
+
+Train step of epoch 1:  36%|███▌      | 2317/6434 [5:26:19<9:32:49,  8.35s/it, gpt_loss=0.231, loss_mean=0.259][A[A
+
+Train step of epoch 1:  36%|███▌      | 2317/6434 [5:26:26<9:32:49,  8.35s/it, gpt_loss=0.266, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  36%|███▌      | 2318/6434 [5:26:26<9:12:13,  8.05s/it, gpt_loss=0.266, loss_mean=0.26][A[A
+
+Train step of epoch 1:  36%|███▌      | 2318/6434 [5:26:34<9:12:13,  8.05s/it, gpt_loss=0.252, loss_mean=0.259][A[A
+
+Train step of epoch 1:  36%|███▌      | 2319/6434 [5:26:34<9:04:00,  7.93s/it, gpt_loss=0.252, loss_mean=0.259][A[A
+
+Train step of epoch 1:  36%|███▌      | 2319/6434 [5:26:43<9:04:00,  7.93s/it, gpt_loss=0.262, loss_mean=0.259][A[A
+
+Train step of epoch 1:  36%|███▌      | 2320/6434 [5:26:43<9:26:37,  8.26s/it, gpt_loss=0.262, loss_mean=0.259][A[A
+
+Train step of epoch 1:  36%|███▌      | 2320/6434 [5:26:52<9:26:37,  8.26s/it, gpt_loss=0.306, loss_mean=0.264][A[A
+
+Train step of epoch 1:  36%|███▌      | 2321/6434 [5:26:52<9:49:00,  8.59s/it, gpt_loss=0.306, loss_mean=0.264][A[A
+
+Train step of epoch 1:  36%|███▌      | 2321/6434 [5:27:01<9:49:00,  8.59s/it, gpt_loss=0.25, loss_mean=0.262] [A[A
+
+Train step of epoch 1:  36%|███▌      | 2322/6434 [5:27:01<9:56:42,  8.71s/it, gpt_loss=0.25, loss_mean=0.262][A[A
+
+Train step of epoch 1:  36%|███▌      | 2322/6434 [5:27:09<9:56:42,  8.71s/it, gpt_loss=0.331, loss_mean=0.269][A[A
+
+Train step of epoch 1:  36%|███▌      | 2323/6434 [5:27:09<9:41:27,  8.49s/it, gpt_loss=0.331, loss_mean=0.269][A[A
+
+Train step of epoch 1:  36%|███▌      | 2323/6434 [5:27:17<9:41:27,  8.49s/it, gpt_loss=0.302, loss_mean=0.273][A[A
+
+Train step of epoch 1:  36%|███▌      | 2324/6434 [5:27:17<9:26:24,  8.27s/it, gpt_loss=0.302, loss_mean=0.273][A[A
+
+Train step of epoch 1:  36%|███▌      | 2324/6434 [5:27:26<9:26:24,  8.27s/it, gpt_loss=0.236, loss_mean=0.269][A[A
+
+Train step of epoch 1:  36%|███▌      | 2325/6434 [5:27:26<9:38:47,  8.45s/it, gpt_loss=0.236, loss_mean=0.269][A[A
+[LID Router Debug] Step: 8760
+Batch Size: 10
+Audio Batch Size: 113
+LID Assignments: [1, 1, 3, 3, 0, 2, 9, 5, 1, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  36%|███▌      | 2325/6434 [5:27:34<9:38:47,  8.45s/it, gpt_loss=0.277, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  36%|███▌      | 2326/6434 [5:27:34<9:28:37,  8.31s/it, gpt_loss=0.277, loss_mean=0.27][A[A
+
+Train step of epoch 1:  36%|███▌      | 2326/6434 [5:27:43<9:28:37,  8.31s/it, gpt_loss=0.319, loss_mean=0.275][A[A
+
+Train step of epoch 1:  36%|███▌      | 2327/6434 [5:27:43<9:45:01,  8.55s/it, gpt_loss=0.319, loss_mean=0.275][A[A
+
+Train step of epoch 1:  36%|███▌      | 2327/6434 [5:27:51<9:45:01,  8.55s/it, gpt_loss=0.346, loss_mean=0.282][A[A
+
+Train step of epoch 1:  36%|███▌      | 2328/6434 [5:27:51<9:30:41,  8.34s/it, gpt_loss=0.346, loss_mean=0.282][A[A
+
+Train step of epoch 1:  36%|███▌      | 2328/6434 [5:27:59<9:30:41,  8.34s/it, gpt_loss=0.21, loss_mean=0.275] [A[A
+
+Train step of epoch 1:  36%|███▌      | 2329/6434 [5:27:59<9:33:44,  8.39s/it, gpt_loss=0.21, loss_mean=0.275][A[A
+
+Train step of epoch 1:  36%|███▌      | 2329/6434 [5:28:07<9:33:44,  8.39s/it, gpt_loss=0.276, loss_mean=0.275][A[A
+
+Train step of epoch 1:  36%|███▌      | 2330/6434 [5:28:07<9:14:07,  8.10s/it, gpt_loss=0.276, loss_mean=0.275][A[A
+
+Train step of epoch 1:  36%|███▌      | 2330/6434 [5:28:16<9:14:07,  8.10s/it, gpt_loss=0.221, loss_mean=0.269][A[A
+
+Train step of epoch 1:  36%|███▌      | 2331/6434 [5:28:16<9:34:57,  8.41s/it, gpt_loss=0.221, loss_mean=0.269][A[A
+
+Train step of epoch 1:  36%|███▌      | 2331/6434 [5:28:24<9:34:57,  8.41s/it, gpt_loss=0.241, loss_mean=0.267][A[A
+
+Train step of epoch 1:  36%|███▌      | 2332/6434 [5:28:24<9:34:50,  8.41s/it, gpt_loss=0.241, loss_mean=0.267][A[A
+
+Train step of epoch 1:  36%|███▌      | 2332/6434 [5:28:32<9:34:50,  8.41s/it, gpt_loss=0.284, loss_mean=0.268][A[A
+
+Train step of epoch 1:  36%|███▋      | 2333/6434 [5:28:32<9:10:00,  8.05s/it, gpt_loss=0.284, loss_mean=0.268][A[A
+
+Train step of epoch 1:  36%|███▋      | 2333/6434 [5:28:41<9:10:00,  8.05s/it, gpt_loss=0.259, loss_mean=0.267][A[A
+
+Train step of epoch 1:  36%|███▋      | 2334/6434 [5:28:41<9:41:07,  8.50s/it, gpt_loss=0.259, loss_mean=0.267][A[A
+
+Train step of epoch 1:  36%|███▋      | 2334/6434 [5:28:50<9:41:07,  8.50s/it, gpt_loss=0.338, loss_mean=0.274][A[A
+
+Train step of epoch 1:  36%|███▋      | 2335/6434 [5:28:50<9:51:45,  8.66s/it, gpt_loss=0.338, loss_mean=0.274][A[A
+[LID Router Debug] Step: 8770
+Batch Size: 10
+Audio Batch Size: 124
+LID Assignments: [4, 3, 5, 2, 4, 0, 1, 2, 3, 9]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  36%|███▋      | 2335/6434 [5:28:59<9:51:45,  8.66s/it, gpt_loss=0.253, loss_mean=0.272][A[A
+
+Train step of epoch 1:  36%|███▋      | 2336/6434 [5:28:59<9:45:22,  8.57s/it, gpt_loss=0.253, loss_mean=0.272][A[A
+
+Train step of epoch 1:  36%|███▋      | 2336/6434 [5:29:08<9:45:22,  8.57s/it, gpt_loss=0.258, loss_mean=0.271][A[A
+
+Train step of epoch 1:  36%|███▋      | 2337/6434 [5:29:08<10:05:09,  8.86s/it, gpt_loss=0.258, loss_mean=0.271][A[A
+
+Train step of epoch 1:  36%|███▋      | 2337/6434 [5:29:15<10:05:09,  8.86s/it, gpt_loss=0.228, loss_mean=0.266][A[A
+
+Train step of epoch 1:  36%|███▋      | 2338/6434 [5:29:15<9:31:55,  8.38s/it, gpt_loss=0.228, loss_mean=0.266] [A[A
+
+Train step of epoch 1:  36%|███▋      | 2338/6434 [5:29:24<9:31:55,  8.38s/it, gpt_loss=0.299, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  36%|███▋      | 2339/6434 [5:29:24<9:42:33,  8.54s/it, gpt_loss=0.299, loss_mean=0.27][A[A
+
+Train step of epoch 1:  36%|███▋      | 2339/6434 [5:29:32<9:42:33,  8.54s/it, gpt_loss=0.265, loss_mean=0.269][A[A
+
+Train step of epoch 1:  36%|███▋      | 2340/6434 [5:29:32<9:17:23,  8.17s/it, gpt_loss=0.265, loss_mean=0.269][A[A
+
+Train step of epoch 1:  36%|███▋      | 2340/6434 [5:29:40<9:17:23,  8.17s/it, gpt_loss=0.46, loss_mean=0.288] [A[A
+
+Train step of epoch 1:  36%|███▋      | 2341/6434 [5:29:40<9:14:20,  8.13s/it, gpt_loss=0.46, loss_mean=0.288][A[A
+
+Train step of epoch 1:  36%|███▋      | 2341/6434 [5:29:49<9:14:20,  8.13s/it, gpt_loss=0.303, loss_mean=0.29][A[A
+
+Train step of epoch 1:  36%|███▋      | 2342/6434 [5:29:49<9:39:08,  8.49s/it, gpt_loss=0.303, loss_mean=0.29][A[A
+
+Train step of epoch 1:  36%|███▋      | 2342/6434 [5:29:57<9:39:08,  8.49s/it, gpt_loss=0.192, loss_mean=0.28][A[A
+
+Train step of epoch 1:  36%|███▋      | 2343/6434 [5:29:57<9:40:02,  8.51s/it, gpt_loss=0.192, loss_mean=0.28][A[A
+
+Train step of epoch 1:  36%|███▋      | 2343/6434 [5:30:06<9:40:02,  8.51s/it, gpt_loss=0.243, loss_mean=0.276][A[A
+
+Train step of epoch 1:  36%|███▋      | 2344/6434 [5:30:06<9:37:33,  8.47s/it, gpt_loss=0.243, loss_mean=0.276][A[A
+
+Train step of epoch 1:  36%|███▋      | 2344/6434 [5:30:13<9:37:33,  8.47s/it, gpt_loss=0.333, loss_mean=0.282][A[A
+
+Train step of epoch 1:  36%|███▋      | 2345/6434 [5:30:13<9:13:35,  8.12s/it, gpt_loss=0.333, loss_mean=0.282][A[A
+[LID Router Debug] Step: 8780
+Batch Size: 10
+Audio Batch Size: 121
+LID Assignments: [0, 9, 5, 3, 5, 1, 0, 3, 9, 3]
+Active Experts in Batch: {0, 1, 3, 5, 9}
+
+
+Train step of epoch 1:  36%|███▋      | 2345/6434 [5:30:21<9:13:35,  8.12s/it, gpt_loss=0.257, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  36%|███▋      | 2346/6434 [5:30:21<9:12:47,  8.11s/it, gpt_loss=0.257, loss_mean=0.28][A[A
+
+Train step of epoch 1:  36%|███▋      | 2346/6434 [5:30:30<9:12:47,  8.11s/it, gpt_loss=0.355, loss_mean=0.287][A[A
+
+Train step of epoch 1:  36%|███▋      | 2347/6434 [5:30:30<9:19:35,  8.22s/it, gpt_loss=0.355, loss_mean=0.287][A[A
+
+Train step of epoch 1:  36%|███▋      | 2347/6434 [5:30:37<9:19:35,  8.22s/it, gpt_loss=0.307, loss_mean=0.289][A[A
+
+Train step of epoch 1:  36%|███▋      | 2348/6434 [5:30:37<9:09:45,  8.07s/it, gpt_loss=0.307, loss_mean=0.289][A[A
+
+Train step of epoch 1:  36%|███▋      | 2348/6434 [5:30:45<9:09:45,  8.07s/it, gpt_loss=0.265, loss_mean=0.287][A[A
+
+Train step of epoch 1:  37%|███▋      | 2349/6434 [5:30:45<9:08:19,  8.05s/it, gpt_loss=0.265, loss_mean=0.287][A[A
+
+Train step of epoch 1:  37%|███▋      | 2349/6434 [5:30:54<9:08:19,  8.05s/it, gpt_loss=0.26, loss_mean=0.284] [A[A
+
+Train step of epoch 1:  37%|███▋      | 2350/6434 [5:30:54<9:15:11,  8.16s/it, gpt_loss=0.26, loss_mean=0.284][A[A
+
+Train step of epoch 1:  37%|███▋      | 2350/6434 [5:31:02<9:15:11,  8.16s/it, gpt_loss=0.283, loss_mean=0.284][A[A
+
+Train step of epoch 1:  37%|███▋      | 2351/6434 [5:31:02<9:06:40,  8.03s/it, gpt_loss=0.283, loss_mean=0.284][A[A
+
+Train step of epoch 1:  37%|███▋      | 2351/6434 [5:31:10<9:06:40,  8.03s/it, gpt_loss=0.299, loss_mean=0.285][A[A
+
+Train step of epoch 1:  37%|███▋      | 2352/6434 [5:31:10<9:04:45,  8.01s/it, gpt_loss=0.299, loss_mean=0.285][A[A
+
+Train step of epoch 1:  37%|███▋      | 2352/6434 [5:31:17<9:04:45,  8.01s/it, gpt_loss=0.258, loss_mean=0.283][A[A
+
+Train step of epoch 1:  37%|███▋      | 2353/6434 [5:31:17<9:00:55,  7.95s/it, gpt_loss=0.258, loss_mean=0.283][A[A
+
+Train step of epoch 1:  37%|███▋      | 2353/6434 [5:31:25<9:00:55,  7.95s/it, gpt_loss=0.491, loss_mean=0.303][A[A
+
+Train step of epoch 1:  37%|███▋      | 2354/6434 [5:31:25<9:03:21,  7.99s/it, gpt_loss=0.491, loss_mean=0.303][A[A
+
+Train step of epoch 1:  37%|███▋      | 2354/6434 [5:31:33<9:03:21,  7.99s/it, gpt_loss=0.248, loss_mean=0.298][A[A
+
+Train step of epoch 1:  37%|███▋      | 2355/6434 [5:31:33<8:58:44,  7.92s/it, gpt_loss=0.248, loss_mean=0.298][A[A
+[LID Router Debug] Step: 8790
+Batch Size: 10
+Audio Batch Size: 101
+LID Assignments: [2, 3, 0, 5, 4, 1, 6, 6, 6, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+
+Train step of epoch 1:  37%|███▋      | 2355/6434 [5:31:42<8:58:44,  7.92s/it, gpt_loss=0.203, loss_mean=0.288][A[A
+
+Train step of epoch 1:  37%|███▋      | 2356/6434 [5:31:42<9:24:38,  8.31s/it, gpt_loss=0.203, loss_mean=0.288][A[A
+
+Train step of epoch 1:  37%|███▋      | 2356/6434 [5:31:50<9:24:38,  8.31s/it, gpt_loss=0.275, loss_mean=0.287][A[A
+
+Train step of epoch 1:  37%|███▋      | 2357/6434 [5:31:50<9:09:55,  8.09s/it, gpt_loss=0.275, loss_mean=0.287][A[A
+
+Train step of epoch 1:  37%|███▋      | 2357/6434 [5:31:59<9:09:55,  8.09s/it, gpt_loss=0.262, loss_mean=0.285][A[A
+
+Train step of epoch 1:  37%|███▋      | 2358/6434 [5:31:59<9:27:11,  8.35s/it, gpt_loss=0.262, loss_mean=0.285][A[A
+
+Train step of epoch 1:  37%|███▋      | 2358/6434 [5:32:07<9:27:11,  8.35s/it, gpt_loss=0.272, loss_mean=0.283][A[A
+
+Train step of epoch 1:  37%|███▋      | 2359/6434 [5:32:07<9:25:45,  8.33s/it, gpt_loss=0.272, loss_mean=0.283][A[A
+
+Train step of epoch 1:  37%|███▋      | 2359/6434 [5:32:16<9:25:45,  8.33s/it, gpt_loss=0.24, loss_mean=0.279] [A[A
+
+Train step of epoch 1:  37%|███▋      | 2360/6434 [5:32:16<9:29:25,  8.39s/it, gpt_loss=0.24, loss_mean=0.279][A[A
+
+Train step of epoch 1:  37%|███▋      | 2360/6434 [5:32:24<9:29:25,  8.39s/it, gpt_loss=0.236, loss_mean=0.275][A[A
+
+Train step of epoch 1:  37%|███▋      | 2361/6434 [5:32:24<9:35:05,  8.47s/it, gpt_loss=0.236, loss_mean=0.275][A[A
+
+Train step of epoch 1:  37%|███▋      | 2361/6434 [5:32:33<9:35:05,  8.47s/it, gpt_loss=0.241, loss_mean=0.271][A[A
+
+Train step of epoch 1:  37%|███▋      | 2362/6434 [5:32:33<9:27:41,  8.36s/it, gpt_loss=0.241, loss_mean=0.271][A[A
+
+Train step of epoch 1:  37%|███▋      | 2362/6434 [5:32:42<9:27:41,  8.36s/it, gpt_loss=0.278, loss_mean=0.272][A[A
+
+Train step of epoch 1:  37%|███▋      | 2363/6434 [5:32:42<9:42:23,  8.58s/it, gpt_loss=0.278, loss_mean=0.272][A[A
+
+Train step of epoch 1:  37%|███▋      | 2363/6434 [5:32:50<9:42:23,  8.58s/it, gpt_loss=0.27, loss_mean=0.272] [A[A
+
+Train step of epoch 1:  37%|███▋      | 2364/6434 [5:32:50<9:39:03,  8.54s/it, gpt_loss=0.27, loss_mean=0.272][A[A
+
+Train step of epoch 1:  37%|███▋      | 2364/6434 [5:32:58<9:39:03,  8.54s/it, gpt_loss=0.185, loss_mean=0.263][A[A
+
+Train step of epoch 1:  37%|███▋      | 2365/6434 [5:32:58<9:33:11,  8.45s/it, gpt_loss=0.185, loss_mean=0.263][A[A
+[LID Router Debug] Step: 8800
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [9, 2, 9, 3, 0, 6, 5, 1, 2, 9]
+Active Experts in Batch: {0, 1, 2, 3, 5, 6, 9}
+[2026-02-07 12:34:53,946] [INFO] [logging.py:96:log_dist] [Rank 0] step=4400, skipped=0, lr=[1.1552326766016015e-05, 1.1552326766016015e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 12:34:53,947] [INFO] [timer.py:260:stop] epoch=0/micro_step=8800/global_step=4400, RunningAvgSamplesPerSec=4.746315124440067, CurrSamplesPerSec=4.778045118353633, MemAllocated=12.6GB, MaxMemAllocated=49.73GB
+
+
+Train step of epoch 1:  37%|███▋      | 2365/6434 [5:33:07<9:33:11,  8.45s/it, gpt_loss=0.266, loss_mean=0.263][A[A
+
+Train step of epoch 1:  37%|███▋      | 2366/6434 [5:33:07<9:34:25,  8.47s/it, gpt_loss=0.266, loss_mean=0.263][A[A
+
+Train step of epoch 1:  37%|███▋      | 2366/6434 [5:33:16<9:34:25,  8.47s/it, gpt_loss=0.234, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  37%|███▋      | 2367/6434 [5:33:16<9:43:27,  8.61s/it, gpt_loss=0.234, loss_mean=0.26][A[A
+
+Train step of epoch 1:  37%|███▋      | 2367/6434 [5:33:24<9:43:27,  8.61s/it, gpt_loss=0.329, loss_mean=0.267][A[A
+
+Train step of epoch 1:  37%|███▋      | 2368/6434 [5:33:24<9:28:21,  8.39s/it, gpt_loss=0.329, loss_mean=0.267][A[A
+
+Train step of epoch 1:  37%|███▋      | 2368/6434 [5:33:32<9:28:21,  8.39s/it, gpt_loss=0.247, loss_mean=0.265][A[A
+
+Train step of epoch 1:  37%|███▋      | 2369/6434 [5:33:32<9:31:44,  8.44s/it, gpt_loss=0.247, loss_mean=0.265][A[A
+
+Train step of epoch 1:  37%|███▋      | 2369/6434 [5:33:40<9:31:44,  8.44s/it, gpt_loss=0.25, loss_mean=0.264] [A[A
+
+Train step of epoch 1:  37%|███▋      | 2370/6434 [5:33:40<9:16:14,  8.21s/it, gpt_loss=0.25, loss_mean=0.264][A[A
+
+Train step of epoch 1:  37%|███▋      | 2370/6434 [5:33:48<9:16:14,  8.21s/it, gpt_loss=0.34, loss_mean=0.271][A[A
+
+Train step of epoch 1:  37%|███▋      | 2371/6434 [5:33:48<9:14:24,  8.19s/it, gpt_loss=0.34, loss_mean=0.271][A[A
+
+Train step of epoch 1:  37%|███▋      | 2371/6434 [5:33:57<9:14:24,  8.19s/it, gpt_loss=0.289, loss_mean=0.273][A[A
+
+Train step of epoch 1:  37%|███▋      | 2372/6434 [5:33:57<9:23:16,  8.32s/it, gpt_loss=0.289, loss_mean=0.273][A[A
+
+Train step of epoch 1:  37%|███▋      | 2372/6434 [5:34:05<9:23:16,  8.32s/it, gpt_loss=0.257, loss_mean=0.271][A[A
+
+Train step of epoch 1:  37%|███▋      | 2373/6434 [5:34:05<9:15:30,  8.21s/it, gpt_loss=0.257, loss_mean=0.271][A[A
+
+Train step of epoch 1:  37%|███▋      | 2373/6434 [5:34:12<9:15:30,  8.21s/it, gpt_loss=0.287, loss_mean=0.273][A[A
+
+Train step of epoch 1:  37%|███▋      | 2374/6434 [5:34:12<9:04:52,  8.05s/it, gpt_loss=0.287, loss_mean=0.273][A[A
+
+Train step of epoch 1:  37%|███▋      | 2374/6434 [5:34:21<9:04:52,  8.05s/it, gpt_loss=0.281, loss_mean=0.274][A[A
+
+Train step of epoch 1:  37%|███▋      | 2375/6434 [5:34:21<9:11:24,  8.15s/it, gpt_loss=0.281, loss_mean=0.274][A[A
+[LID Router Debug] Step: 8810
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [1, 0, 9, 5, 5, 9, 1, 3, 6, 5]
+Active Experts in Batch: {0, 1, 3, 5, 6, 9}
+
+
+Train step of epoch 1:  37%|███▋      | 2375/6434 [5:34:28<9:11:24,  8.15s/it, gpt_loss=0.277, loss_mean=0.274][A[A
+
+Train step of epoch 1:  37%|███▋      | 2376/6434 [5:34:28<8:53:50,  7.89s/it, gpt_loss=0.277, loss_mean=0.274][A[A
+
+Train step of epoch 1:  37%|███▋      | 2376/6434 [5:34:36<8:53:50,  7.89s/it, gpt_loss=0.289, loss_mean=0.276][A[A
+
+Train step of epoch 1:  37%|███▋      | 2377/6434 [5:34:36<8:59:12,  7.97s/it, gpt_loss=0.289, loss_mean=0.276][A[A
+
+Train step of epoch 1:  37%|███▋      | 2377/6434 [5:34:45<8:59:12,  7.97s/it, gpt_loss=0.227, loss_mean=0.271][A[A
+
+Train step of epoch 1:  37%|███▋      | 2378/6434 [5:34:45<9:25:33,  8.37s/it, gpt_loss=0.227, loss_mean=0.271][A[A
+
+Train step of epoch 1:  37%|███▋      | 2378/6434 [5:34:53<9:25:33,  8.37s/it, gpt_loss=0.284, loss_mean=0.272][A[A
+
+Train step of epoch 1:  37%|███▋      | 2379/6434 [5:34:53<9:14:20,  8.20s/it, gpt_loss=0.284, loss_mean=0.272][A[A
+
+Train step of epoch 1:  37%|███▋      | 2379/6434 [5:35:02<9:14:20,  8.20s/it, gpt_loss=0.241, loss_mean=0.269][A[A
+
+Train step of epoch 1:  37%|███▋      | 2380/6434 [5:35:02<9:23:33,  8.34s/it, gpt_loss=0.241, loss_mean=0.269][A[A
+
+Train step of epoch 1:  37%|███▋      | 2380/6434 [5:35:11<9:23:33,  8.34s/it, gpt_loss=0.262, loss_mean=0.268][A[A
+
+Train step of epoch 1:  37%|███▋      | 2381/6434 [5:35:11<9:32:19,  8.47s/it, gpt_loss=0.262, loss_mean=0.268][A[A
+
+Train step of epoch 1:  37%|███▋      | 2381/6434 [5:35:19<9:32:19,  8.47s/it, gpt_loss=0.313, loss_mean=0.273][A[A
+
+Train step of epoch 1:  37%|███▋      | 2382/6434 [5:35:19<9:35:41,  8.52s/it, gpt_loss=0.313, loss_mean=0.273][A[A
+
+Train step of epoch 1:  37%|███▋      | 2382/6434 [5:35:28<9:35:41,  8.52s/it, gpt_loss=0.271, loss_mean=0.272][A[A
+
+Train step of epoch 1:  37%|███▋      | 2383/6434 [5:35:28<9:37:38,  8.56s/it, gpt_loss=0.271, loss_mean=0.272][A[A
+
+Train step of epoch 1:  37%|███▋      | 2383/6434 [5:35:38<9:37:38,  8.56s/it, gpt_loss=0.29, loss_mean=0.274] [A[A
+
+Train step of epoch 1:  37%|███▋      | 2384/6434 [5:35:38<10:00:22,  8.89s/it, gpt_loss=0.29, loss_mean=0.274][A[A
+
+Train step of epoch 1:  37%|███▋      | 2384/6434 [5:35:47<10:00:22,  8.89s/it, gpt_loss=0.223, loss_mean=0.269][A[A
+
+Train step of epoch 1:  37%|███▋      | 2385/6434 [5:35:47<10:00:21,  8.90s/it, gpt_loss=0.223, loss_mean=0.269][A[A
+[LID Router Debug] Step: 8820
+Batch Size: 10
+Audio Batch Size: 110
+LID Assignments: [1, 1, 1, 5, 2, 3, 3, 6, 0, 9]
+Active Experts in Batch: {0, 1, 2, 3, 5, 6, 9}
+
+
+Train step of epoch 1:  37%|███▋      | 2385/6434 [5:35:55<10:00:21,  8.90s/it, gpt_loss=0.276, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  37%|███▋      | 2386/6434 [5:35:55<9:48:38,  8.72s/it, gpt_loss=0.276, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  37%|███▋      | 2386/6434 [5:36:03<9:48:38,  8.72s/it, gpt_loss=0.252, loss_mean=0.268][A[A
+
+Train step of epoch 1:  37%|███▋      | 2387/6434 [5:36:03<9:30:49,  8.46s/it, gpt_loss=0.252, loss_mean=0.268][A[A
+
+Train step of epoch 1:  37%|███▋      | 2387/6434 [5:36:12<9:30:49,  8.46s/it, gpt_loss=0.227, loss_mean=0.264][A[A
+
+Train step of epoch 1:  37%|███▋      | 2388/6434 [5:36:12<9:42:08,  8.63s/it, gpt_loss=0.227, loss_mean=0.264][A[A
+
+Train step of epoch 1:  37%|███▋      | 2388/6434 [5:36:20<9:42:08,  8.63s/it, gpt_loss=0.213, loss_mean=0.259][A[A
+
+Train step of epoch 1:  37%|███▋      | 2389/6434 [5:36:20<9:34:25,  8.52s/it, gpt_loss=0.213, loss_mean=0.259][A[A
+
+Train step of epoch 1:  37%|███▋      | 2389/6434 [5:36:28<9:34:25,  8.52s/it, gpt_loss=0.244, loss_mean=0.257][A[A
+
+Train step of epoch 1:  37%|███▋      | 2390/6434 [5:36:28<9:27:38,  8.42s/it, gpt_loss=0.244, loss_mean=0.257][A[A
+
+Train step of epoch 1:  37%|███▋      | 2390/6434 [5:36:36<9:27:38,  8.42s/it, gpt_loss=0.333, loss_mean=0.265][A[A
+
+Train step of epoch 1:  37%|███▋      | 2391/6434 [5:36:36<9:10:38,  8.17s/it, gpt_loss=0.333, loss_mean=0.265][A[A
+
+Train step of epoch 1:  37%|███▋      | 2391/6434 [5:36:44<9:10:38,  8.17s/it, gpt_loss=0.293, loss_mean=0.268][A[A
+
+Train step of epoch 1:  37%|███▋      | 2392/6434 [5:36:44<9:18:30,  8.29s/it, gpt_loss=0.293, loss_mean=0.268][A[A
+
+Train step of epoch 1:  37%|███▋      | 2392/6434 [5:36:52<9:18:30,  8.29s/it, gpt_loss=0.239, loss_mean=0.265][A[A
+
+Train step of epoch 1:  37%|███▋      | 2393/6434 [5:36:52<9:14:22,  8.23s/it, gpt_loss=0.239, loss_mean=0.265][A[A
+
+Train step of epoch 1:  37%|███▋      | 2393/6434 [5:37:01<9:14:22,  8.23s/it, gpt_loss=0.255, loss_mean=0.264][A[A
+
+Train step of epoch 1:  37%|███▋      | 2394/6434 [5:37:01<9:21:47,  8.34s/it, gpt_loss=0.255, loss_mean=0.264][A[A
+
+Train step of epoch 1:  37%|███▋      | 2394/6434 [5:37:09<9:21:47,  8.34s/it, gpt_loss=0.37, loss_mean=0.275] [A[A
+
+Train step of epoch 1:  37%|███▋      | 2395/6434 [5:37:09<9:03:34,  8.07s/it, gpt_loss=0.37, loss_mean=0.275][A[A
+[LID Router Debug] Step: 8830
+Batch Size: 10
+Audio Batch Size: 82
+LID Assignments: [0, 4, 5, 1, 9, 1, 1, 0, 6, 1]
+Active Experts in Batch: {0, 1, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  37%|███▋      | 2395/6434 [5:37:16<9:03:34,  8.07s/it, gpt_loss=0.265, loss_mean=0.274][A[A
+
+Train step of epoch 1:  37%|███▋      | 2396/6434 [5:37:16<8:44:03,  7.79s/it, gpt_loss=0.265, loss_mean=0.274][A[A
+
+Train step of epoch 1:  37%|███▋      | 2396/6434 [5:37:24<8:44:03,  7.79s/it, gpt_loss=0.265, loss_mean=0.273][A[A
+
+Train step of epoch 1:  37%|███▋      | 2397/6434 [5:37:24<8:56:20,  7.97s/it, gpt_loss=0.265, loss_mean=0.273][A[A
+
+Train step of epoch 1:  37%|███▋      | 2397/6434 [5:37:32<8:56:20,  7.97s/it, gpt_loss=0.371, loss_mean=0.283][A[A
+
+Train step of epoch 1:  37%|███▋      | 2398/6434 [5:37:32<8:48:35,  7.86s/it, gpt_loss=0.371, loss_mean=0.283][A[A
+
+Train step of epoch 1:  37%|███▋      | 2398/6434 [5:37:40<8:48:35,  7.86s/it, gpt_loss=0.207, loss_mean=0.275][A[A
+
+Train step of epoch 1:  37%|███▋      | 2399/6434 [5:37:40<9:06:31,  8.13s/it, gpt_loss=0.207, loss_mean=0.275][A[A
+
+Train step of epoch 1:  37%|███▋      | 2399/6434 [5:37:49<9:06:31,  8.13s/it, gpt_loss=0.293, loss_mean=0.277][A[A
+
+Train step of epoch 1:  37%|███▋      | 2400/6434 [5:37:49<9:08:45,  8.16s/it, gpt_loss=0.293, loss_mean=0.277][A[A
+
+Train step of epoch 1:  37%|███▋      | 2400/6434 [5:37:56<9:08:45,  8.16s/it, gpt_loss=0.288, loss_mean=0.278][A[A
+
+Train step of epoch 1:  37%|███▋      | 2401/6434 [5:37:56<9:02:52,  8.08s/it, gpt_loss=0.288, loss_mean=0.278][A[A
+
+Train step of epoch 1:  37%|███▋      | 2401/6434 [5:38:06<9:02:52,  8.08s/it, gpt_loss=0.243, loss_mean=0.274][A[A
+
+Train step of epoch 1:  37%|███▋      | 2402/6434 [5:38:06<9:27:34,  8.45s/it, gpt_loss=0.243, loss_mean=0.274][A[A
+
+Train step of epoch 1:  37%|███▋      | 2402/6434 [5:38:14<9:27:34,  8.45s/it, gpt_loss=0.304, loss_mean=0.277][A[A
+
+Train step of epoch 1:  37%|███▋      | 2403/6434 [5:38:14<9:21:52,  8.36s/it, gpt_loss=0.304, loss_mean=0.277][A[A
+
+Train step of epoch 1:  37%|███▋      | 2403/6434 [5:38:23<9:21:52,  8.36s/it, gpt_loss=0.237, loss_mean=0.273][A[A
+
+Train step of epoch 1:  37%|███▋      | 2404/6434 [5:38:23<9:31:45,  8.51s/it, gpt_loss=0.237, loss_mean=0.273][A[A
+
+Train step of epoch 1:  37%|███▋      | 2404/6434 [5:38:31<9:31:45,  8.51s/it, gpt_loss=0.223, loss_mean=0.268][A[A
+
+Train step of epoch 1:  37%|███▋      | 2405/6434 [5:38:31<9:27:44,  8.45s/it, gpt_loss=0.223, loss_mean=0.268][A[A
+[LID Router Debug] Step: 8840
+Batch Size: 10
+Audio Batch Size: 91
+LID Assignments: [2, 4, 4, 6, 6, 1, 0, 9, 2, 5]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  37%|███▋      | 2405/6434 [5:38:40<9:27:44,  8.45s/it, gpt_loss=0.293, loss_mean=0.271][A[A
+
+Train step of epoch 1:  37%|███▋      | 2406/6434 [5:38:40<9:37:14,  8.60s/it, gpt_loss=0.293, loss_mean=0.271][A[A
+
+Train step of epoch 1:  37%|███▋      | 2406/6434 [5:38:48<9:37:14,  8.60s/it, gpt_loss=0.349, loss_mean=0.279][A[A
+
+Train step of epoch 1:  37%|███▋      | 2407/6434 [5:38:48<9:32:30,  8.53s/it, gpt_loss=0.349, loss_mean=0.279][A[A
+
+Train step of epoch 1:  37%|███▋      | 2407/6434 [5:38:56<9:32:30,  8.53s/it, gpt_loss=0.299, loss_mean=0.281][A[A
+
+Train step of epoch 1:  37%|███▋      | 2408/6434 [5:38:56<9:21:32,  8.37s/it, gpt_loss=0.299, loss_mean=0.281][A[A
+
+Train step of epoch 1:  37%|███▋      | 2408/6434 [5:39:06<9:21:32,  8.37s/it, gpt_loss=0.281, loss_mean=0.281][A[A
+
+Train step of epoch 1:  37%|███▋      | 2409/6434 [5:39:06<9:41:48,  8.67s/it, gpt_loss=0.281, loss_mean=0.281][A[A
+
+Train step of epoch 1:  37%|███▋      | 2409/6434 [5:39:14<9:41:48,  8.67s/it, gpt_loss=0.25, loss_mean=0.278] [A[A
+
+Train step of epoch 1:  37%|███▋      | 2410/6434 [5:39:14<9:36:51,  8.60s/it, gpt_loss=0.25, loss_mean=0.278][A[A
+
+Train step of epoch 1:  37%|███▋      | 2410/6434 [5:39:23<9:36:51,  8.60s/it, gpt_loss=0.356, loss_mean=0.285][A[A
+
+Train step of epoch 1:  37%|███▋      | 2411/6434 [5:39:23<9:35:22,  8.58s/it, gpt_loss=0.356, loss_mean=0.285][A[A
+
+Train step of epoch 1:  37%|███▋      | 2411/6434 [5:39:32<9:35:22,  8.58s/it, gpt_loss=0.266, loss_mean=0.283][A[A
+
+Train step of epoch 1:  37%|███▋      | 2412/6434 [5:39:32<9:57:47,  8.92s/it, gpt_loss=0.266, loss_mean=0.283][A[A
+
+Train step of epoch 1:  37%|███▋      | 2412/6434 [5:39:40<9:57:47,  8.92s/it, gpt_loss=0.229, loss_mean=0.278][A[A
+
+Train step of epoch 1:  38%|███▊      | 2413/6434 [5:39:40<9:33:46,  8.56s/it, gpt_loss=0.229, loss_mean=0.278][A[A
+
+Train step of epoch 1:  38%|███▊      | 2413/6434 [5:39:48<9:33:46,  8.56s/it, gpt_loss=0.282, loss_mean=0.278][A[A
+
+Train step of epoch 1:  38%|███▊      | 2414/6434 [5:39:48<9:26:46,  8.46s/it, gpt_loss=0.282, loss_mean=0.278][A[A
+
+Train step of epoch 1:  38%|███▊      | 2414/6434 [5:39:57<9:26:46,  8.46s/it, gpt_loss=0.235, loss_mean=0.274][A[A
+
+Train step of epoch 1:  38%|███▊      | 2415/6434 [5:39:57<9:34:06,  8.57s/it, gpt_loss=0.235, loss_mean=0.274][A[A
+[LID Router Debug] Step: 8850
+Batch Size: 10
+Audio Batch Size: 124
+LID Assignments: [3, 1, 4, 4, 2, 6, 3, 6, 1, 3]
+Active Experts in Batch: {1, 2, 3, 4, 6}
+
+
+Train step of epoch 1:  38%|███▊      | 2415/6434 [5:40:06<9:34:06,  8.57s/it, gpt_loss=0.203, loss_mean=0.267][A[A
+
+Train step of epoch 1:  38%|███▊      | 2416/6434 [5:40:06<9:41:29,  8.68s/it, gpt_loss=0.203, loss_mean=0.267][A[A
+
+Train step of epoch 1:  38%|███▊      | 2416/6434 [5:40:15<9:41:29,  8.68s/it, gpt_loss=0.3, loss_mean=0.27]   [A[A
+
+Train step of epoch 1:  38%|███▊      | 2417/6434 [5:40:15<9:52:39,  8.85s/it, gpt_loss=0.3, loss_mean=0.27][A[A
+
+Train step of epoch 1:  38%|███▊      | 2417/6434 [5:40:24<9:52:39,  8.85s/it, gpt_loss=0.26, loss_mean=0.269][A[A
+
+Train step of epoch 1:  38%|███▊      | 2418/6434 [5:40:24<9:42:57,  8.71s/it, gpt_loss=0.26, loss_mean=0.269][A[A
+
+Train step of epoch 1:  38%|███▊      | 2418/6434 [5:40:32<9:42:57,  8.71s/it, gpt_loss=0.264, loss_mean=0.269][A[A
+
+Train step of epoch 1:  38%|███▊      | 2419/6434 [5:40:32<9:35:00,  8.59s/it, gpt_loss=0.264, loss_mean=0.269][A[A
+
+Train step of epoch 1:  38%|███▊      | 2419/6434 [5:40:40<9:35:00,  8.59s/it, gpt_loss=0.229, loss_mean=0.265][A[A
+
+Train step of epoch 1:  38%|███▊      | 2420/6434 [5:40:40<9:26:39,  8.47s/it, gpt_loss=0.229, loss_mean=0.265][A[A
+
+Train step of epoch 1:  38%|███▊      | 2420/6434 [5:40:48<9:26:39,  8.47s/it, gpt_loss=0.248, loss_mean=0.263][A[A
+
+Train step of epoch 1:  38%|███▊      | 2421/6434 [5:40:48<9:12:22,  8.26s/it, gpt_loss=0.248, loss_mean=0.263][A[A
+
+Train step of epoch 1:  38%|███▊      | 2421/6434 [5:40:57<9:12:22,  8.26s/it, gpt_loss=0.267, loss_mean=0.264][A[A
+
+Train step of epoch 1:  38%|███▊      | 2422/6434 [5:40:57<9:27:02,  8.48s/it, gpt_loss=0.267, loss_mean=0.264][A[A
+
+Train step of epoch 1:  38%|███▊      | 2422/6434 [5:41:06<9:27:02,  8.48s/it, gpt_loss=0.218, loss_mean=0.259][A[A
+
+Train step of epoch 1:  38%|███▊      | 2423/6434 [5:41:06<9:35:34,  8.61s/it, gpt_loss=0.218, loss_mean=0.259][A[A
+
+Train step of epoch 1:  38%|███▊      | 2423/6434 [5:41:16<9:35:34,  8.61s/it, gpt_loss=0.257, loss_mean=0.259][A[A
+
+Train step of epoch 1:  38%|███▊      | 2424/6434 [5:41:16<10:00:31,  8.99s/it, gpt_loss=0.257, loss_mean=0.259][A[A
+
+Train step of epoch 1:  38%|███▊      | 2424/6434 [5:41:25<10:00:31,  8.99s/it, gpt_loss=0.223, loss_mean=0.255][A[A
+
+Train step of epoch 1:  38%|███▊      | 2425/6434 [5:41:25<9:55:51,  8.92s/it, gpt_loss=0.223, loss_mean=0.255] [A[A
+[LID Router Debug] Step: 8860
+Batch Size: 10
+Audio Batch Size: 115
+LID Assignments: [3, 0, 6, 4, 2, 5, 6, 8, 5, 6]
+Active Experts in Batch: {0, 2, 3, 4, 5, 6, 8}
+
+
+Train step of epoch 1:  38%|███▊      | 2425/6434 [5:41:33<9:55:51,  8.92s/it, gpt_loss=0.25, loss_mean=0.255] [A[A
+
+Train step of epoch 1:  38%|███▊      | 2426/6434 [5:41:33<9:51:57,  8.86s/it, gpt_loss=0.25, loss_mean=0.255][A[A
+
+Train step of epoch 1:  38%|███▊      | 2426/6434 [5:41:41<9:51:57,  8.86s/it, gpt_loss=0.318, loss_mean=0.261][A[A
+
+Train step of epoch 1:  38%|███▊      | 2427/6434 [5:41:41<9:30:23,  8.54s/it, gpt_loss=0.318, loss_mean=0.261][A[A
+
+Train step of epoch 1:  38%|███▊      | 2427/6434 [5:41:48<9:30:23,  8.54s/it, gpt_loss=0.282, loss_mean=0.263][A[A
+
+Train step of epoch 1:  38%|███▊      | 2428/6434 [5:41:48<9:05:48,  8.17s/it, gpt_loss=0.282, loss_mean=0.263][A[A
+
+Train step of epoch 1:  38%|███▊      | 2428/6434 [5:41:57<9:05:48,  8.17s/it, gpt_loss=0.284, loss_mean=0.265][A[A
+
+Train step of epoch 1:  38%|███▊      | 2429/6434 [5:41:57<9:20:30,  8.40s/it, gpt_loss=0.284, loss_mean=0.265][A[A
+
+Train step of epoch 1:  38%|███▊      | 2429/6434 [5:42:05<9:20:30,  8.40s/it, gpt_loss=0.29, loss_mean=0.268] [A[A
+
+Train step of epoch 1:  38%|███▊      | 2430/6434 [5:42:05<9:11:05,  8.26s/it, gpt_loss=0.29, loss_mean=0.268][A[A
+
+Train step of epoch 1:  38%|███▊      | 2430/6434 [5:42:13<9:11:05,  8.26s/it, gpt_loss=0.293, loss_mean=0.27][A[A
+
+Train step of epoch 1:  38%|███▊      | 2431/6434 [5:42:13<8:57:02,  8.05s/it, gpt_loss=0.293, loss_mean=0.27][A[A
+
+Train step of epoch 1:  38%|███▊      | 2431/6434 [5:42:22<8:57:02,  8.05s/it, gpt_loss=0.343, loss_mean=0.277][A[A
+
+Train step of epoch 1:  38%|███▊      | 2432/6434 [5:42:22<9:12:37,  8.29s/it, gpt_loss=0.343, loss_mean=0.277][A[A
+
+Train step of epoch 1:  38%|███▊      | 2432/6434 [5:42:31<9:12:37,  8.29s/it, gpt_loss=0.272, loss_mean=0.277][A[A
+
+Train step of epoch 1:  38%|███▊      | 2433/6434 [5:42:31<9:40:53,  8.71s/it, gpt_loss=0.272, loss_mean=0.277][A[A
+
+Train step of epoch 1:  38%|███▊      | 2433/6434 [5:42:39<9:40:53,  8.71s/it, gpt_loss=0.309, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  38%|███▊      | 2434/6434 [5:42:39<9:27:06,  8.51s/it, gpt_loss=0.309, loss_mean=0.28][A[A
+
+Train step of epoch 1:  38%|███▊      | 2434/6434 [5:42:47<9:27:06,  8.51s/it, gpt_loss=0.36, loss_mean=0.288][A[A
+
+Train step of epoch 1:  38%|███▊      | 2435/6434 [5:42:47<9:02:30,  8.14s/it, gpt_loss=0.36, loss_mean=0.288][A[A
+[LID Router Debug] Step: 8870
+Batch Size: 10
+Audio Batch Size: 139
+LID Assignments: [0, 3, 3, 4, 0, 2, 9, 3, 9, 7]
+Active Experts in Batch: {0, 2, 3, 4, 7, 9}
+
+
+Train step of epoch 1:  38%|███▊      | 2435/6434 [5:42:56<9:02:30,  8.14s/it, gpt_loss=0.239, loss_mean=0.283][A[A
+
+Train step of epoch 1:  38%|███▊      | 2436/6434 [5:42:56<9:17:18,  8.36s/it, gpt_loss=0.239, loss_mean=0.283][A[A
+
+Train step of epoch 1:  38%|███▊      | 2436/6434 [5:43:04<9:17:18,  8.36s/it, gpt_loss=0.297, loss_mean=0.285][A[A
+
+Train step of epoch 1:  38%|███▊      | 2437/6434 [5:43:04<9:08:27,  8.23s/it, gpt_loss=0.297, loss_mean=0.285][A[A
+
+Train step of epoch 1:  38%|███▊      | 2437/6434 [5:43:12<9:08:27,  8.23s/it, gpt_loss=0.291, loss_mean=0.285][A[A
+
+Train step of epoch 1:  38%|███▊      | 2438/6434 [5:43:12<9:17:30,  8.37s/it, gpt_loss=0.291, loss_mean=0.285][A[A
+
+Train step of epoch 1:  38%|███▊      | 2438/6434 [5:43:21<9:17:30,  8.37s/it, gpt_loss=0.331, loss_mean=0.29] [A[A
+
+Train step of epoch 1:  38%|███▊      | 2439/6434 [5:43:21<9:15:49,  8.35s/it, gpt_loss=0.331, loss_mean=0.29][A[A
+
+Train step of epoch 1:  38%|███▊      | 2439/6434 [5:43:29<9:15:49,  8.35s/it, gpt_loss=0.294, loss_mean=0.29][A[A
+
+Train step of epoch 1:  38%|███▊      | 2440/6434 [5:43:29<9:16:22,  8.36s/it, gpt_loss=0.294, loss_mean=0.29][A[A
+
+Train step of epoch 1:  38%|███▊      | 2440/6434 [5:43:37<9:16:22,  8.36s/it, gpt_loss=0.267, loss_mean=0.288][A[A
+
+Train step of epoch 1:  38%|███▊      | 2441/6434 [5:43:37<9:09:35,  8.26s/it, gpt_loss=0.267, loss_mean=0.288][A[A
+
+Train step of epoch 1:  38%|███▊      | 2441/6434 [5:43:46<9:09:35,  8.26s/it, gpt_loss=0.249, loss_mean=0.284][A[A
+
+Train step of epoch 1:  38%|███▊      | 2442/6434 [5:43:46<9:21:04,  8.43s/it, gpt_loss=0.249, loss_mean=0.284][A[A
+
+Train step of epoch 1:  38%|███▊      | 2442/6434 [5:43:54<9:21:04,  8.43s/it, gpt_loss=0.229, loss_mean=0.278][A[A
+
+Train step of epoch 1:  38%|███▊      | 2443/6434 [5:43:54<9:18:02,  8.39s/it, gpt_loss=0.229, loss_mean=0.278][A[A
+
+Train step of epoch 1:  38%|███▊      | 2443/6434 [5:44:03<9:18:02,  8.39s/it, gpt_loss=0.273, loss_mean=0.278][A[A
+
+Train step of epoch 1:  38%|███▊      | 2444/6434 [5:44:03<9:27:53,  8.54s/it, gpt_loss=0.273, loss_mean=0.278][A[A
+
+Train step of epoch 1:  38%|███▊      | 2444/6434 [5:44:12<9:27:53,  8.54s/it, gpt_loss=0.243, loss_mean=0.274][A[A
+
+Train step of epoch 1:  38%|███▊      | 2445/6434 [5:44:12<9:43:52,  8.78s/it, gpt_loss=0.243, loss_mean=0.274][A[A
+[LID Router Debug] Step: 8880
+Batch Size: 10
+Audio Batch Size: 105
+LID Assignments: [3, 1, 5, 2, 2, 0, 2, 5, 9, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  38%|███▊      | 2445/6434 [5:44:21<9:43:52,  8.78s/it, gpt_loss=0.301, loss_mean=0.277][A[A
+
+Train step of epoch 1:  38%|███▊      | 2446/6434 [5:44:21<9:40:05,  8.73s/it, gpt_loss=0.301, loss_mean=0.277][A[A
+
+Train step of epoch 1:  38%|███▊      | 2446/6434 [5:44:31<9:40:05,  8.73s/it, gpt_loss=0.355, loss_mean=0.285][A[A
+
+Train step of epoch 1:  38%|███▊      | 2447/6434 [5:44:31<9:58:56,  9.01s/it, gpt_loss=0.355, loss_mean=0.285][A[A
+
+Train step of epoch 1:  38%|███▊      | 2447/6434 [5:44:39<9:58:56,  9.01s/it, gpt_loss=0.179, loss_mean=0.274][A[A
+
+Train step of epoch 1:  38%|███▊      | 2448/6434 [5:44:39<9:48:51,  8.86s/it, gpt_loss=0.179, loss_mean=0.274][A[A
+
+Train step of epoch 1:  38%|███▊      | 2448/6434 [5:44:47<9:48:51,  8.86s/it, gpt_loss=0.268, loss_mean=0.274][A[A
+
+Train step of epoch 1:  38%|███▊      | 2449/6434 [5:44:47<9:21:56,  8.46s/it, gpt_loss=0.268, loss_mean=0.274][A[A
+
+Train step of epoch 1:  38%|███▊      | 2449/6434 [5:44:56<9:21:56,  8.46s/it, gpt_loss=0.242, loss_mean=0.271][A[A
+
+Train step of epoch 1:  38%|███▊      | 2450/6434 [5:44:56<9:29:50,  8.58s/it, gpt_loss=0.242, loss_mean=0.271][A[A
+
+Train step of epoch 1:  38%|███▊      | 2450/6434 [5:45:03<9:29:50,  8.58s/it, gpt_loss=0.188, loss_mean=0.262][A[A
+
+Train step of epoch 1:  38%|███▊      | 2451/6434 [5:45:03<9:13:47,  8.34s/it, gpt_loss=0.188, loss_mean=0.262][A[A
+
+Train step of epoch 1:  38%|███▊      | 2451/6434 [5:45:11<9:13:47,  8.34s/it, gpt_loss=0.256, loss_mean=0.262][A[A
+
+Train step of epoch 1:  38%|███▊      | 2452/6434 [5:45:11<9:07:45,  8.25s/it, gpt_loss=0.256, loss_mean=0.262][A[A
+
+Train step of epoch 1:  38%|███▊      | 2452/6434 [5:45:20<9:07:45,  8.25s/it, gpt_loss=0.236, loss_mean=0.259][A[A
+
+Train step of epoch 1:  38%|███▊      | 2453/6434 [5:45:20<9:08:15,  8.26s/it, gpt_loss=0.236, loss_mean=0.259][A[A
+
+Train step of epoch 1:  38%|███▊      | 2453/6434 [5:45:28<9:08:15,  8.26s/it, gpt_loss=0.219, loss_mean=0.255][A[A
+
+Train step of epoch 1:  38%|███▊      | 2454/6434 [5:45:28<9:11:52,  8.32s/it, gpt_loss=0.219, loss_mean=0.255][A[A
+
+Train step of epoch 1:  38%|███▊      | 2454/6434 [5:45:39<9:11:52,  8.32s/it, gpt_loss=0.323, loss_mean=0.262][A[A
+
+Train step of epoch 1:  38%|███▊      | 2455/6434 [5:45:39<10:08:11,  9.17s/it, gpt_loss=0.323, loss_mean=0.262][A[A
+[LID Router Debug] Step: 8890
+Batch Size: 10
+Audio Batch Size: 121
+LID Assignments: [9, 4, 2, 2, 4, 5, 3, 5, 3, 5]
+Active Experts in Batch: {2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  38%|███▊      | 2455/6434 [5:45:48<10:08:11,  9.17s/it, gpt_loss=0.256, loss_mean=0.261][A[A
+
+Train step of epoch 1:  38%|███▊      | 2456/6434 [5:45:48<9:53:33,  8.95s/it, gpt_loss=0.256, loss_mean=0.261] [A[A
+
+Train step of epoch 1:  38%|███▊      | 2456/6434 [5:45:56<9:53:33,  8.95s/it, gpt_loss=0.332, loss_mean=0.268][A[A
+
+Train step of epoch 1:  38%|███▊      | 2457/6434 [5:45:56<9:36:10,  8.69s/it, gpt_loss=0.332, loss_mean=0.268][A[A
+
+Train step of epoch 1:  38%|███▊      | 2457/6434 [5:46:05<9:36:10,  8.69s/it, gpt_loss=0.307, loss_mean=0.272][A[A
+
+Train step of epoch 1:  38%|███▊      | 2458/6434 [5:46:05<9:45:08,  8.83s/it, gpt_loss=0.307, loss_mean=0.272][A[A
+
+Train step of epoch 1:  38%|███▊      | 2458/6434 [5:46:13<9:45:08,  8.83s/it, gpt_loss=0.257, loss_mean=0.271][A[A
+
+Train step of epoch 1:  38%|███▊      | 2459/6434 [5:46:13<9:39:11,  8.74s/it, gpt_loss=0.257, loss_mean=0.271][A[A
+
+Train step of epoch 1:  38%|███▊      | 2459/6434 [5:46:22<9:39:11,  8.74s/it, gpt_loss=0.228, loss_mean=0.266][A[A
+
+Train step of epoch 1:  38%|███▊      | 2460/6434 [5:46:22<9:42:10,  8.79s/it, gpt_loss=0.228, loss_mean=0.266][A[A
+
+Train step of epoch 1:  38%|███▊      | 2460/6434 [5:46:32<9:42:10,  8.79s/it, gpt_loss=0.286, loss_mean=0.268][A[A
+
+Train step of epoch 1:  38%|███▊      | 2461/6434 [5:46:32<9:50:52,  8.92s/it, gpt_loss=0.286, loss_mean=0.268][A[A
+
+Train step of epoch 1:  38%|███▊      | 2461/6434 [5:46:39<9:50:52,  8.92s/it, gpt_loss=0.239, loss_mean=0.265][A[A
+
+Train step of epoch 1:  38%|███▊      | 2462/6434 [5:46:39<9:25:10,  8.54s/it, gpt_loss=0.239, loss_mean=0.265][A[A
+
+Train step of epoch 1:  38%|███▊      | 2462/6434 [5:46:48<9:25:10,  8.54s/it, gpt_loss=0.208, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  38%|███▊      | 2463/6434 [5:46:48<9:28:31,  8.59s/it, gpt_loss=0.208, loss_mean=0.26][A[A
+
+Train step of epoch 1:  38%|███▊      | 2463/6434 [5:46:55<9:28:31,  8.59s/it, gpt_loss=0.259, loss_mean=0.26][A[A
+
+Train step of epoch 1:  38%|███▊      | 2464/6434 [5:46:55<9:04:01,  8.22s/it, gpt_loss=0.259, loss_mean=0.26][A[A
+
+Train step of epoch 1:  38%|███▊      | 2464/6434 [5:47:03<9:04:01,  8.22s/it, gpt_loss=0.271, loss_mean=0.261][A[A
+
+Train step of epoch 1:  38%|███▊      | 2465/6434 [5:47:03<8:59:45,  8.16s/it, gpt_loss=0.271, loss_mean=0.261][A[A
+[LID Router Debug] Step: 8900
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [3, 9, 6, 4, 1, 5, 2, 1, 6, 1]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  38%|███▊      | 2465/6434 [5:47:11<8:59:45,  8.16s/it, gpt_loss=0.233, loss_mean=0.258][A[A
+
+Train step of epoch 1:  38%|███▊      | 2466/6434 [5:47:11<8:59:46,  8.16s/it, gpt_loss=0.233, loss_mean=0.258][A[A
+
+Train step of epoch 1:  38%|███▊      | 2466/6434 [5:47:19<8:59:46,  8.16s/it, gpt_loss=0.235, loss_mean=0.256][A[A
+
+Train step of epoch 1:  38%|███▊      | 2467/6434 [5:47:19<8:44:30,  7.93s/it, gpt_loss=0.235, loss_mean=0.256][A[A
+
+Train step of epoch 1:  38%|███▊      | 2467/6434 [5:47:28<8:44:30,  7.93s/it, gpt_loss=0.288, loss_mean=0.259][A[A
+
+Train step of epoch 1:  38%|███▊      | 2468/6434 [5:47:28<8:59:52,  8.17s/it, gpt_loss=0.288, loss_mean=0.259][A[A
+
+Train step of epoch 1:  38%|███▊      | 2468/6434 [5:47:36<8:59:52,  8.17s/it, gpt_loss=0.219, loss_mean=0.255][A[A
+
+Train step of epoch 1:  38%|███▊      | 2469/6434 [5:47:36<9:12:57,  8.37s/it, gpt_loss=0.219, loss_mean=0.255][A[A
+
+Train step of epoch 1:  38%|███▊      | 2469/6434 [5:47:46<9:12:57,  8.37s/it, gpt_loss=0.351, loss_mean=0.264][A[A
+
+Train step of epoch 1:  38%|███▊      | 2470/6434 [5:47:46<9:28:46,  8.61s/it, gpt_loss=0.351, loss_mean=0.264][A[A
+
+Train step of epoch 1:  38%|███▊      | 2470/6434 [5:47:55<9:28:46,  8.61s/it, gpt_loss=0.232, loss_mean=0.261][A[A
+
+Train step of epoch 1:  38%|███▊      | 2471/6434 [5:47:55<9:34:19,  8.70s/it, gpt_loss=0.232, loss_mean=0.261][A[A
+
+Train step of epoch 1:  38%|███▊      | 2471/6434 [5:48:03<9:34:19,  8.70s/it, gpt_loss=0.269, loss_mean=0.262][A[A
+
+Train step of epoch 1:  38%|███▊      | 2472/6434 [5:48:03<9:30:24,  8.64s/it, gpt_loss=0.269, loss_mean=0.262][A[A
+
+Train step of epoch 1:  38%|███▊      | 2472/6434 [5:48:11<9:30:24,  8.64s/it, gpt_loss=0.361, loss_mean=0.272][A[A
+
+Train step of epoch 1:  38%|███▊      | 2473/6434 [5:48:11<9:18:13,  8.46s/it, gpt_loss=0.361, loss_mean=0.272][A[A
+
+Train step of epoch 1:  38%|███▊      | 2473/6434 [5:48:20<9:18:13,  8.46s/it, gpt_loss=0.342, loss_mean=0.279][A[A
+
+Train step of epoch 1:  38%|███▊      | 2474/6434 [5:48:20<9:24:09,  8.55s/it, gpt_loss=0.342, loss_mean=0.279][A[A
+
+Train step of epoch 1:  38%|███▊      | 2474/6434 [5:48:30<9:24:09,  8.55s/it, gpt_loss=0.285, loss_mean=0.279][A[A
+
+Train step of epoch 1:  38%|███▊      | 2475/6434 [5:48:30<9:57:17,  9.05s/it, gpt_loss=0.285, loss_mean=0.279][A[A
+[LID Router Debug] Step: 8910
+Batch Size: 10
+Audio Batch Size: 107
+LID Assignments: [2, 4, 1, 0, 9, 9, 3, 2, 2, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+
+Train step of epoch 1:  38%|███▊      | 2475/6434 [5:48:39<9:57:17,  9.05s/it, gpt_loss=0.284, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  38%|███▊      | 2476/6434 [5:48:39<9:45:54,  8.88s/it, gpt_loss=0.284, loss_mean=0.28][A[A
+
+Train step of epoch 1:  38%|███▊      | 2476/6434 [5:48:47<9:45:54,  8.88s/it, gpt_loss=0.218, loss_mean=0.274][A[A
+
+Train step of epoch 1:  38%|███▊      | 2477/6434 [5:48:47<9:47:10,  8.90s/it, gpt_loss=0.218, loss_mean=0.274][A[A
+
+Train step of epoch 1:  38%|███▊      | 2477/6434 [5:48:56<9:47:10,  8.90s/it, gpt_loss=0.226, loss_mean=0.269][A[A
+
+Train step of epoch 1:  39%|███▊      | 2478/6434 [5:48:56<9:35:39,  8.73s/it, gpt_loss=0.226, loss_mean=0.269][A[A
+
+Train step of epoch 1:  39%|███▊      | 2478/6434 [5:49:04<9:35:39,  8.73s/it, gpt_loss=0.275, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  39%|███▊      | 2479/6434 [5:49:04<9:22:32,  8.53s/it, gpt_loss=0.275, loss_mean=0.27][A[A
+
+Train step of epoch 1:  39%|███▊      | 2479/6434 [5:49:12<9:22:32,  8.53s/it, gpt_loss=0.262, loss_mean=0.269][A[A
+
+Train step of epoch 1:  39%|███▊      | 2480/6434 [5:49:12<9:12:20,  8.38s/it, gpt_loss=0.262, loss_mean=0.269][A[A
+
+Train step of epoch 1:  39%|███▊      | 2480/6434 [5:49:20<9:12:20,  8.38s/it, gpt_loss=0.271, loss_mean=0.269][A[A
+
+Train step of epoch 1:  39%|███▊      | 2481/6434 [5:49:20<9:02:42,  8.24s/it, gpt_loss=0.271, loss_mean=0.269][A[A
+
+Train step of epoch 1:  39%|███▊      | 2481/6434 [5:49:28<9:02:42,  8.24s/it, gpt_loss=0.346, loss_mean=0.277][A[A
+
+Train step of epoch 1:  39%|███▊      | 2482/6434 [5:49:28<9:06:13,  8.29s/it, gpt_loss=0.346, loss_mean=0.277][A[A
+
+Train step of epoch 1:  39%|███▊      | 2482/6434 [5:49:36<9:06:13,  8.29s/it, gpt_loss=0.238, loss_mean=0.273][A[A
+
+Train step of epoch 1:  39%|███▊      | 2483/6434 [5:49:36<8:59:09,  8.19s/it, gpt_loss=0.238, loss_mean=0.273][A[A
+
+Train step of epoch 1:  39%|███▊      | 2483/6434 [5:49:45<8:59:09,  8.19s/it, gpt_loss=0.178, loss_mean=0.263][A[A
+
+Train step of epoch 1:  39%|███▊      | 2484/6434 [5:49:45<9:14:35,  8.42s/it, gpt_loss=0.178, loss_mean=0.263][A[A
+
+Train step of epoch 1:  39%|███▊      | 2484/6434 [5:49:54<9:14:35,  8.42s/it, gpt_loss=0.247, loss_mean=0.262][A[A
+
+Train step of epoch 1:  39%|███▊      | 2485/6434 [5:49:54<9:22:12,  8.54s/it, gpt_loss=0.247, loss_mean=0.262][A[A
+[LID Router Debug] Step: 8920
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [9, 9, 7, 2, 3, 2, 6, 5, 9, 9]
+Active Experts in Batch: {2, 3, 5, 6, 7, 9}
+
+
+Train step of epoch 1:  39%|███▊      | 2485/6434 [5:50:02<9:22:12,  8.54s/it, gpt_loss=0.319, loss_mean=0.267][A[A
+
+Train step of epoch 1:  39%|███▊      | 2486/6434 [5:50:02<9:17:40,  8.48s/it, gpt_loss=0.319, loss_mean=0.267][A[A
+
+Train step of epoch 1:  39%|███▊      | 2486/6434 [5:50:12<9:17:40,  8.48s/it, gpt_loss=0.275, loss_mean=0.268][A[A
+
+Train step of epoch 1:  39%|███▊      | 2487/6434 [5:50:12<9:42:33,  8.86s/it, gpt_loss=0.275, loss_mean=0.268][A[A
+
+Train step of epoch 1:  39%|███▊      | 2487/6434 [5:50:21<9:42:33,  8.86s/it, gpt_loss=0.304, loss_mean=0.272][A[A
+
+Train step of epoch 1:  39%|███▊      | 2488/6434 [5:50:21<9:36:26,  8.76s/it, gpt_loss=0.304, loss_mean=0.272][A[A
+
+Train step of epoch 1:  39%|███▊      | 2488/6434 [5:50:29<9:36:26,  8.76s/it, gpt_loss=0.236, loss_mean=0.268][A[A
+
+Train step of epoch 1:  39%|███▊      | 2489/6434 [5:50:29<9:31:59,  8.70s/it, gpt_loss=0.236, loss_mean=0.268][A[A
+
+Train step of epoch 1:  39%|███▊      | 2489/6434 [5:50:36<9:31:59,  8.70s/it, gpt_loss=0.323, loss_mean=0.274][A[A
+
+Train step of epoch 1:  39%|███▊      | 2490/6434 [5:50:36<8:55:05,  8.14s/it, gpt_loss=0.323, loss_mean=0.274][A[A
+
+Train step of epoch 1:  39%|███▊      | 2490/6434 [5:50:44<8:55:05,  8.14s/it, gpt_loss=0.301, loss_mean=0.276][A[A
+
+Train step of epoch 1:  39%|███▊      | 2491/6434 [5:50:44<8:45:41,  8.00s/it, gpt_loss=0.301, loss_mean=0.276][A[A
+
+Train step of epoch 1:  39%|███▊      | 2491/6434 [5:50:53<8:45:41,  8.00s/it, gpt_loss=0.236, loss_mean=0.272][A[A
+
+Train step of epoch 1:  39%|███▊      | 2492/6434 [5:50:53<9:06:47,  8.32s/it, gpt_loss=0.236, loss_mean=0.272][A[A
+
+Train step of epoch 1:  39%|███▊      | 2492/6434 [5:51:02<9:06:47,  8.32s/it, gpt_loss=0.272, loss_mean=0.272][A[A
+
+Train step of epoch 1:  39%|███▊      | 2493/6434 [5:51:02<9:21:26,  8.55s/it, gpt_loss=0.272, loss_mean=0.272][A[A
+
+Train step of epoch 1:  39%|███▊      | 2493/6434 [5:51:10<9:21:26,  8.55s/it, gpt_loss=0.31, loss_mean=0.276] [A[A
+
+Train step of epoch 1:  39%|███▉      | 2494/6434 [5:51:10<9:17:32,  8.49s/it, gpt_loss=0.31, loss_mean=0.276][A[A
+
+Train step of epoch 1:  39%|███▉      | 2494/6434 [5:51:18<9:17:32,  8.49s/it, gpt_loss=0.256, loss_mean=0.274][A[A
+
+Train step of epoch 1:  39%|███▉      | 2495/6434 [5:51:18<9:00:57,  8.24s/it, gpt_loss=0.256, loss_mean=0.274][A[A
+[LID Router Debug] Step: 8930
+Batch Size: 10
+Audio Batch Size: 91
+LID Assignments: [0, 5, 1, 9, 4, 2, 0, 5, 0, 6]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  39%|███▉      | 2495/6434 [5:51:26<9:00:57,  8.24s/it, gpt_loss=0.238, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  39%|███▉      | 2496/6434 [5:51:26<9:05:06,  8.31s/it, gpt_loss=0.238, loss_mean=0.27][A[A
+
+Train step of epoch 1:  39%|███▉      | 2496/6434 [5:51:35<9:05:06,  8.31s/it, gpt_loss=0.321, loss_mean=0.275][A[A
+
+Train step of epoch 1:  39%|███▉      | 2497/6434 [5:51:35<9:13:04,  8.43s/it, gpt_loss=0.321, loss_mean=0.275][A[A
+
+Train step of epoch 1:  39%|███▉      | 2497/6434 [5:51:43<9:13:04,  8.43s/it, gpt_loss=0.264, loss_mean=0.274][A[A
+
+Train step of epoch 1:  39%|███▉      | 2498/6434 [5:51:43<8:58:11,  8.20s/it, gpt_loss=0.264, loss_mean=0.274][A[A
+
+Train step of epoch 1:  39%|███▉      | 2498/6434 [5:51:51<8:58:11,  8.20s/it, gpt_loss=0.294, loss_mean=0.276][A[A
+
+Train step of epoch 1:  39%|███▉      | 2499/6434 [5:51:51<9:10:09,  8.39s/it, gpt_loss=0.294, loss_mean=0.276][A[A
+
+Train step of epoch 1:  39%|███▉      | 2499/6434 [5:52:00<9:10:09,  8.39s/it, gpt_loss=0.262, loss_mean=0.275][A[A
+
+Train step of epoch 1:  39%|███▉      | 2500/6434 [5:52:00<9:10:31,  8.40s/it, gpt_loss=0.262, loss_mean=0.275][A[A
+
+Train step of epoch 1:  39%|███▉      | 2500/6434 [5:52:07<9:10:31,  8.40s/it, gpt_loss=0.272, loss_mean=0.275][A[A
+
+Train step of epoch 1:  39%|███▉      | 2501/6434 [5:52:07<8:49:42,  8.08s/it, gpt_loss=0.272, loss_mean=0.275][A[A
+
+Train step of epoch 1:  39%|███▉      | 2501/6434 [5:52:17<8:49:42,  8.08s/it, gpt_loss=0.269, loss_mean=0.274][A[A
+
+Train step of epoch 1:  39%|███▉      | 2502/6434 [5:52:17<9:19:02,  8.53s/it, gpt_loss=0.269, loss_mean=0.274][A[A
+
+Train step of epoch 1:  39%|███▉      | 2502/6434 [5:52:25<9:19:02,  8.53s/it, gpt_loss=0.237, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  39%|███▉      | 2503/6434 [5:52:25<9:15:12,  8.47s/it, gpt_loss=0.237, loss_mean=0.27][A[A
+
+Train step of epoch 1:  39%|███▉      | 2503/6434 [5:52:34<9:15:12,  8.47s/it, gpt_loss=0.301, loss_mean=0.273][A[A
+
+Train step of epoch 1:  39%|███▉      | 2504/6434 [5:52:34<9:19:37,  8.54s/it, gpt_loss=0.301, loss_mean=0.273][A[A
+
+Train step of epoch 1:  39%|███▉      | 2504/6434 [5:52:41<9:19:37,  8.54s/it, gpt_loss=0.304, loss_mean=0.276][A[A
+
+Train step of epoch 1:  39%|███▉      | 2505/6434 [5:52:41<8:57:38,  8.21s/it, gpt_loss=0.304, loss_mean=0.276][A[A
+[LID Router Debug] Step: 8940
+Batch Size: 10
+Audio Batch Size: 114
+LID Assignments: [4, 6, 9, 2, 6, 3, 3, 2, 5, 3]
+Active Experts in Batch: {2, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  39%|███▉      | 2505/6434 [5:52:49<8:57:38,  8.21s/it, gpt_loss=0.304, loss_mean=0.279][A[A
+
+Train step of epoch 1:  39%|███▉      | 2506/6434 [5:52:49<8:44:12,  8.01s/it, gpt_loss=0.304, loss_mean=0.279][A[A
+
+Train step of epoch 1:  39%|███▉      | 2506/6434 [5:52:57<8:44:12,  8.01s/it, gpt_loss=0.286, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  39%|███▉      | 2507/6434 [5:52:57<8:56:15,  8.19s/it, gpt_loss=0.286, loss_mean=0.28][A[A
+
+Train step of epoch 1:  39%|███▉      | 2507/6434 [5:53:06<8:56:15,  8.19s/it, gpt_loss=0.245, loss_mean=0.276][A[A
+
+Train step of epoch 1:  39%|███▉      | 2508/6434 [5:53:06<9:02:46,  8.29s/it, gpt_loss=0.245, loss_mean=0.276][A[A
+
+Train step of epoch 1:  39%|███▉      | 2508/6434 [5:53:15<9:02:46,  8.29s/it, gpt_loss=0.297, loss_mean=0.278][A[A
+
+Train step of epoch 1:  39%|███▉      | 2509/6434 [5:53:15<9:11:34,  8.43s/it, gpt_loss=0.297, loss_mean=0.278][A[A
+
+Train step of epoch 1:  39%|███▉      | 2509/6434 [5:53:24<9:11:34,  8.43s/it, gpt_loss=0.272, loss_mean=0.278][A[A
+
+Train step of epoch 1:  39%|███▉      | 2510/6434 [5:53:24<9:21:24,  8.58s/it, gpt_loss=0.272, loss_mean=0.278][A[A
+
+Train step of epoch 1:  39%|███▉      | 2510/6434 [5:53:32<9:21:24,  8.58s/it, gpt_loss=0.263, loss_mean=0.276][A[A
+
+Train step of epoch 1:  39%|███▉      | 2511/6434 [5:53:32<9:13:50,  8.47s/it, gpt_loss=0.263, loss_mean=0.276][A[A
+
+Train step of epoch 1:  39%|███▉      | 2511/6434 [5:53:41<9:13:50,  8.47s/it, gpt_loss=0.199, loss_mean=0.269][A[A
+
+Train step of epoch 1:  39%|███▉      | 2512/6434 [5:53:41<9:23:45,  8.62s/it, gpt_loss=0.199, loss_mean=0.269][A[A
+
+Train step of epoch 1:  39%|███▉      | 2512/6434 [5:53:49<9:23:45,  8.62s/it, gpt_loss=0.227, loss_mean=0.264][A[A
+
+Train step of epoch 1:  39%|███▉      | 2513/6434 [5:53:49<9:11:28,  8.44s/it, gpt_loss=0.227, loss_mean=0.264][A[A
+
+Train step of epoch 1:  39%|███▉      | 2513/6434 [5:53:59<9:11:28,  8.44s/it, gpt_loss=0.22, loss_mean=0.26]  [A[A
+
+Train step of epoch 1:  39%|███▉      | 2514/6434 [5:53:59<9:46:18,  8.97s/it, gpt_loss=0.22, loss_mean=0.26][A[A
+
+Train step of epoch 1:  39%|███▉      | 2514/6434 [5:54:09<9:46:18,  8.97s/it, gpt_loss=0.209, loss_mean=0.255][A[A
+
+Train step of epoch 1:  39%|███▉      | 2515/6434 [5:54:09<9:59:02,  9.17s/it, gpt_loss=0.209, loss_mean=0.255][A[A
+[LID Router Debug] Step: 8950
+Batch Size: 10
+Audio Batch Size: 96
+LID Assignments: [2, 2, 5, 0, 1, 9, 3, 9, 7, 1]
+Active Experts in Batch: {0, 1, 2, 3, 5, 7, 9}
+
+
+Train step of epoch 1:  39%|███▉      | 2515/6434 [5:54:17<9:59:02,  9.17s/it, gpt_loss=0.346, loss_mean=0.264][A[A
+
+Train step of epoch 1:  39%|███▉      | 2516/6434 [5:54:17<9:42:21,  8.92s/it, gpt_loss=0.346, loss_mean=0.264][A[A
+
+Train step of epoch 1:  39%|███▉      | 2516/6434 [5:54:25<9:42:21,  8.92s/it, gpt_loss=0.326, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  39%|███▉      | 2517/6434 [5:54:25<9:21:26,  8.60s/it, gpt_loss=0.326, loss_mean=0.27][A[A
+
+Train step of epoch 1:  39%|███▉      | 2517/6434 [5:54:35<9:21:26,  8.60s/it, gpt_loss=0.308, loss_mean=0.274][A[A
+
+Train step of epoch 1:  39%|███▉      | 2518/6434 [5:54:35<9:49:55,  9.04s/it, gpt_loss=0.308, loss_mean=0.274][A[A
+
+Train step of epoch 1:  39%|███▉      | 2518/6434 [5:54:44<9:49:55,  9.04s/it, gpt_loss=0.305, loss_mean=0.277][A[A
+
+Train step of epoch 1:  39%|███▉      | 2519/6434 [5:54:44<9:41:40,  8.91s/it, gpt_loss=0.305, loss_mean=0.277][A[A
+
+Train step of epoch 1:  39%|███▉      | 2519/6434 [5:54:51<9:41:40,  8.91s/it, gpt_loss=0.27, loss_mean=0.276] [A[A
+
+Train step of epoch 1:  39%|███▉      | 2520/6434 [5:54:51<9:13:39,  8.49s/it, gpt_loss=0.27, loss_mean=0.276][A[A
+
+Train step of epoch 1:  39%|███▉      | 2520/6434 [5:55:00<9:13:39,  8.49s/it, gpt_loss=0.228, loss_mean=0.272][A[A
+
+Train step of epoch 1:  39%|███▉      | 2521/6434 [5:55:00<9:17:18,  8.55s/it, gpt_loss=0.228, loss_mean=0.272][A[A
+
+Train step of epoch 1:  39%|███▉      | 2521/6434 [5:55:08<9:17:18,  8.55s/it, gpt_loss=0.293, loss_mean=0.274][A[A
+
+Train step of epoch 1:  39%|███▉      | 2522/6434 [5:55:08<9:02:24,  8.32s/it, gpt_loss=0.293, loss_mean=0.274][A[A
+
+Train step of epoch 1:  39%|███▉      | 2522/6434 [5:55:15<9:02:24,  8.32s/it, gpt_loss=0.25, loss_mean=0.271] [A[A
+
+Train step of epoch 1:  39%|███▉      | 2523/6434 [5:55:15<8:49:29,  8.12s/it, gpt_loss=0.25, loss_mean=0.271][A[A
+
+Train step of epoch 1:  39%|███▉      | 2523/6434 [5:55:24<8:49:29,  8.12s/it, gpt_loss=0.299, loss_mean=0.274][A[A
+
+Train step of epoch 1:  39%|███▉      | 2524/6434 [5:55:24<8:54:29,  8.20s/it, gpt_loss=0.299, loss_mean=0.274][A[A
+
+Train step of epoch 1:  39%|███▉      | 2524/6434 [5:55:31<8:54:29,  8.20s/it, gpt_loss=0.373, loss_mean=0.284][A[A
+
+Train step of epoch 1:  39%|███▉      | 2525/6434 [5:55:31<8:40:54,  8.00s/it, gpt_loss=0.373, loss_mean=0.284][A[A
+[LID Router Debug] Step: 8960
+Batch Size: 10
+Audio Batch Size: 93
+LID Assignments: [2, 9, 2, 2, 1, 2, 0, 6, 4, 2]
+Active Experts in Batch: {0, 1, 2, 4, 6, 9}
+
+
+Train step of epoch 1:  39%|███▉      | 2525/6434 [5:55:39<8:40:54,  8.00s/it, gpt_loss=0.345, loss_mean=0.29] [A[A
+
+Train step of epoch 1:  39%|███▉      | 2526/6434 [5:55:39<8:31:51,  7.86s/it, gpt_loss=0.345, loss_mean=0.29][A[A
+
+Train step of epoch 1:  39%|███▉      | 2526/6434 [5:55:47<8:31:51,  7.86s/it, gpt_loss=0.236, loss_mean=0.285][A[A
+
+Train step of epoch 1:  39%|███▉      | 2527/6434 [5:55:47<8:39:43,  7.98s/it, gpt_loss=0.236, loss_mean=0.285][A[A
+
+Train step of epoch 1:  39%|███▉      | 2527/6434 [5:55:55<8:39:43,  7.98s/it, gpt_loss=0.247, loss_mean=0.281][A[A
+
+Train step of epoch 1:  39%|███▉      | 2528/6434 [5:55:55<8:48:34,  8.12s/it, gpt_loss=0.247, loss_mean=0.281][A[A
+
+Train step of epoch 1:  39%|███▉      | 2528/6434 [5:56:04<8:48:34,  8.12s/it, gpt_loss=0.285, loss_mean=0.281][A[A
+
+Train step of epoch 1:  39%|███▉      | 2529/6434 [5:56:04<8:52:13,  8.18s/it, gpt_loss=0.285, loss_mean=0.281][A[A
+
+Train step of epoch 1:  39%|███▉      | 2529/6434 [5:56:13<8:52:13,  8.18s/it, gpt_loss=0.288, loss_mean=0.282][A[A
+
+Train step of epoch 1:  39%|███▉      | 2530/6434 [5:56:13<9:15:45,  8.54s/it, gpt_loss=0.288, loss_mean=0.282][A[A
+
+Train step of epoch 1:  39%|███▉      | 2530/6434 [5:56:21<9:15:45,  8.54s/it, gpt_loss=0.197, loss_mean=0.273][A[A
+
+Train step of epoch 1:  39%|███▉      | 2531/6434 [5:56:21<9:01:35,  8.33s/it, gpt_loss=0.197, loss_mean=0.273][A[A
+
+Train step of epoch 1:  39%|███▉      | 2531/6434 [5:56:30<9:01:35,  8.33s/it, gpt_loss=0.238, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  39%|███▉      | 2532/6434 [5:56:30<9:08:50,  8.44s/it, gpt_loss=0.238, loss_mean=0.27][A[A
+
+Train step of epoch 1:  39%|███▉      | 2532/6434 [5:56:38<9:08:50,  8.44s/it, gpt_loss=0.299, loss_mean=0.273][A[A
+
+Train step of epoch 1:  39%|███▉      | 2533/6434 [5:56:38<9:14:29,  8.53s/it, gpt_loss=0.299, loss_mean=0.273][A[A
+
+Train step of epoch 1:  39%|███▉      | 2533/6434 [5:56:47<9:14:29,  8.53s/it, gpt_loss=0.371, loss_mean=0.283][A[A
+
+Train step of epoch 1:  39%|███▉      | 2534/6434 [5:56:47<9:14:12,  8.53s/it, gpt_loss=0.371, loss_mean=0.283][A[A
+
+Train step of epoch 1:  39%|███▉      | 2534/6434 [5:56:55<9:14:12,  8.53s/it, gpt_loss=0.235, loss_mean=0.278][A[A
+
+Train step of epoch 1:  39%|███▉      | 2535/6434 [5:56:55<9:01:40,  8.34s/it, gpt_loss=0.235, loss_mean=0.278][A[A
+[LID Router Debug] Step: 8970
+Batch Size: 10
+Audio Batch Size: 121
+LID Assignments: [1, 1, 4, 3, 4, 0, 2, 3, 9, 6]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6, 9}
+
+
+Train step of epoch 1:  39%|███▉      | 2535/6434 [5:57:03<9:01:40,  8.34s/it, gpt_loss=0.292, loss_mean=0.279][A[A
+
+Train step of epoch 1:  39%|███▉      | 2536/6434 [5:57:03<8:56:41,  8.26s/it, gpt_loss=0.292, loss_mean=0.279][A[A
+
+Train step of epoch 1:  39%|███▉      | 2536/6434 [5:57:12<8:56:41,  8.26s/it, gpt_loss=0.291, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  39%|███▉      | 2537/6434 [5:57:12<9:16:57,  8.58s/it, gpt_loss=0.291, loss_mean=0.28][A[A
+
+Train step of epoch 1:  39%|███▉      | 2537/6434 [5:57:19<9:16:57,  8.58s/it, gpt_loss=0.227, loss_mean=0.275][A[A
+
+Train step of epoch 1:  39%|███▉      | 2538/6434 [5:57:19<8:48:27,  8.14s/it, gpt_loss=0.227, loss_mean=0.275][A[A
+
+Train step of epoch 1:  39%|███▉      | 2538/6434 [5:57:27<8:48:27,  8.14s/it, gpt_loss=0.315, loss_mean=0.279][A[A
+
+Train step of epoch 1:  39%|███▉      | 2539/6434 [5:57:27<8:48:38,  8.14s/it, gpt_loss=0.315, loss_mean=0.279][A[A
+
+Train step of epoch 1:  39%|███▉      | 2539/6434 [5:57:36<8:48:38,  8.14s/it, gpt_loss=0.254, loss_mean=0.277][A[A
+
+Train step of epoch 1:  39%|███▉      | 2540/6434 [5:57:36<8:59:22,  8.31s/it, gpt_loss=0.254, loss_mean=0.277][A[A
+
+Train step of epoch 1:  39%|███▉      | 2540/6434 [5:57:44<8:59:22,  8.31s/it, gpt_loss=0.162, loss_mean=0.265][A[A
+
+Train step of epoch 1:  39%|███▉      | 2541/6434 [5:57:44<8:47:02,  8.12s/it, gpt_loss=0.162, loss_mean=0.265][A[A
+
+Train step of epoch 1:  39%|███▉      | 2541/6434 [5:57:53<8:47:02,  8.12s/it, gpt_loss=0.281, loss_mean=0.267][A[A
+
+Train step of epoch 1:  40%|███▉      | 2542/6434 [5:57:53<9:07:02,  8.43s/it, gpt_loss=0.281, loss_mean=0.267][A[A
+
+Train step of epoch 1:  40%|███▉      | 2542/6434 [5:58:02<9:07:02,  8.43s/it, gpt_loss=0.247, loss_mean=0.265][A[A
+
+Train step of epoch 1:  40%|███▉      | 2543/6434 [5:58:02<9:22:59,  8.68s/it, gpt_loss=0.247, loss_mean=0.265][A[A
+
+Train step of epoch 1:  40%|███▉      | 2543/6434 [5:58:12<9:22:59,  8.68s/it, gpt_loss=0.306, loss_mean=0.269][A[A
+
+Train step of epoch 1:  40%|███▉      | 2544/6434 [5:58:12<9:37:44,  8.91s/it, gpt_loss=0.306, loss_mean=0.269][A[A
+
+Train step of epoch 1:  40%|███▉      | 2544/6434 [5:58:20<9:37:44,  8.91s/it, gpt_loss=0.238, loss_mean=0.266][A[A
+
+Train step of epoch 1:  40%|███▉      | 2545/6434 [5:58:20<9:20:52,  8.65s/it, gpt_loss=0.238, loss_mean=0.266][A[A
+[LID Router Debug] Step: 8980
+Batch Size: 10
+Audio Batch Size: 83
+LID Assignments: [1, 1, 0, 1, 6, 2, 10, 0, 5, 1]
+Active Experts in Batch: {0, 1, 2, 5, 6, 10}
+
+
+Train step of epoch 1:  40%|███▉      | 2545/6434 [5:58:29<9:20:52,  8.65s/it, gpt_loss=0.303, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  40%|███▉      | 2546/6434 [5:58:29<9:36:48,  8.90s/it, gpt_loss=0.303, loss_mean=0.27][A[A
+
+Train step of epoch 1:  40%|███▉      | 2546/6434 [5:58:38<9:36:48,  8.90s/it, gpt_loss=0.217, loss_mean=0.264][A[A
+
+Train step of epoch 1:  40%|███▉      | 2547/6434 [5:58:38<9:27:24,  8.76s/it, gpt_loss=0.217, loss_mean=0.264][A[A
+
+Train step of epoch 1:  40%|███▉      | 2547/6434 [5:58:46<9:27:24,  8.76s/it, gpt_loss=0.272, loss_mean=0.265][A[A
+
+Train step of epoch 1:  40%|███▉      | 2548/6434 [5:58:46<9:29:16,  8.79s/it, gpt_loss=0.272, loss_mean=0.265][A[A
+
+Train step of epoch 1:  40%|███▉      | 2548/6434 [5:58:55<9:29:16,  8.79s/it, gpt_loss=0.337, loss_mean=0.272][A[A
+
+Train step of epoch 1:  40%|███▉      | 2549/6434 [5:58:55<9:22:00,  8.68s/it, gpt_loss=0.337, loss_mean=0.272][A[A
+
+Train step of epoch 1:  40%|███▉      | 2549/6434 [5:59:03<9:22:00,  8.68s/it, gpt_loss=0.261, loss_mean=0.271][A[A
+
+Train step of epoch 1:  40%|███▉      | 2550/6434 [5:59:03<9:14:24,  8.56s/it, gpt_loss=0.261, loss_mean=0.271][A[A
+
+Train step of epoch 1:  40%|███▉      | 2550/6434 [5:59:12<9:14:24,  8.56s/it, gpt_loss=0.26, loss_mean=0.27]  [A[A
+
+Train step of epoch 1:  40%|███▉      | 2551/6434 [5:59:12<9:18:32,  8.63s/it, gpt_loss=0.26, loss_mean=0.27][A[A
+
+Train step of epoch 1:  40%|███▉      | 2551/6434 [5:59:19<9:18:32,  8.63s/it, gpt_loss=0.234, loss_mean=0.266][A[A
+
+Train step of epoch 1:  40%|███▉      | 2552/6434 [5:59:19<8:54:02,  8.25s/it, gpt_loss=0.234, loss_mean=0.266][A[A
+
+Train step of epoch 1:  40%|███▉      | 2552/6434 [5:59:29<8:54:02,  8.25s/it, gpt_loss=0.214, loss_mean=0.261][A[A
+
+Train step of epoch 1:  40%|███▉      | 2553/6434 [5:59:29<9:16:47,  8.61s/it, gpt_loss=0.214, loss_mean=0.261][A[A
+
+Train step of epoch 1:  40%|███▉      | 2553/6434 [5:59:37<9:16:47,  8.61s/it, gpt_loss=0.244, loss_mean=0.259][A[A
+
+Train step of epoch 1:  40%|███▉      | 2554/6434 [5:59:37<9:18:04,  8.63s/it, gpt_loss=0.244, loss_mean=0.259][A[A
+
+Train step of epoch 1:  40%|███▉      | 2554/6434 [5:59:46<9:18:04,  8.63s/it, gpt_loss=0.223, loss_mean=0.256][A[A
+
+Train step of epoch 1:  40%|███▉      | 2555/6434 [5:59:46<9:18:35,  8.64s/it, gpt_loss=0.223, loss_mean=0.256][A[A
+[LID Router Debug] Step: 8990
+Batch Size: 10
+Audio Batch Size: 120
+LID Assignments: [9, 5, 0, 4, 0, 9, 2, 3, 3, 5]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  40%|███▉      | 2555/6434 [5:59:54<9:18:35,  8.64s/it, gpt_loss=0.215, loss_mean=0.252][A[A
+
+Train step of epoch 1:  40%|███▉      | 2556/6434 [5:59:54<9:03:17,  8.41s/it, gpt_loss=0.215, loss_mean=0.252][A[A
+
+Train step of epoch 1:  40%|███▉      | 2556/6434 [6:00:02<9:03:17,  8.41s/it, gpt_loss=0.265, loss_mean=0.253][A[A
+
+Train step of epoch 1:  40%|███▉      | 2557/6434 [6:00:02<8:53:14,  8.25s/it, gpt_loss=0.265, loss_mean=0.253][A[A
+
+Train step of epoch 1:  40%|███▉      | 2557/6434 [6:00:11<8:53:14,  8.25s/it, gpt_loss=0.24, loss_mean=0.252] [A[A
+
+Train step of epoch 1:  40%|███▉      | 2558/6434 [6:00:11<9:10:12,  8.52s/it, gpt_loss=0.24, loss_mean=0.252][A[A
+
+Train step of epoch 1:  40%|███▉      | 2558/6434 [6:00:20<9:10:12,  8.52s/it, gpt_loss=0.216, loss_mean=0.248][A[A
+
+Train step of epoch 1:  40%|███▉      | 2559/6434 [6:00:20<9:13:06,  8.56s/it, gpt_loss=0.216, loss_mean=0.248][A[A
+
+Train step of epoch 1:  40%|███▉      | 2559/6434 [6:00:29<9:13:06,  8.56s/it, gpt_loss=0.214, loss_mean=0.245][A[A
+
+Train step of epoch 1:  40%|███▉      | 2560/6434 [6:00:29<9:19:18,  8.66s/it, gpt_loss=0.214, loss_mean=0.245][A[A
+
+Train step of epoch 1:  40%|███▉      | 2560/6434 [6:00:37<9:19:18,  8.66s/it, gpt_loss=0.271, loss_mean=0.247][A[A
+
+Train step of epoch 1:  40%|███▉      | 2561/6434 [6:00:37<9:09:51,  8.52s/it, gpt_loss=0.271, loss_mean=0.247][A[A
+
+Train step of epoch 1:  40%|███▉      | 2561/6434 [6:00:45<9:09:51,  8.52s/it, gpt_loss=0.236, loss_mean=0.246][A[A
+
+Train step of epoch 1:  40%|███▉      | 2562/6434 [6:00:45<9:11:03,  8.54s/it, gpt_loss=0.236, loss_mean=0.246][A[A
+
+Train step of epoch 1:  40%|███▉      | 2562/6434 [6:00:54<9:11:03,  8.54s/it, gpt_loss=0.254, loss_mean=0.247][A[A
+
+Train step of epoch 1:  40%|███▉      | 2563/6434 [6:00:54<9:06:48,  8.48s/it, gpt_loss=0.254, loss_mean=0.247][A[A
+
+Train step of epoch 1:  40%|███▉      | 2563/6434 [6:01:01<9:06:48,  8.48s/it, gpt_loss=0.229, loss_mean=0.245][A[A
+
+Train step of epoch 1:  40%|███▉      | 2564/6434 [6:01:01<8:53:17,  8.27s/it, gpt_loss=0.229, loss_mean=0.245][A[A
+
+Train step of epoch 1:  40%|███▉      | 2564/6434 [6:01:10<8:53:17,  8.27s/it, gpt_loss=0.315, loss_mean=0.252][A[A
+
+Train step of epoch 1:  40%|███▉      | 2565/6434 [6:01:10<8:55:13,  8.30s/it, gpt_loss=0.315, loss_mean=0.252][A[A
+[LID Router Debug] Step: 9000
+Batch Size: 10
+Audio Batch Size: 79
+LID Assignments: [5, 6, 0, 9, 4, 0, 0, 0, 6, 5]
+Active Experts in Batch: {0, 4, 5, 6, 9}
+[2026-02-07 13:03:06,082] [INFO] [logging.py:96:log_dist] [Rank 0] step=4500, skipped=0, lr=[1.122670732694342e-05, 1.122670732694342e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 13:03:06,083] [INFO] [timer.py:260:stop] epoch=0/micro_step=9000/global_step=4500, RunningAvgSamplesPerSec=4.746094836741675, CurrSamplesPerSec=4.571334965345479, MemAllocated=12.54GB, MaxMemAllocated=49.73GB
+
+
+Train step of epoch 1:  40%|███▉      | 2565/6434 [6:01:19<8:55:13,  8.30s/it, gpt_loss=0.29, loss_mean=0.256] [A[A
+
+Train step of epoch 1:  40%|███▉      | 2566/6434 [6:01:19<9:11:37,  8.56s/it, gpt_loss=0.29, loss_mean=0.256][A[A
+
+Train step of epoch 1:  40%|███▉      | 2566/6434 [6:01:27<9:11:37,  8.56s/it, gpt_loss=0.239, loss_mean=0.254][A[A
+
+Train step of epoch 1:  40%|███▉      | 2567/6434 [6:01:27<9:08:39,  8.51s/it, gpt_loss=0.239, loss_mean=0.254][A[A
+
+Train step of epoch 1:  40%|███▉      | 2567/6434 [6:01:36<9:08:39,  8.51s/it, gpt_loss=0.37, loss_mean=0.266] [A[A
+
+Train step of epoch 1:  40%|███▉      | 2568/6434 [6:01:36<9:13:04,  8.58s/it, gpt_loss=0.37, loss_mean=0.266][A[A
+
+Train step of epoch 1:  40%|███▉      | 2568/6434 [6:01:44<9:13:04,  8.58s/it, gpt_loss=0.265, loss_mean=0.266][A[A
+
+Train step of epoch 1:  40%|███▉      | 2569/6434 [6:01:44<8:49:27,  8.22s/it, gpt_loss=0.265, loss_mean=0.266][A[A
+
+Train step of epoch 1:  40%|███▉      | 2569/6434 [6:01:52<8:49:27,  8.22s/it, gpt_loss=0.248, loss_mean=0.264][A[A
+
+Train step of epoch 1:  40%|███▉      | 2570/6434 [6:01:52<8:55:23,  8.31s/it, gpt_loss=0.248, loss_mean=0.264][A[A
+
+Train step of epoch 1:  40%|███▉      | 2570/6434 [6:02:00<8:55:23,  8.31s/it, gpt_loss=0.246, loss_mean=0.262][A[A
+
+Train step of epoch 1:  40%|███▉      | 2571/6434 [6:02:00<8:49:06,  8.22s/it, gpt_loss=0.246, loss_mean=0.262][A[A
+
+Train step of epoch 1:  40%|███▉      | 2571/6434 [6:02:10<8:49:06,  8.22s/it, gpt_loss=0.197, loss_mean=0.256][A[A
+
+Train step of epoch 1:  40%|███▉      | 2572/6434 [6:02:10<9:18:14,  8.67s/it, gpt_loss=0.197, loss_mean=0.256][A[A
+
+Train step of epoch 1:  40%|███▉      | 2572/6434 [6:02:18<9:18:14,  8.67s/it, gpt_loss=0.269, loss_mean=0.257][A[A
+
+Train step of epoch 1:  40%|███▉      | 2573/6434 [6:02:18<9:04:39,  8.46s/it, gpt_loss=0.269, loss_mean=0.257][A[A
+
+Train step of epoch 1:  40%|███▉      | 2573/6434 [6:02:26<9:04:39,  8.46s/it, gpt_loss=0.278, loss_mean=0.259][A[A
+
+Train step of epoch 1:  40%|████      | 2574/6434 [6:02:26<8:57:48,  8.36s/it, gpt_loss=0.278, loss_mean=0.259][A[A
+
+Train step of epoch 1:  40%|████      | 2574/6434 [6:02:34<8:57:48,  8.36s/it, gpt_loss=0.172, loss_mean=0.25] [A[A
+
+Train step of epoch 1:  40%|████      | 2575/6434 [6:02:34<8:54:53,  8.32s/it, gpt_loss=0.172, loss_mean=0.25][A[A
+[LID Router Debug] Step: 9010
+Batch Size: 10
+Audio Batch Size: 125
+LID Assignments: [4, 5, 9, 0, 2, 3, 5, 2, 3, 2]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  40%|████      | 2575/6434 [6:02:42<8:54:53,  8.32s/it, gpt_loss=0.24, loss_mean=0.249][A[A
+
+Train step of epoch 1:  40%|████      | 2576/6434 [6:02:42<8:53:11,  8.29s/it, gpt_loss=0.24, loss_mean=0.249][A[A
+
+Train step of epoch 1:  40%|████      | 2576/6434 [6:02:50<8:53:11,  8.29s/it, gpt_loss=0.226, loss_mean=0.247][A[A
+
+Train step of epoch 1:  40%|████      | 2577/6434 [6:02:50<8:49:15,  8.23s/it, gpt_loss=0.226, loss_mean=0.247][A[A
+
+Train step of epoch 1:  40%|████      | 2577/6434 [6:02:59<8:49:15,  8.23s/it, gpt_loss=0.329, loss_mean=0.255][A[A
+
+Train step of epoch 1:  40%|████      | 2578/6434 [6:02:59<8:51:20,  8.27s/it, gpt_loss=0.329, loss_mean=0.255][A[A
+
+Train step of epoch 1:  40%|████      | 2578/6434 [6:03:08<8:51:20,  8.27s/it, gpt_loss=0.221, loss_mean=0.252][A[A
+
+Train step of epoch 1:  40%|████      | 2579/6434 [6:03:08<9:02:05,  8.44s/it, gpt_loss=0.221, loss_mean=0.252][A[A
+
+Train step of epoch 1:  40%|████      | 2579/6434 [6:03:16<9:02:05,  8.44s/it, gpt_loss=0.246, loss_mean=0.251][A[A
+
+Train step of epoch 1:  40%|████      | 2580/6434 [6:03:16<9:04:24,  8.48s/it, gpt_loss=0.246, loss_mean=0.251][A[A
+
+Train step of epoch 1:  40%|████      | 2580/6434 [6:03:25<9:04:24,  8.48s/it, gpt_loss=0.224, loss_mean=0.249][A[A
+
+Train step of epoch 1:  40%|████      | 2581/6434 [6:03:25<9:07:57,  8.53s/it, gpt_loss=0.224, loss_mean=0.249][A[A
+
+Train step of epoch 1:  40%|████      | 2581/6434 [6:03:33<9:07:57,  8.53s/it, gpt_loss=0.213, loss_mean=0.245][A[A
+
+Train step of epoch 1:  40%|████      | 2582/6434 [6:03:33<9:03:00,  8.46s/it, gpt_loss=0.213, loss_mean=0.245][A[A
+
+Train step of epoch 1:  40%|████      | 2582/6434 [6:03:42<9:03:00,  8.46s/it, gpt_loss=0.268, loss_mean=0.247][A[A
+
+Train step of epoch 1:  40%|████      | 2583/6434 [6:03:42<9:02:27,  8.45s/it, gpt_loss=0.268, loss_mean=0.247][A[A
+
+Train step of epoch 1:  40%|████      | 2583/6434 [6:03:50<9:02:27,  8.45s/it, gpt_loss=0.25, loss_mean=0.248] [A[A
+
+Train step of epoch 1:  40%|████      | 2584/6434 [6:03:50<9:05:05,  8.49s/it, gpt_loss=0.25, loss_mean=0.248][A[A
+
+Train step of epoch 1:  40%|████      | 2584/6434 [6:03:59<9:05:05,  8.49s/it, gpt_loss=0.327, loss_mean=0.256][A[A
+
+Train step of epoch 1:  40%|████      | 2585/6434 [6:03:59<9:09:29,  8.57s/it, gpt_loss=0.327, loss_mean=0.256][A[A
+[LID Router Debug] Step: 9020
+Batch Size: 10
+Audio Batch Size: 116
+LID Assignments: [1, 2, 3, 4, 9, 2, 5, 2, 9, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  40%|████      | 2585/6434 [6:04:07<9:09:29,  8.57s/it, gpt_loss=0.281, loss_mean=0.258][A[A
+
+Train step of epoch 1:  40%|████      | 2586/6434 [6:04:07<8:56:49,  8.37s/it, gpt_loss=0.281, loss_mean=0.258][A[A
+
+Train step of epoch 1:  40%|████      | 2586/6434 [6:04:15<8:56:49,  8.37s/it, gpt_loss=0.265, loss_mean=0.259][A[A
+
+Train step of epoch 1:  40%|████      | 2587/6434 [6:04:15<8:57:39,  8.39s/it, gpt_loss=0.265, loss_mean=0.259][A[A
+
+Train step of epoch 1:  40%|████      | 2587/6434 [6:04:23<8:57:39,  8.39s/it, gpt_loss=0.238, loss_mean=0.257][A[A
+
+Train step of epoch 1:  40%|████      | 2588/6434 [6:04:23<8:42:57,  8.16s/it, gpt_loss=0.238, loss_mean=0.257][A[A
+
+Train step of epoch 1:  40%|████      | 2588/6434 [6:04:31<8:42:57,  8.16s/it, gpt_loss=0.269, loss_mean=0.258][A[A
+
+Train step of epoch 1:  40%|████      | 2589/6434 [6:04:31<8:36:34,  8.06s/it, gpt_loss=0.269, loss_mean=0.258][A[A
+
+Train step of epoch 1:  40%|████      | 2589/6434 [6:04:39<8:36:34,  8.06s/it, gpt_loss=0.231, loss_mean=0.255][A[A
+
+Train step of epoch 1:  40%|████      | 2590/6434 [6:04:39<8:41:32,  8.14s/it, gpt_loss=0.231, loss_mean=0.255][A[A
+
+Train step of epoch 1:  40%|████      | 2590/6434 [6:04:47<8:41:32,  8.14s/it, gpt_loss=0.23, loss_mean=0.253] [A[A
+
+Train step of epoch 1:  40%|████      | 2591/6434 [6:04:47<8:45:36,  8.21s/it, gpt_loss=0.23, loss_mean=0.253][A[A
+
+Train step of epoch 1:  40%|████      | 2591/6434 [6:04:56<8:45:36,  8.21s/it, gpt_loss=0.273, loss_mean=0.255][A[A
+
+Train step of epoch 1:  40%|████      | 2592/6434 [6:04:56<8:50:33,  8.29s/it, gpt_loss=0.273, loss_mean=0.255][A[A
+
+Train step of epoch 1:  40%|████      | 2592/6434 [6:05:05<8:50:33,  8.29s/it, gpt_loss=0.203, loss_mean=0.25] [A[A
+
+Train step of epoch 1:  40%|████      | 2593/6434 [6:05:05<9:01:22,  8.46s/it, gpt_loss=0.203, loss_mean=0.25][A[A
+
+Train step of epoch 1:  40%|████      | 2593/6434 [6:05:13<9:01:22,  8.46s/it, gpt_loss=0.31, loss_mean=0.256][A[A
+
+Train step of epoch 1:  40%|████      | 2594/6434 [6:05:13<8:54:18,  8.35s/it, gpt_loss=0.31, loss_mean=0.256][A[A
+
+Train step of epoch 1:  40%|████      | 2594/6434 [6:05:21<8:54:18,  8.35s/it, gpt_loss=0.24, loss_mean=0.254][A[A
+
+Train step of epoch 1:  40%|████      | 2595/6434 [6:05:21<8:57:36,  8.40s/it, gpt_loss=0.24, loss_mean=0.254][A[A
+[LID Router Debug] Step: 9030
+Batch Size: 10
+Audio Batch Size: 122
+LID Assignments: [5, 4, 2, 4, 1, 4, 3, 5, 9, 9]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  40%|████      | 2595/6434 [6:05:30<8:57:36,  8.40s/it, gpt_loss=0.227, loss_mean=0.251][A[A
+
+Train step of epoch 1:  40%|████      | 2596/6434 [6:05:30<8:55:56,  8.38s/it, gpt_loss=0.227, loss_mean=0.251][A[A
+
+Train step of epoch 1:  40%|████      | 2596/6434 [6:05:38<8:55:56,  8.38s/it, gpt_loss=0.251, loss_mean=0.251][A[A
+
+Train step of epoch 1:  40%|████      | 2597/6434 [6:05:38<8:51:34,  8.31s/it, gpt_loss=0.251, loss_mean=0.251][A[A
+
+Train step of epoch 1:  40%|████      | 2597/6434 [6:05:46<8:51:34,  8.31s/it, gpt_loss=0.207, loss_mean=0.247][A[A
+
+Train step of epoch 1:  40%|████      | 2598/6434 [6:05:46<8:50:24,  8.30s/it, gpt_loss=0.207, loss_mean=0.247][A[A
+
+Train step of epoch 1:  40%|████      | 2598/6434 [6:05:54<8:50:24,  8.30s/it, gpt_loss=0.243, loss_mean=0.247][A[A
+
+Train step of epoch 1:  40%|████      | 2599/6434 [6:05:54<8:43:47,  8.19s/it, gpt_loss=0.243, loss_mean=0.247][A[A
+
+Train step of epoch 1:  40%|████      | 2599/6434 [6:06:02<8:43:47,  8.19s/it, gpt_loss=0.231, loss_mean=0.245][A[A
+
+Train step of epoch 1:  40%|████      | 2600/6434 [6:06:02<8:42:01,  8.17s/it, gpt_loss=0.231, loss_mean=0.245][A[A
+
+Train step of epoch 1:  40%|████      | 2600/6434 [6:06:10<8:42:01,  8.17s/it, gpt_loss=0.417, loss_mean=0.262][A[A
+
+Train step of epoch 1:  40%|████      | 2601/6434 [6:06:10<8:44:09,  8.20s/it, gpt_loss=0.417, loss_mean=0.262][A[A
+
+Train step of epoch 1:  40%|████      | 2601/6434 [6:06:19<8:44:09,  8.20s/it, gpt_loss=0.306, loss_mean=0.267][A[A
+
+Train step of epoch 1:  40%|████      | 2602/6434 [6:06:19<8:52:44,  8.34s/it, gpt_loss=0.306, loss_mean=0.267][A[A
+
+Train step of epoch 1:  40%|████      | 2602/6434 [6:06:27<8:52:44,  8.34s/it, gpt_loss=0.268, loss_mean=0.267][A[A
+
+Train step of epoch 1:  40%|████      | 2603/6434 [6:06:27<8:52:52,  8.35s/it, gpt_loss=0.268, loss_mean=0.267][A[A
+
+Train step of epoch 1:  40%|████      | 2603/6434 [6:06:35<8:52:52,  8.35s/it, gpt_loss=0.231, loss_mean=0.263][A[A
+
+Train step of epoch 1:  40%|████      | 2604/6434 [6:06:35<8:43:04,  8.19s/it, gpt_loss=0.231, loss_mean=0.263][A[A
+
+Train step of epoch 1:  40%|████      | 2604/6434 [6:06:44<8:43:04,  8.19s/it, gpt_loss=0.367, loss_mean=0.274][A[A
+
+Train step of epoch 1:  40%|████      | 2605/6434 [6:06:44<8:49:04,  8.29s/it, gpt_loss=0.367, loss_mean=0.274][A[A
+[LID Router Debug] Step: 9040
+Batch Size: 10
+Audio Batch Size: 122
+LID Assignments: [0, 0, 2, 4, 0, 8, 2, 6, 9, 0]
+Active Experts in Batch: {0, 2, 4, 6, 8, 9}
+
+
+Train step of epoch 1:  40%|████      | 2605/6434 [6:06:52<8:49:04,  8.29s/it, gpt_loss=0.302, loss_mean=0.276][A[A
+
+Train step of epoch 1:  41%|████      | 2606/6434 [6:06:52<8:47:48,  8.27s/it, gpt_loss=0.302, loss_mean=0.276][A[A
+
+Train step of epoch 1:  41%|████      | 2606/6434 [6:07:01<8:47:48,  8.27s/it, gpt_loss=0.25, loss_mean=0.274] [A[A
+
+Train step of epoch 1:  41%|████      | 2607/6434 [6:07:01<9:02:13,  8.50s/it, gpt_loss=0.25, loss_mean=0.274][A[A
+
+Train step of epoch 1:  41%|████      | 2607/6434 [6:07:09<9:02:13,  8.50s/it, gpt_loss=0.325, loss_mean=0.279][A[A
+
+Train step of epoch 1:  41%|████      | 2608/6434 [6:07:09<8:56:40,  8.42s/it, gpt_loss=0.325, loss_mean=0.279][A[A
+
+Train step of epoch 1:  41%|████      | 2608/6434 [6:07:17<8:56:40,  8.42s/it, gpt_loss=0.216, loss_mean=0.273][A[A
+
+Train step of epoch 1:  41%|████      | 2609/6434 [6:07:17<8:47:03,  8.27s/it, gpt_loss=0.216, loss_mean=0.273][A[A
+
+Train step of epoch 1:  41%|████      | 2609/6434 [6:07:26<8:47:03,  8.27s/it, gpt_loss=0.247, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  41%|████      | 2610/6434 [6:07:26<8:55:31,  8.40s/it, gpt_loss=0.247, loss_mean=0.27][A[A
+
+Train step of epoch 1:  41%|████      | 2610/6434 [6:07:34<8:55:31,  8.40s/it, gpt_loss=0.293, loss_mean=0.272][A[A
+
+Train step of epoch 1:  41%|████      | 2611/6434 [6:07:34<8:54:36,  8.39s/it, gpt_loss=0.293, loss_mean=0.272][A[A
+
+Train step of epoch 1:  41%|████      | 2611/6434 [6:07:43<8:54:36,  8.39s/it, gpt_loss=0.285, loss_mean=0.274][A[A
+
+Train step of epoch 1:  41%|████      | 2612/6434 [6:07:43<8:53:03,  8.37s/it, gpt_loss=0.285, loss_mean=0.274][A[A
+
+Train step of epoch 1:  41%|████      | 2612/6434 [6:07:51<8:53:03,  8.37s/it, gpt_loss=0.277, loss_mean=0.274][A[A
+
+Train step of epoch 1:  41%|████      | 2613/6434 [6:07:51<8:54:43,  8.40s/it, gpt_loss=0.277, loss_mean=0.274][A[A
+
+Train step of epoch 1:  41%|████      | 2613/6434 [6:08:00<8:54:43,  8.40s/it, gpt_loss=0.308, loss_mean=0.277][A[A
+
+Train step of epoch 1:  41%|████      | 2614/6434 [6:08:00<9:08:52,  8.62s/it, gpt_loss=0.308, loss_mean=0.277][A[A
+
+Train step of epoch 1:  41%|████      | 2614/6434 [6:08:08<9:08:52,  8.62s/it, gpt_loss=0.322, loss_mean=0.282][A[A
+
+Train step of epoch 1:  41%|████      | 2615/6434 [6:08:08<8:47:52,  8.29s/it, gpt_loss=0.322, loss_mean=0.282][A[A
+[LID Router Debug] Step: 9050
+Batch Size: 10
+Audio Batch Size: 97
+LID Assignments: [4, 4, 3, 2, 1, 5, 2, 5, 5, 2]
+Active Experts in Batch: {1, 2, 3, 4, 5}
+
+
+Train step of epoch 1:  41%|████      | 2615/6434 [6:08:16<8:47:52,  8.29s/it, gpt_loss=0.266, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  41%|████      | 2616/6434 [6:08:16<8:47:59,  8.30s/it, gpt_loss=0.266, loss_mean=0.28][A[A
+
+Train step of epoch 1:  41%|████      | 2616/6434 [6:08:25<8:47:59,  8.30s/it, gpt_loss=0.244, loss_mean=0.276][A[A
+
+Train step of epoch 1:  41%|████      | 2617/6434 [6:08:25<8:59:37,  8.48s/it, gpt_loss=0.244, loss_mean=0.276][A[A
+
+Train step of epoch 1:  41%|████      | 2617/6434 [6:08:33<8:59:37,  8.48s/it, gpt_loss=0.315, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  41%|████      | 2618/6434 [6:08:33<8:51:43,  8.36s/it, gpt_loss=0.315, loss_mean=0.28][A[A
+
+Train step of epoch 1:  41%|████      | 2618/6434 [6:08:41<8:51:43,  8.36s/it, gpt_loss=0.225, loss_mean=0.275][A[A
+
+Train step of epoch 1:  41%|████      | 2619/6434 [6:08:41<8:36:49,  8.13s/it, gpt_loss=0.225, loss_mean=0.275][A[A
+
+Train step of epoch 1:  41%|████      | 2619/6434 [6:08:49<8:36:49,  8.13s/it, gpt_loss=0.213, loss_mean=0.269][A[A
+
+Train step of epoch 1:  41%|████      | 2620/6434 [6:08:49<8:48:25,  8.31s/it, gpt_loss=0.213, loss_mean=0.269][A[A
+
+Train step of epoch 1:  41%|████      | 2620/6434 [6:08:58<8:48:25,  8.31s/it, gpt_loss=0.264, loss_mean=0.268][A[A
+
+Train step of epoch 1:  41%|████      | 2621/6434 [6:08:58<8:53:00,  8.39s/it, gpt_loss=0.264, loss_mean=0.268][A[A
+
+Train step of epoch 1:  41%|████      | 2621/6434 [6:09:06<8:53:00,  8.39s/it, gpt_loss=0.284, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  41%|████      | 2622/6434 [6:09:06<8:48:36,  8.32s/it, gpt_loss=0.284, loss_mean=0.27][A[A
+
+Train step of epoch 1:  41%|████      | 2622/6434 [6:09:14<8:48:36,  8.32s/it, gpt_loss=0.274, loss_mean=0.27][A[A
+
+Train step of epoch 1:  41%|████      | 2623/6434 [6:09:14<8:43:38,  8.24s/it, gpt_loss=0.274, loss_mean=0.27][A[A
+
+Train step of epoch 1:  41%|████      | 2623/6434 [6:09:22<8:43:38,  8.24s/it, gpt_loss=0.246, loss_mean=0.268][A[A
+
+Train step of epoch 1:  41%|████      | 2624/6434 [6:09:22<8:41:04,  8.21s/it, gpt_loss=0.246, loss_mean=0.268][A[A
+
+Train step of epoch 1:  41%|████      | 2624/6434 [6:09:30<8:41:04,  8.21s/it, gpt_loss=0.356, loss_mean=0.277][A[A
+
+Train step of epoch 1:  41%|████      | 2625/6434 [6:09:30<8:37:48,  8.16s/it, gpt_loss=0.356, loss_mean=0.277][A[A
+[LID Router Debug] Step: 9060
+Batch Size: 10
+Audio Batch Size: 95
+LID Assignments: [0, 1, 6, 0, 5, 5, 2, 1, 3, 4]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+
+Train step of epoch 1:  41%|████      | 2625/6434 [6:09:38<8:37:48,  8.16s/it, gpt_loss=0.266, loss_mean=0.275][A[A
+
+Train step of epoch 1:  41%|████      | 2626/6434 [6:09:38<8:36:37,  8.14s/it, gpt_loss=0.266, loss_mean=0.275][A[A
+
+Train step of epoch 1:  41%|████      | 2626/6434 [6:09:46<8:36:37,  8.14s/it, gpt_loss=0.262, loss_mean=0.274][A[A
+
+Train step of epoch 1:  41%|████      | 2627/6434 [6:09:46<8:26:14,  7.98s/it, gpt_loss=0.262, loss_mean=0.274][A[A
+
+Train step of epoch 1:  41%|████      | 2627/6434 [6:09:54<8:26:14,  7.98s/it, gpt_loss=0.244, loss_mean=0.271][A[A
+
+Train step of epoch 1:  41%|████      | 2628/6434 [6:09:54<8:31:01,  8.06s/it, gpt_loss=0.244, loss_mean=0.271][A[A
+
+Train step of epoch 1:  41%|████      | 2628/6434 [6:10:03<8:31:01,  8.06s/it, gpt_loss=0.276, loss_mean=0.272][A[A
+
+Train step of epoch 1:  41%|████      | 2629/6434 [6:10:03<8:43:26,  8.25s/it, gpt_loss=0.276, loss_mean=0.272][A[A
+
+Train step of epoch 1:  41%|████      | 2629/6434 [6:10:12<8:43:26,  8.25s/it, gpt_loss=0.25, loss_mean=0.269] [A[A
+
+Train step of epoch 1:  41%|████      | 2630/6434 [6:10:12<8:58:44,  8.50s/it, gpt_loss=0.25, loss_mean=0.269][A[A
+
+Train step of epoch 1:  41%|████      | 2630/6434 [6:10:21<8:58:44,  8.50s/it, gpt_loss=0.229, loss_mean=0.265][A[A
+
+Train step of epoch 1:  41%|████      | 2631/6434 [6:10:21<9:05:48,  8.61s/it, gpt_loss=0.229, loss_mean=0.265][A[A
+
+Train step of epoch 1:  41%|████      | 2631/6434 [6:10:30<9:05:48,  8.61s/it, gpt_loss=0.222, loss_mean=0.261][A[A
+
+Train step of epoch 1:  41%|████      | 2632/6434 [6:10:30<9:18:31,  8.81s/it, gpt_loss=0.222, loss_mean=0.261][A[A
+
+Train step of epoch 1:  41%|████      | 2632/6434 [6:10:38<9:18:31,  8.81s/it, gpt_loss=0.269, loss_mean=0.262][A[A
+
+Train step of epoch 1:  41%|████      | 2633/6434 [6:10:38<9:06:17,  8.62s/it, gpt_loss=0.269, loss_mean=0.262][A[A
+
+Train step of epoch 1:  41%|████      | 2633/6434 [6:10:47<9:06:17,  8.62s/it, gpt_loss=0.236, loss_mean=0.259][A[A
+
+Train step of epoch 1:  41%|████      | 2634/6434 [6:10:47<9:02:47,  8.57s/it, gpt_loss=0.236, loss_mean=0.259][A[A
+
+Train step of epoch 1:  41%|████      | 2634/6434 [6:10:54<9:02:47,  8.57s/it, gpt_loss=0.281, loss_mean=0.261][A[A
+
+Train step of epoch 1:  41%|████      | 2635/6434 [6:10:54<8:40:03,  8.21s/it, gpt_loss=0.281, loss_mean=0.261][A[A
+[LID Router Debug] Step: 9070
+Batch Size: 10
+Audio Batch Size: 92
+LID Assignments: [1, 5, 9, 5, 5, 6, 2, 9, 4, 0]
+Active Experts in Batch: {0, 1, 2, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  41%|████      | 2635/6434 [6:11:02<8:40:03,  8.21s/it, gpt_loss=0.26, loss_mean=0.261] [A[A
+
+Train step of epoch 1:  41%|████      | 2636/6434 [6:11:02<8:36:51,  8.17s/it, gpt_loss=0.26, loss_mean=0.261][A[A
+
+Train step of epoch 1:  41%|████      | 2636/6434 [6:11:12<8:36:51,  8.17s/it, gpt_loss=0.255, loss_mean=0.261][A[A
+
+Train step of epoch 1:  41%|████      | 2637/6434 [6:11:12<9:05:47,  8.62s/it, gpt_loss=0.255, loss_mean=0.261][A[A
+
+Train step of epoch 1:  41%|████      | 2637/6434 [6:11:20<9:05:47,  8.62s/it, gpt_loss=0.218, loss_mean=0.256][A[A
+
+Train step of epoch 1:  41%|████      | 2638/6434 [6:11:20<8:51:55,  8.41s/it, gpt_loss=0.218, loss_mean=0.256][A[A
+
+Train step of epoch 1:  41%|████      | 2638/6434 [6:11:28<8:51:55,  8.41s/it, gpt_loss=0.276, loss_mean=0.258][A[A
+
+Train step of epoch 1:  41%|████      | 2639/6434 [6:11:28<8:50:30,  8.39s/it, gpt_loss=0.276, loss_mean=0.258][A[A
+
+Train step of epoch 1:  41%|████      | 2639/6434 [6:11:37<8:50:30,  8.39s/it, gpt_loss=0.333, loss_mean=0.266][A[A
+
+Train step of epoch 1:  41%|████      | 2640/6434 [6:11:37<8:52:11,  8.42s/it, gpt_loss=0.333, loss_mean=0.266][A[A
+
+Train step of epoch 1:  41%|████      | 2640/6434 [6:11:45<8:52:11,  8.42s/it, gpt_loss=0.222, loss_mean=0.262][A[A
+
+Train step of epoch 1:  41%|████      | 2641/6434 [6:11:45<8:48:22,  8.36s/it, gpt_loss=0.222, loss_mean=0.262][A[A
+
+Train step of epoch 1:  41%|████      | 2641/6434 [6:11:53<8:48:22,  8.36s/it, gpt_loss=0.242, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  41%|████      | 2642/6434 [6:11:53<8:42:58,  8.27s/it, gpt_loss=0.242, loss_mean=0.26][A[A
+
+Train step of epoch 1:  41%|████      | 2642/6434 [6:12:01<8:42:58,  8.27s/it, gpt_loss=0.3, loss_mean=0.264] [A[A
+
+Train step of epoch 1:  41%|████      | 2643/6434 [6:12:01<8:35:10,  8.15s/it, gpt_loss=0.3, loss_mean=0.264][A[A
+
+Train step of epoch 1:  41%|████      | 2643/6434 [6:12:08<8:35:10,  8.15s/it, gpt_loss=0.347, loss_mean=0.272][A[A
+
+Train step of epoch 1:  41%|████      | 2644/6434 [6:12:08<8:23:42,  7.97s/it, gpt_loss=0.347, loss_mean=0.272][A[A
+
+Train step of epoch 1:  41%|████      | 2644/6434 [6:12:17<8:23:42,  7.97s/it, gpt_loss=0.249, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  41%|████      | 2645/6434 [6:12:17<8:25:50,  8.01s/it, gpt_loss=0.249, loss_mean=0.27][A[A
+[LID Router Debug] Step: 9080
+Batch Size: 10
+Audio Batch Size: 92
+LID Assignments: [4, 4, 5, 4, 1, 9, 4, 4, 0, 6]
+Active Experts in Batch: {0, 1, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  41%|████      | 2645/6434 [6:12:25<8:25:50,  8.01s/it, gpt_loss=0.25, loss_mean=0.268][A[A
+
+Train step of epoch 1:  41%|████      | 2646/6434 [6:12:25<8:29:23,  8.07s/it, gpt_loss=0.25, loss_mean=0.268][A[A
+
+Train step of epoch 1:  41%|████      | 2646/6434 [6:12:33<8:29:23,  8.07s/it, gpt_loss=0.333, loss_mean=0.274][A[A
+
+Train step of epoch 1:  41%|████      | 2647/6434 [6:12:33<8:29:20,  8.07s/it, gpt_loss=0.333, loss_mean=0.274][A[A
+
+Train step of epoch 1:  41%|████      | 2647/6434 [6:12:41<8:29:20,  8.07s/it, gpt_loss=0.277, loss_mean=0.275][A[A
+
+Train step of epoch 1:  41%|████      | 2648/6434 [6:12:41<8:35:49,  8.17s/it, gpt_loss=0.277, loss_mean=0.275][A[A
+
+Train step of epoch 1:  41%|████      | 2648/6434 [6:12:50<8:35:49,  8.17s/it, gpt_loss=0.214, loss_mean=0.268][A[A
+
+Train step of epoch 1:  41%|████      | 2649/6434 [6:12:50<8:56:10,  8.50s/it, gpt_loss=0.214, loss_mean=0.268][A[A
+
+Train step of epoch 1:  41%|████      | 2649/6434 [6:12:58<8:56:10,  8.50s/it, gpt_loss=0.334, loss_mean=0.275][A[A
+
+Train step of epoch 1:  41%|████      | 2650/6434 [6:12:58<8:30:44,  8.10s/it, gpt_loss=0.334, loss_mean=0.275][A[A
+
+Train step of epoch 1:  41%|████      | 2650/6434 [6:13:05<8:30:44,  8.10s/it, gpt_loss=0.311, loss_mean=0.279][A[A
+
+Train step of epoch 1:  41%|████      | 2651/6434 [6:13:05<8:13:56,  7.83s/it, gpt_loss=0.311, loss_mean=0.279][A[A
+
+Train step of epoch 1:  41%|████      | 2651/6434 [6:13:13<8:13:56,  7.83s/it, gpt_loss=0.216, loss_mean=0.272][A[A
+
+Train step of epoch 1:  41%|████      | 2652/6434 [6:13:13<8:28:34,  8.07s/it, gpt_loss=0.216, loss_mean=0.272][A[A
+
+Train step of epoch 1:  41%|████      | 2652/6434 [6:13:23<8:28:34,  8.07s/it, gpt_loss=0.265, loss_mean=0.272][A[A
+
+Train step of epoch 1:  41%|████      | 2653/6434 [6:13:23<8:52:17,  8.45s/it, gpt_loss=0.265, loss_mean=0.272][A[A
+
+Train step of epoch 1:  41%|████      | 2653/6434 [6:13:30<8:52:17,  8.45s/it, gpt_loss=0.247, loss_mean=0.269][A[A
+
+Train step of epoch 1:  41%|████      | 2654/6434 [6:13:30<8:31:05,  8.11s/it, gpt_loss=0.247, loss_mean=0.269][A[A
+
+Train step of epoch 1:  41%|████      | 2654/6434 [6:13:40<8:31:05,  8.11s/it, gpt_loss=0.208, loss_mean=0.263][A[A
+
+Train step of epoch 1:  41%|████▏     | 2655/6434 [6:13:40<9:04:01,  8.64s/it, gpt_loss=0.208, loss_mean=0.263][A[A
+[LID Router Debug] Step: 9090
+Batch Size: 10
+Audio Batch Size: 113
+LID Assignments: [5, 5, 5, 9, 3, 5, 1, 9, 3, 4]
+Active Experts in Batch: {1, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  41%|████▏     | 2655/6434 [6:13:48<9:04:01,  8.64s/it, gpt_loss=0.246, loss_mean=0.261][A[A
+
+Train step of epoch 1:  41%|████▏     | 2656/6434 [6:13:48<8:50:38,  8.43s/it, gpt_loss=0.246, loss_mean=0.261][A[A
+
+Train step of epoch 1:  41%|████▏     | 2656/6434 [6:13:57<8:50:38,  8.43s/it, gpt_loss=0.31, loss_mean=0.266] [A[A
+
+Train step of epoch 1:  41%|████▏     | 2657/6434 [6:13:57<8:56:34,  8.52s/it, gpt_loss=0.31, loss_mean=0.266][A[A
+
+Train step of epoch 1:  41%|████▏     | 2657/6434 [6:14:04<8:56:34,  8.52s/it, gpt_loss=0.369, loss_mean=0.277][A[A
+
+Train step of epoch 1:  41%|████▏     | 2658/6434 [6:14:04<8:40:49,  8.28s/it, gpt_loss=0.369, loss_mean=0.277][A[A
+
+Train step of epoch 1:  41%|████▏     | 2658/6434 [6:14:12<8:40:49,  8.28s/it, gpt_loss=0.263, loss_mean=0.275][A[A
+
+Train step of epoch 1:  41%|████▏     | 2659/6434 [6:14:12<8:27:12,  8.06s/it, gpt_loss=0.263, loss_mean=0.275][A[A
+
+Train step of epoch 1:  41%|████▏     | 2659/6434 [6:14:19<8:27:12,  8.06s/it, gpt_loss=0.281, loss_mean=0.276][A[A
+
+Train step of epoch 1:  41%|████▏     | 2660/6434 [6:14:19<8:16:29,  7.89s/it, gpt_loss=0.281, loss_mean=0.276][A[A
+
+Train step of epoch 1:  41%|████▏     | 2660/6434 [6:14:28<8:16:29,  7.89s/it, gpt_loss=0.209, loss_mean=0.269][A[A
+
+Train step of epoch 1:  41%|████▏     | 2661/6434 [6:14:28<8:32:29,  8.15s/it, gpt_loss=0.209, loss_mean=0.269][A[A
+
+Train step of epoch 1:  41%|████▏     | 2661/6434 [6:14:36<8:32:29,  8.15s/it, gpt_loss=0.323, loss_mean=0.274][A[A
+
+Train step of epoch 1:  41%|████▏     | 2662/6434 [6:14:36<8:27:46,  8.08s/it, gpt_loss=0.323, loss_mean=0.274][A[A
+
+Train step of epoch 1:  41%|████▏     | 2662/6434 [6:14:44<8:27:46,  8.08s/it, gpt_loss=0.23, loss_mean=0.27]  [A[A
+
+Train step of epoch 1:  41%|████▏     | 2663/6434 [6:14:44<8:30:35,  8.12s/it, gpt_loss=0.23, loss_mean=0.27][A[A
+
+Train step of epoch 1:  41%|████▏     | 2663/6434 [6:14:52<8:30:35,  8.12s/it, gpt_loss=0.31, loss_mean=0.274][A[A
+
+Train step of epoch 1:  41%|████▏     | 2664/6434 [6:14:52<8:15:54,  7.89s/it, gpt_loss=0.31, loss_mean=0.274][A[A
+
+Train step of epoch 1:  41%|████▏     | 2664/6434 [6:15:00<8:15:54,  7.89s/it, gpt_loss=0.242, loss_mean=0.271][A[A
+
+Train step of epoch 1:  41%|████▏     | 2665/6434 [6:15:00<8:19:07,  7.95s/it, gpt_loss=0.242, loss_mean=0.271][A[A
+[LID Router Debug] Step: 9100
+Batch Size: 10
+Audio Batch Size: 115
+LID Assignments: [0, 1, 9, 1, 9, 5, 5, 0, 9, 3]
+Active Experts in Batch: {0, 1, 3, 5, 9}
+
+
+Train step of epoch 1:  41%|████▏     | 2665/6434 [6:15:08<8:19:07,  7.95s/it, gpt_loss=0.241, loss_mean=0.268][A[A
+
+Train step of epoch 1:  41%|████▏     | 2666/6434 [6:15:08<8:33:31,  8.18s/it, gpt_loss=0.241, loss_mean=0.268][A[A
+
+Train step of epoch 1:  41%|████▏     | 2666/6434 [6:15:17<8:33:31,  8.18s/it, gpt_loss=0.274, loss_mean=0.268][A[A
+
+Train step of epoch 1:  41%|████▏     | 2667/6434 [6:15:17<8:31:25,  8.15s/it, gpt_loss=0.274, loss_mean=0.268][A[A
+
+Train step of epoch 1:  41%|████▏     | 2667/6434 [6:15:25<8:31:25,  8.15s/it, gpt_loss=0.298, loss_mean=0.271][A[A
+
+Train step of epoch 1:  41%|████▏     | 2668/6434 [6:15:25<8:40:14,  8.29s/it, gpt_loss=0.298, loss_mean=0.271][A[A
+
+Train step of epoch 1:  41%|████▏     | 2668/6434 [6:15:33<8:40:14,  8.29s/it, gpt_loss=0.286, loss_mean=0.273][A[A
+
+Train step of epoch 1:  41%|████▏     | 2669/6434 [6:15:33<8:37:00,  8.24s/it, gpt_loss=0.286, loss_mean=0.273][A[A
+
+Train step of epoch 1:  41%|████▏     | 2669/6434 [6:15:41<8:37:00,  8.24s/it, gpt_loss=0.319, loss_mean=0.277][A[A
+
+Train step of epoch 1:  41%|████▏     | 2670/6434 [6:15:41<8:34:52,  8.21s/it, gpt_loss=0.319, loss_mean=0.277][A[A
+
+Train step of epoch 1:  41%|████▏     | 2670/6434 [6:15:49<8:34:52,  8.21s/it, gpt_loss=0.356, loss_mean=0.285][A[A
+
+Train step of epoch 1:  42%|████▏     | 2671/6434 [6:15:49<8:21:49,  8.00s/it, gpt_loss=0.356, loss_mean=0.285][A[A
+
+Train step of epoch 1:  42%|████▏     | 2671/6434 [6:15:58<8:21:49,  8.00s/it, gpt_loss=0.237, loss_mean=0.28] [A[A
+
+Train step of epoch 1:  42%|████▏     | 2672/6434 [6:15:58<8:32:52,  8.18s/it, gpt_loss=0.237, loss_mean=0.28][A[A
+
+Train step of epoch 1:  42%|████▏     | 2672/6434 [6:16:05<8:32:52,  8.18s/it, gpt_loss=0.21, loss_mean=0.273][A[A
+
+Train step of epoch 1:  42%|████▏     | 2673/6434 [6:16:05<8:24:51,  8.05s/it, gpt_loss=0.21, loss_mean=0.273][A[A
+
+Train step of epoch 1:  42%|████▏     | 2673/6434 [6:16:14<8:24:51,  8.05s/it, gpt_loss=0.268, loss_mean=0.273][A[A
+
+Train step of epoch 1:  42%|████▏     | 2674/6434 [6:16:14<8:37:33,  8.26s/it, gpt_loss=0.268, loss_mean=0.273][A[A
+
+Train step of epoch 1:  42%|████▏     | 2674/6434 [6:16:23<8:37:33,  8.26s/it, gpt_loss=0.228, loss_mean=0.268][A[A
+
+Train step of epoch 1:  42%|████▏     | 2675/6434 [6:16:23<8:46:41,  8.41s/it, gpt_loss=0.228, loss_mean=0.268][A[A
+[LID Router Debug] Step: 9110
+Batch Size: 10
+Audio Batch Size: 111
+LID Assignments: [5, 3, 2, 0, 0, 0, 9, 2, 4, 7]
+Active Experts in Batch: {0, 2, 3, 4, 5, 7, 9}
+
+
+Train step of epoch 1:  42%|████▏     | 2675/6434 [6:16:30<8:46:41,  8.41s/it, gpt_loss=0.237, loss_mean=0.265][A[A
+
+Train step of epoch 1:  42%|████▏     | 2676/6434 [6:16:30<8:31:02,  8.16s/it, gpt_loss=0.237, loss_mean=0.265][A[A
+
+Train step of epoch 1:  42%|████▏     | 2676/6434 [6:16:38<8:31:02,  8.16s/it, gpt_loss=0.235, loss_mean=0.262][A[A
+
+Train step of epoch 1:  42%|████▏     | 2677/6434 [6:16:38<8:18:27,  7.96s/it, gpt_loss=0.235, loss_mean=0.262][A[A
+
+Train step of epoch 1:  42%|████▏     | 2677/6434 [6:16:46<8:18:27,  7.96s/it, gpt_loss=0.269, loss_mean=0.263][A[A
+
+Train step of epoch 1:  42%|████▏     | 2678/6434 [6:16:46<8:28:32,  8.12s/it, gpt_loss=0.269, loss_mean=0.263][A[A
+
+Train step of epoch 1:  42%|████▏     | 2678/6434 [6:16:55<8:28:32,  8.12s/it, gpt_loss=0.365, loss_mean=0.273][A[A
+
+Train step of epoch 1:  42%|████▏     | 2679/6434 [6:16:55<8:34:00,  8.21s/it, gpt_loss=0.365, loss_mean=0.273][A[A
+
+Train step of epoch 1:  42%|████▏     | 2679/6434 [6:17:03<8:34:00,  8.21s/it, gpt_loss=0.277, loss_mean=0.274][A[A
+
+Train step of epoch 1:  42%|████▏     | 2680/6434 [6:17:03<8:31:03,  8.17s/it, gpt_loss=0.277, loss_mean=0.274][A[A
+
+Train step of epoch 1:  42%|████▏     | 2680/6434 [6:17:11<8:31:03,  8.17s/it, gpt_loss=0.277, loss_mean=0.274][A[A
+
+Train step of epoch 1:  42%|████▏     | 2681/6434 [6:17:11<8:35:56,  8.25s/it, gpt_loss=0.277, loss_mean=0.274][A[A
+
+Train step of epoch 1:  42%|████▏     | 2681/6434 [6:17:19<8:35:56,  8.25s/it, gpt_loss=0.307, loss_mean=0.277][A[A
+
+Train step of epoch 1:  42%|████▏     | 2682/6434 [6:17:19<8:22:48,  8.04s/it, gpt_loss=0.307, loss_mean=0.277][A[A
+
+Train step of epoch 1:  42%|████▏     | 2682/6434 [6:17:27<8:22:48,  8.04s/it, gpt_loss=0.284, loss_mean=0.278][A[A
+
+Train step of epoch 1:  42%|████▏     | 2683/6434 [6:17:27<8:17:28,  7.96s/it, gpt_loss=0.284, loss_mean=0.278][A[A
+
+Train step of epoch 1:  42%|████▏     | 2683/6434 [6:17:35<8:17:28,  7.96s/it, gpt_loss=0.309, loss_mean=0.281][A[A
+
+Train step of epoch 1:  42%|████▏     | 2684/6434 [6:17:35<8:32:11,  8.20s/it, gpt_loss=0.309, loss_mean=0.281][A[A
+
+Train step of epoch 1:  42%|████▏     | 2684/6434 [6:17:43<8:32:11,  8.20s/it, gpt_loss=0.216, loss_mean=0.275][A[A
+
+Train step of epoch 1:  42%|████▏     | 2685/6434 [6:17:43<8:26:17,  8.10s/it, gpt_loss=0.216, loss_mean=0.275][A[A
+[LID Router Debug] Step: 9120
+Batch Size: 10
+Audio Batch Size: 139
+LID Assignments: [1, 4, 2, 3, 6, 9, 9, 2, 3, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6, 9}
+
+
+Train step of epoch 1:  42%|████▏     | 2685/6434 [6:17:52<8:26:17,  8.10s/it, gpt_loss=0.304, loss_mean=0.277][A[A
+
+Train step of epoch 1:  42%|████▏     | 2686/6434 [6:17:52<8:36:40,  8.27s/it, gpt_loss=0.304, loss_mean=0.277][A[A
+
+Train step of epoch 1:  42%|████▏     | 2686/6434 [6:18:01<8:36:40,  8.27s/it, gpt_loss=0.32, loss_mean=0.282] [A[A
+
+Train step of epoch 1:  42%|████▏     | 2687/6434 [6:18:01<8:48:47,  8.47s/it, gpt_loss=0.32, loss_mean=0.282][A[A
+
+Train step of epoch 1:  42%|████▏     | 2687/6434 [6:18:09<8:48:47,  8.47s/it, gpt_loss=0.235, loss_mean=0.277][A[A
+
+Train step of epoch 1:  42%|████▏     | 2688/6434 [6:18:09<8:47:38,  8.45s/it, gpt_loss=0.235, loss_mean=0.277][A[A
+
+Train step of epoch 1:  42%|████▏     | 2688/6434 [6:18:18<8:47:38,  8.45s/it, gpt_loss=0.203, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  42%|████▏     | 2689/6434 [6:18:18<8:47:35,  8.45s/it, gpt_loss=0.203, loss_mean=0.27][A[A
+
+Train step of epoch 1:  42%|████▏     | 2689/6434 [6:18:26<8:47:35,  8.45s/it, gpt_loss=0.214, loss_mean=0.264][A[A
+
+Train step of epoch 1:  42%|████▏     | 2690/6434 [6:18:26<8:52:13,  8.53s/it, gpt_loss=0.214, loss_mean=0.264][A[A
+
+Train step of epoch 1:  42%|████▏     | 2690/6434 [6:18:35<8:52:13,  8.53s/it, gpt_loss=0.317, loss_mean=0.269][A[A
+
+Train step of epoch 1:  42%|████▏     | 2691/6434 [6:18:35<8:47:39,  8.46s/it, gpt_loss=0.317, loss_mean=0.269][A[A
+
+Train step of epoch 1:  42%|████▏     | 2691/6434 [6:18:42<8:47:39,  8.46s/it, gpt_loss=0.191, loss_mean=0.261][A[A
+
+Train step of epoch 1:  42%|████▏     | 2692/6434 [6:18:42<8:24:16,  8.09s/it, gpt_loss=0.191, loss_mean=0.261][A[A
+
+Train step of epoch 1:  42%|████▏     | 2692/6434 [6:18:52<8:24:16,  8.09s/it, gpt_loss=0.292, loss_mean=0.265][A[A
+
+Train step of epoch 1:  42%|████▏     | 2693/6434 [6:18:52<8:52:44,  8.54s/it, gpt_loss=0.292, loss_mean=0.265][A[A
+
+Train step of epoch 1:  42%|████▏     | 2693/6434 [6:18:59<8:52:44,  8.54s/it, gpt_loss=0.301, loss_mean=0.268][A[A
+
+Train step of epoch 1:  42%|████▏     | 2694/6434 [6:18:59<8:29:01,  8.17s/it, gpt_loss=0.301, loss_mean=0.268][A[A
+
+Train step of epoch 1:  42%|████▏     | 2694/6434 [6:19:07<8:29:01,  8.17s/it, gpt_loss=0.247, loss_mean=0.266][A[A
+
+Train step of epoch 1:  42%|████▏     | 2695/6434 [6:19:07<8:38:30,  8.32s/it, gpt_loss=0.247, loss_mean=0.266][A[A
+[LID Router Debug] Step: 9130
+Batch Size: 10
+Audio Batch Size: 94
+LID Assignments: [6, 4, 2, 1, 0, 3, 1, 5, 2, 5]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+
+Train step of epoch 1:  42%|████▏     | 2695/6434 [6:19:17<8:38:30,  8.32s/it, gpt_loss=0.256, loss_mean=0.265][A[A
+
+Train step of epoch 1:  42%|████▏     | 2696/6434 [6:19:17<9:09:14,  8.82s/it, gpt_loss=0.256, loss_mean=0.265][A[A
+
+Train step of epoch 1:  42%|████▏     | 2696/6434 [6:19:27<9:09:14,  8.82s/it, gpt_loss=0.227, loss_mean=0.261][A[A
+
+Train step of epoch 1:  42%|████▏     | 2697/6434 [6:19:27<9:13:23,  8.88s/it, gpt_loss=0.227, loss_mean=0.261][A[A
+
+Train step of epoch 1:  42%|████▏     | 2697/6434 [6:19:35<9:13:23,  8.88s/it, gpt_loss=0.252, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  42%|████▏     | 2698/6434 [6:19:35<9:03:52,  8.73s/it, gpt_loss=0.252, loss_mean=0.26][A[A
+
+Train step of epoch 1:  42%|████▏     | 2698/6434 [6:19:44<9:03:52,  8.73s/it, gpt_loss=0.245, loss_mean=0.259][A[A
+
+Train step of epoch 1:  42%|████▏     | 2699/6434 [6:19:44<9:04:43,  8.75s/it, gpt_loss=0.245, loss_mean=0.259][A[A
+
+Train step of epoch 1:  42%|████▏     | 2699/6434 [6:19:53<9:04:43,  8.75s/it, gpt_loss=0.235, loss_mean=0.256][A[A
+
+Train step of epoch 1:  42%|████▏     | 2700/6434 [6:19:53<9:11:47,  8.87s/it, gpt_loss=0.235, loss_mean=0.256][A[A
+
+Train step of epoch 1:  42%|████▏     | 2700/6434 [6:20:01<9:11:47,  8.87s/it, gpt_loss=0.271, loss_mean=0.258][A[A
+
+Train step of epoch 1:  42%|████▏     | 2701/6434 [6:20:01<8:52:35,  8.56s/it, gpt_loss=0.271, loss_mean=0.258][A[A
+
+Train step of epoch 1:  42%|████▏     | 2701/6434 [6:20:09<8:52:35,  8.56s/it, gpt_loss=0.21, loss_mean=0.253] [A[A
+
+Train step of epoch 1:  42%|████▏     | 2702/6434 [6:20:09<8:50:33,  8.53s/it, gpt_loss=0.21, loss_mean=0.253][A[A
+
+Train step of epoch 1:  42%|████▏     | 2702/6434 [6:20:18<8:50:33,  8.53s/it, gpt_loss=0.208, loss_mean=0.249][A[A
+
+Train step of epoch 1:  42%|████▏     | 2703/6434 [6:20:18<9:02:18,  8.72s/it, gpt_loss=0.208, loss_mean=0.249][A[A
+
+Train step of epoch 1:  42%|████▏     | 2703/6434 [6:20:27<9:02:18,  8.72s/it, gpt_loss=0.32, loss_mean=0.256] [A[A
+
+Train step of epoch 1:  42%|████▏     | 2704/6434 [6:20:27<9:06:47,  8.80s/it, gpt_loss=0.32, loss_mean=0.256][A[A
+
+Train step of epoch 1:  42%|████▏     | 2704/6434 [6:20:35<9:06:47,  8.80s/it, gpt_loss=0.332, loss_mean=0.263][A[A
+
+Train step of epoch 1:  42%|████▏     | 2705/6434 [6:20:35<8:44:25,  8.44s/it, gpt_loss=0.332, loss_mean=0.263][A[A
+[LID Router Debug] Step: 9140
+Batch Size: 10
+Audio Batch Size: 90
+LID Assignments: [5, 2, 1, 6, 3, 5, 5, 1, 1, 9]
+Active Experts in Batch: {1, 2, 3, 5, 6, 9}
+
+
+Train step of epoch 1:  42%|████▏     | 2705/6434 [6:20:44<8:44:25,  8.44s/it, gpt_loss=0.249, loss_mean=0.262][A[A
+
+Train step of epoch 1:  42%|████▏     | 2706/6434 [6:20:44<8:53:35,  8.59s/it, gpt_loss=0.249, loss_mean=0.262][A[A
+
+Train step of epoch 1:  42%|████▏     | 2706/6434 [6:20:52<8:53:35,  8.59s/it, gpt_loss=0.29, loss_mean=0.265] [A[A
+
+Train step of epoch 1:  42%|████▏     | 2707/6434 [6:20:52<8:49:18,  8.52s/it, gpt_loss=0.29, loss_mean=0.265][A[A
+
+Train step of epoch 1:  42%|████▏     | 2707/6434 [6:21:01<8:49:18,  8.52s/it, gpt_loss=0.272, loss_mean=0.265][A[A
+
+Train step of epoch 1:  42%|████▏     | 2708/6434 [6:21:01<8:53:56,  8.60s/it, gpt_loss=0.272, loss_mean=0.265][A[A
+
+Train step of epoch 1:  42%|████▏     | 2708/6434 [6:21:09<8:53:56,  8.60s/it, gpt_loss=0.297, loss_mean=0.269][A[A
+
+Train step of epoch 1:  42%|████▏     | 2709/6434 [6:21:09<8:47:53,  8.50s/it, gpt_loss=0.297, loss_mean=0.269][A[A
+
+Train step of epoch 1:  42%|████▏     | 2709/6434 [6:21:18<8:47:53,  8.50s/it, gpt_loss=0.223, loss_mean=0.264][A[A
+
+Train step of epoch 1:  42%|████▏     | 2710/6434 [6:21:18<8:49:07,  8.53s/it, gpt_loss=0.223, loss_mean=0.264][A[A
+
+Train step of epoch 1:  42%|████▏     | 2710/6434 [6:21:26<8:49:07,  8.53s/it, gpt_loss=0.273, loss_mean=0.265][A[A
+
+Train step of epoch 1:  42%|████▏     | 2711/6434 [6:21:26<8:51:41,  8.57s/it, gpt_loss=0.273, loss_mean=0.265][A[A
+
+Train step of epoch 1:  42%|████▏     | 2711/6434 [6:21:35<8:51:41,  8.57s/it, gpt_loss=0.245, loss_mean=0.263][A[A
+
+Train step of epoch 1:  42%|████▏     | 2712/6434 [6:21:35<8:42:26,  8.42s/it, gpt_loss=0.245, loss_mean=0.263][A[A
+
+Train step of epoch 1:  42%|████▏     | 2712/6434 [6:21:42<8:42:26,  8.42s/it, gpt_loss=0.345, loss_mean=0.271][A[A
+
+Train step of epoch 1:  42%|████▏     | 2713/6434 [6:21:42<8:24:09,  8.13s/it, gpt_loss=0.345, loss_mean=0.271][A[A
+
+Train step of epoch 1:  42%|████▏     | 2713/6434 [6:21:49<8:24:09,  8.13s/it, gpt_loss=0.242, loss_mean=0.268][A[A
+
+Train step of epoch 1:  42%|████▏     | 2714/6434 [6:21:49<8:06:11,  7.84s/it, gpt_loss=0.242, loss_mean=0.268][A[A
+
+Train step of epoch 1:  42%|████▏     | 2714/6434 [6:21:57<8:06:11,  7.84s/it, gpt_loss=0.245, loss_mean=0.266][A[A
+
+Train step of epoch 1:  42%|████▏     | 2715/6434 [6:21:57<8:08:39,  7.88s/it, gpt_loss=0.245, loss_mean=0.266][A[A
+[LID Router Debug] Step: 9150
+Batch Size: 10
+Audio Batch Size: 156
+LID Assignments: [5, 3, 2, 3, 3, 3, 9, 2, 9, 3]
+Active Experts in Batch: {9, 2, 3, 5}
+
+
+Train step of epoch 1:  42%|████▏     | 2715/6434 [6:22:07<8:08:39,  7.88s/it, gpt_loss=0.231, loss_mean=0.262][A[A
+
+Train step of epoch 1:  42%|████▏     | 2716/6434 [6:22:07<8:45:24,  8.48s/it, gpt_loss=0.231, loss_mean=0.262][A[A
+
+Train step of epoch 1:  42%|████▏     | 2716/6434 [6:22:16<8:45:24,  8.48s/it, gpt_loss=0.238, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  42%|████▏     | 2717/6434 [6:22:16<9:00:03,  8.72s/it, gpt_loss=0.238, loss_mean=0.26][A[A
+
+Train step of epoch 1:  42%|████▏     | 2717/6434 [6:22:26<9:00:03,  8.72s/it, gpt_loss=0.306, loss_mean=0.265][A[A
+
+Train step of epoch 1:  42%|████▏     | 2718/6434 [6:22:26<9:18:43,  9.02s/it, gpt_loss=0.306, loss_mean=0.265][A[A
+
+Train step of epoch 1:  42%|████▏     | 2718/6434 [6:22:34<9:18:43,  9.02s/it, gpt_loss=0.223, loss_mean=0.261][A[A
+
+Train step of epoch 1:  42%|████▏     | 2719/6434 [6:22:34<9:05:45,  8.81s/it, gpt_loss=0.223, loss_mean=0.261][A[A
+
+Train step of epoch 1:  42%|████▏     | 2719/6434 [6:22:43<9:05:45,  8.81s/it, gpt_loss=0.369, loss_mean=0.271][A[A
+
+Train step of epoch 1:  42%|████▏     | 2720/6434 [6:22:43<8:55:58,  8.66s/it, gpt_loss=0.369, loss_mean=0.271][A[A
+
+Train step of epoch 1:  42%|████▏     | 2720/6434 [6:22:50<8:55:58,  8.66s/it, gpt_loss=0.225, loss_mean=0.267][A[A
+
+Train step of epoch 1:  42%|████▏     | 2721/6434 [6:22:50<8:31:22,  8.26s/it, gpt_loss=0.225, loss_mean=0.267][A[A
+
+Train step of epoch 1:  42%|████▏     | 2721/6434 [6:22:58<8:31:22,  8.26s/it, gpt_loss=0.216, loss_mean=0.262][A[A
+
+Train step of epoch 1:  42%|████▏     | 2722/6434 [6:22:58<8:30:50,  8.26s/it, gpt_loss=0.216, loss_mean=0.262][A[A
+
+Train step of epoch 1:  42%|████▏     | 2722/6434 [6:23:07<8:30:50,  8.26s/it, gpt_loss=0.31, loss_mean=0.267] [A[A
+
+Train step of epoch 1:  42%|████▏     | 2723/6434 [6:23:07<8:31:55,  8.28s/it, gpt_loss=0.31, loss_mean=0.267][A[A
+
+Train step of epoch 1:  42%|████▏     | 2723/6434 [6:23:14<8:31:55,  8.28s/it, gpt_loss=0.35, loss_mean=0.275][A[A
+
+Train step of epoch 1:  42%|████▏     | 2724/6434 [6:23:14<8:16:30,  8.03s/it, gpt_loss=0.35, loss_mean=0.275][A[A
+
+Train step of epoch 1:  42%|████▏     | 2724/6434 [6:23:22<8:16:30,  8.03s/it, gpt_loss=0.202, loss_mean=0.268][A[A
+
+Train step of epoch 1:  42%|████▏     | 2725/6434 [6:23:22<8:24:15,  8.16s/it, gpt_loss=0.202, loss_mean=0.268][A[A
+[LID Router Debug] Step: 9160
+Batch Size: 10
+Audio Batch Size: 103
+LID Assignments: [1, 3, 8, 9, 4, 1, 5, 5, 2, 0]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 8, 9}
+
+
+Train step of epoch 1:  42%|████▏     | 2725/6434 [6:23:31<8:24:15,  8.16s/it, gpt_loss=0.235, loss_mean=0.264][A[A
+
+Train step of epoch 1:  42%|████▏     | 2726/6434 [6:23:31<8:23:26,  8.15s/it, gpt_loss=0.235, loss_mean=0.264][A[A
+
+Train step of epoch 1:  42%|████▏     | 2726/6434 [6:23:38<8:23:26,  8.15s/it, gpt_loss=0.302, loss_mean=0.268][A[A
+
+Train step of epoch 1:  42%|████▏     | 2727/6434 [6:23:38<8:16:41,  8.04s/it, gpt_loss=0.302, loss_mean=0.268][A[A
+
+Train step of epoch 1:  42%|████▏     | 2727/6434 [6:23:46<8:16:41,  8.04s/it, gpt_loss=0.259, loss_mean=0.267][A[A
+
+Train step of epoch 1:  42%|████▏     | 2728/6434 [6:23:46<8:00:40,  7.78s/it, gpt_loss=0.259, loss_mean=0.267][A[A
+
+Train step of epoch 1:  42%|████▏     | 2728/6434 [6:23:54<8:00:40,  7.78s/it, gpt_loss=0.219, loss_mean=0.262][A[A
+
+Train step of epoch 1:  42%|████▏     | 2729/6434 [6:23:54<8:21:38,  8.12s/it, gpt_loss=0.219, loss_mean=0.262][A[A
+
+Train step of epoch 1:  42%|████▏     | 2729/6434 [6:24:03<8:21:38,  8.12s/it, gpt_loss=0.362, loss_mean=0.272][A[A
+
+Train step of epoch 1:  42%|████▏     | 2730/6434 [6:24:03<8:22:01,  8.13s/it, gpt_loss=0.362, loss_mean=0.272][A[A
+
+Train step of epoch 1:  42%|████▏     | 2730/6434 [6:24:10<8:22:01,  8.13s/it, gpt_loss=0.296, loss_mean=0.275][A[A
+
+Train step of epoch 1:  42%|████▏     | 2731/6434 [6:24:10<8:08:29,  7.92s/it, gpt_loss=0.296, loss_mean=0.275][A[A
+
+Train step of epoch 1:  42%|████▏     | 2731/6434 [6:24:19<8:08:29,  7.92s/it, gpt_loss=0.266, loss_mean=0.274][A[A
+
+Train step of epoch 1:  42%|████▏     | 2732/6434 [6:24:19<8:26:28,  8.21s/it, gpt_loss=0.266, loss_mean=0.274][A[A
+
+Train step of epoch 1:  42%|████▏     | 2732/6434 [6:24:27<8:26:28,  8.21s/it, gpt_loss=0.241, loss_mean=0.271][A[A
+
+Train step of epoch 1:  42%|████▏     | 2733/6434 [6:24:27<8:32:01,  8.30s/it, gpt_loss=0.241, loss_mean=0.271][A[A
+
+Train step of epoch 1:  42%|████▏     | 2733/6434 [6:24:35<8:32:01,  8.30s/it, gpt_loss=0.326, loss_mean=0.276][A[A
+
+Train step of epoch 1:  42%|████▏     | 2734/6434 [6:24:35<8:16:12,  8.05s/it, gpt_loss=0.326, loss_mean=0.276][A[A
+
+Train step of epoch 1:  42%|████▏     | 2734/6434 [6:24:43<8:16:12,  8.05s/it, gpt_loss=0.212, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  43%|████▎     | 2735/6434 [6:24:43<8:18:35,  8.09s/it, gpt_loss=0.212, loss_mean=0.27][A[A
+[LID Router Debug] Step: 9170
+Batch Size: 10
+Audio Batch Size: 145
+LID Assignments: [6, 5, 4, 3, 3, 3, 5, 2, 9, 2]
+Active Experts in Batch: {2, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  43%|████▎     | 2735/6434 [6:24:52<8:18:35,  8.09s/it, gpt_loss=0.229, loss_mean=0.266][A[A
+
+Train step of epoch 1:  43%|████▎     | 2736/6434 [6:24:52<8:40:16,  8.44s/it, gpt_loss=0.229, loss_mean=0.266][A[A
+
+Train step of epoch 1:  43%|████▎     | 2736/6434 [6:25:01<8:40:16,  8.44s/it, gpt_loss=0.248, loss_mean=0.264][A[A
+
+Train step of epoch 1:  43%|████▎     | 2737/6434 [6:25:01<8:49:41,  8.60s/it, gpt_loss=0.248, loss_mean=0.264][A[A
+
+Train step of epoch 1:  43%|████▎     | 2737/6434 [6:25:11<8:49:41,  8.60s/it, gpt_loss=0.216, loss_mean=0.259][A[A
+
+Train step of epoch 1:  43%|████▎     | 2738/6434 [6:25:11<9:10:54,  8.94s/it, gpt_loss=0.216, loss_mean=0.259][A[A
+
+Train step of epoch 1:  43%|████▎     | 2738/6434 [6:25:20<9:10:54,  8.94s/it, gpt_loss=0.208, loss_mean=0.254][A[A
+
+Train step of epoch 1:  43%|████▎     | 2739/6434 [6:25:20<9:01:37,  8.80s/it, gpt_loss=0.208, loss_mean=0.254][A[A
+
+Train step of epoch 1:  43%|████▎     | 2739/6434 [6:25:29<9:01:37,  8.80s/it, gpt_loss=0.253, loss_mean=0.254][A[A
+
+Train step of epoch 1:  43%|████▎     | 2740/6434 [6:25:29<9:10:41,  8.94s/it, gpt_loss=0.253, loss_mean=0.254][A[A
+
+Train step of epoch 1:  43%|████▎     | 2740/6434 [6:25:38<9:10:41,  8.94s/it, gpt_loss=0.192, loss_mean=0.248][A[A
+
+Train step of epoch 1:  43%|████▎     | 2741/6434 [6:25:38<9:15:18,  9.02s/it, gpt_loss=0.192, loss_mean=0.248][A[A
+
+Train step of epoch 1:  43%|████▎     | 2741/6434 [6:25:46<9:15:18,  9.02s/it, gpt_loss=0.189, loss_mean=0.242][A[A
+
+Train step of epoch 1:  43%|████▎     | 2742/6434 [6:25:46<8:57:44,  8.74s/it, gpt_loss=0.189, loss_mean=0.242][A[A
+
+Train step of epoch 1:  43%|████▎     | 2742/6434 [6:25:55<8:57:44,  8.74s/it, gpt_loss=0.268, loss_mean=0.244][A[A
+
+Train step of epoch 1:  43%|████▎     | 2743/6434 [6:25:55<8:59:40,  8.77s/it, gpt_loss=0.268, loss_mean=0.244][A[A
+
+Train step of epoch 1:  43%|████▎     | 2743/6434 [6:26:04<8:59:40,  8.77s/it, gpt_loss=0.241, loss_mean=0.244][A[A
+
+Train step of epoch 1:  43%|████▎     | 2744/6434 [6:26:04<9:07:57,  8.91s/it, gpt_loss=0.241, loss_mean=0.244][A[A
+
+Train step of epoch 1:  43%|████▎     | 2744/6434 [6:26:13<9:07:57,  8.91s/it, gpt_loss=0.259, loss_mean=0.246][A[A
+
+Train step of epoch 1:  43%|████▎     | 2745/6434 [6:26:13<9:03:08,  8.83s/it, gpt_loss=0.259, loss_mean=0.246][A[A
+[LID Router Debug] Step: 9180
+Batch Size: 10
+Audio Batch Size: 127
+LID Assignments: [0, 9, 9, 9, 9, 3, 0, 9, 1, 3]
+Active Experts in Batch: {0, 9, 3, 1}
+
+
+Train step of epoch 1:  43%|████▎     | 2745/6434 [6:26:21<9:03:08,  8.83s/it, gpt_loss=0.291, loss_mean=0.25] [A[A
+
+Train step of epoch 1:  43%|████▎     | 2746/6434 [6:26:21<8:54:16,  8.69s/it, gpt_loss=0.291, loss_mean=0.25][A[A
+
+Train step of epoch 1:  43%|████▎     | 2746/6434 [6:26:30<8:54:16,  8.69s/it, gpt_loss=0.245, loss_mean=0.25][A[A
+
+Train step of epoch 1:  43%|████▎     | 2747/6434 [6:26:30<8:50:03,  8.63s/it, gpt_loss=0.245, loss_mean=0.25][A[A
+
+Train step of epoch 1:  43%|████▎     | 2747/6434 [6:26:38<8:50:03,  8.63s/it, gpt_loss=0.257, loss_mean=0.25][A[A
+
+Train step of epoch 1:  43%|████▎     | 2748/6434 [6:26:38<8:35:53,  8.40s/it, gpt_loss=0.257, loss_mean=0.25][A[A
+
+Train step of epoch 1:  43%|████▎     | 2748/6434 [6:26:46<8:35:53,  8.40s/it, gpt_loss=0.205, loss_mean=0.246][A[A
+
+Train step of epoch 1:  43%|████▎     | 2749/6434 [6:26:46<8:42:10,  8.50s/it, gpt_loss=0.205, loss_mean=0.246][A[A
+
+Train step of epoch 1:  43%|████▎     | 2749/6434 [6:26:54<8:42:10,  8.50s/it, gpt_loss=0.312, loss_mean=0.252][A[A
+
+Train step of epoch 1:  43%|████▎     | 2750/6434 [6:26:54<8:27:34,  8.27s/it, gpt_loss=0.312, loss_mean=0.252][A[A
+
+Train step of epoch 1:  43%|████▎     | 2750/6434 [6:27:02<8:27:34,  8.27s/it, gpt_loss=0.316, loss_mean=0.259][A[A
+
+Train step of epoch 1:  43%|████▎     | 2751/6434 [6:27:02<8:28:54,  8.29s/it, gpt_loss=0.316, loss_mean=0.259][A[A
+
+Train step of epoch 1:  43%|████▎     | 2751/6434 [6:27:11<8:28:54,  8.29s/it, gpt_loss=0.245, loss_mean=0.257][A[A
+
+Train step of epoch 1:  43%|████▎     | 2752/6434 [6:27:11<8:28:22,  8.28s/it, gpt_loss=0.245, loss_mean=0.257][A[A
+
+Train step of epoch 1:  43%|████▎     | 2752/6434 [6:27:20<8:28:22,  8.28s/it, gpt_loss=0.214, loss_mean=0.253][A[A
+
+Train step of epoch 1:  43%|████▎     | 2753/6434 [6:27:20<8:39:49,  8.47s/it, gpt_loss=0.214, loss_mean=0.253][A[A
+
+Train step of epoch 1:  43%|████▎     | 2753/6434 [6:27:28<8:39:49,  8.47s/it, gpt_loss=0.311, loss_mean=0.259][A[A
+
+Train step of epoch 1:  43%|████▎     | 2754/6434 [6:27:28<8:35:24,  8.40s/it, gpt_loss=0.311, loss_mean=0.259][A[A
+
+Train step of epoch 1:  43%|████▎     | 2754/6434 [6:27:36<8:35:24,  8.40s/it, gpt_loss=0.213, loss_mean=0.254][A[A
+
+Train step of epoch 1:  43%|████▎     | 2755/6434 [6:27:36<8:24:01,  8.22s/it, gpt_loss=0.213, loss_mean=0.254][A[A
+[LID Router Debug] Step: 9190
+Batch Size: 10
+Audio Batch Size: 96
+LID Assignments: [4, 6, 5, 5, 4, 1, 0, 2, 3, 6]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6}
+
+
+Train step of epoch 1:  43%|████▎     | 2755/6434 [6:27:45<8:24:01,  8.22s/it, gpt_loss=0.233, loss_mean=0.252][A[A
+
+Train step of epoch 1:  43%|████▎     | 2756/6434 [6:27:45<8:40:07,  8.48s/it, gpt_loss=0.233, loss_mean=0.252][A[A
+
+Train step of epoch 1:  43%|████▎     | 2756/6434 [6:27:53<8:40:07,  8.48s/it, gpt_loss=0.261, loss_mean=0.253][A[A
+
+Train step of epoch 1:  43%|████▎     | 2757/6434 [6:27:53<8:45:05,  8.57s/it, gpt_loss=0.261, loss_mean=0.253][A[A
+
+Train step of epoch 1:  43%|████▎     | 2757/6434 [6:28:01<8:45:05,  8.57s/it, gpt_loss=0.296, loss_mean=0.257][A[A
+
+Train step of epoch 1:  43%|████▎     | 2758/6434 [6:28:01<8:33:42,  8.38s/it, gpt_loss=0.296, loss_mean=0.257][A[A
+
+Train step of epoch 1:  43%|████▎     | 2758/6434 [6:28:09<8:33:42,  8.38s/it, gpt_loss=0.29, loss_mean=0.261] [A[A
+
+Train step of epoch 1:  43%|████▎     | 2759/6434 [6:28:09<8:19:11,  8.15s/it, gpt_loss=0.29, loss_mean=0.261][A[A
+
+Train step of epoch 1:  43%|████▎     | 2759/6434 [6:28:19<8:19:11,  8.15s/it, gpt_loss=0.287, loss_mean=0.263][A[A
+
+Train step of epoch 1:  43%|████▎     | 2760/6434 [6:28:19<8:47:03,  8.61s/it, gpt_loss=0.287, loss_mean=0.263][A[A
+
+Train step of epoch 1:  43%|████▎     | 2760/6434 [6:28:26<8:47:03,  8.61s/it, gpt_loss=0.281, loss_mean=0.265][A[A
+
+Train step of epoch 1:  43%|████▎     | 2761/6434 [6:28:26<8:23:05,  8.22s/it, gpt_loss=0.281, loss_mean=0.265][A[A
+
+Train step of epoch 1:  43%|████▎     | 2761/6434 [6:28:34<8:23:05,  8.22s/it, gpt_loss=0.302, loss_mean=0.269][A[A
+
+Train step of epoch 1:  43%|████▎     | 2762/6434 [6:28:34<8:21:51,  8.20s/it, gpt_loss=0.302, loss_mean=0.269][A[A
+
+Train step of epoch 1:  43%|████▎     | 2762/6434 [6:28:43<8:21:51,  8.20s/it, gpt_loss=0.242, loss_mean=0.266][A[A
+
+Train step of epoch 1:  43%|████▎     | 2763/6434 [6:28:43<8:36:37,  8.44s/it, gpt_loss=0.242, loss_mean=0.266][A[A
+
+Train step of epoch 1:  43%|████▎     | 2763/6434 [6:28:52<8:36:37,  8.44s/it, gpt_loss=0.351, loss_mean=0.275][A[A
+
+Train step of epoch 1:  43%|████▎     | 2764/6434 [6:28:52<8:37:27,  8.46s/it, gpt_loss=0.351, loss_mean=0.275][A[A
+
+Train step of epoch 1:  43%|████▎     | 2764/6434 [6:28:59<8:37:27,  8.46s/it, gpt_loss=0.405, loss_mean=0.288][A[A
+
+Train step of epoch 1:  43%|████▎     | 2765/6434 [6:28:59<8:22:03,  8.21s/it, gpt_loss=0.405, loss_mean=0.288][A[A
+[LID Router Debug] Step: 9200
+Batch Size: 10
+Audio Batch Size: 115
+LID Assignments: [3, 0, 2, 4, 9, 5, 3, 2, 9, 9]
+Active Experts in Batch: {0, 2, 3, 4, 5, 9}
+[2026-02-07 13:30:55,298] [INFO] [logging.py:96:log_dist] [Rank 0] step=4600, skipped=0, lr=[1.0899761620459559e-05, 1.0899761620459559e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2026-02-07 13:30:55,299] [INFO] [timer.py:260:stop] epoch=0/micro_step=9200/global_step=4600, RunningAvgSamplesPerSec=4.747300850601012, CurrSamplesPerSec=4.826313908712758, MemAllocated=12.65GB, MaxMemAllocated=49.73GB
+
+
+Train step of epoch 1:  43%|████▎     | 2765/6434 [6:29:08<8:22:03,  8.21s/it, gpt_loss=0.25, loss_mean=0.284] [A[A
+
+Train step of epoch 1:  43%|████▎     | 2766/6434 [6:29:08<8:35:58,  8.44s/it, gpt_loss=0.25, loss_mean=0.284][A[A
+
+Train step of epoch 1:  43%|████▎     | 2766/6434 [6:29:17<8:35:58,  8.44s/it, gpt_loss=0.253, loss_mean=0.281][A[A
+
+Train step of epoch 1:  43%|████▎     | 2767/6434 [6:29:17<8:37:14,  8.46s/it, gpt_loss=0.253, loss_mean=0.281][A[A
+
+Train step of epoch 1:  43%|████▎     | 2767/6434 [6:29:25<8:37:14,  8.46s/it, gpt_loss=0.254, loss_mean=0.278][A[A
+
+Train step of epoch 1:  43%|████▎     | 2768/6434 [6:29:25<8:27:26,  8.31s/it, gpt_loss=0.254, loss_mean=0.278][A[A
+
+Train step of epoch 1:  43%|████▎     | 2768/6434 [6:29:34<8:27:26,  8.31s/it, gpt_loss=0.279, loss_mean=0.278][A[A
+
+Train step of epoch 1:  43%|████▎     | 2769/6434 [6:29:34<8:41:12,  8.53s/it, gpt_loss=0.279, loss_mean=0.278][A[A
+
+Train step of epoch 1:  43%|████▎     | 2769/6434 [6:29:42<8:41:12,  8.53s/it, gpt_loss=0.29, loss_mean=0.279] [A[A
+
+Train step of epoch 1:  43%|████▎     | 2770/6434 [6:29:42<8:42:09,  8.55s/it, gpt_loss=0.29, loss_mean=0.279][A[A
+
+Train step of epoch 1:  43%|████▎     | 2770/6434 [6:29:51<8:42:09,  8.55s/it, gpt_loss=0.344, loss_mean=0.286][A[A
+
+Train step of epoch 1:  43%|████▎     | 2771/6434 [6:29:51<8:45:24,  8.61s/it, gpt_loss=0.344, loss_mean=0.286][A[A
+
+Train step of epoch 1:  43%|████▎     | 2771/6434 [6:29:59<8:45:24,  8.61s/it, gpt_loss=0.299, loss_mean=0.287][A[A
+
+Train step of epoch 1:  43%|████▎     | 2772/6434 [6:29:59<8:30:24,  8.36s/it, gpt_loss=0.299, loss_mean=0.287][A[A
+
+Train step of epoch 1:  43%|████▎     | 2772/6434 [6:30:06<8:30:24,  8.36s/it, gpt_loss=0.29, loss_mean=0.287] [A[A
+
+Train step of epoch 1:  43%|████▎     | 2773/6434 [6:30:06<8:10:35,  8.04s/it, gpt_loss=0.29, loss_mean=0.287][A[A
+
+Train step of epoch 1:  43%|████▎     | 2773/6434 [6:30:15<8:10:35,  8.04s/it, gpt_loss=0.234, loss_mean=0.282][A[A
+
+Train step of epoch 1:  43%|████▎     | 2774/6434 [6:30:15<8:30:31,  8.37s/it, gpt_loss=0.234, loss_mean=0.282][A[A
+
+Train step of epoch 1:  43%|████▎     | 2774/6434 [6:30:24<8:30:31,  8.37s/it, gpt_loss=0.437, loss_mean=0.298][A[A
+
+Train step of epoch 1:  43%|████▎     | 2775/6434 [6:30:24<8:38:03,  8.50s/it, gpt_loss=0.437, loss_mean=0.298][A[A
+[LID Router Debug] Step: 9210
+Batch Size: 10
+Audio Batch Size: 91
+LID Assignments: [1, 1, 5, 5, 5, 5, 3, 2, 9, 4]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  43%|████▎     | 2775/6434 [6:30:32<8:38:03,  8.50s/it, gpt_loss=0.323, loss_mean=0.3]  [A[A
+
+Train step of epoch 1:  43%|████▎     | 2776/6434 [6:30:32<8:18:53,  8.18s/it, gpt_loss=0.323, loss_mean=0.3][A[A
+
+Train step of epoch 1:  43%|████▎     | 2776/6434 [6:30:40<8:18:53,  8.18s/it, gpt_loss=0.266, loss_mean=0.297][A[A
+
+Train step of epoch 1:  43%|████▎     | 2777/6434 [6:30:40<8:31:19,  8.39s/it, gpt_loss=0.266, loss_mean=0.297][A[A
+
+Train step of epoch 1:  43%|████▎     | 2777/6434 [6:30:48<8:31:19,  8.39s/it, gpt_loss=0.204, loss_mean=0.287][A[A
+
+Train step of epoch 1:  43%|████▎     | 2778/6434 [6:30:48<8:09:16,  8.03s/it, gpt_loss=0.204, loss_mean=0.287][A[A
+
+Train step of epoch 1:  43%|████▎     | 2778/6434 [6:30:56<8:09:16,  8.03s/it, gpt_loss=0.288, loss_mean=0.287][A[A
+
+Train step of epoch 1:  43%|████▎     | 2779/6434 [6:30:56<8:13:46,  8.11s/it, gpt_loss=0.288, loss_mean=0.287][A[A
+
+Train step of epoch 1:  43%|████▎     | 2779/6434 [6:31:04<8:13:46,  8.11s/it, gpt_loss=0.272, loss_mean=0.286][A[A
+
+Train step of epoch 1:  43%|████▎     | 2780/6434 [6:31:04<8:20:35,  8.22s/it, gpt_loss=0.272, loss_mean=0.286][A[A
+
+Train step of epoch 1:  43%|████▎     | 2780/6434 [6:31:12<8:20:35,  8.22s/it, gpt_loss=0.213, loss_mean=0.279][A[A
+
+Train step of epoch 1:  43%|████▎     | 2781/6434 [6:31:12<8:12:51,  8.10s/it, gpt_loss=0.213, loss_mean=0.279][A[A
+
+Train step of epoch 1:  43%|████▎     | 2781/6434 [6:31:21<8:12:51,  8.10s/it, gpt_loss=0.197, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  43%|████▎     | 2782/6434 [6:31:21<8:34:53,  8.46s/it, gpt_loss=0.197, loss_mean=0.27][A[A
+
+Train step of epoch 1:  43%|████▎     | 2782/6434 [6:31:30<8:34:53,  8.46s/it, gpt_loss=0.216, loss_mean=0.265][A[A
+
+Train step of epoch 1:  43%|████▎     | 2783/6434 [6:31:30<8:44:02,  8.61s/it, gpt_loss=0.216, loss_mean=0.265][A[A
+
+Train step of epoch 1:  43%|████▎     | 2783/6434 [6:31:39<8:44:02,  8.61s/it, gpt_loss=0.217, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  43%|████▎     | 2784/6434 [6:31:39<8:41:04,  8.57s/it, gpt_loss=0.217, loss_mean=0.26][A[A
+
+Train step of epoch 1:  43%|████▎     | 2784/6434 [6:31:48<8:41:04,  8.57s/it, gpt_loss=0.248, loss_mean=0.259][A[A
+
+Train step of epoch 1:  43%|████▎     | 2785/6434 [6:31:48<8:59:39,  8.87s/it, gpt_loss=0.248, loss_mean=0.259][A[A
+[LID Router Debug] Step: 9220
+Batch Size: 10
+Audio Batch Size: 95
+LID Assignments: [9, 4, 6, 0, 3, 1, 2, 9, 2, 5]
+Active Experts in Batch: {0, 1, 2, 3, 4, 5, 6, 9}
+
+
+Train step of epoch 1:  43%|████▎     | 2785/6434 [6:31:58<8:59:39,  8.87s/it, gpt_loss=0.28, loss_mean=0.261] [A[A
+
+Train step of epoch 1:  43%|████▎     | 2786/6434 [6:31:58<9:10:03,  9.05s/it, gpt_loss=0.28, loss_mean=0.261][A[A
+
+Train step of epoch 1:  43%|████▎     | 2786/6434 [6:32:06<9:10:03,  9.05s/it, gpt_loss=0.278, loss_mean=0.263][A[A
+
+Train step of epoch 1:  43%|████▎     | 2787/6434 [6:32:06<9:00:02,  8.88s/it, gpt_loss=0.278, loss_mean=0.263][A[A
+
+Train step of epoch 1:  43%|████▎     | 2787/6434 [6:32:14<9:00:02,  8.88s/it, gpt_loss=0.229, loss_mean=0.259][A[A
+
+Train step of epoch 1:  43%|████▎     | 2788/6434 [6:32:14<8:43:22,  8.61s/it, gpt_loss=0.229, loss_mean=0.259][A[A
+
+Train step of epoch 1:  43%|████▎     | 2788/6434 [6:32:22<8:43:22,  8.61s/it, gpt_loss=0.268, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  43%|████▎     | 2789/6434 [6:32:22<8:32:39,  8.44s/it, gpt_loss=0.268, loss_mean=0.26][A[A
+
+Train step of epoch 1:  43%|████▎     | 2789/6434 [6:32:30<8:32:39,  8.44s/it, gpt_loss=0.265, loss_mean=0.261][A[A
+
+Train step of epoch 1:  43%|████▎     | 2790/6434 [6:32:30<8:17:35,  8.19s/it, gpt_loss=0.265, loss_mean=0.261][A[A
+
+Train step of epoch 1:  43%|████▎     | 2790/6434 [6:32:38<8:17:35,  8.19s/it, gpt_loss=0.344, loss_mean=0.269][A[A
+
+Train step of epoch 1:  43%|████▎     | 2791/6434 [6:32:38<8:15:49,  8.17s/it, gpt_loss=0.344, loss_mean=0.269][A[A
+
+Train step of epoch 1:  43%|████▎     | 2791/6434 [6:32:46<8:15:49,  8.17s/it, gpt_loss=0.274, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  43%|████▎     | 2792/6434 [6:32:46<8:08:47,  8.05s/it, gpt_loss=0.274, loss_mean=0.27][A[A
+
+Train step of epoch 1:  43%|████▎     | 2792/6434 [6:32:54<8:08:47,  8.05s/it, gpt_loss=0.28, loss_mean=0.271][A[A
+
+Train step of epoch 1:  43%|████▎     | 2793/6434 [6:32:54<8:10:36,  8.08s/it, gpt_loss=0.28, loss_mean=0.271][A[A
+
+Train step of epoch 1:  43%|████▎     | 2793/6434 [6:33:02<8:10:36,  8.08s/it, gpt_loss=0.239, loss_mean=0.267][A[A
+
+Train step of epoch 1:  43%|████▎     | 2794/6434 [6:33:02<8:09:39,  8.07s/it, gpt_loss=0.239, loss_mean=0.267][A[A
+
+Train step of epoch 1:  43%|████▎     | 2794/6434 [6:33:11<8:09:39,  8.07s/it, gpt_loss=0.291, loss_mean=0.27] [A[A
+
+Train step of epoch 1:  43%|████▎     | 2795/6434 [6:33:11<8:23:38,  8.30s/it, gpt_loss=0.291, loss_mean=0.27][A[A
+[LID Router Debug] Step: 9230
+Batch Size: 10
+Audio Batch Size: 132
+LID Assignments: [2, 0, 9, 4, 3, 2, 1, 0, 2, 9]
+Active Experts in Batch: {0, 1, 2, 3, 4, 9}
+
+
+Train step of epoch 1:  43%|████▎     | 2795/6434 [6:33:20<8:23:38,  8.30s/it, gpt_loss=0.214, loss_mean=0.264][A[A
+
+Train step of epoch 1:  43%|████▎     | 2796/6434 [6:33:20<8:30:59,  8.43s/it, gpt_loss=0.214, loss_mean=0.264][A[A
+
+Train step of epoch 1:  43%|████▎     | 2796/6434 [6:33:27<8:30:59,  8.43s/it, gpt_loss=0.308, loss_mean=0.269][A[A
+
+Train step of epoch 1:  43%|████▎     | 2797/6434 [6:33:27<8:18:17,  8.22s/it, gpt_loss=0.308, loss_mean=0.269][A[A
+
+Train step of epoch 1:  43%|████▎     | 2797/6434 [6:33:35<8:18:17,  8.22s/it, gpt_loss=0.218, loss_mean=0.264][A[A
+
+Train step of epoch 1:  43%|████▎     | 2798/6434 [6:33:35<8:10:12,  8.09s/it, gpt_loss=0.218, loss_mean=0.264][A[A
+
+Train step of epoch 1:  43%|████▎     | 2798/6434 [6:33:43<8:10:12,  8.09s/it, gpt_loss=0.254, loss_mean=0.263][A[A
+
+Train step of epoch 1:  44%|████▎     | 2799/6434 [6:33:43<7:54:44,  7.84s/it, gpt_loss=0.254, loss_mean=0.263][A[A
+
+Train step of epoch 1:  44%|████▎     | 2799/6434 [6:33:51<7:54:44,  7.84s/it, gpt_loss=0.247, loss_mean=0.261][A[A
+
+Train step of epoch 1:  44%|████▎     | 2800/6434 [6:33:51<8:13:51,  8.15s/it, gpt_loss=0.247, loss_mean=0.261][A[A
+
+Train step of epoch 1:  44%|████▎     | 2800/6434 [6:34:00<8:13:51,  8.15s/it, gpt_loss=0.196, loss_mean=0.255][A[A
+
+Train step of epoch 1:  44%|████▎     | 2801/6434 [6:34:00<8:18:32,  8.23s/it, gpt_loss=0.196, loss_mean=0.255][A[A
+
+Train step of epoch 1:  44%|████▎     | 2801/6434 [6:34:09<8:18:32,  8.23s/it, gpt_loss=0.273, loss_mean=0.256][A[A
+
+Train step of epoch 1:  44%|████▎     | 2802/6434 [6:34:09<8:31:10,  8.44s/it, gpt_loss=0.273, loss_mean=0.256][A[A
+
+Train step of epoch 1:  44%|████▎     | 2802/6434 [6:34:17<8:31:10,  8.44s/it, gpt_loss=0.357, loss_mean=0.267][A[A
+
+Train step of epoch 1:  44%|████▎     | 2803/6434 [6:34:17<8:28:31,  8.40s/it, gpt_loss=0.357, loss_mean=0.267][A[A
+
+Train step of epoch 1:  44%|████▎     | 2803/6434 [6:34:25<8:28:31,  8.40s/it, gpt_loss=0.244, loss_mean=0.264][A[A
+
+Train step of epoch 1:  44%|████▎     | 2804/6434 [6:34:25<8:25:45,  8.36s/it, gpt_loss=0.244, loss_mean=0.264][A[A
+
+Train step of epoch 1:  44%|████▎     | 2804/6434 [6:34:34<8:25:45,  8.36s/it, gpt_loss=0.244, loss_mean=0.262][A[A
+
+Train step of epoch 1:  44%|████▎     | 2805/6434 [6:34:34<8:32:40,  8.48s/it, gpt_loss=0.244, loss_mean=0.262][A[A
+[LID Router Debug] Step: 9240
+Batch Size: 10
+Audio Batch Size: 113
+LID Assignments: [0, 1, 9, 9, 2, 9, 1, 4, 5, 9]
+Active Experts in Batch: {0, 1, 2, 4, 5, 9}
+
+
+Train step of epoch 1:  44%|████▎     | 2805/6434 [6:34:43<8:32:40,  8.48s/it, gpt_loss=0.237, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  44%|████▎     | 2806/6434 [6:34:43<8:40:42,  8.61s/it, gpt_loss=0.237, loss_mean=0.26][A[A
+
+Train step of epoch 1:  44%|████▎     | 2806/6434 [6:34:52<8:40:42,  8.61s/it, gpt_loss=0.268, loss_mean=0.261][A[A
+
+Train step of epoch 1:  44%|████▎     | 2807/6434 [6:34:52<8:43:45,  8.66s/it, gpt_loss=0.268, loss_mean=0.261][A[A
+
+Train step of epoch 1:  44%|████▎     | 2807/6434 [6:35:00<8:43:45,  8.66s/it, gpt_loss=0.261, loss_mean=0.261][A[A
+
+Train step of epoch 1:  44%|████▎     | 2808/6434 [6:35:00<8:39:02,  8.59s/it, gpt_loss=0.261, loss_mean=0.261][A[A
+
+Train step of epoch 1:  44%|████▎     | 2808/6434 [6:35:08<8:39:02,  8.59s/it, gpt_loss=0.296, loss_mean=0.264][A[A
+
+Train step of epoch 1:  44%|████▎     | 2809/6434 [6:35:08<8:29:17,  8.43s/it, gpt_loss=0.296, loss_mean=0.264][A[A
+
+Train step of epoch 1:  44%|████▎     | 2809/6434 [6:35:16<8:29:17,  8.43s/it, gpt_loss=0.3, loss_mean=0.268]  [A[A
+
+Train step of epoch 1:  44%|████▎     | 2810/6434 [6:35:16<8:11:05,  8.13s/it, gpt_loss=0.3, loss_mean=0.268][A[A
+
+Train step of epoch 1:  44%|████▎     | 2810/6434 [6:35:24<8:11:05,  8.13s/it, gpt_loss=0.219, loss_mean=0.263][A[A
+
+Train step of epoch 1:  44%|████▎     | 2811/6434 [6:35:24<8:16:23,  8.22s/it, gpt_loss=0.219, loss_mean=0.263][A[A
+
+Train step of epoch 1:  44%|████▎     | 2811/6434 [6:35:32<8:16:23,  8.22s/it, gpt_loss=0.377, loss_mean=0.274][A[A
+
+Train step of epoch 1:  44%|████▎     | 2812/6434 [6:35:32<8:16:30,  8.22s/it, gpt_loss=0.377, loss_mean=0.274][A[A
+
+Train step of epoch 1:  44%|████▎     | 2812/6434 [6:35:41<8:16:30,  8.22s/it, gpt_loss=0.283, loss_mean=0.275][A[A
+
+Train step of epoch 1:  44%|████▎     | 2813/6434 [6:35:41<8:17:10,  8.24s/it, gpt_loss=0.283, loss_mean=0.275][A[A
+
+Train step of epoch 1:  44%|████▎     | 2813/6434 [6:35:48<8:17:10,  8.24s/it, gpt_loss=0.195, loss_mean=0.267][A[A
+
+Train step of epoch 1:  44%|████▎     | 2814/6434 [6:35:48<8:06:10,  8.06s/it, gpt_loss=0.195, loss_mean=0.267][A[A
+
+Train step of epoch 1:  44%|████▎     | 2814/6434 [6:35:57<8:06:10,  8.06s/it, gpt_loss=0.225, loss_mean=0.263][A[A
+
+Train step of epoch 1:  44%|████▍     | 2815/6434 [6:35:57<8:26:59,  8.41s/it, gpt_loss=0.225, loss_mean=0.263][A[A
+[LID Router Debug] Step: 9250
+Batch Size: 10
+Audio Batch Size: 77
+LID Assignments: [4, 2, 1, 0, 4, 0, 2, 1, 0, 6]
+Active Experts in Batch: {0, 1, 2, 4, 6}
+
+
+Train step of epoch 1:  44%|████▍     | 2815/6434 [6:36:06<8:26:59,  8.41s/it, gpt_loss=0.232, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  44%|████▍     | 2816/6434 [6:36:06<8:26:34,  8.40s/it, gpt_loss=0.232, loss_mean=0.26][A[A
+
+Train step of epoch 1:  44%|████▍     | 2816/6434 [6:36:15<8:26:34,  8.40s/it, gpt_loss=0.244, loss_mean=0.258][A[A
+
+Train step of epoch 1:  44%|████▍     | 2817/6434 [6:36:15<8:35:16,  8.55s/it, gpt_loss=0.244, loss_mean=0.258][A[A
+
+Train step of epoch 1:  44%|████▍     | 2817/6434 [6:36:23<8:35:16,  8.55s/it, gpt_loss=0.31, loss_mean=0.263] [A[A
+
+Train step of epoch 1:  44%|████▍     | 2818/6434 [6:36:23<8:37:09,  8.58s/it, gpt_loss=0.31, loss_mean=0.263][A[A
+
+Train step of epoch 1:  44%|████▍     | 2818/6434 [6:36:32<8:37:09,  8.58s/it, gpt_loss=0.272, loss_mean=0.264][A[A
+
+Train step of epoch 1:  44%|████▍     | 2819/6434 [6:36:32<8:44:55,  8.71s/it, gpt_loss=0.272, loss_mean=0.264][A[A
+
+Train step of epoch 1:  44%|████▍     | 2819/6434 [6:36:40<8:44:55,  8.71s/it, gpt_loss=0.285, loss_mean=0.266][A[A
+
+Train step of epoch 1:  44%|████▍     | 2820/6434 [6:36:40<8:31:13,  8.49s/it, gpt_loss=0.285, loss_mean=0.266][A[A
+
+Train step of epoch 1:  44%|████▍     | 2820/6434 [6:36:49<8:31:13,  8.49s/it, gpt_loss=0.2, loss_mean=0.26]   [A[A
+
+Train step of epoch 1:  44%|████▍     | 2821/6434 [6:36:49<8:33:10,  8.52s/it, gpt_loss=0.2, loss_mean=0.26][A[A
+
+Train step of epoch 1:  44%|████▍     | 2821/6434 [6:36:57<8:33:10,  8.52s/it, gpt_loss=0.247, loss_mean=0.258][A[A
+
+Train step of epoch 1:  44%|████▍     | 2822/6434 [6:36:57<8:31:45,  8.50s/it, gpt_loss=0.247, loss_mean=0.258][A[A
+
+Train step of epoch 1:  44%|████▍     | 2822/6434 [6:37:06<8:31:45,  8.50s/it, gpt_loss=0.274, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  44%|████▍     | 2823/6434 [6:37:06<8:26:30,  8.42s/it, gpt_loss=0.274, loss_mean=0.26][A[A
+
+Train step of epoch 1:  44%|████▍     | 2823/6434 [6:37:14<8:26:30,  8.42s/it, gpt_loss=0.225, loss_mean=0.256][A[A
+
+Train step of epoch 1:  44%|████▍     | 2824/6434 [6:37:14<8:22:33,  8.35s/it, gpt_loss=0.225, loss_mean=0.256][A[A
+
+Train step of epoch 1:  44%|████▍     | 2824/6434 [6:37:23<8:22:33,  8.35s/it, gpt_loss=0.296, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  44%|████▍     | 2825/6434 [6:37:23<8:28:09,  8.45s/it, gpt_loss=0.296, loss_mean=0.26][A[A
+[LID Router Debug] Step: 9260
+Batch Size: 10
+Audio Batch Size: 86
+LID Assignments: [7, 5, 3, 6, 6, 4, 2, 1, 1, 1]
+Active Experts in Batch: {1, 2, 3, 4, 5, 6, 7}
+
+
+Train step of epoch 1:  44%|████▍     | 2825/6434 [6:37:31<8:28:09,  8.45s/it, gpt_loss=0.254, loss_mean=0.26][A[A
+
+Train step of epoch 1:  44%|████▍     | 2826/6434 [6:37:31<8:21:53,  8.35s/it, gpt_loss=0.254, loss_mean=0.26][A[A
+
+Train step of epoch 1:  44%|████▍     | 2826/6434 [6:37:40<8:21:53,  8.35s/it, gpt_loss=0.253, loss_mean=0.259][A[A
+
+Train step of epoch 1:  44%|████▍     | 2827/6434 [6:37:40<8:41:55,  8.68s/it, gpt_loss=0.253, loss_mean=0.259][A[A
+
+Train step of epoch 1:  44%|████▍     | 2827/6434 [6:37:48<8:41:55,  8.68s/it, gpt_loss=0.24, loss_mean=0.257] [A[A
+
+Train step of epoch 1:  44%|████▍     | 2828/6434 [6:37:48<8:30:46,  8.50s/it, gpt_loss=0.24, loss_mean=0.257][A[A
+
+Train step of epoch 1:  44%|████▍     | 2828/6434 [6:37:56<8:30:46,  8.50s/it, gpt_loss=0.214, loss_mean=0.253][A[A
+
+Train step of epoch 1:  44%|████▍     | 2829/6434 [6:37:56<8:19:04,  8.31s/it, gpt_loss=0.214, loss_mean=0.253][A[A
+
+Train step of epoch 1:  44%|████▍     | 2829/6434 [6:38:04<8:19:04,  8.31s/it, gpt_loss=0.266, loss_mean=0.254][A[A
+
+Train step of epoch 1:  44%|████▍     | 2830/6434 [6:38:04<8:12:12,  8.19s/it, gpt_loss=0.266, loss_mean=0.254][A[A
+
+Train step of epoch 1:  44%|████▍     | 2830/6434 [6:38:12<8:12:12,  8.19s/it, gpt_loss=0.288, loss_mean=0.258][A[A
+
+Train step of epoch 1:  44%|████▍     | 2831/6434 [6:38:12<8:10:20,  8.17s/it, gpt_loss=0.288, loss_mean=0.258][A[A
+
+Train step of epoch 1:  44%|████▍     | 2831/6434 [6:38:20<8:10:20,  8.17s/it, gpt_loss=0.28, loss_mean=0.26]  [A[A
+
+Train step of epoch 1:  44%|████▍     | 2832/6434 [6:38:20<7:59:32,  7.99s/it, gpt_loss=0.28, loss_mean=0.26][A[A
+
+Train step of epoch 1:  44%|████▍     | 2832/6434 [6:38:29<7:59:32,  7.99s/it, gpt_loss=0.333, loss_mean=0.267][A[A
+
+Train step of epoch 1:  44%|████▍     | 2833/6434 [6:38:29<8:17:27,  8.29s/it, gpt_loss=0.333, loss_mean=0.267][A[A
+
+Train step of epoch 1:  44%|████▍     | 2833/6434 [6:38:37<8:17:27,  8.29s/it, gpt_loss=0.243, loss_mean=0.265][A[A
+
+Train step of epoch 1:  44%|████▍     | 2834/6434 [6:38:37<8:11:41,  8.19s/it, gpt_loss=0.243, loss_mean=0.265][A[A
+
+Train step of epoch 1:  44%|████▍     | 2834/6434 [6:38:44<8:11:41,  8.19s/it, gpt_loss=0.275, loss_mean=0.266][A[A
+
+Train step of epoch 1:  44%|████▍     | 2835/6434 [6:38:44<7:51:11,  7.86s/it, gpt_loss=0.275, loss_mean=0.266][A[A
+[LID Router Debug] Step: 9270
+Batch Size: 10
+Audio Batch Size: 150
+LID Assignments: [3, 4, 6, 6, 3, 4, 3, 8, 1, 9]
+Active Experts in Batch: {1, 3, 4, 6, 8, 9}
+
+
+Train step of epoch 1:  44%|████▍     | 2835/6434 [6:38:53<7:51:11,  7.86s/it, gpt_loss=0.253, loss_mean=0.264][A[A
+
+Train step of epoch 1:  44%|████▍     | 2836/6434 [6:38:53<8:21:52,  8.37s/it, gpt_loss=0.253, loss_mean=0.264][A[A
+
+Train step of epoch 1:  44%|████▍     | 2836/6434 [6:39:01<8:21:52,  8.37s/it, gpt_loss=0.302, loss_mean=0.268][A[A
+
+Train step of epoch 1:  44%|████▍     | 2837/6434 [6:39:01<8:12:13,  8.21s/it, gpt_loss=0.302, loss_mean=0.268][A[A
+
+Train step of epoch 1:  44%|████▍     | 2837/6434 [6:39:09<8:12:13,  8.21s/it, gpt_loss=0.205, loss_mean=0.262][A[A
+
+Train step of epoch 1:  44%|████▍     | 2838/6434 [6:39:09<8:11:23,  8.20s/it, gpt_loss=0.205, loss_mean=0.262][A[A
+
+Train step of epoch 1:  44%|████▍     | 2838/6434 [6:39:18<8:11:23,  8.20s/it, gpt_loss=0.246, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  44%|████▍     | 2839/6434 [6:39:18<8:28:09,  8.48s/it, gpt_loss=0.246, loss_mean=0.26][A[A
+
+Train step of epoch 1:  44%|████▍     | 2839/6434 [6:39:26<8:28:09,  8.48s/it, gpt_loss=0.242, loss_mean=0.258][A[A
+
+Train step of epoch 1:  44%|████▍     | 2840/6434 [6:39:26<8:14:14,  8.25s/it, gpt_loss=0.242, loss_mean=0.258][A[A
+
+Train step of epoch 1:  44%|████▍     | 2840/6434 [6:39:34<8:14:14,  8.25s/it, gpt_loss=0.211, loss_mean=0.254][A[A
+
+Train step of epoch 1:  44%|████▍     | 2841/6434 [6:39:34<8:14:58,  8.27s/it, gpt_loss=0.211, loss_mean=0.254][A[A
+
+Train step of epoch 1:  44%|████▍     | 2841/6434 [6:39:42<8:14:58,  8.27s/it, gpt_loss=0.342, loss_mean=0.263][A[A
+
+Train step of epoch 1:  44%|████▍     | 2842/6434 [6:39:42<8:07:17,  8.14s/it, gpt_loss=0.342, loss_mean=0.263][A[A
+
+Train step of epoch 1:  44%|████▍     | 2842/6434 [6:39:50<8:07:17,  8.14s/it, gpt_loss=0.23, loss_mean=0.259] [A[A
+
+Train step of epoch 1:  44%|████▍     | 2843/6434 [6:39:50<8:03:42,  8.08s/it, gpt_loss=0.23, loss_mean=0.259][A[A
+
+Train step of epoch 1:  44%|████▍     | 2843/6434 [6:39:59<8:03:42,  8.08s/it, gpt_loss=0.228, loss_mean=0.256][A[A
+
+Train step of epoch 1:  44%|████▍     | 2844/6434 [6:39:59<8:12:25,  8.23s/it, gpt_loss=0.228, loss_mean=0.256][A[A
+
+Train step of epoch 1:  44%|████▍     | 2844/6434 [6:40:08<8:12:25,  8.23s/it, gpt_loss=0.214, loss_mean=0.252][A[A
+
+Train step of epoch 1:  44%|████▍     | 2845/6434 [6:40:08<8:28:54,  8.51s/it, gpt_loss=0.214, loss_mean=0.252][A[A
+[LID Router Debug] Step: 9280
+Batch Size: 10
+Audio Batch Size: 119
+LID Assignments: [0, 3, 9, 3, 2, 0, 0, 4, 4, 4]
+Active Experts in Batch: {0, 2, 3, 4, 9}
+
+
+Train step of epoch 1:  44%|████▍     | 2845/6434 [6:40:16<8:28:54,  8.51s/it, gpt_loss=0.181, loss_mean=0.245][A[A
+
+Train step of epoch 1:  44%|████▍     | 2846/6434 [6:40:16<8:16:13,  8.30s/it, gpt_loss=0.181, loss_mean=0.245][A[A
+
+Train step of epoch 1:  44%|████▍     | 2846/6434 [6:40:24<8:16:13,  8.30s/it, gpt_loss=0.279, loss_mean=0.248][A[A
+
+Train step of epoch 1:  44%|████▍     | 2847/6434 [6:40:24<8:13:39,  8.26s/it, gpt_loss=0.279, loss_mean=0.248][A[A
+
+Train step of epoch 1:  44%|████▍     | 2847/6434 [6:40:32<8:13:39,  8.26s/it, gpt_loss=0.239, loss_mean=0.247][A[A
+
+Train step of epoch 1:  44%|████▍     | 2848/6434 [6:40:32<8:12:14,  8.24s/it, gpt_loss=0.239, loss_mean=0.247][A[A
+
+Train step of epoch 1:  44%|████▍     | 2848/6434 [6:40:42<8:12:14,  8.24s/it, gpt_loss=0.259, loss_mean=0.248][A[A
+
+Train step of epoch 1:  44%|████▍     | 2849/6434 [6:40:42<8:33:34,  8.60s/it, gpt_loss=0.259, loss_mean=0.248][A[A
+
+Train step of epoch 1:  44%|████▍     | 2849/6434 [6:40:50<8:33:34,  8.60s/it, gpt_loss=0.215, loss_mean=0.245][A[A
+
+Train step of epoch 1:  44%|████▍     | 2850/6434 [6:40:50<8:32:19,  8.58s/it, gpt_loss=0.215, loss_mean=0.245][A[A
+
+Train step of epoch 1:  44%|████▍     | 2850/6434 [6:40:58<8:32:19,  8.58s/it, gpt_loss=0.328, loss_mean=0.253][A[A
+
+Train step of epoch 1:  44%|████▍     | 2851/6434 [6:40:58<8:22:34,  8.42s/it, gpt_loss=0.328, loss_mean=0.253][A[A
+
+Train step of epoch 1:  44%|████▍     | 2851/6434 [6:41:08<8:22:34,  8.42s/it, gpt_loss=0.276, loss_mean=0.256][A[A
+
+Train step of epoch 1:  44%|████▍     | 2852/6434 [6:41:08<8:43:25,  8.77s/it, gpt_loss=0.276, loss_mean=0.256][A[A
+
+Train step of epoch 1:  44%|████▍     | 2852/6434 [6:41:16<8:43:25,  8.77s/it, gpt_loss=0.2, loss_mean=0.25]   [A[A
+
+Train step of epoch 1:  44%|████▍     | 2853/6434 [6:41:16<8:26:35,  8.49s/it, gpt_loss=0.2, loss_mean=0.25][A[A
+
+Train step of epoch 1:  44%|████▍     | 2853/6434 [6:41:24<8:26:35,  8.49s/it, gpt_loss=0.229, loss_mean=0.248][A[A
+
+Train step of epoch 1:  44%|████▍     | 2854/6434 [6:41:24<8:20:23,  8.39s/it, gpt_loss=0.229, loss_mean=0.248][A[A
+
+Train step of epoch 1:  44%|████▍     | 2854/6434 [6:41:32<8:20:23,  8.39s/it, gpt_loss=0.23, loss_mean=0.246] [A[A
+
+Train step of epoch 1:  44%|████▍     | 2855/6434 [6:41:32<8:18:13,  8.35s/it, gpt_loss=0.23, loss_mean=0.246][A[A
+[LID Router Debug] Step: 9290
+Batch Size: 10
+Audio Batch Size: 105
+LID Assignments: [3, 3, 4, 4, 5, 4, 2, 9, 4, 1]
+Active Experts in Batch: {1, 2, 3, 4, 5, 9}
+
+
+Train step of epoch 1:  44%|████▍     | 2855/6434 [6:41:39<8:18:13,  8.35s/it, gpt_loss=0.288, loss_mean=0.25][A[A
+
+Train step of epoch 1:  44%|████▍     | 2856/6434 [6:41:39<8:02:32,  8.09s/it, gpt_loss=0.288, loss_mean=0.25][A[A
+
+Train step of epoch 1:  44%|████▍     | 2856/6434 [6:41:48<8:02:32,  8.09s/it, gpt_loss=0.277, loss_mean=0.253][A[A
+
+Train step of epoch 1:  44%|████▍     | 2857/6434 [6:41:48<8:11:43,  8.25s/it, gpt_loss=0.277, loss_mean=0.253][A[A
+
+Train step of epoch 1:  44%|████▍     | 2857/6434 [6:41:55<8:11:43,  8.25s/it, gpt_loss=0.287, loss_mean=0.257][A[A
+
+Train step of epoch 1:  44%|████▍     | 2858/6434 [6:41:55<7:53:12,  7.94s/it, gpt_loss=0.287, loss_mean=0.257][A[A
+
+Train step of epoch 1:  44%|████▍     | 2858/6434 [6:42:03<7:53:12,  7.94s/it, gpt_loss=0.283, loss_mean=0.259][A[A
+
+Train step of epoch 1:  44%|████▍     | 2859/6434 [6:42:03<7:53:51,  7.95s/it, gpt_loss=0.283, loss_mean=0.259][A[A
+
+Train step of epoch 1:  44%|████▍     | 2859/6434 [6:42:12<7:53:51,  7.95s/it, gpt_loss=0.303, loss_mean=0.264][A[A
+
+Train step of epoch 1:  44%|████▍     | 2860/6434 [6:42:12<8:10:33,  8.24s/it, gpt_loss=0.303, loss_mean=0.264][A[A
+
+Train step of epoch 1:  44%|████▍     | 2860/6434 [6:42:22<8:10:33,  8.24s/it, gpt_loss=0.223, loss_mean=0.259][A[A
+
+Train step of epoch 1:  44%|████▍     | 2861/6434 [6:42:22<8:37:46,  8.69s/it, gpt_loss=0.223, loss_mean=0.259][A[A
+
+Train step of epoch 1:  44%|████▍     | 2861/6434 [6:42:29<8:37:46,  8.69s/it, gpt_loss=0.308, loss_mean=0.264][A[A
+
+Train step of epoch 1:  44%|████▍     | 2862/6434 [6:42:29<8:15:28,  8.32s/it, gpt_loss=0.308, loss_mean=0.264][A[A
+
+Train step of epoch 1:  44%|████▍     | 2862/6434 [6:42:38<8:15:28,  8.32s/it, gpt_loss=0.212, loss_mean=0.259][A[A
+
+Train step of epoch 1:  44%|████▍     | 2863/6434 [6:42:38<8:20:51,  8.42s/it, gpt_loss=0.212, loss_mean=0.259][A[A
+
+Train step of epoch 1:  44%|████▍     | 2863/6434 [6:42:47<8:20:51,  8.42s/it, gpt_loss=0.259, loss_mean=0.259][A[A
+
+Train step of epoch 1:  45%|████▍     | 2864/6434 [6:42:47<8:37:11,  8.69s/it, gpt_loss=0.259, loss_mean=0.259][A[A
+
+Train step of epoch 1:  45%|████▍     | 2864/6434 [6:42:56<8:37:11,  8.69s/it, gpt_loss=0.283, loss_mean=0.261][A[A
+
+Train step of epoch 1:  45%|████▍     | 2865/6434 [6:42:56<8:30:44,  8.59s/it, gpt_loss=0.283, loss_mean=0.261][A[A
+[LID Router Debug] Step: 9300
+Batch Size: 10
+Audio Batch Size: 123
+LID Assignments: [9, 3, 3, 1, 5, 9, 11, 5, 2, 2]
+Active Experts in Batch: {1, 2, 3, 5, 9, 11}
+
+
+Train step of epoch 1:  45%|████▍     | 2865/6434 [6:43:04<8:30:44,  8.59s/it, gpt_loss=0.331, loss_mean=0.268][A[A
+
+Train step of epoch 1:  45%|████▍     | 2866/6434 [6:43:04<8:25:37,  8.50s/it, gpt_loss=0.331, loss_mean=0.268][A[A
+
+Train step of epoch 1:  45%|████▍     | 2866/6434 [6:43:12<8:25:37,  8.50s/it, gpt_loss=0.238, loss_mean=0.265][A[A
+
+Train step of epoch 1:  45%|████▍     | 2867/6434 [6:43:12<8:08:29,  8.22s/it, gpt_loss=0.238, loss_mean=0.265][A[A
+
+Train step of epoch 1:  45%|████▍     | 2867/6434 [6:43:19<8:08:29,  8.22s/it, gpt_loss=0.189, loss_mean=0.258][A[A
+
+Train step of epoch 1:  45%|████▍     | 2868/6434 [6:43:19<7:59:43,  8.07s/it, gpt_loss=0.189, loss_mean=0.258][A[A
+
+Train step of epoch 1:  45%|████▍     | 2868/6434 [6:43:27<7:59:43,  8.07s/it, gpt_loss=0.242, loss_mean=0.256][A[A
+
+Train step of epoch 1:  45%|████▍     | 2869/6434 [6:43:27<7:58:42,  8.06s/it, gpt_loss=0.242, loss_mean=0.256][A[A
+
+Train step of epoch 1:  45%|████▍     | 2869/6434 [6:43:36<7:58:42,  8.06s/it, gpt_loss=0.324, loss_mean=0.263][A[A
+
+Train step of epoch 1:  45%|████▍     | 2870/6434 [6:43:36<8:04:25,  8.16s/it, gpt_loss=0.324, loss_mean=0.263][A[A
+
+Train step of epoch 1:  45%|████▍     | 2870/6434 [6:43:44<8:04:25,  8.16s/it, gpt_loss=0.201, loss_mean=0.257][A[A
+
+Train step of epoch 1:  45%|████▍     | 2871/6434 [6:43:44<8:01:42,  8.11s/it, gpt_loss=0.201, loss_mean=0.257][A[A
+
+Train step of epoch 1:  45%|████▍     | 2871/6434 [6:43:53<8:01:42,  8.11s/it, gpt_loss=0.238, loss_mean=0.255][A[A
+
+Train step of epoch 1:  45%|████▍     | 2872/6434 [6:43:53<8:22:03,  8.46s/it, gpt_loss=0.238, loss_mean=0.255][A[A
+
+Train step of epoch 1:  45%|████▍     | 2872/6434 [6:44:02<8:22:03,  8.46s/it, gpt_loss=0.299, loss_mean=0.259][A[A
+
+Train step of epoch 1:  45%|████▍     | 2873/6434 [6:44:02<8:28:19,  8.56s/it, gpt_loss=0.299, loss_mean=0.259][A[A
+
+Train step of epoch 1:  45%|████▍     | 2873/6434 [6:44:10<8:28:19,  8.56s/it, gpt_loss=0.238, loss_mean=0.257][A[A
+
+Train step of epoch 1:  45%|████▍     | 2874/6434 [6:44:10<8:31:05,  8.61s/it, gpt_loss=0.238, loss_mean=0.257][A[A
+
+Train step of epoch 1:  45%|████▍     | 2874/6434 [6:44:20<8:31:05,  8.61s/it, gpt_loss=0.312, loss_mean=0.263][A[A
+
+Train step of epoch 1:  45%|████▍     | 2875/6434 [6:44:20<8:40:41,  8.78s/it, gpt_loss=0.312, loss_mean=0.263][A[A
+[LID Router Debug] Step: 9310
+Batch Size: 10
+Audio Batch Size: 110
+LID Assignments: [0, 4, 4, 4, 3, 1, 2, 1, 3, 6]
+Active Experts in Batch: {0, 1, 2, 3, 4, 6}
+
+
+Train step of epoch 1:  45%|████▍     | 2875/6434 [6:44:28<8:40:41,  8.78s/it, gpt_loss=0.222, loss_mean=0.259][A[A
+
+Train step of epoch 1:  45%|████▍     | 2876/6434 [6:44:28<8:39:46,  8.77s/it, gpt_loss=0.222, loss_mean=0.259][A[A
+
+Train step of epoch 1:  45%|████▍     | 2876/6434 [6:44:38<8:39:46,  8.77s/it, gpt_loss=0.252, loss_mean=0.258][A[A
+
+Train step of epoch 1:  45%|████▍     | 2877/6434 [6:44:38<8:53:01,  8.99s/it, gpt_loss=0.252, loss_mean=0.258][A[A
+
+Train step of epoch 1:  45%|████▍     | 2877/6434 [6:44:46<8:53:01,  8.99s/it, gpt_loss=0.221, loss_mean=0.254][A[A
+
+Train step of epoch 1:  45%|████▍     | 2878/6434 [6:44:46<8:42:28,  8.82s/it, gpt_loss=0.221, loss_mean=0.254][A[A
+
+Train step of epoch 1:  45%|████▍     | 2878/6434 [6:44:56<8:42:28,  8.82s/it, gpt_loss=0.309, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  45%|████▍     | 2879/6434 [6:44:56<8:52:45,  8.99s/it, gpt_loss=0.309, loss_mean=0.26][A[A
+
+Train step of epoch 1:  45%|████▍     | 2879/6434 [6:45:05<8:52:45,  8.99s/it, gpt_loss=0.289, loss_mean=0.263][A[A
+
+Train step of epoch 1:  45%|████▍     | 2880/6434 [6:45:05<9:03:09,  9.17s/it, gpt_loss=0.289, loss_mean=0.263][A[A
+
+Train step of epoch 1:  45%|████▍     | 2880/6434 [6:45:14<9:03:09,  9.17s/it, gpt_loss=0.236, loss_mean=0.26] [A[A
+
+Train step of epoch 1:  45%|████▍     | 2881/6434 [6:45:14<8:46:53,  8.90s/it, gpt_loss=0.236, loss_mean=0.26][A[A
+
+Train step of epoch 1:  45%|████▍     | 2881/6434 [6:45:23<8:46:53,  8.90s/it, gpt_loss=0.25, loss_mean=0.259][A[A
+
+Train step of epoch 1:  45%|████▍     | 2882/6434 [6:45:23<9:04:58,  9.21s/it, gpt_loss=0.25, loss_mean=0.259][A[A
+
+Train step of epoch 1:  45%|████▍     | 2882/6434 [6:45:31<9:04:58,  9.21s/it, gpt_loss=0.25, loss_mean=0.258][A[A
+
+Train step of epoch 1:  45%|████▍     | 2883/6434 [6:45:31<8:30:16,  8.62s/it, gpt_loss=0.25, loss_mean=0.258][A[A[2026-02-07 13:47:26,166] [INFO] [launch.py:316:sigkill_handler] Killing subprocess 707055
+[2026-02-07 13:47:33,525] [INFO] [launch.py:316:sigkill_handler] Killing subprocess 707056
+[2026-02-07 13:47:33,527] [INFO] [launch.py:316:sigkill_handler] Killing subprocess 707057
+[2026-02-07 13:47:33,528] [INFO] [launch.py:316:sigkill_handler] Killing subprocess 707058
+[2026-02-07 13:47:33,529] [INFO] [launch.py:325:sigkill_handler] Main process received SIGTERM, exiting
diff --git a/ckpts/qwen3-1.7b-whisper-260205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/tb_logs/events.out.tfevents.1770393363.t-20260205111348-dcgpv-worker-0.707055.0 b/ckpts/qwen3-1.7b-whisper-260205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/tb_logs/events.out.tfevents.1770393363.t-20260205111348-dcgpv-worker-0.707055.0
new file mode 100644
index 0000000000000000000000000000000000000000..c692cec5744ebf069954071c5cd112c686e48809
--- /dev/null
+++ b/ckpts/qwen3-1.7b-whisper-260205_12x1000h_lite30h_zipper_lora_independent_audio_init_baseline_with_lid_embedding_non_chunked/tb_logs/events.out.tfevents.1770393363.t-20260205111348-dcgpv-worker-0.707055.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:698f5a12bdb2e9594719f624b004bb29f78673d3359ac593bc01f7ce6085c2c5
+size 884901