diff --git a/flame/__pycache__/__init__.cpython-312.pyc b/flame/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6161808beae3801061789786255e3c5efb746474 Binary files /dev/null and b/flame/__pycache__/__init__.cpython-312.pyc differ diff --git a/flame/__pycache__/config_manager.cpython-312.pyc b/flame/__pycache__/config_manager.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c95c518012bc7f934d64bfdbf5b206aa56c2e715 Binary files /dev/null and b/flame/__pycache__/config_manager.cpython-312.pyc differ diff --git a/flame/components/__pycache__/checkpoint.cpython-312.pyc b/flame/components/__pycache__/checkpoint.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b3c78561a8b7cf755c1b850ccd73aa2adfdccaa Binary files /dev/null and b/flame/components/__pycache__/checkpoint.cpython-312.pyc differ diff --git a/flame/components/checkpoint.py b/flame/components/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..d7a21fbbcaf9503c1f0f8d965acce420b223201b --- /dev/null +++ b/flame/components/checkpoint.py @@ -0,0 +1,59 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass, field +from datetime import timedelta +from io import BytesIO +from typing import Any, Dict, List + +import torch +from torch.distributed.checkpoint.stateful import Stateful + + +@dataclass +class TrainState(Stateful): + step: int = 0 + skipped_step: int = 0 + token: int = 0 + elapsed: timedelta = timedelta(0) + global_avg_losses: List[float] = field(default_factory=list) + global_max_losses: List[float] = field(default_factory=list) + log_steps: List[int] = field(default_factory=list) + + def state_dict(self) -> Dict[str, Any]: + # Only checkpoint global_avg_losses and global_max_losses per log frequency + # to avoid sync overhead in every iteration. + global_avg_losses_bytes = BytesIO() + torch.save(self.global_avg_losses, global_avg_losses_bytes) + global_max_losses_bytes = BytesIO() + torch.save(self.global_max_losses, global_max_losses_bytes) + log_steps_bytes = BytesIO() + torch.save(self.log_steps, log_steps_bytes) + return { + "step": torch.tensor(self.step, dtype=torch.int32), + "skipped_step": torch.tensor(self.skipped_step, dtype=torch.int32), + "token": torch.tensor(self.token, dtype=torch.int64), + "elapsed": self.elapsed, + "global_avg_losses": global_avg_losses_bytes, + "global_max_losses": global_max_losses_bytes, + "log_steps": log_steps_bytes, + } + + def load_state_dict(self, state_dict) -> None: + self.step = state_dict["step"].item() + self.skipped_step = state_dict.get("skipped_step", 0).item() + self.token = state_dict["token"].item() + self.elapsed = state_dict["elapsed"] + state_dict["global_avg_losses"].seek(0) + self.global_avg_losses = torch.load( + state_dict["global_avg_losses"], weights_only=False + ) + state_dict["global_max_losses"].seek(0) + self.global_max_losses = torch.load( + state_dict["global_max_losses"], weights_only=False + ) + state_dict["log_steps"].seek(0) + self.log_steps = torch.load(state_dict["log_steps"], weights_only=False) diff --git a/flame/models/__init__.py b/flame/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/flame/models/__pycache__/__init__.cpython-312.pyc b/flame/models/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6edf19a299d3f61335a04fdd8bb14d9a0b077c69 Binary files /dev/null and b/flame/models/__pycache__/__init__.cpython-312.pyc differ diff --git a/flame/models/fla.toml b/flame/models/fla.toml new file mode 100644 index 0000000000000000000000000000000000000000..afd3a212bbef8206c10714307c6df738051e83db --- /dev/null +++ b/flame/models/fla.toml @@ -0,0 +1,67 @@ +[model] +config = "fla-hub/transformer-1.3B-100B" +tokenizer_path = "fla-hub/transformer-1.3B-100B" + +[job] +dump_folder = "exp" +print_args = true + +[training] +batch_size = 32 +seq_len = 2048 +context_len = 2048 +gradient_accumulation_steps = 1 +steps = 20480 +max_norm = 1.0 +skip_nan_inf = true +data_parallel_replicate_degree = 1 +data_parallel_shard_degree = -1 +tensor_parallel_degree = 1 +compile = false +dataset = "HuggingFaceFW/fineweb-edu" +dataset_name = "default" +num_workers = 32 +pin_memory = false +persistent_workers = false +prefetch_factor = 2 +seed = 42 +varlen = false + +[optimizer] +name = "AdamW" +eps = 1e-15 +lr = 3e-4 + +[lr_scheduler] +warmup_steps = 1024 +decay_type = "cosine" +lr_min = 0.1 + +[checkpoint] +enable_checkpoint = true +folder = "checkpoint" +interval_type = "steps" +interval = 2048 +model_weights_only = false +export_dtype = "float32" +async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"] + +[profiling] +enable_profiling = true +save_traces_folder = "profile_trace" +profile_freq = 512 + +[metrics] +log_freq = 32 +enable_wandb = true + +[experimental] +context_parallel_degree = 1 +pipeline_parallel_degree = 1 + +[float8] +enable_fsdp_float8_all_gather = false +precompute_float8_dynamic_scale_for_fsdp = false + +[activation_checkpoint] +mode = "none" \ No newline at end of file diff --git a/flame/models/parallelize_fla.py b/flame/models/parallelize_fla.py new file mode 100644 index 0000000000000000000000000000000000000000..37178af1bf365b3f5179cefc62000bf8f2f4ded3 --- /dev/null +++ b/flame/models/parallelize_fla.py @@ -0,0 +1,550 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# This file applies the PT-D parallelisms (except pipeline parallelism) and various +# training techniques (e.g. activation checkpointing and compile) to the Llama model. + +from collections import defaultdict + +import torch +import torch.nn as nn +from torch.distributed import DeviceMesh +from torch.distributed._composable.fsdp import CPUOffloadPolicy, MixedPrecisionPolicy, fully_shard +from torch.distributed._composable.replicate import replicate +from torch.distributed._tensor import Replicate, Shard +from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import checkpoint_wrapper as ptd_checkpoint_wrapper +from torch.distributed.tensor.parallel import ( + ColwiseParallel, + PrepareModuleInput, + PrepareModuleOutput, + RowwiseParallel, + SequenceParallel, + parallelize_module +) + +from fla.modules.fused_linear_cross_entropy import LinearLossParallel +from fla.modules.mlp import SwiGLULinearParallel +from fla.modules.parallel import PrepareModuleWeight +from torchtitan.config_manager import TORCH_DTYPE_MAP, JobConfig +from torchtitan.distributed.parallel_dims import ParallelDims +from torchtitan.tools.logging import logger + + +def parallelize_fla( + model: nn.Module, + world_mesh: DeviceMesh, + parallel_dims: ParallelDims, + job_config: JobConfig, +): + """ + Apply tensor parallelism, activation checkpointing, torch.compile, and data + parallelism to the model. + + NOTE: The passed-in model preferably should be on meta device. Otherwise, + the model must fit on GPU or CPU memory. + """ + + if parallel_dims.tp_enabled: + if ( + job_config.experimental.enable_async_tensor_parallel + and not job_config.training.compile + ): + raise RuntimeError("Async TP requires --training.compile") + enable_float8_linear = "float8" in job_config.model.converters + apply_tp( + model, + world_mesh["tp"], + loss_parallel=parallel_dims.loss_parallel_enabled, + enable_float8=enable_float8_linear, + enable_async_tp=job_config.experimental.enable_async_tensor_parallel, + ) + + if job_config.activation_checkpoint.mode != "none": + apply_ac(model, job_config.activation_checkpoint) + + # turn on per-block compile after AC wrapping and before FSDP + if job_config.training.compile: + apply_compile(model) + + if ( + parallel_dims.dp_shard_enabled or parallel_dims.cp_enabled + ): # apply FSDP or HSDP, potentially with Context Parallel + if parallel_dims.dp_replicate_enabled: + dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp") + else: + dp_mesh_dim_names = ("dp_shard_cp",) + + apply_fsdp( + model, + world_mesh[tuple(dp_mesh_dim_names)], + param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param], + reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce], + pp_enabled=parallel_dims.pp_enabled, + cpu_offload=job_config.training.enable_cpu_offload, + reshard_after_forward_policy=job_config.training.fsdp_reshard_after_forward, + ) + + if parallel_dims.dp_replicate_enabled: + logger.info("Applied HSDP to the model") + else: + logger.info("Applied FSDP to the model") + + if parallel_dims.cp_enabled: + logger.info("Applied Context Parallel to the model") + + if job_config.training.enable_cpu_offload: + logger.info("Applied CPU Offloading to the model") + elif parallel_dims.dp_replicate_enabled: + if world_mesh.ndim > 1: + raise RuntimeError("DDP has not supported > 1D parallelism") + apply_ddp( + model, + world_mesh, + enable_compile=job_config.training.compile, + enable_compiled_autograd=job_config.experimental.enable_compiled_autograd, + ) + + +class TPPlan: + def __init__( + self, + model=None, + loss_parallel=False, + enable_float8=False, + ): + self.model = model + self.loss_parallel = loss_parallel + self.enable_float8 = enable_float8 + self.base_model_prefix = getattr(model, "base_model_prefix", "model") + + # TODO(vkuzo): once float8 configuration supports delayed scaling, + # add a check here to enforce supported float8 all-gather configurations + # TODO(vkuzo): add the items below to __init__.py of torchao.float8 and import from there + try: + from torchao.float8.float8_tensor_parallel import ( + Float8ColwiseParallel, + Float8RowwiseParallel, + PrepareFloat8ModuleInput + ) + except ImportError: + Float8ColwiseParallel = None + Float8RowwiseParallel = None + PrepareFloat8ModuleInput = None + if self.enable_float8 and Float8ColwiseParallel is not None: + self.rowwise_parallel = Float8RowwiseParallel + self.colwise_parallel = Float8ColwiseParallel + self.prepare_module_input = PrepareFloat8ModuleInput + self.prepare_module_output = PrepareModuleOutput + else: + self.rowwise_parallel = RowwiseParallel + self.colwise_parallel = ColwiseParallel + self.prepare_module_input = PrepareModuleInput + self.prepare_module_output = PrepareModuleOutput + + @property + def model_plan(self): + plans = { + f"{self.base_model_prefix}.embeddings": RowwiseParallel( + input_layouts=Replicate(), + output_layouts=Shard(1), + ), + f"{self.base_model_prefix}.norm": SequenceParallel(), + } + if self.loss_parallel: + plans.update( + { + "lm_head": ColwiseParallel( + input_layouts=Shard(1), + output_layouts=Shard(-1) if self.loss_parallel else Replicate(), + use_local_output=not self.loss_parallel, + ), + } + ) + else: + plans.update( + { + "lm_head": PrepareModuleWeight(layouts=Replicate()), + "criterion": LinearLossParallel(), + } + ) + return plans + + @property + def layer_plan(self): + return { + "attn_norm": SequenceParallel(), + **self.attn_plan, + "mlp_norm": SequenceParallel(), + **self.mlp_plan, + } + + @property + def attn_plan(self): + raise NotImplementedError( + f"TP plans for token mixing layers of {self.model.config.model_type} not implemented" + ) + + @property + def mlp_plan(self): + return { + "mlp": self.prepare_module_input( + input_layouts=(Shard(1),), + desired_input_layouts=(Replicate(),), + ), + "mlp.gate_proj": self.colwise_parallel(), + "mlp.up_proj": self.colwise_parallel(), + "mlp.down_proj": self.rowwise_parallel(output_layouts=Shard(1)), + "mlp.swiglu_linear": SwiGLULinearParallel(output_layouts=Shard(1)), + } + + +class TransformerTPPlan(TPPlan): + + @property + def attn_plan(self): + return { + "attn": self.prepare_module_input( + input_kwarg_layouts={"hidden_states": Shard(1)}, + desired_input_kwarg_layouts={"hidden_states": Replicate()}, + ), + "attn.q_proj": self.colwise_parallel(), + "attn.k_proj": self.colwise_parallel(), + "attn.v_proj": self.colwise_parallel(), + "attn.o_proj": self.rowwise_parallel(output_layouts=Shard(1)), + } + + +class GLATPPlan(TPPlan): + + @property + def attn_plan(self): + return { + "attn": self.prepare_module_input( + input_kwarg_layouts={"hidden_states": Shard(1)}, + desired_input_kwarg_layouts={"hidden_states": Replicate()}, + ), + "attn.q_proj": self.colwise_parallel(), + "attn.k_proj": self.colwise_parallel(), + "attn.v_proj": self.colwise_parallel(), + "attn.g_proj": self.colwise_parallel(), + "attn.gk_proj.0": PrepareModuleWeight(layouts=Replicate()), + "attn.gk_proj.1": self.colwise_parallel(), + "attn.g_norm": SequenceParallel(sequence_dim=-1), + "attn.o_proj": self.rowwise_parallel(output_layouts=Shard(1)), + } + + +TP_PLAN_MAP = {"transformer": TransformerTPPlan, "gla": GLATPPlan} + + +def apply_tp( + model: nn.Module, + tp_mesh: DeviceMesh, + loss_parallel: bool, + enable_float8: bool, + enable_async_tp: bool, +): + """Apply tensor parallelism.""" + # 1. Parallelize the embedding and shard its outputs (which are the first + # transformer block's inputs) + # 2. Parallelize the root norm layer over the sequence dim + # 3. Parallelize the final linear output layer + tp_plan = TP_PLAN_MAP[model.config.model_type]( + model, loss_parallel=loss_parallel, enable_float8=enable_float8 + ) + parallelize_module(model, tp_mesh, tp_plan.model_plan) + + blocks = get_blocks(model) + if blocks is None: + logger.warning("No block found for tensor parallelism") + else: + for _, block in enumerate(blocks): + parallelize_module( + module=block, + device_mesh=tp_mesh, + parallelize_plan=tp_plan.layer_plan, + ) + + if enable_async_tp: + from torch.distributed._symmetric_memory import enable_symm_mem_for_group + + torch._inductor.config._micro_pipeline_tp = True + enable_symm_mem_for_group(tp_mesh.get_group().group_name) + + logger.info( + f"Applied {'Float8 ' if enable_float8 else ''}{'Async ' if enable_async_tp else ''}" + "Tensor Parallelism to the model" + ) + + +# for selective op activation checkpointing +_save_list = { + torch.ops.aten.mm.default, + torch.ops.aten._scaled_dot_product_efficient_attention.default, + torch.ops.aten._scaled_dot_product_flash_attention.default, + torch.ops._c10d_functional.reduce_scatter_tensor.default, + # for low precision training, it's useful to always save + # the result of max, since the absolute maximum is + # used to compute the scaling factor for quantization. + torch.ops.aten.max.default, +} + + +def _apply_ac_to_block(module: nn.Module, ac_config): + valid_ac_modes = ("full", "selective") + if ac_config.mode not in valid_ac_modes: + raise ValueError( + f"Invalid AC mode: {ac_config.mode}. Valid modes: {valid_ac_modes}" + ) + + if ac_config.mode == "full": + return ptd_checkpoint_wrapper(module, preserve_rng_state=False) + + assert ac_config.mode == "selective", f"{ac_config.mode}" + use_op_sac = ac_config.selective_ac_option == "op" + use_layer_sac = ac_config.selective_ac_option.isdigit() + if not use_op_sac and not use_layer_sac: + raise ValueError( + f"Invalid selective AC option: {ac_config.selective_ac_option}. " + f"Valid options: 'op' or a positive int representing layer frequency" + ) + if use_op_sac: + from torch.utils.checkpoint import CheckpointPolicy, create_selective_checkpoint_contexts + + def _get_custom_policy(meta): + def _custom_policy(ctx, func, *args, **kwargs): + mode = "recompute" if ctx.is_recompute else "forward" + mm_count_key = f"{mode}_mm_count" + if func == torch.ops.aten.mm.default: + meta[mm_count_key] += 1 + # Saves output of all compute ops, except every second mm + to_save = func in _save_list and not ( + func == torch.ops.aten.mm.default and meta[mm_count_key] % 2 == 0 + ) + return ( + CheckpointPolicy.MUST_SAVE + if to_save + else CheckpointPolicy.PREFER_RECOMPUTE + ) + + return _custom_policy + + def selective_checkpointing_context_fn(): + meta = defaultdict(int) + return create_selective_checkpoint_contexts(_get_custom_policy(meta)) + + return ptd_checkpoint_wrapper( + module, + context_fn=selective_checkpointing_context_fn, + preserve_rng_state=False, + ) + elif use_layer_sac: + # Checkpoint every `ac_freq` of the modules passed to this function + ac_freq = int(ac_config.selective_ac_option) + ptd_checkpoint_wrapper.__dict__.setdefault("_count", 0) + ptd_checkpoint_wrapper._count += 1 + if not ac_freq or ptd_checkpoint_wrapper._count % ac_freq == 0: + return ptd_checkpoint_wrapper(module, preserve_rng_state=False) + else: + return module + + +def apply_ac(model: nn.Module, ac_config): + """Apply activation checkpointing to the model.""" + blocks = get_blocks(model) + if blocks is None: + logger.warning("No block found for activation checkpointing") + return + + for layer_id, block in blocks.named_children(): + block = _apply_ac_to_block(block, ac_config) + blocks.register_module(layer_id, block) + + logger.info(f"Applied {ac_config.mode} activation checkpointing to the model") + + +def apply_compile(model: nn.Module): + """ + Apply torch.compile to each block, which makes compilation efficient due to + repeated structure. Alternatively one can compile the whole model (after applying DP). + """ + + blocks = get_blocks(model) + if blocks is None: + logger.warning("No block found for torch.compile") + else: + for layer_id, block in blocks.named_children(): + block = torch.compile(block) + blocks.register_module(layer_id, block) + logger.info("Compiling each block with torch.compile") + + real_model = get_model(model) + + logger.info("Compiling the embedding, norm, and lm_head layers with torch.compile") + embeddings_key = get_components_name(real_model, "tok_embeddings") + if embeddings_key is not None: + embeddings = torch.compile(getattr(real_model, embeddings_key), fullgraph=True) + real_model.register_module(embeddings_key, embeddings) + + norm_key = get_components_name(real_model, "norm") + if norm_key is not None: + norm = torch.compile(getattr(real_model, norm_key), fullgraph=True) + real_model.register_module(norm_key, norm) + + lm_head_key = get_components_name(model, "lm_head") + if lm_head_key is not None: + lm_head = torch.compile(getattr(model, lm_head_key), fullgraph=True) + model.register_module(lm_head_key, lm_head) + + logger.info("Compiling the entire model with torch.compile") + model = torch.compile(model) + + +def apply_fsdp( + model: nn.Module, + dp_mesh: DeviceMesh, + param_dtype: torch.dtype, + reduce_dtype: torch.dtype, + pp_enabled: bool, + cpu_offload: bool = False, + reshard_after_forward_policy: str = "default", +): + """ + Apply data parallelism (via FSDP2) to the model. + + Args: + model (nn.Module): The model to apply data parallelism to. + dp_mesh (DeviceMesh): The device mesh to use for data parallelism. + param_dtype (torch.dtype): The data type to use for model parameters. + reduce_dtype (torch.dtype): The data type to use for reduction operations. + pp_enabled (bool): Whether pipeline parallelism is enabled. + cpu_offload (bool, optional): Whether to offload model parameters to CPU. Defaults to False. + reshard_after_forward_policy (str, optional): + The policy to use for resharding after forward pass. Defaults to "default". + Other options: "never", "always". + - "default" applies default resharding behavior, implementing "smart defaults" for known optimal scenarios. + - "always" will enable `reshard_after_forward` for all forward passes. + - "never" will disable `reshard_after_forward` for all forward passes. + + """ + mp_policy = MixedPrecisionPolicy(param_dtype=param_dtype, reduce_dtype=reduce_dtype) + fsdp_config = {"mesh": dp_mesh, "mp_policy": mp_policy} + if cpu_offload: + fsdp_config["offload_policy"] = CPUOffloadPolicy() + + blocks = get_blocks(model) + if blocks is None: + logger.warning("No block found for FSDP") + else: + total_blocks = len(blocks) + for layer_id, block in enumerate(blocks): + if reshard_after_forward_policy == "always": + reshard_after_forward = True + elif reshard_after_forward_policy == "never": + reshard_after_forward = False + elif reshard_after_forward_policy == "default": + if pp_enabled: + # For PP, do not reshard after forward to avoid per-microbatch + # all-gathers, which can be expensive and non-overlapped + reshard_after_forward = False + else: + # As an optimization, do not reshard after forward for the last + # transformer block since FSDP would prefetch it immediately + reshard_after_forward = int(layer_id) < total_blocks - 1 + else: + raise ValueError( + f"Invalid reshard_after_forward_policy: {reshard_after_forward_policy}." + ) + fully_shard( + block, + **fsdp_config, + reshard_after_forward=reshard_after_forward, + ) + + fully_shard(model, **fsdp_config, reshard_after_forward=not pp_enabled) + + +def apply_ddp( + model: nn.Module, + dp_mesh: DeviceMesh, + enable_compile: bool, + enable_compiled_autograd: bool, +): + if enable_compile: + if enable_compiled_autograd: + torch._dynamo.config.optimize_ddp = ( + "python_reducer_without_compiled_forward" + ) + else: + torch._dynamo.config.optimize_ddp = "ddp_optimizer" + + replicate(model, device_mesh=dp_mesh, bucket_cap_mb=100) + + logger.info("Applied DDP to the model") + + +def get_model(model): + base_model_prefix = getattr(model, "base_model_prefix", "model") + if not hasattr(model, base_model_prefix): + return None + model = getattr(model, base_model_prefix) + return model + + +def get_blocks(model): + # TODO[flame]: adapt for network not using 'layers' attribute + model = get_model(model) + if not hasattr(model, "layers"): + logger.warning('no "layers" in model can be found') + return None + return model.layers + + +def get_components_name(model, component_name): + """ + We try to catch tok_embeddings, norm layers and lm_head layers + We do not catch the layer names in the blocks, for blocks see `get_blocks` + We assume the model has the following structure: + LlamaForCausalLM: + Model: + embed_tokens, + layers, + norm, + lm_head + *** + so, to search 'tok_embeddings' and 'norm' we need to pass `get_model(model)` + and for 'lm_head' we need to pass `model` + *** + """ + + if component_name == "tok_embeddings": + if hasattr(model, "tok_embeddings"): + return "tok_embeddings" + elif hasattr(model, "embed_tokens"): + return "embed_tokens" + elif hasattr(model, "embeddings"): + return "embeddings" + else: + logger.warning("No tok_embeddings found in model") + return None + + elif component_name == "norm": + if hasattr(model, "norm"): + return "norm" + elif hasattr(model, "norms"): + return "norms" + elif hasattr(model, "layernorm"): + return "layernorm" + else: + logger.warning("No norm found in model") + return None + + elif component_name == "lm_head": + if hasattr(model, "lm_head"): + return "lm_head" + else: + logger.warning("No lm_head found in model") + return None diff --git a/flame/models/pipeline_fla.py b/flame/models/pipeline_fla.py new file mode 100644 index 0000000000000000000000000000000000000000..7f2b29f521c25b607ec49b04bb240f59c61641f6 --- /dev/null +++ b/flame/models/pipeline_fla.py @@ -0,0 +1,162 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# This file applies the PT-D pipeline parallelism to the Llama model. + +import copy +from typing import Callable, Optional, Union + +import torch +import torch.nn as nn +from torch.distributed import DeviceMesh +from torch.distributed.pipelining import PipelineStage +from torch.distributed.pipelining.schedules import ScheduleZBVZeroBubble, _PipelineSchedule, get_schedule_class +from transformers import PretrainedConfig + +from flame.models.parallelize_fla import get_blocks, get_components_name, get_model +from torchtitan.config_manager import JobConfig +from torchtitan.distributed.parallel_dims import ParallelDims +from torchtitan.distributed.pipeline import build_pipeline_schedule, generate_split_points, stage_ids_this_rank +from torchtitan.tools.logging import logger + +DeviceType = Union[int, str, torch.device] + + +def pipeline_fla( + model: nn.Module, + pp_mesh: DeviceMesh, + parallel_dims: ParallelDims, + job_config: JobConfig, + device: DeviceType, + model_config: PretrainedConfig, + loss_fn: Callable[..., torch.Tensor], +) -> tuple[_PipelineSchedule, list[nn.Module], bool, bool]: + stages, models = pipeline_fla_manual_split( + model, pp_mesh, parallel_dims, job_config, device, model_config + ) + + pp_schedule = build_pipeline_schedule(job_config, stages, loss_fn) + + # This is used in the train loop to determine whether to pass in the input_ids and labels + has_first_stage = False + has_last_stage = False + for stage in stages: + if stage.is_first: + has_first_stage = True + if stage.is_last: + has_last_stage = True + + return pp_schedule, models, has_first_stage, has_last_stage + + +def pipeline_fla_manual_split( + whole_model: nn.Module, + pp_mesh: DeviceMesh, + parallel_dims: ParallelDims, + job_config: JobConfig, + device: DeviceType, + model_config: PretrainedConfig, +) -> tuple[list[PipelineStage], list[nn.Module]]: + """ + This API extracts one torch.nn.Module objects for the part of the model configured to run inside this stage. + + It wraps the model chunk in a ManualPipelineStage object and returns both the stage and model objects. + + The stage object is used to create a pipeline schedule, and the model object can be used for applying SPMD + parallelism. + """ + pp_rank = pp_mesh.get_local_rank() + pp_size = pp_mesh.size() + + splits = ( + job_config.experimental.pipeline_parallel_split_points + or generate_split_points( + job_config, parallel_dims.pp, model_config.num_hidden_layers + ) + ) + + def _build_stage( + stage_idx: int, + start_layer: Optional[str], + stop_layer: Optional[str], + is_first: bool = False, + is_last: bool = False, + ) -> tuple[PipelineStage, nn.Module]: + model = copy.deepcopy(whole_model) + if not is_first: + # we do `model.tok_embeddings = None` here + real_model = get_model(model) + tok_embeddings_name = get_components_name(real_model, "tok_embeddings") + setattr(real_model, tok_embeddings_name, None) + + drop_layers = start_layer is not None + # Get module dictionary from get_blocks(model) + # and Create a list of keys before modifying dictionary + module_dict = get_blocks(model)._modules # Store reference + layer_names = list(module_dict.keys()) + + # Iterate over the list of keys instead of `_modules.items()` + for name in layer_names: + # Dynamically determine prefix (blocks.* or layers.*) + prefix = start_layer.split(".")[0] if start_layer else "layers" + layer_name = f"{prefix}.{name}" # Construct the correct name format + + # Ensure `drop_layers` activation is based on actual naming + if layer_name == start_layer: + drop_layers = False + if layer_name == stop_layer: + drop_layers = True + + # Delete layer if drop_layers is active + if drop_layers: + del module_dict[name] # Safe deletion from stored dictionary + + if not is_last: + # we do `model.norm = None` and `model.output = None` + real_model = get_model(model) + norm_name = get_components_name(real_model, "norm") + setattr(real_model, norm_name, None) + + head_name = get_components_name(model, "lm_head") + setattr(model, head_name, None) + + stage = PipelineStage( + model, + stage_idx, + num_stages, + device, + group=pp_mesh.get_group("pp"), + ) + return stage, model + + num_stages = len(splits) + 1 + stage_idx = pp_rank + + stages = [] + models = [] + + schedule_class = get_schedule_class( + job_config.experimental.pipeline_parallel_schedule + ) + style = "v" if schedule_class == ScheduleZBVZeroBubble else "loop" + + for stage_idx in stage_ids_this_rank(pp_rank, pp_size, num_stages, style=style): + start_layer = splits[stage_idx - 1] if stage_idx > 0 else None + stop_layer = splits[stage_idx] if stage_idx < num_stages - 1 else None + stage, model_chunk = _build_stage( + stage_idx, + start_layer, + stop_layer, + is_first=stage_idx == 0, + is_last=stage_idx == num_stages - 1, + ) + logger.info( + f"PP rank {pp_rank} is building stage_idx {stage_idx}" + f" with start_layer {start_layer}, stop_layer {stop_layer}" + ) + stages.append(stage) + models.append(model_chunk) + return stages, models diff --git a/flame/tools/__pycache__/utils.cpython-312.pyc b/flame/tools/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45b78019f2a639492b3de888a09ebf17c6a341b2 Binary files /dev/null and b/flame/tools/__pycache__/utils.cpython-312.pyc differ diff --git a/flame/tools/utils.py b/flame/tools/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a798ec243f6054aecf7878ad62a3f818f32faeca --- /dev/null +++ b/flame/tools/utils.py @@ -0,0 +1,41 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from torch import nn +from torchtitan.tools.logging import logger + + +def get_nparams_and_flops(model: nn.Module, model_config, seq_len: int) -> tuple[int, int]: + nparams = sum(p.numel() for p in model.parameters()) + nparams_embedding = sum( + sum(p.numel() for p in m.parameters()) + for m in model.children() + if isinstance(m, nn.Embedding) + ) + + if hasattr(model_config, "num_heads"): + num_heads = model_config.num_heads + elif hasattr(model_config, "num_attention_heads"): + num_heads = model_config.num_attention_heads + else: + num_heads = 1 + logger.warning("num_heads not found in model_config, defaulting to 1. ") + + l, h, q, t = ( + model_config.num_hidden_layers, + num_heads, + model_config.hidden_size // num_heads, + seq_len, + ) + # Reasoning behind the factor of 12 for the self-attention part of the formula: + # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6) + # 2. the flash attention does 1 more matmul recomputation in the backward + # but recomputation should not be counted in calculating MFU (+0) + # 3. each matmul performs 1 multiplication and 1 addition (*2) + # 4. we follow the convention and do not account for sparsity in causal attention + num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t + + return nparams, num_flops_per_token diff --git a/flame/utils/__init__.py b/flame/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/flame/utils/__pycache__/__init__.cpython-312.pyc b/flame/utils/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b4d02f1a48c0b27fefbc0d725b89e72828d98301 Binary files /dev/null and b/flame/utils/__pycache__/__init__.cpython-312.pyc differ diff --git a/flame/utils/__pycache__/convert_dcp_to_hf.cpython-312.pyc b/flame/utils/__pycache__/convert_dcp_to_hf.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e06d70c3b61529210928d02135010673524a784f Binary files /dev/null and b/flame/utils/__pycache__/convert_dcp_to_hf.cpython-312.pyc differ diff --git a/flame/utils/convert_dcp_to_hf.py b/flame/utils/convert_dcp_to_hf.py new file mode 100644 index 0000000000000000000000000000000000000000..2240578d54b6be51241ddad0e253c548cb362492 --- /dev/null +++ b/flame/utils/convert_dcp_to_hf.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +import argparse +import io +import os +import tempfile +from datetime import timedelta + +import torch +import torch.serialization +from torch.distributed.checkpoint.format_utils import dcp_to_torch_save +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + +import fla # noqa +from torchtitan.tools.logging import init_logger, logger + + +@torch.inference_mode() +def save_pretrained( + path: str, + step: int, + config: str, + tokenizer: str +): + logger.info(f"Loading the config from {config}") + config = AutoConfig.from_pretrained(config, trust_remote_code=True) + + logger.info(f"Saving the config to {path}") + config.save_pretrained(path) + logger.info(f"Loading the tokenizer from {tokenizer}") + tokenizer = AutoTokenizer.from_pretrained(tokenizer, trust_remote_code=True) + logger.info(f"Saving the tokenizer to {path}") + tokenizer.save_pretrained(path) + + with tempfile.TemporaryDirectory() as tmpdir: + # base_checkpoint_dir = os.path.dirname(path) + base_checkpoint_dir = path + checkpoint = os.path.join(base_checkpoint_dir, f'checkpoint/step-{step}') + checkpoint_path = os.path.join(tmpdir, 'checkpoint.pt') + logger.info(f"Saving the distributed checkpoint to {checkpoint_path}") + dcp_to_torch_save(checkpoint, checkpoint_path) + + logger.info(f"Initializing the model from config\n{config}") + model = AutoModelForCausalLM.from_config(config) + logger.info(model) + logger.info("Loading state dict from the checkpoint") + + # Add datetime.timedelta and io.BytesIO to safe globals + torch.serialization.add_safe_globals([timedelta, io.BytesIO]) + # torch.load now with default weights_only=True will work + model.load_state_dict(torch.load(checkpoint_path, map_location='cpu')['model']) + + logger.info(f"Saving the model to {path}") + model.save_pretrained(path) + + +if __name__ == "__main__": + init_logger() + parser = argparse.ArgumentParser("Convert DCP format model weights to huggingface-style.") + parser.add_argument("--path", type=str, required=True) + parser.add_argument("--step", type=int, required=True) + parser.add_argument("--config", type=str, required=True) + parser.add_argument("--tokenizer", type=str, required=True) + args = parser.parse_args() + save_pretrained(args.path, args.step, args.config, args.tokenizer) diff --git a/flame/utils/convert_hf_to_dcp.py b/flame/utils/convert_hf_to_dcp.py new file mode 100644 index 0000000000000000000000000000000000000000..bab94ebf80ea8822139b851e0c64b95854c2e78b --- /dev/null +++ b/flame/utils/convert_hf_to_dcp.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +import argparse +from pathlib import Path + +import torch +import torch.distributed.checkpoint as DCP +from transformers import AutoModelForCausalLM + +import fla # noqa +from torchtitan.tools.logging import init_logger, logger + + +@torch.inference_mode() +def convert_hf_weights(model: str, checkpoint: str): + logger.info(f"Loading model from {model}") + model = AutoModelForCausalLM.from_pretrained(model) + state_dict = model.state_dict() + + logger.info(f"Writing to DCP at '{checkpoint}'") + checkpoint.mkdir(parents=True, exist_ok=True) + storage_writer = DCP.filesystem.FileSystemWriter(checkpoint, thread_count=8) + DCP.save({"model": state_dict}, storage_writer=storage_writer) + + +if __name__ == "__main__": + init_logger() + parser = argparse.ArgumentParser(description="Convert huggingface-style model weights to DCP format.") + parser.add_argument("--model", type=str, required=True) + parser.add_argument("--checkpoint", type=Path, required=True) + args = parser.parse_args() + + convert_hf_weights(args.model, args.checkpoint) diff --git a/flame/utils/hf_utils.py b/flame/utils/hf_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c8954965dbde4c33131bdf05811fdf803c247168 --- /dev/null +++ b/flame/utils/hf_utils.py @@ -0,0 +1,77 @@ +import os +import re +from huggingface_hub import HfApi, HfFolder, logging as hf_logging, create_repo +from torchtitan.tools.logging import logger + +def upload_checkpoint_to_hf( + local_path: str, + step: int, + hf_repo_id_for_run: str, + hf_keep_latest_k: int, + upload_format: str +): + """Uploads a checkpoint directory to HF Hub and manages retention.""" + if not os.path.isdir(local_path): + logger.error(f"Local path for upload does not exist or is not a directory: {local_path}") + return + + api = HfApi() + token = HfFolder.get_token() + if not token: + logger.warning("Hugging Face Hub token not found. Skipping upload. Login via `huggingface-cli login` or set HF_TOKEN.") + return + + # --- Ensure the specific repository for this run exists --- + try: + logger.info(f"Ensuring repository {hf_repo_id_for_run} exists...") + # Use create_repo which handles creation only if it doesn't exist + create_repo(repo_id=hf_repo_id_for_run, token=token, repo_type="model", exist_ok=True) + logger.info(f"Repository {hf_repo_id_for_run} ensured.") + except Exception as e: + logger.error(f"Failed to create or ensure repository {hf_repo_id_for_run}: {e}", exc_info=True) + return # Stop if repo interaction fails + + commit_message = f"Upload {upload_format.upper()} checkpoint step {step}" + path_in_repo = f"step-{step}" + + logger.info(f"Uploading {local_path} to {hf_repo_id_for_run}/{path_in_repo} on Hugging Face Hub...") + try: + api.upload_folder( + folder_path=local_path, + path_in_repo=path_in_repo, + repo_id=hf_repo_id_for_run, + repo_type="model", + commit_message=commit_message, + token=token, + ) + logger.info(f"Successfully uploaded step {step} to {hf_repo_id_for_run}.") + except Exception as e: + logger.error(f"Failed to upload checkpoint step {step} to {hf_repo_id_for_run}: {e}", exc_info=True) + if hf_keep_latest_k > 0: + logger.info(f"Cleaning up old checkpoints on {hf_repo_id_for_run}, keeping latest {hf_keep_latest_k}") + try: + repo_files = api.list_repo_tree(hf_repo_id_for_run, repo_type="model", token=token, recursive=False) + step_folders = [ + item.path for item in repo_files + if item.path.startswith("step-") and item.path[5:].isdigit() + ] + + step_folders.sort(key=lambda x: int(x.split('-')[1]), reverse=True) + + if len(step_folders) > hf_keep_latest_k: + folders_to_delete = step_folders[hf_keep_latest_k:] + logger.info(f"Found {len(step_folders)} checkpoints on Hub. Deleting {len(folders_to_delete)} older ones: {folders_to_delete}") + for folder in folders_to_delete: + # Deleting requires repo_id, path_in_repo, and token + api.delete_folder( + repo_id=hf_repo_id_for_run, + path_in_repo=folder, + repo_type="model", + commit_message=f"Delete old checkpoint {folder}", + token=token + ) + logger.info("Hub cleanup complete.") + else: + logger.info("No old checkpoints found on Hub to delete.") + except Exception as e: + logger.error(f"Error during Hub checkpoint cleanup for {hf_repo_id_for_run}: {e}", exc_info=True) \ No newline at end of file diff --git a/logs/none_g37i6vbo/attempt_0/6/stderr.log b/logs/none_g37i6vbo/attempt_0/6/stderr.log new file mode 100644 index 0000000000000000000000000000000000000000..9e1e19b2cb578471910479625d2d526a36567190 --- /dev/null +++ b/logs/none_g37i6vbo/attempt_0/6/stderr.log @@ -0,0 +1,4497 @@ +OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k +OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k +OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k +2025-09-10 06:23:04.673016: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. +2025-09-10 06:23:04.739255: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. +To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX512_FP16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. +2025-09-10 06:23:06.106526: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. +wandb: Appending key for api.wandb.ai to your netrc file: /home/cvm/.netrc +wandb: Currently logged in as: zaydzuhri to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured. +[titan] 2025-09-10 06:23:08,131 - root - INFO - Starting job: default job +[titan] 2025-09-10 06:23:08,131 - root - INFO - { + "activation_checkpoint": { + "mode": "none", + "selective_ac_option": "2" + }, + "activation_offload": { + "mode": "none" + }, + "checkpoint": { + "async_mode": "disabled", + "convert_to_hf_on_save": false, + "create_seed_checkpoint": false, + "enable_checkpoint": true, + "exclude_from_loading": [], + "export_dtype": "float32", + "folder": "checkpoint", + "hf_repo_base_name": "zaydzuhri/top-code-7B-4096-batch8x2-steps40000", + "hf_upload_enabled": true, + "hf_upload_format": "dcp", + "interval": 5000, + "interval_type": "steps", + "keep_latest_k": 0, + "load_step": -1, + "model_weights_only": false + }, + "comm": { + "init_timeout_seconds": 6000, + "trace_buf_size": 20000, + "train_timeout_seconds": 6000 + }, + "experimental": { + "context_parallel_degree": 1, + "context_parallel_rotate_method": "allgather", + "custom_model_path": "", + "enable_async_tensor_parallel": false, + "enable_compiled_autograd": false, + "pipeline_parallel_degree": 1, + "pipeline_parallel_microbatches": null, + "pipeline_parallel_schedule": "1F1B", + "pipeline_parallel_schedule_csv": "", + "pipeline_parallel_split_points": [] + }, + "fault_tolerance": { + "enable": false, + "group_size": 0, + "min_replica_size": 1, + "replica_id": 0 + }, + "float8": { + "enable_fsdp_float8_all_gather": false, + "force_recompute_fp8_weight_in_bwd": false, + "precompute_float8_dynamic_scale_for_fsdp": false, + "recipe_name": null + }, + "job": { + "config_file": "flame/models/fla.toml", + "description": "default job", + "dump_folder": "exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine", + "print_args": true, + "use_for_integration_test": false + }, + "lr_scheduler": { + "decay_ratio": null, + "decay_type": "cosine", + "lr_min": 0.1, + "warmup_steps": 400 + }, + "memory_estimation": { + "disable_fake_mode": false, + "enabled": false + }, + "metrics": { + "disable_color_printing": false, + "enable_tensorboard": false, + "enable_wandb": true, + "log_freq": 5, + "save_for_all_ranks": false, + "save_tb_folder": "tb" + }, + "model": { + "config": "configs/top_transformer_7B.json", + "converters": [], + "name": "fla", + "print_after_conversion": false, + "tokenizer_path": "fla-hub/transformer-1.3B-100B" + }, + "optimizer": { + "early_step_in_backward": false, + "eps": 1e-15, + "implementation": "fused", + "lr": 2e-05, + "name": "AdamW" + }, + "profiling": { + "enable_memory_snapshot": false, + "enable_profiling": true, + "profile_freq": 512, + "save_memory_snapshot_folder": "memory_snapshot", + "save_traces_folder": "profile_trace" + }, + "training": { + "batch_size": 8, + "compile": true, + "context_len": 4096, + "data_dir": null, + "data_files": null, + "data_parallel_replicate_degree": 1, + "data_parallel_shard_degree": -1, + "data_probs": null, + "dataset": "/home/cvm/.cache/zaydzuhri___stack-edu-python/default", + "dataset_name": "default", + "dataset_split": "train", + "deterministic": false, + "disable_loss_parallel": false, + "enable_cpu_offload": false, + "fsdp_reshard_after_forward": "default", + "gc_freq": 50, + "gradient_accumulation_steps": 2, + "max_norm": 1.0, + "mixed_precision_param": "bfloat16", + "mixed_precision_reduce": "float32", + "num_workers": 32, + "persistent_workers": false, + "pin_memory": false, + "prefetch_factor": 2, + "seed": 79, + "seq_len": 4096, + "skip_nan_inf": true, + "steps": 40000, + "streaming": false, + "tensor_parallel_degree": 1, + "varlen": false + } +} +[titan] 2025-09-10 06:23:08,131 - root - INFO - [GC] Initial GC collection. 0.00 seconds. +[titan] 2025-09-10 06:23:28,571 - root - INFO - Target Hugging Face repository for this run: zaydzuhri/top-code-7B-4096-batch8x2-steps40000-20250910-062328 +[titan] 2025-09-10 06:23:28,572 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config +[titan] 2025-09-10 06:23:28,574 - root - INFO - CUDA capacity: NVIDIA H200 with 139.36GiB memory +[titan] 2025-09-10 06:23:28,644 - root - INFO - Peak FLOPS used for computing MFU: 9.890e+14 +[titan] 2025-09-10 06:23:28,644 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8] +[titan] 2025-09-10 06:23:36,548 - root - INFO - Loading tokenizer... +[titan] 2025-09-10 06:23:36,720 - root - INFO - LlamaTokenizerFast(name_or_path='fla-hub/transformer-1.3B-100B', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': ''}, clean_up_tokenization_spaces=False, added_tokens_decoder={ + 0: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 1: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 2: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), +} +) +[titan] 2025-09-10 06:23:36,720 - root - INFO - Loading dataset /home/cvm/.cache/zaydzuhri___stack-edu-python/default:default +[titan] 2025-09-10 06:23:38,496 - root - INFO - Dataset({ + features: ['blob_id', 'language', 'repo_name', 'path', 'src_encoding', 'length_bytes', 'score', 'int_score', 'detected_licenses', 'license_type', 'text', 'download_success'], + num_rows: 25286012 +}) +[titan] 2025-09-10 06:23:38,496 - root - INFO - Shuffling the dataset with seed 79 +[titan] 2025-09-10 06:23:59,115 - root - INFO - Loading model config from configs/top_transformer_7B.json +[titan] 2025-09-10 06:23:59,118 - root - INFO - Building dataloader... +[titan] 2025-09-10 06:23:59,120 - root - INFO - Building model from the config +TOPTransformerConfig { + "attention_bias": false, + "bos_token_id": 1, + "elementwise_affine": true, + "eos_token_id": 2, + "fuse_cross_entropy": true, + "fuse_norm": true, + "fuse_swiglu": true, + "hidden_act": "swish", + "hidden_ratio": 4, + "hidden_size": 4096, + "initializer_range": 0.006, + "intermediate_size": 14336, + "max_position_embeddings": 2048, + "model_type": "top_transformer", + "norm_eps": 1e-06, + "num_heads": 32, + "num_hidden_layers": 30, + "num_kv_heads": 8, + "qk_norm": false, + "qkv_bias": false, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "top_window_size": 4096, + "transformers_version": "4.51.3", + "use_cache": true, + "use_top_loss": true, + "vocab_size": 32000, + "window_size": null +} + +[titan] 2025-09-10 06:23:59,247 - root - INFO -  +TOPTransformerForCausalLM( + (model): TOPTransformerModel( + (embeddings): Embedding(32000, 4096) + (layers): ModuleList( + (0-29): 30 x TOPTransformerBlock( + (attn_norm): RMSNorm(4096, eps=1e-06) + (attn): Attention( + (q_proj): Linear(in_features=4096, out_features=4096, bias=False) + (k_proj): Linear(in_features=4096, out_features=1024, bias=False) + (v_proj): Linear(in_features=4096, out_features=1024, bias=False) + (o_proj): Linear(in_features=4096, out_features=4096, bias=False) + (rotary): RotaryEmbedding(dim=128, base=10000.0, interleaved=False, pos_idx_in_fp32=True) + ) + (mlp_norm): RMSNorm(4096, eps=1e-06) + (mlp): GatedMLP( + (gate_proj): Linear(in_features=4096, out_features=14336, bias=False) + (up_proj): Linear(in_features=4096, out_features=14336, bias=False) + (down_proj): Linear(in_features=14336, out_features=4096, bias=False) + (swiglu_linear): SwiGLULinear() + ) + ) + ) + (norm): RMSNorm(4096, eps=1e-06) + ) + (lm_head): Linear(in_features=4096, out_features=32000, bias=False) + (top_head): Linear(in_features=4096, out_features=32000, bias=False) + (top_criterion): FusedLinearListNetLoss() + (criterion): FusedLinearCrossEntropyLoss() +) + +[titan] 2025-09-10 06:23:59,276 - root - INFO - Compiling each block with torch.compile +[titan] 2025-09-10 06:23:59,276 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile +[titan] 2025-09-10 06:23:59,277 - root - INFO - Compiling the entire model with torch.compile +[titan] 2025-09-10 06:23:59,352 - root - INFO - Applied FSDP to the model +[titan] 2025-09-10 06:23:59,555 - root - INFO - CUDA memory usage for model: 3.24GiB(2.33%) +[titan] 2025-09-10 06:23:59,575 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/checkpoint +[titan] 2025-09-10 06:23:59,576 - root - INFO - Loading the checkpoint at step 30000. +[titan] 2025-09-10 06:24:32,913 - root - INFO - [GC] GC collection for checkpoint loading. 1.09 seconds. +[titan] 2025-09-10 06:24:32,913 - root - INFO - Finished loading the checkpoint in 33.34 seconds. +[titan] 2025-09-10 06:24:33,033 - root - INFO - CUDA capacity: NVIDIA H200 with 139.36GiB memory +[titan] 2025-09-10 06:24:38,686 - root - INFO - ***** Running training ***** +[titan] 2025-09-10 06:24:38,686 - root - INFO -  Training starts at step 30001 +[titan] 2025-09-10 06:24:38,690 - root - INFO -  Number of tokens per sequence = 4,096 +[titan] 2025-09-10 06:24:38,690 - root - INFO -  Gradient Accumulation steps = 2 +[titan] 2025-09-10 06:24:38,690 - root - INFO -  Instantaneous batch size (per device) = 8 +[titan] 2025-09-10 06:24:38,690 - root - INFO -  Global batch size (w. parallel, distributed & accumulation) = 128 (524,288 tokens) +[titan] 2025-09-10 06:24:38,690 - root - INFO -  Total optimization steps = 40,000 (20,971,520,000 tokens) +[titan] 2025-09-10 06:24:38,690 - root - INFO -  Warmup steps = 400 (209,715,200 tokens) +[titan] 2025-09-10 06:24:38,691 - root - INFO -  Number of parameters = 6,936,580,096  +[titan] 2025-09-10 06:24:38,691 - root - INFO - Profiling active. Traces will be saved at exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/profile_trace +[titan] 2025-09-10 06:25:33,239 - root - INFO - step: 30005 loss: 2.6666 memory: 122.03GiB(87.57%) tps: 5,445 tflops: 259.53 mfu: 26.24% global_avg_ntp_loss: 0.7416 global_avg_top_loss: 1.9249 +[titan] 2025-09-10 06:25:33,239 - root - INFO - lr: 4.6841e-06 gnorm: 0.39 [2 days, 6:49:05<18:15:37] +[titan] 2025-09-10 06:26:03,165 - root - INFO - step: 30010 loss: 2.8472 memory: 122.03GiB(87.57%) tps: 10,950 tflops: 521.86 mfu: 52.77% global_avg_ntp_loss: 0.8447 global_avg_top_loss: 2.0025 +[titan] 2025-09-10 06:26:03,165 - root - INFO - lr: 4.6815e-06 gnorm: 0.39 [2 days, 6:49:35<18:15:03] +[titan] 2025-09-10 06:26:33,132 - root - INFO - step: 30015 loss: 2.6568 memory: 122.03GiB(87.57%) tps: 10,935 tflops: 521.14 mfu: 52.69% global_avg_ntp_loss: 0.7350 global_avg_top_loss: 1.9218 +[titan] 2025-09-10 06:26:33,133 - root - INFO - lr: 4.6790e-06 gnorm: 0.40 [2 days, 6:50:04<18:14:30] +[titan] 2025-09-10 06:27:03,206 - root - INFO - step: 30020 loss: 2.6787 memory: 122.03GiB(87.57%) tps: 10,896 tflops: 519.30 mfu: 52.51% global_avg_ntp_loss: 0.7466 global_avg_top_loss: 1.9320 +[titan] 2025-09-10 06:27:03,206 - root - INFO - lr: 4.6764e-06 gnorm: 0.40 [2 days, 6:50:35<18:13:56] +[titan] 2025-09-10 06:27:33,420 - root - INFO - step: 30025 loss: 2.7179 memory: 122.03GiB(87.57%) tps: 10,846 tflops: 516.89 mfu: 52.26% global_avg_ntp_loss: 0.7615 global_avg_top_loss: 1.9564 +[titan] 2025-09-10 06:27:33,420 - root - INFO - lr: 4.6739e-06 gnorm: 0.52 [2 days, 6:51:05<18:13:22] +[titan] 2025-09-10 06:28:03,765 - root - INFO - step: 30030 loss: 2.7003 memory: 122.03GiB(87.57%) tps: 10,799 tflops: 514.65 mfu: 52.04% global_avg_ntp_loss: 0.7546 global_avg_top_loss: 1.9457 +[titan] 2025-09-10 06:28:03,765 - root - INFO - lr: 4.6714e-06 gnorm: 0.40 [2 days, 6:51:35<18:12:48] +[titan] 2025-09-10 06:28:34,277 - root - INFO - step: 30035 loss: 2.7101 memory: 122.03GiB(87.57%) tps: 10,739 tflops: 511.83 mfu: 51.75% global_avg_ntp_loss: 0.7580 global_avg_top_loss: 1.9522 +[titan] 2025-09-10 06:28:34,277 - root - INFO - lr: 4.6688e-06 gnorm: 0.42 [2 days, 6:52:06<18:12:15] +[titan] 2025-09-10 06:29:05,194 - root - INFO - step: 30040 loss: 3.1154 memory: 122.03GiB(87.57%) tps: 10,599 tflops: 505.13 mfu: 51.07% global_avg_ntp_loss: 0.9926 global_avg_top_loss: 2.1228 +[titan] 2025-09-10 06:29:05,195 - root - INFO - lr: 4.6663e-06 gnorm: 0.39 [2 days, 6:52:37<18:11:41] +[titan] 2025-09-10 06:29:36,544 - root - INFO - step: 30045 loss: 3.1783 memory: 122.03GiB(87.57%) tps: 10,453 tflops: 498.16 mfu: 50.37% global_avg_ntp_loss: 1.0308 global_avg_top_loss: 2.1475 +[titan] 2025-09-10 06:29:36,544 - root - INFO - lr: 4.6637e-06 gnorm: 0.38 [2 days, 6:53:08<18:11:08] +[titan] 2025-09-10 06:30:01,635 - root - INFO - [GC] Peforming periodical GC collection. 0.12 seconds. +[titan] 2025-09-10 06:30:07,830 - root - INFO - step: 30050 loss: 2.8363 memory: 122.03GiB(87.57%) tps: 10,474 tflops: 499.17 mfu: 50.47% global_avg_ntp_loss: 0.8317 global_avg_top_loss: 2.0046 +[titan] 2025-09-10 06:30:07,831 - root - INFO - lr: 4.6612e-06 gnorm: 0.39 [2 days, 6:53:39<18:10:34] +[titan] 2025-09-10 06:30:39,327 - root - INFO - step: 30055 loss: 2.9715 memory: 122.03GiB(87.57%) tps: 10,404 tflops: 495.85 mfu: 50.14% global_avg_ntp_loss: 0.9349 global_avg_top_loss: 2.0366 +[titan] 2025-09-10 06:30:39,327 - root - INFO - lr: 4.6587e-06 gnorm: 0.42 [2 days, 6:54:11<18:10:01] +[titan] 2025-09-10 06:31:10,695 - root - INFO - step: 30060 loss: 2.6743 memory: 122.03GiB(87.57%) tps: 10,446 tflops: 497.86 mfu: 50.34% global_avg_ntp_loss: 0.7452 global_avg_top_loss: 1.9291 +[titan] 2025-09-10 06:31:10,696 - root - INFO - lr: 4.6561e-06 gnorm: 0.39 [2 days, 6:54:42<18:09:28] +[titan] 2025-09-10 06:31:42,321 - root - INFO - step: 30065 loss: 2.6293 memory: 122.03GiB(87.57%) tps: 10,361 tflops: 493.82 mfu: 49.93% global_avg_ntp_loss: 0.7266 global_avg_top_loss: 1.9027 +[titan] 2025-09-10 06:31:42,321 - root - INFO - lr: 4.6536e-06 gnorm: 0.41 [2 days, 6:55:14<18:08:54] +[titan] 2025-09-10 06:32:14,044 - root - INFO - step: 30070 loss: 3.0732 memory: 122.03GiB(87.57%) tps: 10,329 tflops: 492.30 mfu: 49.78% global_avg_ntp_loss: 0.9706 global_avg_top_loss: 2.1026 +[titan] 2025-09-10 06:32:14,044 - root - INFO - lr: 4.6511e-06 gnorm: 0.38 [2 days, 6:55:45<18:08:21] +[titan] 2025-09-10 06:32:45,692 - root - INFO - step: 30075 loss: 2.7283 memory: 122.03GiB(87.57%) tps: 10,354 tflops: 493.46 mfu: 49.89% global_avg_ntp_loss: 0.7713 global_avg_top_loss: 1.9571 +[titan] 2025-09-10 06:32:45,692 - root - INFO - lr: 4.6485e-06 gnorm: 0.39 [2 days, 6:56:17<18:07:48] +[titan] 2025-09-10 06:33:17,256 - root - INFO - step: 30080 loss: 2.8993 memory: 122.03GiB(87.57%) tps: 10,382 tflops: 494.79 mfu: 50.03% global_avg_ntp_loss: 0.8759 global_avg_top_loss: 2.0235 +[titan] 2025-09-10 06:33:17,256 - root - INFO - lr: 4.6460e-06 gnorm: 0.39 [2 days, 6:56:49<18:07:14] +[titan] 2025-09-10 06:33:48,951 - root - INFO - step: 30085 loss: 2.6503 memory: 122.03GiB(87.57%) tps: 10,339 tflops: 492.73 mfu: 49.82% global_avg_ntp_loss: 0.7383 global_avg_top_loss: 1.9120 +[titan] 2025-09-10 06:33:48,951 - root - INFO - lr: 4.6435e-06 gnorm: 0.40 [2 days, 6:57:20<18:06:41] +[titan] 2025-09-10 06:34:21,012 - root - INFO - step: 30090 loss: 2.6677 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.11 mfu: 49.25% global_avg_ntp_loss: 0.7417 global_avg_top_loss: 1.9261 +[titan] 2025-09-10 06:34:21,012 - root - INFO - lr: 4.6410e-06 gnorm: 0.42 [2 days, 6:57:52<18:06:08] +[titan] 2025-09-10 06:34:53,004 - root - INFO - step: 30095 loss: 2.6542 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.15 mfu: 49.36% global_avg_ntp_loss: 0.7354 global_avg_top_loss: 1.9187 +[titan] 2025-09-10 06:34:53,004 - root - INFO - lr: 4.6384e-06 gnorm: 0.40 [2 days, 6:58:24<18:05:35] +[titan] 2025-09-10 06:35:18,405 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 06:35:24,763 - root - INFO - step: 30100 loss: 2.7121 memory: 122.03GiB(87.57%) tps: 10,318 tflops: 491.74 mfu: 49.72% global_avg_ntp_loss: 0.7628 global_avg_top_loss: 1.9493 +[titan] 2025-09-10 06:35:24,763 - root - INFO - lr: 4.6359e-06 gnorm: 0.40 [2 days, 6:58:56<18:05:02] +[titan] 2025-09-10 06:35:56,458 - root - INFO - step: 30105 loss: 2.6589 memory: 122.03GiB(87.57%) tps: 10,339 tflops: 492.74 mfu: 49.82% global_avg_ntp_loss: 0.7383 global_avg_top_loss: 1.9207 +[titan] 2025-09-10 06:35:56,458 - root - INFO - lr: 4.6334e-06 gnorm: 0.50 [2 days, 6:59:28<18:04:28] +[titan] 2025-09-10 06:36:28,386 - root - INFO - step: 30110 loss: 2.6447 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.13 mfu: 49.46% global_avg_ntp_loss: 0.7300 global_avg_top_loss: 1.9147 +[titan] 2025-09-10 06:36:28,387 - root - INFO - lr: 4.6309e-06 gnorm: 0.40 [2 days, 7:00:00<18:03:55] +[titan] 2025-09-10 06:37:00,323 - root - INFO - step: 30115 loss: 2.6461 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.01 mfu: 49.44% global_avg_ntp_loss: 0.7324 global_avg_top_loss: 1.9137 +[titan] 2025-09-10 06:37:00,323 - root - INFO - lr: 4.6283e-06 gnorm: 0.41 [2 days, 7:00:32<18:03:22] +[titan] 2025-09-10 06:37:32,129 - root - INFO - step: 30120 loss: 2.7429 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 491.00 mfu: 49.65% global_avg_ntp_loss: 0.7903 global_avg_top_loss: 1.9526 +[titan] 2025-09-10 06:37:32,130 - root - INFO - lr: 4.6258e-06 gnorm: 0.39 [2 days, 7:01:03<18:02:49] +[titan] 2025-09-10 06:38:04,410 - root - INFO - step: 30125 loss: 3.5323 memory: 122.03GiB(87.57%) tps: 10,151 tflops: 483.80 mfu: 48.92% global_avg_ntp_loss: 1.2317 global_avg_top_loss: 2.3005 +[titan] 2025-09-10 06:38:04,410 - root - INFO - lr: 4.6233e-06 gnorm: 0.39 [2 days, 7:01:36<18:02:16] +[titan] 2025-09-10 06:38:36,416 - root - INFO - step: 30130 loss: 2.6354 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.94 mfu: 49.34% global_avg_ntp_loss: 0.7288 global_avg_top_loss: 1.9066 +[titan] 2025-09-10 06:38:36,417 - root - INFO - lr: 4.6208e-06 gnorm: 0.39 [2 days, 7:02:08<18:01:42] +[titan] 2025-09-10 06:39:08,483 - root - INFO - step: 30135 loss: 3.5982 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.03 mfu: 49.24% global_avg_ntp_loss: 1.2671 global_avg_top_loss: 2.3311 +[titan] 2025-09-10 06:39:08,483 - root - INFO - lr: 4.6183e-06 gnorm: 0.40 [2 days, 7:02:40<18:01:09] +[titan] 2025-09-10 06:39:40,442 - root - INFO - step: 30140 loss: 2.6324 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.66 mfu: 49.41% global_avg_ntp_loss: 0.7230 global_avg_top_loss: 1.9094 +[titan] 2025-09-10 06:39:40,442 - root - INFO - lr: 4.6158e-06 gnorm: 0.39 [2 days, 7:03:12<18:00:36] +[titan] 2025-09-10 06:40:12,474 - root - INFO - step: 30145 loss: 2.6919 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.55 mfu: 49.30% global_avg_ntp_loss: 0.7533 global_avg_top_loss: 1.9386 +[titan] 2025-09-10 06:40:12,474 - root - INFO - lr: 4.6132e-06 gnorm: 0.38 [2 days, 7:03:44<18:00:03] +[titan] 2025-09-10 06:40:38,063 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 06:40:44,423 - root - INFO - step: 30150 loss: 3.1151 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.82 mfu: 49.43% global_avg_ntp_loss: 0.9989 global_avg_top_loss: 2.1161 +[titan] 2025-09-10 06:40:44,423 - root - INFO - lr: 4.6107e-06 gnorm: 0.40 [2 days, 7:04:16<17:59:30] +[titan] 2025-09-10 06:41:16,520 - root - INFO - step: 30155 loss: 2.6841 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.56 mfu: 49.20% global_avg_ntp_loss: 0.7487 global_avg_top_loss: 1.9354 +[titan] 2025-09-10 06:41:16,520 - root - INFO - lr: 4.6082e-06 gnorm: 0.41 [2 days, 7:04:48<17:58:57] +[titan] 2025-09-10 06:41:48,599 - root - INFO - step: 30160 loss: 2.6828 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.83 mfu: 49.22% global_avg_ntp_loss: 0.7473 global_avg_top_loss: 1.9354 +[titan] 2025-09-10 06:41:48,599 - root - INFO - lr: 4.6057e-06 gnorm: 0.40 [2 days, 7:05:20<17:58:24] +[titan] 2025-09-10 06:42:20,442 - root - INFO - step: 30165 loss: 2.6292 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.45 mfu: 49.59% global_avg_ntp_loss: 0.7270 global_avg_top_loss: 1.9023 +[titan] 2025-09-10 06:42:20,442 - root - INFO - lr: 4.6032e-06 gnorm: 0.45 [2 days, 7:05:52<17:57:50] +[titan] 2025-09-10 06:42:52,688 - root - INFO - step: 30170 loss: 2.6798 memory: 122.03GiB(87.57%) tps: 10,162 tflops: 484.32 mfu: 48.97% global_avg_ntp_loss: 0.7464 global_avg_top_loss: 1.9334 +[titan] 2025-09-10 06:42:52,688 - root - INFO - lr: 4.6007e-06 gnorm: 0.41 [2 days, 7:06:24<17:57:17] +[titan] 2025-09-10 06:43:25,006 - root - INFO - step: 30175 loss: 2.7166 memory: 122.03GiB(87.57%) tps: 10,139 tflops: 483.24 mfu: 48.86% global_avg_ntp_loss: 0.7640 global_avg_top_loss: 1.9525 +[titan] 2025-09-10 06:43:25,006 - root - INFO - lr: 4.5982e-06 gnorm: 0.40 [2 days, 7:06:56<17:56:44] +[titan] 2025-09-10 06:43:56,799 - root - INFO - step: 30180 loss: 2.6327 memory: 122.03GiB(87.57%) tps: 10,307 tflops: 491.20 mfu: 49.67% global_avg_ntp_loss: 0.7275 global_avg_top_loss: 1.9052 +[titan] 2025-09-10 06:43:56,800 - root - INFO - lr: 4.5957e-06 gnorm: 0.38 [2 days, 7:07:28<17:56:11] +[titan] 2025-09-10 06:44:28,960 - root - INFO - step: 30185 loss: 2.5632 memory: 122.03GiB(87.57%) tps: 10,189 tflops: 485.60 mfu: 49.10% global_avg_ntp_loss: 0.6910 global_avg_top_loss: 1.8722 +[titan] 2025-09-10 06:44:28,960 - root - INFO - lr: 4.5931e-06 gnorm: 0.43 [2 days, 7:08:00<17:55:38] +[titan] 2025-09-10 06:45:00,919 - root - INFO - step: 30190 loss: 2.7289 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.65 mfu: 49.41% global_avg_ntp_loss: 0.7709 global_avg_top_loss: 1.9580 +[titan] 2025-09-10 06:45:00,920 - root - INFO - lr: 4.5906e-06 gnorm: 0.40 [2 days, 7:08:32<17:55:05] +[titan] 2025-09-10 06:45:32,966 - root - INFO - step: 30195 loss: 2.6531 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.33 mfu: 49.28% global_avg_ntp_loss: 0.7348 global_avg_top_loss: 1.9183 +[titan] 2025-09-10 06:45:32,966 - root - INFO - lr: 4.5881e-06 gnorm: 0.40 [2 days, 7:09:04<17:54:31] +[titan] 2025-09-10 06:45:58,838 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 06:46:05,255 - root - INFO - step: 30200 loss: 2.8259 memory: 122.03GiB(87.57%) tps: 10,148 tflops: 483.67 mfu: 48.90% global_avg_ntp_loss: 0.8269 global_avg_top_loss: 1.9990 +[titan] 2025-09-10 06:46:05,255 - root - INFO - lr: 4.5856e-06 gnorm: 0.39 [2 days, 7:09:37<17:53:58] +[titan] 2025-09-10 06:46:37,824 - root - INFO - step: 30205 loss: 2.7635 memory: 122.03GiB(87.57%) tps: 10,061 tflops: 479.51 mfu: 48.48% global_avg_ntp_loss: 0.7865 global_avg_top_loss: 1.9769 +[titan] 2025-09-10 06:46:37,824 - root - INFO - lr: 4.5831e-06 gnorm: 0.39 [2 days, 7:10:09<17:53:25] +[titan] 2025-09-10 06:46:57,252 - root - INFO - Dumping profiler traces at step 30208 +[titan] 2025-09-10 06:46:57,315 - root - INFO - Finished dumping profiler traces in 0.06 seconds +[titan] 2025-09-10 06:47:10,052 - root - INFO - step: 30210 loss: 2.6300 memory: 122.03GiB(87.57%) tps: 10,168 tflops: 484.59 mfu: 49.00% global_avg_ntp_loss: 0.7225 global_avg_top_loss: 1.9075 +[titan] 2025-09-10 06:47:10,052 - root - INFO - lr: 4.5806e-06 gnorm: 0.39 [2 days, 7:10:41<17:52:52] +[titan] 2025-09-10 06:47:41,832 - root - INFO - step: 30215 loss: 3.5210 memory: 122.03GiB(87.57%) tps: 10,311 tflops: 491.41 mfu: 49.69% global_avg_ntp_loss: 1.2346 global_avg_top_loss: 2.2865 +[titan] 2025-09-10 06:47:41,833 - root - INFO - lr: 4.5781e-06 gnorm: 0.39 [2 days, 7:11:13<17:52:19] +[titan] 2025-09-10 06:48:13,623 - root - INFO - step: 30220 loss: 2.6326 memory: 122.03GiB(87.57%) tps: 10,308 tflops: 491.25 mfu: 49.67% global_avg_ntp_loss: 0.7272 global_avg_top_loss: 1.9053 +[titan] 2025-09-10 06:48:13,623 - root - INFO - lr: 4.5756e-06 gnorm: 0.39 [2 days, 7:11:45<17:51:46] +[titan] 2025-09-10 06:48:45,704 - root - INFO - step: 30225 loss: 2.6769 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.81 mfu: 49.22% global_avg_ntp_loss: 0.7468 global_avg_top_loss: 1.9301 +[titan] 2025-09-10 06:48:45,704 - root - INFO - lr: 4.5731e-06 gnorm: 0.41 [2 days, 7:12:17<17:51:13] +[titan] 2025-09-10 06:49:17,760 - root - INFO - step: 30230 loss: 3.0909 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.19 mfu: 49.26% global_avg_ntp_loss: 0.9841 global_avg_top_loss: 2.1068 +[titan] 2025-09-10 06:49:17,760 - root - INFO - lr: 4.5706e-06 gnorm: 0.40 [2 days, 7:12:49<17:50:40] +[titan] 2025-09-10 06:49:49,666 - root - INFO - step: 30235 loss: 2.5889 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.48 mfu: 49.49% global_avg_ntp_loss: 0.7078 global_avg_top_loss: 1.8812 +[titan] 2025-09-10 06:49:49,666 - root - INFO - lr: 4.5681e-06 gnorm: 0.39 [2 days, 7:13:21<17:50:06] +[titan] 2025-09-10 06:50:21,615 - root - INFO - step: 30240 loss: 2.6379 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.81 mfu: 49.42% global_avg_ntp_loss: 0.7287 global_avg_top_loss: 1.9093 +[titan] 2025-09-10 06:50:21,616 - root - INFO - lr: 4.5656e-06 gnorm: 0.40 [2 days, 7:13:53<17:49:33] +[titan] 2025-09-10 06:50:53,619 - root - INFO - step: 30245 loss: 2.7021 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.98 mfu: 49.34% global_avg_ntp_loss: 0.7568 global_avg_top_loss: 1.9453 +[titan] 2025-09-10 06:50:53,619 - root - INFO - lr: 4.5631e-06 gnorm: 0.40 [2 days, 7:14:25<17:49:00] +[titan] 2025-09-10 06:51:19,208 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 06:51:25,648 - root - INFO - step: 30250 loss: 2.7076 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.59 mfu: 49.30% global_avg_ntp_loss: 0.7618 global_avg_top_loss: 1.9458 +[titan] 2025-09-10 06:51:25,649 - root - INFO - lr: 4.5606e-06 gnorm: 0.41 [2 days, 7:14:57<17:48:27] +[titan] 2025-09-10 06:51:57,457 - root - INFO - step: 30255 loss: 2.9892 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 490.98 mfu: 49.64% global_avg_ntp_loss: 0.9201 global_avg_top_loss: 2.0691 +[titan] 2025-09-10 06:51:57,457 - root - INFO - lr: 4.5581e-06 gnorm: 0.40 [2 days, 7:15:29<17:47:54] +[titan] 2025-09-10 06:52:29,530 - root - INFO - step: 30260 loss: 2.6457 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.93 mfu: 49.23% global_avg_ntp_loss: 0.7317 global_avg_top_loss: 1.9140 +[titan] 2025-09-10 06:52:29,530 - root - INFO - lr: 4.5557e-06 gnorm: 0.41 [2 days, 7:16:01<17:47:21] +[titan] 2025-09-10 06:53:01,618 - root - INFO - step: 30265 loss: 2.5979 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.69 mfu: 49.21% global_avg_ntp_loss: 0.7098 global_avg_top_loss: 1.8881 +[titan] 2025-09-10 06:53:01,618 - root - INFO - lr: 4.5532e-06 gnorm: 0.40 [2 days, 7:16:33<17:46:47] +[titan] 2025-09-10 06:53:33,568 - root - INFO - step: 30270 loss: 2.7667 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.80 mfu: 49.42% global_avg_ntp_loss: 0.7824 global_avg_top_loss: 1.9843 +[titan] 2025-09-10 06:53:33,568 - root - INFO - lr: 4.5507e-06 gnorm: 0.44 [2 days, 7:17:05<17:46:14] +[titan] 2025-09-10 06:54:05,570 - root - INFO - step: 30275 loss: 2.7202 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 488.00 mfu: 49.34% global_avg_ntp_loss: 0.7645 global_avg_top_loss: 1.9558 +[titan] 2025-09-10 06:54:05,571 - root - INFO - lr: 4.5482e-06 gnorm: 0.41 [2 days, 7:17:37<17:45:41] +[titan] 2025-09-10 06:54:37,785 - root - INFO - step: 30280 loss: 2.6577 memory: 122.03GiB(87.57%) tps: 10,172 tflops: 484.78 mfu: 49.02% global_avg_ntp_loss: 0.7379 global_avg_top_loss: 1.9198 +[titan] 2025-09-10 06:54:37,786 - root - INFO - lr: 4.5457e-06 gnorm: 0.39 [2 days, 7:18:09<17:45:08] +[titan] 2025-09-10 06:55:09,476 - root - INFO - step: 30285 loss: 2.7255 memory: 122.03GiB(87.57%) tps: 10,340 tflops: 492.80 mfu: 49.83% global_avg_ntp_loss: 0.7679 global_avg_top_loss: 1.9575 +[titan] 2025-09-10 06:55:09,477 - root - INFO - lr: 4.5432e-06 gnorm: 0.40 [2 days, 7:18:41<17:44:35] +[titan] 2025-09-10 06:55:41,576 - root - INFO - step: 30290 loss: 2.6519 memory: 122.03GiB(87.57%) tps: 10,208 tflops: 486.53 mfu: 49.19% global_avg_ntp_loss: 0.7328 global_avg_top_loss: 1.9191 +[titan] 2025-09-10 06:55:41,576 - root - INFO - lr: 4.5407e-06 gnorm: 0.39 [2 days, 7:19:13<17:44:02] +[titan] 2025-09-10 06:56:13,694 - root - INFO - step: 30295 loss: 3.1760 memory: 122.03GiB(87.57%) tps: 10,202 tflops: 486.24 mfu: 49.16% global_avg_ntp_loss: 1.0245 global_avg_top_loss: 2.1515 +[titan] 2025-09-10 06:56:13,695 - root - INFO - lr: 4.5382e-06 gnorm: 0.40 [2 days, 7:19:45<17:43:29] +[titan] 2025-09-10 06:56:39,647 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 06:56:46,070 - root - INFO - step: 30300 loss: 2.7107 memory: 122.03GiB(87.57%) tps: 10,121 tflops: 482.38 mfu: 48.77% global_avg_ntp_loss: 0.7598 global_avg_top_loss: 1.9509 +[titan] 2025-09-10 06:56:46,070 - root - INFO - lr: 4.5357e-06 gnorm: 0.42 [2 days, 7:20:17<17:42:56] +[titan] 2025-09-10 06:57:17,955 - root - INFO - step: 30305 loss: 2.7111 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.80 mfu: 49.53% global_avg_ntp_loss: 0.7699 global_avg_top_loss: 1.9412 +[titan] 2025-09-10 06:57:17,955 - root - INFO - lr: 4.5333e-06 gnorm: 0.41 [2 days, 7:20:49<17:42:22] +[titan] 2025-09-10 06:57:49,784 - root - INFO - step: 30310 loss: 3.6190 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.65 mfu: 49.61% global_avg_ntp_loss: 1.2739 global_avg_top_loss: 2.3451 +[titan] 2025-09-10 06:57:49,785 - root - INFO - lr: 4.5308e-06 gnorm: 0.42 [2 days, 7:21:21<17:41:49] +[titan] 2025-09-10 06:58:21,990 - root - INFO - step: 30315 loss: 2.7597 memory: 122.03GiB(87.57%) tps: 10,175 tflops: 484.92 mfu: 49.03% global_avg_ntp_loss: 0.7904 global_avg_top_loss: 1.9693 +[titan] 2025-09-10 06:58:21,990 - root - INFO - lr: 4.5283e-06 gnorm: 0.40 [2 days, 7:21:53<17:41:16] +[titan] 2025-09-10 06:58:53,943 - root - INFO - step: 30320 loss: 2.6753 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.75 mfu: 49.42% global_avg_ntp_loss: 0.7461 global_avg_top_loss: 1.9291 +[titan] 2025-09-10 06:58:53,944 - root - INFO - lr: 4.5258e-06 gnorm: 0.43 [2 days, 7:22:25<17:40:43] +[titan] 2025-09-10 06:59:26,092 - root - INFO - step: 30325 loss: 2.5605 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.78 mfu: 49.12% global_avg_ntp_loss: 0.6957 global_avg_top_loss: 1.8648 +[titan] 2025-09-10 06:59:26,092 - root - INFO - lr: 4.5233e-06 gnorm: 0.39 [2 days, 7:22:57<17:40:10] +[titan] 2025-09-10 06:59:58,252 - root - INFO - step: 30330 loss: 2.7233 memory: 122.03GiB(87.57%) tps: 10,189 tflops: 485.60 mfu: 49.10% global_avg_ntp_loss: 0.7679 global_avg_top_loss: 1.9554 +[titan] 2025-09-10 06:59:58,253 - root - INFO - lr: 4.5209e-06 gnorm: 0.42 [2 days, 7:23:30<17:39:37] +[titan] 2025-09-10 07:00:30,330 - root - INFO - step: 30335 loss: 2.6979 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.86 mfu: 49.23% global_avg_ntp_loss: 0.7584 global_avg_top_loss: 1.9395 +[titan] 2025-09-10 07:00:30,330 - root - INFO - lr: 4.5184e-06 gnorm: 0.39 [2 days, 7:24:02<17:39:04] +[titan] 2025-09-10 07:01:02,542 - root - INFO - step: 30340 loss: 2.7269 memory: 122.03GiB(87.57%) tps: 10,173 tflops: 484.82 mfu: 49.02% global_avg_ntp_loss: 0.7709 global_avg_top_loss: 1.9560 +[titan] 2025-09-10 07:01:02,542 - root - INFO - lr: 4.5159e-06 gnorm: 0.42 [2 days, 7:24:34<17:38:30] +[titan] 2025-09-10 07:01:34,478 - root - INFO - step: 30345 loss: 2.5374 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 489.01 mfu: 49.44% global_avg_ntp_loss: 0.6834 global_avg_top_loss: 1.8541 +[titan] 2025-09-10 07:01:34,479 - root - INFO - lr: 4.5134e-06 gnorm: 0.44 [2 days, 7:25:06<17:37:57] +[titan] 2025-09-10 07:01:59,995 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 07:02:06,369 - root - INFO - step: 30350 loss: 2.6601 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.71 mfu: 49.52% global_avg_ntp_loss: 0.7326 global_avg_top_loss: 1.9275 +[titan] 2025-09-10 07:02:06,370 - root - INFO - lr: 4.5110e-06 gnorm: 0.45 [2 days, 7:25:38<17:37:24] +[titan] 2025-09-10 07:02:38,391 - root - INFO - step: 30355 loss: 2.6395 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.70 mfu: 49.31% global_avg_ntp_loss: 0.7275 global_avg_top_loss: 1.9120 +[titan] 2025-09-10 07:02:38,392 - root - INFO - lr: 4.5085e-06 gnorm: 0.42 [2 days, 7:26:10<17:36:51] +[titan] 2025-09-10 07:03:10,346 - root - INFO - step: 30360 loss: 2.5856 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.72 mfu: 49.42% global_avg_ntp_loss: 0.7046 global_avg_top_loss: 1.8810 +[titan] 2025-09-10 07:03:10,347 - root - INFO - lr: 4.5060e-06 gnorm: 0.39 [2 days, 7:26:42<17:36:18] +[titan] 2025-09-10 07:03:42,319 - root - INFO - step: 30365 loss: 2.6548 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.45 mfu: 49.39% global_avg_ntp_loss: 0.7406 global_avg_top_loss: 1.9143 +[titan] 2025-09-10 07:03:42,320 - root - INFO - lr: 4.5035e-06 gnorm: 0.39 [2 days, 7:27:14<17:35:45] +[titan] 2025-09-10 07:04:14,313 - root - INFO - step: 30370 loss: 2.6628 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.14 mfu: 49.36% global_avg_ntp_loss: 0.7368 global_avg_top_loss: 1.9260 +[titan] 2025-09-10 07:04:14,313 - root - INFO - lr: 4.5011e-06 gnorm: 0.41 [2 days, 7:27:46<17:35:11] +[titan] 2025-09-10 07:04:46,534 - root - INFO - step: 30375 loss: 3.1675 memory: 122.03GiB(87.57%) tps: 10,170 tflops: 484.68 mfu: 49.01% global_avg_ntp_loss: 1.0213 global_avg_top_loss: 2.1462 +[titan] 2025-09-10 07:04:46,535 - root - INFO - lr: 4.4986e-06 gnorm: 0.39 [2 days, 7:28:18<17:34:38] +[titan] 2025-09-10 07:05:18,538 - root - INFO - step: 30380 loss: 2.7450 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.98 mfu: 49.34% global_avg_ntp_loss: 0.7778 global_avg_top_loss: 1.9672 +[titan] 2025-09-10 07:05:18,538 - root - INFO - lr: 4.4961e-06 gnorm: 0.41 [2 days, 7:28:50<17:34:05] +[titan] 2025-09-10 07:05:50,598 - root - INFO - step: 30385 loss: 2.6289 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.12 mfu: 49.25% global_avg_ntp_loss: 0.7251 global_avg_top_loss: 1.9038 +[titan] 2025-09-10 07:05:50,599 - root - INFO - lr: 4.4937e-06 gnorm: 0.41 [2 days, 7:29:22<17:33:32] +[titan] 2025-09-10 07:06:22,545 - root - INFO - step: 30390 loss: 3.5852 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.85 mfu: 49.43% global_avg_ntp_loss: 1.2605 global_avg_top_loss: 2.3247 +[titan] 2025-09-10 07:06:22,545 - root - INFO - lr: 4.4912e-06 gnorm: 0.41 [2 days, 7:29:54<17:32:59] +[titan] 2025-09-10 07:06:54,531 - root - INFO - step: 30395 loss: 2.7101 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.25 mfu: 49.37% global_avg_ntp_loss: 0.7592 global_avg_top_loss: 1.9508 +[titan] 2025-09-10 07:06:54,531 - root - INFO - lr: 4.4887e-06 gnorm: 0.40 [2 days, 7:30:26<17:32:26] +[titan] 2025-09-10 07:07:20,454 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 07:07:26,819 - root - INFO - step: 30400 loss: 2.6694 memory: 122.03GiB(87.57%) tps: 10,149 tflops: 483.68 mfu: 48.91% global_avg_ntp_loss: 0.7439 global_avg_top_loss: 1.9255 +[titan] 2025-09-10 07:07:26,819 - root - INFO - lr: 4.4863e-06 gnorm: 0.41 [2 days, 7:30:58<17:31:53] +[titan] 2025-09-10 07:07:58,840 - root - INFO - step: 30405 loss: 2.7025 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.72 mfu: 49.31% global_avg_ntp_loss: 0.7588 global_avg_top_loss: 1.9438 +[titan] 2025-09-10 07:07:58,840 - root - INFO - lr: 4.4838e-06 gnorm: 0.42 [2 days, 7:31:30<17:31:20] +[titan] 2025-09-10 07:08:30,867 - root - INFO - step: 30410 loss: 2.7442 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.62 mfu: 49.30% global_avg_ntp_loss: 0.7769 global_avg_top_loss: 1.9673 +[titan] 2025-09-10 07:08:30,868 - root - INFO - lr: 4.4813e-06 gnorm: 0.44 [2 days, 7:32:02<17:30:46] +[titan] 2025-09-10 07:09:02,925 - root - INFO - step: 30415 loss: 2.6337 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.16 mfu: 49.26% global_avg_ntp_loss: 0.7266 global_avg_top_loss: 1.9071 +[titan] 2025-09-10 07:09:02,925 - root - INFO - lr: 4.4789e-06 gnorm: 0.39 [2 days, 7:32:34<17:30:13] +[titan] 2025-09-10 07:09:34,850 - root - INFO - step: 30420 loss: 2.7088 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.19 mfu: 49.46% global_avg_ntp_loss: 0.7658 global_avg_top_loss: 1.9430 +[titan] 2025-09-10 07:09:34,850 - root - INFO - lr: 4.4764e-06 gnorm: 0.40 [2 days, 7:33:06<17:29:40] +[titan] 2025-09-10 07:10:07,078 - root - INFO - step: 30425 loss: 2.5999 memory: 122.03GiB(87.57%) tps: 10,167 tflops: 484.57 mfu: 49.00% global_avg_ntp_loss: 0.7080 global_avg_top_loss: 1.8919 +[titan] 2025-09-10 07:10:07,079 - root - INFO - lr: 4.4740e-06 gnorm: 0.43 [2 days, 7:33:38<17:29:07] +[titan] 2025-09-10 07:10:39,319 - root - INFO - step: 30430 loss: 2.6333 memory: 122.03GiB(87.57%) tps: 10,164 tflops: 484.39 mfu: 48.98% global_avg_ntp_loss: 0.7220 global_avg_top_loss: 1.9114 +[titan] 2025-09-10 07:10:39,319 - root - INFO - lr: 4.4715e-06 gnorm: 0.44 [2 days, 7:34:11<17:28:34] +[titan] 2025-09-10 07:11:11,384 - root - INFO - step: 30435 loss: 2.7468 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.06 mfu: 49.25% global_avg_ntp_loss: 0.7766 global_avg_top_loss: 1.9702 +[titan] 2025-09-10 07:11:11,384 - root - INFO - lr: 4.4691e-06 gnorm: 0.42 [2 days, 7:34:43<17:28:01] +[titan] 2025-09-10 07:11:43,566 - root - INFO - step: 30440 loss: 2.5618 memory: 122.03GiB(87.57%) tps: 10,182 tflops: 485.27 mfu: 49.07% global_avg_ntp_loss: 0.6931 global_avg_top_loss: 1.8687 +[titan] 2025-09-10 07:11:43,566 - root - INFO - lr: 4.4666e-06 gnorm: 0.40 [2 days, 7:35:15<17:27:28] +[titan] 2025-09-10 07:12:15,565 - root - INFO - step: 30445 loss: 2.8001 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.05 mfu: 49.35% global_avg_ntp_loss: 0.8127 global_avg_top_loss: 1.9874 +[titan] 2025-09-10 07:12:15,565 - root - INFO - lr: 4.4641e-06 gnorm: 0.41 [2 days, 7:35:47<17:26:55] +[titan] 2025-09-10 07:12:41,606 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 07:12:47,972 - root - INFO - step: 30450 loss: 2.6505 memory: 122.03GiB(87.57%) tps: 10,112 tflops: 481.92 mfu: 48.73% global_avg_ntp_loss: 0.7326 global_avg_top_loss: 1.9179 +[titan] 2025-09-10 07:12:47,972 - root - INFO - lr: 4.4617e-06 gnorm: 0.42 [2 days, 7:36:19<17:26:22] +[titan] 2025-09-10 07:13:19,993 - root - INFO - step: 30455 loss: 3.1180 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.72 mfu: 49.31% global_avg_ntp_loss: 0.9966 global_avg_top_loss: 2.1214 +[titan] 2025-09-10 07:13:19,993 - root - INFO - lr: 4.4592e-06 gnorm: 0.39 [2 days, 7:36:51<17:25:49] +[titan] 2025-09-10 07:13:52,102 - root - INFO - step: 30460 loss: 2.6747 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.37 mfu: 49.18% global_avg_ntp_loss: 0.7435 global_avg_top_loss: 1.9312 +[titan] 2025-09-10 07:13:52,103 - root - INFO - lr: 4.4568e-06 gnorm: 0.40 [2 days, 7:37:23<17:25:15] +[titan] 2025-09-10 07:14:24,146 - root - INFO - step: 30465 loss: 2.6823 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.38 mfu: 49.28% global_avg_ntp_loss: 0.7504 global_avg_top_loss: 1.9319 +[titan] 2025-09-10 07:14:24,146 - root - INFO - lr: 4.4543e-06 gnorm: 0.41 [2 days, 7:37:55<17:24:42] +[titan] 2025-09-10 07:14:56,167 - root - INFO - step: 30470 loss: 3.2387 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.72 mfu: 49.31% global_avg_ntp_loss: 1.0558 global_avg_top_loss: 2.1829 +[titan] 2025-09-10 07:14:56,167 - root - INFO - lr: 4.4519e-06 gnorm: 0.44 [2 days, 7:38:27<17:24:09] +[titan] 2025-09-10 07:15:28,165 - root - INFO - step: 30475 loss: 2.6390 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.07 mfu: 49.35% global_avg_ntp_loss: 0.7273 global_avg_top_loss: 1.9116 +[titan] 2025-09-10 07:15:28,165 - root - INFO - lr: 4.4494e-06 gnorm: 0.41 [2 days, 7:38:59<17:23:36] +[titan] 2025-09-10 07:16:00,282 - root - INFO - step: 30480 loss: 2.6737 memory: 122.03GiB(87.57%) tps: 10,203 tflops: 486.26 mfu: 49.17% global_avg_ntp_loss: 0.7441 global_avg_top_loss: 1.9296 +[titan] 2025-09-10 07:16:00,282 - root - INFO - lr: 4.4470e-06 gnorm: 0.41 [2 days, 7:39:32<17:23:03] +[titan] 2025-09-10 07:16:32,304 - root - INFO - step: 30485 loss: 2.7111 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.70 mfu: 49.31% global_avg_ntp_loss: 0.7607 global_avg_top_loss: 1.9505 +[titan] 2025-09-10 07:16:32,304 - root - INFO - lr: 4.4445e-06 gnorm: 0.47 [2 days, 7:40:04<17:22:30] +[titan] 2025-09-10 07:17:04,220 - root - INFO - step: 30490 loss: 2.6541 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.31 mfu: 49.48% global_avg_ntp_loss: 0.7328 global_avg_top_loss: 1.9212 +[titan] 2025-09-10 07:17:04,221 - root - INFO - lr: 4.4421e-06 gnorm: 0.41 [2 days, 7:40:36<17:21:57] +[titan] 2025-09-10 07:17:36,330 - root - INFO - step: 30495 loss: 2.6601 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.37 mfu: 49.18% global_avg_ntp_loss: 0.7394 global_avg_top_loss: 1.9207 +[titan] 2025-09-10 07:17:36,330 - root - INFO - lr: 4.4397e-06 gnorm: 0.43 [2 days, 7:41:08<17:21:23] +[titan] 2025-09-10 07:18:01,831 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 07:18:08,300 - root - INFO - step: 30500 loss: 2.6800 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.49 mfu: 49.39% global_avg_ntp_loss: 0.7468 global_avg_top_loss: 1.9332 +[titan] 2025-09-10 07:18:08,300 - root - INFO - lr: 4.4372e-06 gnorm: 0.47 [2 days, 7:41:40<17:20:50] +[titan] 2025-09-10 07:18:40,254 - root - INFO - step: 30505 loss: 2.6446 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.74 mfu: 49.42% global_avg_ntp_loss: 0.7340 global_avg_top_loss: 1.9105 +[titan] 2025-09-10 07:18:40,254 - root - INFO - lr: 4.4348e-06 gnorm: 0.40 [2 days, 7:42:12<17:20:17] +[titan] 2025-09-10 07:19:12,423 - root - INFO - step: 30510 loss: 2.5863 memory: 122.03GiB(87.57%) tps: 10,186 tflops: 485.46 mfu: 49.09% global_avg_ntp_loss: 0.7035 global_avg_top_loss: 1.8828 +[titan] 2025-09-10 07:19:12,424 - root - INFO - lr: 4.4323e-06 gnorm: 0.49 [2 days, 7:42:44<17:19:44] +[titan] 2025-09-10 07:19:44,438 - root - INFO - step: 30515 loss: 2.6806 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.81 mfu: 49.32% global_avg_ntp_loss: 0.7451 global_avg_top_loss: 1.9355 +[titan] 2025-09-10 07:19:44,439 - root - INFO - lr: 4.4299e-06 gnorm: 0.46 [2 days, 7:43:16<17:19:11] +[titan] 2025-09-10 07:20:16,593 - root - INFO - step: 30520 loss: 2.7271 memory: 122.03GiB(87.57%) tps: 10,191 tflops: 485.70 mfu: 49.11% global_avg_ntp_loss: 0.7659 global_avg_top_loss: 1.9612 +[titan] 2025-09-10 07:20:16,593 - root - INFO - lr: 4.4274e-06 gnorm: 0.43 [2 days, 7:43:48<17:18:38] +[titan] 2025-09-10 07:20:49,009 - root - INFO - step: 30525 loss: 2.7575 memory: 122.03GiB(87.57%) tps: 10,109 tflops: 481.77 mfu: 48.71% global_avg_ntp_loss: 0.7820 global_avg_top_loss: 1.9756 +[titan] 2025-09-10 07:20:49,009 - root - INFO - lr: 4.4250e-06 gnorm: 0.40 [2 days, 7:44:20<17:18:05] +[titan] 2025-09-10 07:21:21,255 - root - INFO - step: 30530 loss: 2.6685 memory: 122.03GiB(87.57%) tps: 10,162 tflops: 484.32 mfu: 48.97% global_avg_ntp_loss: 0.7422 global_avg_top_loss: 1.9263 +[titan] 2025-09-10 07:21:21,255 - root - INFO - lr: 4.4226e-06 gnorm: 0.41 [2 days, 7:44:53<17:17:32] +[titan] 2025-09-10 07:21:53,126 - root - INFO - step: 30535 loss: 2.5856 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.01 mfu: 49.55% global_avg_ntp_loss: 0.7020 global_avg_top_loss: 1.8836 +[titan] 2025-09-10 07:21:53,126 - root - INFO - lr: 4.4201e-06 gnorm: 0.41 [2 days, 7:45:24<17:16:59] +[titan] 2025-09-10 07:22:25,179 - root - INFO - step: 30540 loss: 2.7014 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.23 mfu: 49.27% global_avg_ntp_loss: 0.7590 global_avg_top_loss: 1.9424 +[titan] 2025-09-10 07:22:25,179 - root - INFO - lr: 4.4177e-06 gnorm: 0.42 [2 days, 7:45:56<17:16:26] +[titan] 2025-09-10 07:22:57,240 - root - INFO - step: 30545 loss: 2.7349 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.11 mfu: 49.25% global_avg_ntp_loss: 0.7712 global_avg_top_loss: 1.9637 +[titan] 2025-09-10 07:22:57,240 - root - INFO - lr: 4.4153e-06 gnorm: 0.40 [2 days, 7:46:29<17:15:52] +[titan] 2025-09-10 07:23:23,141 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 07:23:29,737 - root - INFO - step: 30550 loss: 2.7104 memory: 122.03GiB(87.57%) tps: 10,083 tflops: 480.56 mfu: 48.59% global_avg_ntp_loss: 0.7645 global_avg_top_loss: 1.9458 +[titan] 2025-09-10 07:23:29,738 - root - INFO - lr: 4.4128e-06 gnorm: 0.42 [2 days, 7:47:01<17:15:19] +[titan] 2025-09-10 07:24:01,978 - root - INFO - step: 30555 loss: 2.5773 memory: 122.03GiB(87.57%) tps: 10,164 tflops: 484.40 mfu: 48.98% global_avg_ntp_loss: 0.7049 global_avg_top_loss: 1.8725 +[titan] 2025-09-10 07:24:01,978 - root - INFO - lr: 4.4104e-06 gnorm: 0.40 [2 days, 7:47:33<17:14:46] +[titan] 2025-09-10 07:24:34,110 - root - INFO - step: 30560 loss: 2.6094 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.02 mfu: 49.14% global_avg_ntp_loss: 0.7145 global_avg_top_loss: 1.8948 +[titan] 2025-09-10 07:24:34,111 - root - INFO - lr: 4.4080e-06 gnorm: 0.41 [2 days, 7:48:05<17:14:13] +[titan] 2025-09-10 07:25:06,329 - root - INFO - step: 30565 loss: 2.7287 memory: 122.03GiB(87.57%) tps: 10,171 tflops: 484.72 mfu: 49.01% global_avg_ntp_loss: 0.7661 global_avg_top_loss: 1.9626 +[titan] 2025-09-10 07:25:06,329 - root - INFO - lr: 4.4055e-06 gnorm: 0.42 [2 days, 7:48:38<17:13:40] +[titan] 2025-09-10 07:25:38,683 - root - INFO - step: 30570 loss: 2.7107 memory: 122.03GiB(87.57%) tps: 10,128 tflops: 482.70 mfu: 48.81% global_avg_ntp_loss: 0.7657 global_avg_top_loss: 1.9449 +[titan] 2025-09-10 07:25:38,684 - root - INFO - lr: 4.4031e-06 gnorm: 0.42 [2 days, 7:49:10<17:13:07] +[titan] 2025-09-10 07:26:10,740 - root - INFO - step: 30575 loss: 2.6381 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.17 mfu: 49.26% global_avg_ntp_loss: 0.7283 global_avg_top_loss: 1.9098 +[titan] 2025-09-10 07:26:10,740 - root - INFO - lr: 4.4007e-06 gnorm: 0.39 [2 days, 7:49:42<17:12:34] +[titan] 2025-09-10 07:26:42,972 - root - INFO - step: 30580 loss: 2.7007 memory: 122.03GiB(87.57%) tps: 10,166 tflops: 484.53 mfu: 48.99% global_avg_ntp_loss: 0.7530 global_avg_top_loss: 1.9477 +[titan] 2025-09-10 07:26:42,972 - root - INFO - lr: 4.3983e-06 gnorm: 0.41 [2 days, 7:50:14<17:12:01] +[titan] 2025-09-10 07:27:15,157 - root - INFO - step: 30585 loss: 2.5989 memory: 122.03GiB(87.57%) tps: 10,181 tflops: 485.23 mfu: 49.06% global_avg_ntp_loss: 0.7100 global_avg_top_loss: 1.8889 +[titan] 2025-09-10 07:27:15,158 - root - INFO - lr: 4.3958e-06 gnorm: 0.42 [2 days, 7:50:46<17:11:28] +[titan] 2025-09-10 07:27:47,117 - root - INFO - step: 30590 loss: 2.5543 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.65 mfu: 49.41% global_avg_ntp_loss: 0.6887 global_avg_top_loss: 1.8656 +[titan] 2025-09-10 07:27:47,118 - root - INFO - lr: 4.3934e-06 gnorm: 0.41 [2 days, 7:51:18<17:10:55] +[titan] 2025-09-10 07:28:19,163 - root - INFO - step: 30595 loss: 2.6301 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.34 mfu: 49.28% global_avg_ntp_loss: 0.7236 global_avg_top_loss: 1.9065 +[titan] 2025-09-10 07:28:19,163 - root - INFO - lr: 4.3910e-06 gnorm: 0.43 [2 days, 7:51:50<17:10:22] +[titan] 2025-09-10 07:28:44,726 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 07:28:51,100 - root - INFO - step: 30600 loss: 2.6645 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 489.01 mfu: 49.44% global_avg_ntp_loss: 0.7380 global_avg_top_loss: 1.9265 +[titan] 2025-09-10 07:28:51,100 - root - INFO - lr: 4.3886e-06 gnorm: 0.41 [2 days, 7:52:22<17:09:48] +[titan] 2025-09-10 07:29:23,392 - root - INFO - step: 30605 loss: 2.6768 memory: 122.03GiB(87.57%) tps: 10,147 tflops: 483.62 mfu: 48.90% global_avg_ntp_loss: 0.7473 global_avg_top_loss: 1.9295 +[titan] 2025-09-10 07:29:23,392 - root - INFO - lr: 4.3861e-06 gnorm: 0.40 [2 days, 7:52:55<17:09:15] +[titan] 2025-09-10 07:29:55,499 - root - INFO - step: 30610 loss: 2.6058 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.40 mfu: 49.18% global_avg_ntp_loss: 0.7104 global_avg_top_loss: 1.8954 +[titan] 2025-09-10 07:29:55,500 - root - INFO - lr: 4.3837e-06 gnorm: 0.42 [2 days, 7:53:27<17:08:42] +[titan] 2025-09-10 07:30:27,561 - root - INFO - step: 30615 loss: 2.6302 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.09 mfu: 49.25% global_avg_ntp_loss: 0.7228 global_avg_top_loss: 1.9074 +[titan] 2025-09-10 07:30:27,562 - root - INFO - lr: 4.3813e-06 gnorm: 0.40 [2 days, 7:53:59<17:08:09] +[titan] 2025-09-10 07:30:59,607 - root - INFO - step: 30620 loss: 2.6508 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.35 mfu: 49.28% global_avg_ntp_loss: 0.7307 global_avg_top_loss: 1.9201 +[titan] 2025-09-10 07:30:59,607 - root - INFO - lr: 4.3789e-06 gnorm: 0.40 [2 days, 7:54:31<17:07:36] +[titan] 2025-09-10 07:31:31,583 - root - INFO - step: 30625 loss: 2.6327 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.40 mfu: 49.38% global_avg_ntp_loss: 0.7224 global_avg_top_loss: 1.9103 +[titan] 2025-09-10 07:31:31,583 - root - INFO - lr: 4.3765e-06 gnorm: 0.42 [2 days, 7:55:03<17:07:03] +[titan] 2025-09-10 07:32:03,635 - root - INFO - step: 30630 loss: 2.6634 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.25 mfu: 49.27% global_avg_ntp_loss: 0.7412 global_avg_top_loss: 1.9222 +[titan] 2025-09-10 07:32:03,635 - root - INFO - lr: 4.3740e-06 gnorm: 0.40 [2 days, 7:55:35<17:06:30] +[titan] 2025-09-10 07:32:35,804 - root - INFO - step: 30635 loss: 2.6623 memory: 122.03GiB(87.57%) tps: 10,186 tflops: 485.47 mfu: 49.09% global_avg_ntp_loss: 0.7384 global_avg_top_loss: 1.9239 +[titan] 2025-09-10 07:32:35,804 - root - INFO - lr: 4.3716e-06 gnorm: 0.41 [2 days, 7:56:07<17:05:57] +[titan] 2025-09-10 07:33:07,986 - root - INFO - step: 30640 loss: 2.6868 memory: 122.03GiB(87.57%) tps: 10,182 tflops: 485.27 mfu: 49.07% global_avg_ntp_loss: 0.7497 global_avg_top_loss: 1.9371 +[titan] 2025-09-10 07:33:07,987 - root - INFO - lr: 4.3692e-06 gnorm: 0.42 [2 days, 7:56:39<17:05:24] +[titan] 2025-09-10 07:33:40,132 - root - INFO - step: 30645 loss: 2.6705 memory: 122.03GiB(87.57%) tps: 10,194 tflops: 485.82 mfu: 49.12% global_avg_ntp_loss: 0.7442 global_avg_top_loss: 1.9263 +[titan] 2025-09-10 07:33:40,132 - root - INFO - lr: 4.3668e-06 gnorm: 0.43 [2 days, 7:57:11<17:04:51] +[titan] 2025-09-10 07:34:05,795 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 07:34:12,229 - root - INFO - step: 30650 loss: 2.7030 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.57 mfu: 49.20% global_avg_ntp_loss: 0.7587 global_avg_top_loss: 1.9443 +[titan] 2025-09-10 07:34:12,229 - root - INFO - lr: 4.3644e-06 gnorm: 0.43 [2 days, 7:57:44<17:04:18] +[titan] 2025-09-10 07:34:44,353 - root - INFO - step: 30655 loss: 2.6632 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.15 mfu: 49.16% global_avg_ntp_loss: 0.7388 global_avg_top_loss: 1.9245 +[titan] 2025-09-10 07:34:44,353 - root - INFO - lr: 4.3620e-06 gnorm: 0.42 [2 days, 7:58:16<17:03:44] +[titan] 2025-09-10 07:35:16,546 - root - INFO - step: 30660 loss: 2.6200 memory: 122.03GiB(87.57%) tps: 10,179 tflops: 485.10 mfu: 49.05% global_avg_ntp_loss: 0.7175 global_avg_top_loss: 1.9025 +[titan] 2025-09-10 07:35:16,547 - root - INFO - lr: 4.3596e-06 gnorm: 0.42 [2 days, 7:58:48<17:03:11] +[titan] 2025-09-10 07:35:48,614 - root - INFO - step: 30665 loss: 2.5918 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.02 mfu: 49.24% global_avg_ntp_loss: 0.7062 global_avg_top_loss: 1.8855 +[titan] 2025-09-10 07:35:48,614 - root - INFO - lr: 4.3572e-06 gnorm: 0.39 [2 days, 7:59:20<17:02:38] +[titan] 2025-09-10 07:36:20,585 - root - INFO - step: 30670 loss: 2.7741 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.48 mfu: 49.39% global_avg_ntp_loss: 0.8137 global_avg_top_loss: 1.9604 +[titan] 2025-09-10 07:36:20,585 - root - INFO - lr: 4.3548e-06 gnorm: 0.45 [2 days, 7:59:52<17:02:05] +[titan] 2025-09-10 07:36:52,576 - root - INFO - step: 30675 loss: 2.5475 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.16 mfu: 49.36% global_avg_ntp_loss: 0.6845 global_avg_top_loss: 1.8630 +[titan] 2025-09-10 07:36:52,577 - root - INFO - lr: 4.3523e-06 gnorm: 0.40 [2 days, 8:00:24<17:01:32] +[titan] 2025-09-10 07:37:24,290 - root - INFO - step: 30680 loss: 3.1197 memory: 122.03GiB(87.57%) tps: 10,333 tflops: 492.44 mfu: 49.79% global_avg_ntp_loss: 1.0040 global_avg_top_loss: 2.1156 +[titan] 2025-09-10 07:37:24,291 - root - INFO - lr: 4.3499e-06 gnorm: 0.42 [2 days, 8:00:56<17:00:59] +[titan] 2025-09-10 07:37:56,649 - root - INFO - step: 30685 loss: 2.6413 memory: 122.03GiB(87.57%) tps: 10,127 tflops: 482.64 mfu: 48.80% global_avg_ntp_loss: 0.7316 global_avg_top_loss: 1.9097 +[titan] 2025-09-10 07:37:56,649 - root - INFO - lr: 4.3475e-06 gnorm: 0.42 [2 days, 8:01:28<17:00:26] +[titan] 2025-09-10 07:38:28,696 - root - INFO - step: 30690 loss: 2.6185 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.31 mfu: 49.27% global_avg_ntp_loss: 0.7186 global_avg_top_loss: 1.8999 +[titan] 2025-09-10 07:38:28,697 - root - INFO - lr: 4.3451e-06 gnorm: 0.42 [2 days, 8:02:00<16:59:53] +[titan] 2025-09-10 07:39:00,695 - root - INFO - step: 30695 loss: 2.5439 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.06 mfu: 49.35% global_avg_ntp_loss: 0.6844 global_avg_top_loss: 1.8595 +[titan] 2025-09-10 07:39:00,695 - root - INFO - lr: 4.3427e-06 gnorm: 0.40 [2 days, 8:02:32<16:59:20] +[titan] 2025-09-10 07:39:26,314 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 07:39:32,756 - root - INFO - step: 30700 loss: 2.6609 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.12 mfu: 49.25% global_avg_ntp_loss: 0.7354 global_avg_top_loss: 1.9255 +[titan] 2025-09-10 07:39:32,756 - root - INFO - lr: 4.3403e-06 gnorm: 0.40 [2 days, 8:03:04<16:58:46] +[titan] 2025-09-10 07:40:04,649 - root - INFO - step: 30705 loss: 2.7127 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.67 mfu: 49.51% global_avg_ntp_loss: 0.7637 global_avg_top_loss: 1.9490 +[titan] 2025-09-10 07:40:04,649 - root - INFO - lr: 4.3379e-06 gnorm: 0.43 [2 days, 8:03:36<16:58:13] +[titan] 2025-09-10 07:40:36,725 - root - INFO - step: 30710 loss: 2.7145 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.88 mfu: 49.23% global_avg_ntp_loss: 0.7646 global_avg_top_loss: 1.9499 +[titan] 2025-09-10 07:40:36,725 - root - INFO - lr: 4.3355e-06 gnorm: 0.43 [2 days, 8:04:08<16:57:40] +[titan] 2025-09-10 07:41:08,861 - root - INFO - step: 30715 loss: 2.7085 memory: 122.03GiB(87.57%) tps: 10,197 tflops: 485.97 mfu: 49.14% global_avg_ntp_loss: 0.7587 global_avg_top_loss: 1.9498 +[titan] 2025-09-10 07:41:08,861 - root - INFO - lr: 4.3331e-06 gnorm: 0.42 [2 days, 8:04:40<16:57:07] +[titan] 2025-09-10 07:41:41,092 - root - INFO - step: 30720 loss: 2.8183 memory: 122.03GiB(87.57%) tps: 10,167 tflops: 484.54 mfu: 48.99% global_avg_ntp_loss: 0.8225 global_avg_top_loss: 1.9957 +[titan] 2025-09-10 07:41:41,092 - root - INFO - lr: 4.3307e-06 gnorm: 0.42 [2 days, 8:05:12<16:56:34] +[titan] 2025-09-10 07:41:41,327 - root - INFO - Dumping profiler traces at step 30720 +[titan] 2025-09-10 07:41:41,382 - root - INFO - Finished dumping profiler traces in 0.05 seconds +[titan] 2025-09-10 07:42:13,254 - root - INFO - step: 30725 loss: 2.6235 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.57 mfu: 49.10% global_avg_ntp_loss: 0.7227 global_avg_top_loss: 1.9009 +[titan] 2025-09-10 07:42:13,255 - root - INFO - lr: 4.3283e-06 gnorm: 0.43 [2 days, 8:05:45<16:56:01] +[titan] 2025-09-10 07:42:45,532 - root - INFO - step: 30730 loss: 2.6734 memory: 122.03GiB(87.57%) tps: 10,152 tflops: 483.84 mfu: 48.92% global_avg_ntp_loss: 0.7444 global_avg_top_loss: 1.9289 +[titan] 2025-09-10 07:42:45,532 - root - INFO - lr: 4.3259e-06 gnorm: 0.43 [2 days, 8:06:17<16:55:28] +[titan] 2025-09-10 07:43:17,675 - root - INFO - step: 30735 loss: 2.6368 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.87 mfu: 49.13% global_avg_ntp_loss: 0.7250 global_avg_top_loss: 1.9118 +[titan] 2025-09-10 07:43:17,675 - root - INFO - lr: 4.3235e-06 gnorm: 0.43 [2 days, 8:06:49<16:54:55] +[titan] 2025-09-10 07:43:49,976 - root - INFO - step: 30740 loss: 2.7286 memory: 122.03GiB(87.57%) tps: 10,145 tflops: 483.49 mfu: 48.89% global_avg_ntp_loss: 0.7734 global_avg_top_loss: 1.9552 +[titan] 2025-09-10 07:43:49,976 - root - INFO - lr: 4.3211e-06 gnorm: 0.45 [2 days, 8:07:21<16:54:22] +[titan] 2025-09-10 07:44:21,908 - root - INFO - step: 30745 loss: 3.0737 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.08 mfu: 49.45% global_avg_ntp_loss: 0.9851 global_avg_top_loss: 2.0886 +[titan] 2025-09-10 07:44:21,908 - root - INFO - lr: 4.3188e-06 gnorm: 0.39 [2 days, 8:07:53<16:53:49] +[titan] 2025-09-10 07:44:47,327 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 07:44:53,722 - root - INFO - step: 30750 loss: 2.5566 memory: 122.03GiB(87.57%) tps: 10,300 tflops: 490.89 mfu: 49.64% global_avg_ntp_loss: 0.6883 global_avg_top_loss: 1.8683 +[titan] 2025-09-10 07:44:53,722 - root - INFO - lr: 4.3164e-06 gnorm: 0.42 [2 days, 8:08:25<16:53:15] +[titan] 2025-09-10 07:45:25,907 - root - INFO - step: 30755 loss: 2.5416 memory: 122.03GiB(87.57%) tps: 10,181 tflops: 485.23 mfu: 49.06% global_avg_ntp_loss: 0.6816 global_avg_top_loss: 1.8600 +[titan] 2025-09-10 07:45:25,907 - root - INFO - lr: 4.3140e-06 gnorm: 0.43 [2 days, 8:08:57<16:52:42] +[titan] 2025-09-10 07:45:58,286 - root - INFO - step: 30760 loss: 2.6360 memory: 122.03GiB(87.57%) tps: 10,120 tflops: 482.33 mfu: 48.77% global_avg_ntp_loss: 0.7333 global_avg_top_loss: 1.9026 +[titan] 2025-09-10 07:45:58,286 - root - INFO - lr: 4.3116e-06 gnorm: 0.45 [2 days, 8:09:30<16:52:09] +[titan] 2025-09-10 07:46:30,193 - root - INFO - step: 30765 loss: 2.6138 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.46 mfu: 49.49% global_avg_ntp_loss: 0.7167 global_avg_top_loss: 1.8971 +[titan] 2025-09-10 07:46:30,193 - root - INFO - lr: 4.3092e-06 gnorm: 0.44 [2 days, 8:10:01<16:51:36] +[titan] 2025-09-10 07:47:02,204 - root - INFO - step: 30770 loss: 2.4471 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.87 mfu: 49.33% global_avg_ntp_loss: 0.6430 global_avg_top_loss: 1.8040 +[titan] 2025-09-10 07:47:02,204 - root - INFO - lr: 4.3068e-06 gnorm: 0.42 [2 days, 8:10:33<16:51:03] +[titan] 2025-09-10 07:47:34,455 - root - INFO - step: 30775 loss: 2.5877 memory: 122.03GiB(87.57%) tps: 10,160 tflops: 484.23 mfu: 48.96% global_avg_ntp_loss: 0.7043 global_avg_top_loss: 1.8834 +[titan] 2025-09-10 07:47:34,456 - root - INFO - lr: 4.3044e-06 gnorm: 0.40 [2 days, 8:11:06<16:50:30] +[titan] 2025-09-10 07:48:06,376 - root - INFO - step: 30780 loss: 2.7247 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.24 mfu: 49.47% global_avg_ntp_loss: 0.7685 global_avg_top_loss: 1.9562 +[titan] 2025-09-10 07:48:06,377 - root - INFO - lr: 4.3020e-06 gnorm: 0.42 [2 days, 8:11:38<16:49:57] +[titan] 2025-09-10 07:48:38,596 - root - INFO - step: 30785 loss: 2.6817 memory: 122.03GiB(87.57%) tps: 10,170 tflops: 484.71 mfu: 49.01% global_avg_ntp_loss: 0.7469 global_avg_top_loss: 1.9348 +[titan] 2025-09-10 07:48:38,596 - root - INFO - lr: 4.2997e-06 gnorm: 0.45 [2 days, 8:12:10<16:49:24] +[titan] 2025-09-10 07:49:10,479 - root - INFO - step: 30790 loss: 2.6750 memory: 122.03GiB(87.57%) tps: 10,278 tflops: 489.83 mfu: 49.53% global_avg_ntp_loss: 0.7494 global_avg_top_loss: 1.9256 +[titan] 2025-09-10 07:49:10,479 - root - INFO - lr: 4.2973e-06 gnorm: 0.44 [2 days, 8:12:42<16:48:51] +[titan] 2025-09-10 07:49:42,630 - root - INFO - step: 30795 loss: 2.6935 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.75 mfu: 49.11% global_avg_ntp_loss: 0.7532 global_avg_top_loss: 1.9404 +[titan] 2025-09-10 07:49:42,630 - root - INFO - lr: 4.2949e-06 gnorm: 0.42 [2 days, 8:13:14<16:48:18] +[titan] 2025-09-10 07:50:08,258 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 07:50:14,791 - root - INFO - step: 30800 loss: 2.5233 memory: 122.03GiB(87.57%) tps: 10,189 tflops: 485.59 mfu: 49.10% global_avg_ntp_loss: 0.6714 global_avg_top_loss: 1.8519 +[titan] 2025-09-10 07:50:14,792 - root - INFO - lr: 4.2925e-06 gnorm: 0.41 [2 days, 8:13:46<16:47:45] +[titan] 2025-09-10 07:50:46,813 - root - INFO - step: 30805 loss: 2.6577 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.71 mfu: 49.31% global_avg_ntp_loss: 0.7351 global_avg_top_loss: 1.9226 +[titan] 2025-09-10 07:50:46,813 - root - INFO - lr: 4.2901e-06 gnorm: 0.47 [2 days, 8:14:18<16:47:11] +[titan] 2025-09-10 07:51:18,959 - root - INFO - step: 30810 loss: 2.7218 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.81 mfu: 49.12% global_avg_ntp_loss: 0.7663 global_avg_top_loss: 1.9556 +[titan] 2025-09-10 07:51:18,960 - root - INFO - lr: 4.2878e-06 gnorm: 0.44 [2 days, 8:14:50<16:46:38] +[titan] 2025-09-10 07:51:51,025 - root - INFO - step: 30815 loss: 2.7011 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.04 mfu: 49.25% global_avg_ntp_loss: 0.7553 global_avg_top_loss: 1.9459 +[titan] 2025-09-10 07:51:51,025 - root - INFO - lr: 4.2854e-06 gnorm: 0.44 [2 days, 8:15:22<16:46:05] +[titan] 2025-09-10 07:52:23,117 - root - INFO - step: 30820 loss: 2.6715 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.64 mfu: 49.21% global_avg_ntp_loss: 0.7424 global_avg_top_loss: 1.9291 +[titan] 2025-09-10 07:52:23,117 - root - INFO - lr: 4.2830e-06 gnorm: 0.43 [2 days, 8:15:54<16:45:32] +[titan] 2025-09-10 07:52:55,132 - root - INFO - step: 30825 loss: 3.0338 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.81 mfu: 49.32% global_avg_ntp_loss: 0.9574 global_avg_top_loss: 2.0764 +[titan] 2025-09-10 07:52:55,132 - root - INFO - lr: 4.2806e-06 gnorm: 0.39 [2 days, 8:16:26<16:44:59] +[titan] 2025-09-10 07:53:27,135 - root - INFO - step: 30830 loss: 2.5380 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.99 mfu: 49.34% global_avg_ntp_loss: 0.6802 global_avg_top_loss: 1.8577 +[titan] 2025-09-10 07:53:27,135 - root - INFO - lr: 4.2782e-06 gnorm: 0.41 [2 days, 8:16:58<16:44:26] +[titan] 2025-09-10 07:53:59,423 - root - INFO - step: 30835 loss: 2.6286 memory: 122.03GiB(87.57%) tps: 10,149 tflops: 483.68 mfu: 48.91% global_avg_ntp_loss: 0.7224 global_avg_top_loss: 1.9063 +[titan] 2025-09-10 07:53:59,423 - root - INFO - lr: 4.2759e-06 gnorm: 0.47 [2 days, 8:17:31<16:43:53] +[titan] 2025-09-10 07:54:31,328 - root - INFO - step: 30840 loss: 2.6749 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.48 mfu: 49.49% global_avg_ntp_loss: 0.7422 global_avg_top_loss: 1.9327 +[titan] 2025-09-10 07:54:31,329 - root - INFO - lr: 4.2735e-06 gnorm: 0.42 [2 days, 8:18:03<16:43:20] +[titan] 2025-09-10 07:55:03,324 - root - INFO - step: 30845 loss: 2.6359 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.10 mfu: 49.35% global_avg_ntp_loss: 0.7272 global_avg_top_loss: 1.9087 +[titan] 2025-09-10 07:55:03,325 - root - INFO - lr: 4.2711e-06 gnorm: 0.41 [2 days, 8:18:35<16:42:47] +[titan] 2025-09-10 07:55:28,912 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 07:55:35,338 - root - INFO - step: 30850 loss: 2.6465 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.83 mfu: 49.33% global_avg_ntp_loss: 0.7285 global_avg_top_loss: 1.9180 +[titan] 2025-09-10 07:55:35,338 - root - INFO - lr: 4.2688e-06 gnorm: 0.44 [2 days, 8:19:07<16:42:14] +[titan] 2025-09-10 07:56:07,479 - root - INFO - step: 30855 loss: 2.5512 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.90 mfu: 49.13% global_avg_ntp_loss: 0.6874 global_avg_top_loss: 1.8638 +[titan] 2025-09-10 07:56:07,479 - root - INFO - lr: 4.2664e-06 gnorm: 0.41 [2 days, 8:19:39<16:41:40] +[titan] 2025-09-10 07:56:39,397 - root - INFO - step: 30860 loss: 2.5755 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.29 mfu: 49.47% global_avg_ntp_loss: 0.6991 global_avg_top_loss: 1.8763 +[titan] 2025-09-10 07:56:39,397 - root - INFO - lr: 4.2640e-06 gnorm: 0.42 [2 days, 8:20:11<16:41:07] +[titan] 2025-09-10 07:57:11,423 - root - INFO - step: 30865 loss: 2.6722 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.64 mfu: 49.31% global_avg_ntp_loss: 0.7455 global_avg_top_loss: 1.9267 +[titan] 2025-09-10 07:57:11,423 - root - INFO - lr: 4.2617e-06 gnorm: 0.43 [2 days, 8:20:43<16:40:34] +[titan] 2025-09-10 07:57:43,482 - root - INFO - step: 30870 loss: 2.8226 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.13 mfu: 49.25% global_avg_ntp_loss: 0.8371 global_avg_top_loss: 1.9855 +[titan] 2025-09-10 07:57:43,483 - root - INFO - lr: 4.2593e-06 gnorm: 0.43 [2 days, 8:21:15<16:40:01] +[titan] 2025-09-10 07:58:15,587 - root - INFO - step: 30875 loss: 2.6185 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.44 mfu: 49.19% global_avg_ntp_loss: 0.7195 global_avg_top_loss: 1.8990 +[titan] 2025-09-10 07:58:15,587 - root - INFO - lr: 4.2569e-06 gnorm: 0.42 [2 days, 8:21:47<16:39:28] +[titan] 2025-09-10 07:58:47,707 - root - INFO - step: 30880 loss: 2.6381 memory: 122.03GiB(87.57%) tps: 10,202 tflops: 486.22 mfu: 49.16% global_avg_ntp_loss: 0.7272 global_avg_top_loss: 1.9109 +[titan] 2025-09-10 07:58:47,707 - root - INFO - lr: 4.2546e-06 gnorm: 0.42 [2 days, 8:22:19<16:38:55] +[titan] 2025-09-10 07:59:19,890 - root - INFO - step: 30885 loss: 2.7614 memory: 122.03GiB(87.57%) tps: 10,182 tflops: 485.27 mfu: 49.07% global_avg_ntp_loss: 0.7808 global_avg_top_loss: 1.9806 +[titan] 2025-09-10 07:59:19,890 - root - INFO - lr: 4.2522e-06 gnorm: 0.42 [2 days, 8:22:51<16:38:22] +[titan] 2025-09-10 07:59:51,715 - root - INFO - step: 30890 loss: 3.1679 memory: 122.03GiB(87.57%) tps: 10,296 tflops: 490.72 mfu: 49.62% global_avg_ntp_loss: 1.0223 global_avg_top_loss: 2.1456 +[titan] 2025-09-10 07:59:51,715 - root - INFO - lr: 4.2498e-06 gnorm: 0.44 [2 days, 8:23:23<16:37:49] +[titan] 2025-09-10 08:00:23,847 - root - INFO - step: 30895 loss: 2.6784 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.03 mfu: 49.14% global_avg_ntp_loss: 0.7428 global_avg_top_loss: 1.9355 +[titan] 2025-09-10 08:00:23,847 - root - INFO - lr: 4.2475e-06 gnorm: 0.44 [2 days, 8:23:55<16:37:16] +[titan] 2025-09-10 08:00:49,565 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 08:00:56,063 - root - INFO - step: 30900 loss: 2.6619 memory: 122.03GiB(87.57%) tps: 10,171 tflops: 484.76 mfu: 49.02% global_avg_ntp_loss: 0.7382 global_avg_top_loss: 1.9237 +[titan] 2025-09-10 08:00:56,064 - root - INFO - lr: 4.2451e-06 gnorm: 0.43 [2 days, 8:24:27<16:36:43] +[titan] 2025-09-10 08:01:28,261 - root - INFO - step: 30905 loss: 3.0699 memory: 122.03GiB(87.57%) tps: 10,177 tflops: 485.03 mfu: 49.04% global_avg_ntp_loss: 0.9763 global_avg_top_loss: 2.0936 +[titan] 2025-09-10 08:01:28,262 - root - INFO - lr: 4.2428e-06 gnorm: 0.41 [2 days, 8:25:00<16:36:10] +[titan] 2025-09-10 08:02:00,113 - root - INFO - step: 30910 loss: 2.6150 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.31 mfu: 49.58% global_avg_ntp_loss: 0.7151 global_avg_top_loss: 1.8999 +[titan] 2025-09-10 08:02:00,114 - root - INFO - lr: 4.2404e-06 gnorm: 0.42 [2 days, 8:25:31<16:35:36] +[titan] 2025-09-10 08:02:32,659 - root - INFO - step: 30915 loss: 2.7304 memory: 122.03GiB(87.57%) tps: 10,068 tflops: 479.85 mfu: 48.52% global_avg_ntp_loss: 0.7731 global_avg_top_loss: 1.9573 +[titan] 2025-09-10 08:02:32,659 - root - INFO - lr: 4.2380e-06 gnorm: 0.45 [2 days, 8:26:04<16:35:03] +[titan] 2025-09-10 08:03:04,662 - root - INFO - step: 30920 loss: 2.5366 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 488.00 mfu: 49.34% global_avg_ntp_loss: 0.6829 global_avg_top_loss: 1.8537 +[titan] 2025-09-10 08:03:04,662 - root - INFO - lr: 4.2357e-06 gnorm: 0.41 [2 days, 8:26:36<16:34:30] +[titan] 2025-09-10 08:03:36,769 - root - INFO - step: 30925 loss: 2.6658 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.41 mfu: 49.18% global_avg_ntp_loss: 0.7376 global_avg_top_loss: 1.9282 +[titan] 2025-09-10 08:03:36,769 - root - INFO - lr: 4.2333e-06 gnorm: 0.43 [2 days, 8:27:08<16:33:57] +[titan] 2025-09-10 08:04:08,659 - root - INFO - step: 30930 loss: 2.6122 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.71 mfu: 49.52% global_avg_ntp_loss: 0.7130 global_avg_top_loss: 1.8992 +[titan] 2025-09-10 08:04:08,660 - root - INFO - lr: 4.2310e-06 gnorm: 0.44 [2 days, 8:27:40<16:33:24] +[titan] 2025-09-10 08:04:40,798 - root - INFO - step: 30935 loss: 2.6129 memory: 122.03GiB(87.57%) tps: 10,196 tflops: 485.94 mfu: 49.13% global_avg_ntp_loss: 0.7145 global_avg_top_loss: 1.8985 +[titan] 2025-09-10 08:04:40,798 - root - INFO - lr: 4.2286e-06 gnorm: 0.43 [2 days, 8:28:12<16:32:51] +[titan] 2025-09-10 08:05:12,814 - root - INFO - step: 30940 loss: 2.6631 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.78 mfu: 49.32% global_avg_ntp_loss: 0.7375 global_avg_top_loss: 1.9256 +[titan] 2025-09-10 08:05:12,815 - root - INFO - lr: 4.2263e-06 gnorm: 0.51 [2 days, 8:28:44<16:32:18] +[titan] 2025-09-10 08:05:44,652 - root - INFO - step: 30945 loss: 2.7549 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.53 mfu: 49.60% global_avg_ntp_loss: 0.7893 global_avg_top_loss: 1.9656 +[titan] 2025-09-10 08:05:44,652 - root - INFO - lr: 4.2239e-06 gnorm: 0.45 [2 days, 8:29:16<16:31:45] +[titan] 2025-09-10 08:06:10,234 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 08:06:16,671 - root - INFO - step: 30950 loss: 2.6297 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.75 mfu: 49.32% global_avg_ntp_loss: 0.7256 global_avg_top_loss: 1.9041 +[titan] 2025-09-10 08:06:16,671 - root - INFO - lr: 4.2216e-06 gnorm: 0.46 [2 days, 8:29:48<16:31:12] +[titan] 2025-09-10 08:06:48,741 - root - INFO - step: 30955 loss: 2.5965 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.97 mfu: 49.24% global_avg_ntp_loss: 0.7109 global_avg_top_loss: 1.8856 +[titan] 2025-09-10 08:06:48,741 - root - INFO - lr: 4.2192e-06 gnorm: 0.41 [2 days, 8:30:20<16:30:39] +[titan] 2025-09-10 08:07:20,613 - root - INFO - step: 30960 loss: 2.6292 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 490.00 mfu: 49.55% global_avg_ntp_loss: 0.7233 global_avg_top_loss: 1.9059 +[titan] 2025-09-10 08:07:20,613 - root - INFO - lr: 4.2169e-06 gnorm: 0.44 [2 days, 8:30:52<16:30:05] +[titan] 2025-09-10 08:07:52,526 - root - INFO - step: 30965 loss: 2.7218 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.37 mfu: 49.48% global_avg_ntp_loss: 0.7645 global_avg_top_loss: 1.9572 +[titan] 2025-09-10 08:07:52,526 - root - INFO - lr: 4.2145e-06 gnorm: 0.44 [2 days, 8:31:24<16:29:32] +[titan] 2025-09-10 08:08:24,658 - root - INFO - step: 30970 loss: 2.6466 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.03 mfu: 49.14% global_avg_ntp_loss: 0.7341 global_avg_top_loss: 1.9124 +[titan] 2025-09-10 08:08:24,658 - root - INFO - lr: 4.2122e-06 gnorm: 0.41 [2 days, 8:31:56<16:28:59] +[titan] 2025-09-10 08:08:56,587 - root - INFO - step: 30975 loss: 2.6131 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.11 mfu: 49.46% global_avg_ntp_loss: 0.7168 global_avg_top_loss: 1.8963 +[titan] 2025-09-10 08:08:56,588 - root - INFO - lr: 4.2098e-06 gnorm: 0.42 [2 days, 8:32:28<16:28:26] +[titan] 2025-09-10 08:09:28,568 - root - INFO - step: 30980 loss: 2.6862 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.34 mfu: 49.38% global_avg_ntp_loss: 0.7466 global_avg_top_loss: 1.9396 +[titan] 2025-09-10 08:09:28,568 - root - INFO - lr: 4.2075e-06 gnorm: 0.42 [2 days, 8:33:00<16:27:53] +[titan] 2025-09-10 08:10:00,473 - root - INFO - step: 30985 loss: 3.0004 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.48 mfu: 49.49% global_avg_ntp_loss: 0.9437 global_avg_top_loss: 2.0567 +[titan] 2025-09-10 08:10:00,474 - root - INFO - lr: 4.2052e-06 gnorm: 0.41 [2 days, 8:33:32<16:27:20] +[titan] 2025-09-10 08:10:32,506 - root - INFO - step: 30990 loss: 2.6064 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.55 mfu: 49.30% global_avg_ntp_loss: 0.7110 global_avg_top_loss: 1.8954 +[titan] 2025-09-10 08:10:32,506 - root - INFO - lr: 4.2028e-06 gnorm: 0.45 [2 days, 8:34:04<16:26:47] +[titan] 2025-09-10 08:11:04,608 - root - INFO - step: 30995 loss: 2.6566 memory: 122.03GiB(87.57%) tps: 10,208 tflops: 486.48 mfu: 49.19% global_avg_ntp_loss: 0.7352 global_avg_top_loss: 1.9214 +[titan] 2025-09-10 08:11:04,608 - root - INFO - lr: 4.2005e-06 gnorm: 0.48 [2 days, 8:34:36<16:26:14] +[titan] 2025-09-10 08:11:30,255 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 08:11:36,703 - root - INFO - step: 31000 loss: 2.6616 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.59 mfu: 49.20% global_avg_ntp_loss: 0.7372 global_avg_top_loss: 1.9245 +[titan] 2025-09-10 08:11:36,703 - root - INFO - lr: 4.1981e-06 gnorm: 0.43 [2 days, 8:35:08<16:25:41] +[titan] 2025-09-10 08:12:08,774 - root - INFO - step: 31005 loss: 2.6882 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.96 mfu: 49.24% global_avg_ntp_loss: 0.7496 global_avg_top_loss: 1.9386 +[titan] 2025-09-10 08:12:08,774 - root - INFO - lr: 4.1958e-06 gnorm: 0.43 [2 days, 8:35:40<16:25:08] +[titan] 2025-09-10 08:12:40,692 - root - INFO - step: 31010 loss: 2.6733 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.28 mfu: 49.47% global_avg_ntp_loss: 0.7429 global_avg_top_loss: 1.9304 +[titan] 2025-09-10 08:12:40,693 - root - INFO - lr: 4.1935e-06 gnorm: 0.44 [2 days, 8:36:12<16:24:34] +[titan] 2025-09-10 08:13:12,759 - root - INFO - step: 31015 loss: 2.5866 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.02 mfu: 49.24% global_avg_ntp_loss: 0.7035 global_avg_top_loss: 1.8830 +[titan] 2025-09-10 08:13:12,760 - root - INFO - lr: 4.1911e-06 gnorm: 0.42 [2 days, 8:36:44<16:24:01] +[titan] 2025-09-10 08:13:44,878 - root - INFO - step: 31020 loss: 2.5460 memory: 122.03GiB(87.57%) tps: 10,202 tflops: 486.23 mfu: 49.16% global_avg_ntp_loss: 0.6861 global_avg_top_loss: 1.8599 +[titan] 2025-09-10 08:13:44,879 - root - INFO - lr: 4.1888e-06 gnorm: 0.41 [2 days, 8:37:16<16:23:28] +[titan] 2025-09-10 08:14:16,914 - root - INFO - step: 31025 loss: 2.6685 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.50 mfu: 49.29% global_avg_ntp_loss: 0.7420 global_avg_top_loss: 1.9265 +[titan] 2025-09-10 08:14:16,914 - root - INFO - lr: 4.1865e-06 gnorm: 0.43 [2 days, 8:37:48<16:22:55] +[titan] 2025-09-10 08:14:49,059 - root - INFO - step: 31030 loss: 2.7250 memory: 122.03GiB(87.57%) tps: 10,194 tflops: 485.83 mfu: 49.12% global_avg_ntp_loss: 0.7677 global_avg_top_loss: 1.9573 +[titan] 2025-09-10 08:14:49,060 - root - INFO - lr: 4.1841e-06 gnorm: 0.46 [2 days, 8:38:20<16:22:22] +[titan] 2025-09-10 08:15:20,878 - root - INFO - step: 31035 loss: 2.6602 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.82 mfu: 49.63% global_avg_ntp_loss: 0.7367 global_avg_top_loss: 1.9235 +[titan] 2025-09-10 08:15:20,878 - root - INFO - lr: 4.1818e-06 gnorm: 0.42 [2 days, 8:38:52<16:21:49] +[titan] 2025-09-10 08:15:52,947 - root - INFO - step: 31040 loss: 2.7139 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.98 mfu: 49.24% global_avg_ntp_loss: 0.7671 global_avg_top_loss: 1.9468 +[titan] 2025-09-10 08:15:52,947 - root - INFO - lr: 4.1795e-06 gnorm: 0.41 [2 days, 8:39:24<16:21:16] +[titan] 2025-09-10 08:16:24,893 - root - INFO - step: 31045 loss: 2.6046 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.86 mfu: 49.43% global_avg_ntp_loss: 0.7134 global_avg_top_loss: 1.8911 +[titan] 2025-09-10 08:16:24,894 - root - INFO - lr: 4.1772e-06 gnorm: 0.41 [2 days, 8:39:56<16:20:43] +[titan] 2025-09-10 08:16:50,548 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 08:16:56,988 - root - INFO - step: 31050 loss: 2.6908 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.60 mfu: 49.20% global_avg_ntp_loss: 0.7484 global_avg_top_loss: 1.9424 +[titan] 2025-09-10 08:16:56,988 - root - INFO - lr: 4.1748e-06 gnorm: 0.47 [2 days, 8:40:28<16:20:10] +[titan] 2025-09-10 08:17:29,057 - root - INFO - step: 31055 loss: 2.6662 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.99 mfu: 49.24% global_avg_ntp_loss: 0.7414 global_avg_top_loss: 1.9248 +[titan] 2025-09-10 08:17:29,057 - root - INFO - lr: 4.1725e-06 gnorm: 0.43 [2 days, 8:41:00<16:19:37] +[titan] 2025-09-10 08:18:01,040 - root - INFO - step: 31060 loss: 2.6697 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.30 mfu: 49.37% global_avg_ntp_loss: 0.7405 global_avg_top_loss: 1.9291 +[titan] 2025-09-10 08:18:01,040 - root - INFO - lr: 4.1702e-06 gnorm: 0.43 [2 days, 8:41:32<16:19:04] +[titan] 2025-09-10 08:18:33,201 - root - INFO - step: 31065 loss: 2.7910 memory: 122.03GiB(87.57%) tps: 10,189 tflops: 485.58 mfu: 49.10% global_avg_ntp_loss: 0.8182 global_avg_top_loss: 1.9727 +[titan] 2025-09-10 08:18:33,202 - root - INFO - lr: 4.1678e-06 gnorm: 0.43 [2 days, 8:42:04<16:18:30] +[titan] 2025-09-10 08:19:05,285 - root - INFO - step: 31070 loss: 2.5052 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.77 mfu: 49.22% global_avg_ntp_loss: 0.6658 global_avg_top_loss: 1.8394 +[titan] 2025-09-10 08:19:05,285 - root - INFO - lr: 4.1655e-06 gnorm: 0.53 [2 days, 8:42:37<16:17:57] +[titan] 2025-09-10 08:19:37,233 - root - INFO - step: 31075 loss: 2.5312 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.82 mfu: 49.43% global_avg_ntp_loss: 0.6782 global_avg_top_loss: 1.8531 +[titan] 2025-09-10 08:19:37,234 - root - INFO - lr: 4.1632e-06 gnorm: 0.45 [2 days, 8:43:08<16:17:24] +[titan] 2025-09-10 08:20:09,148 - root - INFO - step: 31080 loss: 2.5726 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.35 mfu: 49.48% global_avg_ntp_loss: 0.6934 global_avg_top_loss: 1.8792 +[titan] 2025-09-10 08:20:09,148 - root - INFO - lr: 4.1609e-06 gnorm: 0.43 [2 days, 8:43:40<16:16:51] +[titan] 2025-09-10 08:20:41,404 - root - INFO - step: 31085 loss: 2.7438 memory: 122.03GiB(87.57%) tps: 10,159 tflops: 484.16 mfu: 48.95% global_avg_ntp_loss: 0.7876 global_avg_top_loss: 1.9562 +[titan] 2025-09-10 08:20:41,404 - root - INFO - lr: 4.1586e-06 gnorm: 0.43 [2 days, 8:44:13<16:16:18] +[titan] 2025-09-10 08:21:13,495 - root - INFO - step: 31090 loss: 2.7628 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.65 mfu: 49.21% global_avg_ntp_loss: 0.7897 global_avg_top_loss: 1.9730 +[titan] 2025-09-10 08:21:13,496 - root - INFO - lr: 4.1562e-06 gnorm: 0.43 [2 days, 8:44:45<16:15:45] +[titan] 2025-09-10 08:21:45,363 - root - INFO - step: 31095 loss: 2.6156 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.06 mfu: 49.55% global_avg_ntp_loss: 0.7150 global_avg_top_loss: 1.9006 +[titan] 2025-09-10 08:21:45,364 - root - INFO - lr: 4.1539e-06 gnorm: 0.43 [2 days, 8:45:17<16:15:12] +[titan] 2025-09-10 08:22:11,020 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 08:22:17,471 - root - INFO - step: 31100 loss: 2.6941 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.40 mfu: 49.18% global_avg_ntp_loss: 0.7580 global_avg_top_loss: 1.9360 +[titan] 2025-09-10 08:22:17,471 - root - INFO - lr: 4.1516e-06 gnorm: 0.43 [2 days, 8:45:49<16:14:39] +[titan] 2025-09-10 08:22:49,754 - root - INFO - step: 31105 loss: 2.6848 memory: 122.03GiB(87.57%) tps: 10,150 tflops: 483.76 mfu: 48.91% global_avg_ntp_loss: 0.7494 global_avg_top_loss: 1.9353 +[titan] 2025-09-10 08:22:49,754 - root - INFO - lr: 4.1493e-06 gnorm: 0.41 [2 days, 8:46:21<16:14:06] +[titan] 2025-09-10 08:23:21,704 - root - INFO - step: 31110 loss: 2.6817 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.79 mfu: 49.42% global_avg_ntp_loss: 0.7476 global_avg_top_loss: 1.9341 +[titan] 2025-09-10 08:23:21,705 - root - INFO - lr: 4.1470e-06 gnorm: 0.47 [2 days, 8:46:53<16:13:33] +[titan] 2025-09-10 08:23:53,680 - root - INFO - step: 31115 loss: 2.6177 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.42 mfu: 49.38% global_avg_ntp_loss: 0.7166 global_avg_top_loss: 1.9011 +[titan] 2025-09-10 08:23:53,680 - root - INFO - lr: 4.1447e-06 gnorm: 0.42 [2 days, 8:47:25<16:13:00] +[titan] 2025-09-10 08:24:25,874 - root - INFO - step: 31120 loss: 2.6036 memory: 122.03GiB(87.57%) tps: 10,178 tflops: 485.09 mfu: 49.05% global_avg_ntp_loss: 0.7113 global_avg_top_loss: 1.8924 +[titan] 2025-09-10 08:24:25,875 - root - INFO - lr: 4.1424e-06 gnorm: 0.46 [2 days, 8:47:57<16:12:27] +[titan] 2025-09-10 08:24:57,971 - root - INFO - step: 31125 loss: 2.5644 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.57 mfu: 49.20% global_avg_ntp_loss: 0.6990 global_avg_top_loss: 1.8653 +[titan] 2025-09-10 08:24:57,971 - root - INFO - lr: 4.1400e-06 gnorm: 0.43 [2 days, 8:48:29<16:11:54] +[titan] 2025-09-10 08:25:29,949 - root - INFO - step: 31130 loss: 2.6961 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.36 mfu: 49.38% global_avg_ntp_loss: 0.7532 global_avg_top_loss: 1.9429 +[titan] 2025-09-10 08:25:29,950 - root - INFO - lr: 4.1377e-06 gnorm: 0.43 [2 days, 8:49:01<16:11:20] +[titan] 2025-09-10 08:26:01,916 - root - INFO - step: 31135 loss: 2.6264 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.54 mfu: 49.40% global_avg_ntp_loss: 0.7267 global_avg_top_loss: 1.8997 +[titan] 2025-09-10 08:26:01,916 - root - INFO - lr: 4.1354e-06 gnorm: 0.41 [2 days, 8:49:33<16:10:47] +[titan] 2025-09-10 08:26:33,921 - root - INFO - step: 31140 loss: 2.6254 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.97 mfu: 49.34% global_avg_ntp_loss: 0.7177 global_avg_top_loss: 1.9077 +[titan] 2025-09-10 08:26:33,921 - root - INFO - lr: 4.1331e-06 gnorm: 0.42 [2 days, 8:50:05<16:10:14] +[titan] 2025-09-10 08:27:05,946 - root - INFO - step: 31145 loss: 2.5105 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.66 mfu: 49.31% global_avg_ntp_loss: 0.6713 global_avg_top_loss: 1.8392 +[titan] 2025-09-10 08:27:05,946 - root - INFO - lr: 4.1308e-06 gnorm: 0.41 [2 days, 8:50:37<16:09:41] +[titan] 2025-09-10 08:27:31,548 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 08:27:37,931 - root - INFO - step: 31150 loss: 2.5373 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.27 mfu: 49.37% global_avg_ntp_loss: 0.6780 global_avg_top_loss: 1.8593 +[titan] 2025-09-10 08:27:37,931 - root - INFO - lr: 4.1285e-06 gnorm: 0.42 [2 days, 8:51:09<16:09:08] +[titan] 2025-09-10 08:28:09,902 - root - INFO - step: 31155 loss: 2.5920 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.48 mfu: 49.39% global_avg_ntp_loss: 0.7010 global_avg_top_loss: 1.8909 +[titan] 2025-09-10 08:28:09,902 - root - INFO - lr: 4.1262e-06 gnorm: 0.55 [2 days, 8:51:41<16:08:35] +[titan] 2025-09-10 08:28:41,776 - root - INFO - step: 31160 loss: 2.7661 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.96 mfu: 49.54% global_avg_ntp_loss: 0.8013 global_avg_top_loss: 1.9649 +[titan] 2025-09-10 08:28:41,776 - root - INFO - lr: 4.1239e-06 gnorm: 0.42 [2 days, 8:52:13<16:08:02] +[titan] 2025-09-10 08:29:13,710 - root - INFO - step: 31165 loss: 2.6294 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.05 mfu: 49.45% global_avg_ntp_loss: 0.7245 global_avg_top_loss: 1.9049 +[titan] 2025-09-10 08:29:13,710 - root - INFO - lr: 4.1216e-06 gnorm: 0.44 [2 days, 8:52:45<16:07:29] +[titan] 2025-09-10 08:29:45,690 - root - INFO - step: 31170 loss: 2.6629 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.34 mfu: 49.38% global_avg_ntp_loss: 0.7372 global_avg_top_loss: 1.9256 +[titan] 2025-09-10 08:29:45,690 - root - INFO - lr: 4.1193e-06 gnorm: 0.43 [2 days, 8:53:17<16:06:56] +[titan] 2025-09-10 08:30:17,759 - root - INFO - step: 31175 loss: 2.6263 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.99 mfu: 49.24% global_avg_ntp_loss: 0.7215 global_avg_top_loss: 1.9048 +[titan] 2025-09-10 08:30:17,759 - root - INFO - lr: 4.1170e-06 gnorm: 0.42 [2 days, 8:53:49<16:06:23] +[titan] 2025-09-10 08:30:49,678 - root - INFO - step: 31180 loss: 2.6727 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.27 mfu: 49.47% global_avg_ntp_loss: 0.7402 global_avg_top_loss: 1.9326 +[titan] 2025-09-10 08:30:49,678 - root - INFO - lr: 4.1147e-06 gnorm: 0.43 [2 days, 8:54:21<16:05:49] +[titan] 2025-09-10 08:31:21,687 - root - INFO - step: 31185 loss: 2.7500 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.91 mfu: 49.33% global_avg_ntp_loss: 0.7883 global_avg_top_loss: 1.9617 +[titan] 2025-09-10 08:31:21,687 - root - INFO - lr: 4.1124e-06 gnorm: 0.42 [2 days, 8:54:53<16:05:16] +[titan] 2025-09-10 08:31:53,596 - root - INFO - step: 31190 loss: 2.8203 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.43 mfu: 49.49% global_avg_ntp_loss: 0.8124 global_avg_top_loss: 2.0080 +[titan] 2025-09-10 08:31:53,596 - root - INFO - lr: 4.1101e-06 gnorm: 0.48 [2 days, 8:55:25<16:04:43] +[titan] 2025-09-10 08:32:25,408 - root - INFO - step: 31195 loss: 2.6866 memory: 122.03GiB(87.57%) tps: 10,301 tflops: 490.92 mfu: 49.64% global_avg_ntp_loss: 0.7501 global_avg_top_loss: 1.9365 +[titan] 2025-09-10 08:32:25,409 - root - INFO - lr: 4.1078e-06 gnorm: 0.43 [2 days, 8:55:57<16:04:10] +[titan] 2025-09-10 08:32:51,063 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 08:32:57,553 - root - INFO - step: 31200 loss: 2.7203 memory: 122.03GiB(87.57%) tps: 10,194 tflops: 485.85 mfu: 49.13% global_avg_ntp_loss: 0.7639 global_avg_top_loss: 1.9564 +[titan] 2025-09-10 08:32:57,553 - root - INFO - lr: 4.1055e-06 gnorm: 0.43 [2 days, 8:56:29<16:03:37] +[titan] 2025-09-10 08:33:29,476 - root - INFO - step: 31205 loss: 2.6888 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.21 mfu: 49.47% global_avg_ntp_loss: 0.7505 global_avg_top_loss: 1.9383 +[titan] 2025-09-10 08:33:29,476 - root - INFO - lr: 4.1032e-06 gnorm: 0.44 [2 days, 8:57:01<16:03:04] +[titan] 2025-09-10 08:34:01,473 - root - INFO - step: 31210 loss: 2.6517 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.09 mfu: 49.35% global_avg_ntp_loss: 0.7363 global_avg_top_loss: 1.9154 +[titan] 2025-09-10 08:34:01,473 - root - INFO - lr: 4.1009e-06 gnorm: 0.46 [2 days, 8:57:33<16:02:31] +[titan] 2025-09-10 08:34:33,472 - root - INFO - step: 31215 loss: 2.6380 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.05 mfu: 49.35% global_avg_ntp_loss: 0.7273 global_avg_top_loss: 1.9108 +[titan] 2025-09-10 08:34:33,472 - root - INFO - lr: 4.0986e-06 gnorm: 0.44 [2 days, 8:58:05<16:01:58] +[titan] 2025-09-10 08:35:05,499 - root - INFO - step: 31220 loss: 3.1186 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.63 mfu: 49.30% global_avg_ntp_loss: 0.9956 global_avg_top_loss: 2.1230 +[titan] 2025-09-10 08:35:05,499 - root - INFO - lr: 4.0963e-06 gnorm: 0.49 [2 days, 8:58:37<16:01:25] +[titan] 2025-09-10 08:35:37,487 - root - INFO - step: 31225 loss: 2.5660 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.22 mfu: 49.36% global_avg_ntp_loss: 0.6933 global_avg_top_loss: 1.8727 +[titan] 2025-09-10 08:35:37,487 - root - INFO - lr: 4.0940e-06 gnorm: 0.42 [2 days, 8:59:09<16:00:52] +[titan] 2025-09-10 08:36:09,642 - root - INFO - step: 31230 loss: 2.6312 memory: 122.03GiB(87.57%) tps: 10,191 tflops: 485.69 mfu: 49.11% global_avg_ntp_loss: 0.7224 global_avg_top_loss: 1.9088 +[titan] 2025-09-10 08:36:09,642 - root - INFO - lr: 4.0917e-06 gnorm: 0.43 [2 days, 8:59:41<16:00:18] +[titan] 2025-09-10 08:36:22,579 - root - INFO - Dumping profiler traces at step 31232 +[titan] 2025-09-10 08:36:22,653 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-10 08:36:41,712 - root - INFO - step: 31235 loss: 2.4202 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.97 mfu: 49.24% global_avg_ntp_loss: 0.6267 global_avg_top_loss: 1.7936 +[titan] 2025-09-10 08:36:41,712 - root - INFO - lr: 4.0895e-06 gnorm: 0.54 [2 days, 9:00:13<15:59:45] +[titan] 2025-09-10 08:37:13,585 - root - INFO - step: 31240 loss: 2.6233 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.98 mfu: 49.54% global_avg_ntp_loss: 0.7227 global_avg_top_loss: 1.9007 +[titan] 2025-09-10 08:37:13,585 - root - INFO - lr: 4.0872e-06 gnorm: 0.45 [2 days, 9:00:45<15:59:12] +[titan] 2025-09-10 08:37:45,715 - root - INFO - step: 31245 loss: 2.5818 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.06 mfu: 49.15% global_avg_ntp_loss: 0.7002 global_avg_top_loss: 1.8816 +[titan] 2025-09-10 08:37:45,716 - root - INFO - lr: 4.0849e-06 gnorm: 0.45 [2 days, 9:01:17<15:58:39] +[titan] 2025-09-10 08:38:11,341 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 08:38:17,659 - root - INFO - step: 31250 loss: 2.6305 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.91 mfu: 49.43% global_avg_ntp_loss: 0.7218 global_avg_top_loss: 1.9086 +[titan] 2025-09-10 08:38:17,659 - root - INFO - lr: 4.0826e-06 gnorm: 0.44 [2 days, 9:01:49<15:58:06] +[titan] 2025-09-10 08:38:49,788 - root - INFO - step: 31255 loss: 2.5672 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.07 mfu: 49.15% global_avg_ntp_loss: 0.6957 global_avg_top_loss: 1.8715 +[titan] 2025-09-10 08:38:49,788 - root - INFO - lr: 4.0803e-06 gnorm: 0.41 [2 days, 9:02:21<15:57:33] +[titan] 2025-09-10 08:39:21,735 - root - INFO - step: 31260 loss: 2.6794 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.85 mfu: 49.43% global_avg_ntp_loss: 0.7460 global_avg_top_loss: 1.9334 +[titan] 2025-09-10 08:39:21,735 - root - INFO - lr: 4.0780e-06 gnorm: 0.44 [2 days, 9:02:53<15:57:00] +[titan] 2025-09-10 08:39:53,718 - root - INFO - step: 31265 loss: 2.7179 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.30 mfu: 49.37% global_avg_ntp_loss: 0.7634 global_avg_top_loss: 1.9545 +[titan] 2025-09-10 08:39:53,718 - root - INFO - lr: 4.0758e-06 gnorm: 0.43 [2 days, 9:03:25<15:56:27] +[titan] 2025-09-10 08:40:25,681 - root - INFO - step: 31270 loss: 2.6403 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.60 mfu: 49.40% global_avg_ntp_loss: 0.7250 global_avg_top_loss: 1.9154 +[titan] 2025-09-10 08:40:25,681 - root - INFO - lr: 4.0735e-06 gnorm: 0.48 [2 days, 9:03:57<15:55:54] +[titan] 2025-09-10 08:40:57,800 - root - INFO - step: 31275 loss: 2.6514 memory: 122.03GiB(87.57%) tps: 10,202 tflops: 486.23 mfu: 49.16% global_avg_ntp_loss: 0.7378 global_avg_top_loss: 1.9136 +[titan] 2025-09-10 08:40:57,800 - root - INFO - lr: 4.0712e-06 gnorm: 0.42 [2 days, 9:04:29<15:55:21] +[titan] 2025-09-10 08:41:29,860 - root - INFO - step: 31280 loss: 2.6574 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.12 mfu: 49.25% global_avg_ntp_loss: 0.7379 global_avg_top_loss: 1.9195 +[titan] 2025-09-10 08:41:29,860 - root - INFO - lr: 4.0689e-06 gnorm: 0.45 [2 days, 9:05:01<15:54:48] +[titan] 2025-09-10 08:42:02,180 - root - INFO - step: 31285 loss: 2.6212 memory: 122.03GiB(87.57%) tps: 10,139 tflops: 483.21 mfu: 48.86% global_avg_ntp_loss: 0.7203 global_avg_top_loss: 1.9009 +[titan] 2025-09-10 08:42:02,180 - root - INFO - lr: 4.0666e-06 gnorm: 0.42 [2 days, 9:05:33<15:54:15] +[titan] 2025-09-10 08:42:34,202 - root - INFO - step: 31290 loss: 2.7238 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.71 mfu: 49.31% global_avg_ntp_loss: 0.7745 global_avg_top_loss: 1.9494 +[titan] 2025-09-10 08:42:34,202 - root - INFO - lr: 4.0644e-06 gnorm: 0.43 [2 days, 9:06:05<15:53:42] +[titan] 2025-09-10 08:43:06,305 - root - INFO - step: 31295 loss: 2.6478 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.46 mfu: 49.19% global_avg_ntp_loss: 0.7307 global_avg_top_loss: 1.9171 +[titan] 2025-09-10 08:43:06,306 - root - INFO - lr: 4.0621e-06 gnorm: 0.43 [2 days, 9:06:38<15:53:09] +[titan] 2025-09-10 08:43:31,849 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 08:43:38,263 - root - INFO - step: 31300 loss: 3.0230 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.68 mfu: 49.41% global_avg_ntp_loss: 0.9428 global_avg_top_loss: 2.0802 +[titan] 2025-09-10 08:43:38,264 - root - INFO - lr: 4.0598e-06 gnorm: 0.48 [2 days, 9:07:09<15:52:35] +[titan] 2025-09-10 08:44:10,528 - root - INFO - step: 31305 loss: 2.5262 memory: 122.03GiB(87.57%) tps: 10,156 tflops: 484.04 mfu: 48.94% global_avg_ntp_loss: 0.6746 global_avg_top_loss: 1.8516 +[titan] 2025-09-10 08:44:10,528 - root - INFO - lr: 4.0576e-06 gnorm: 0.44 [2 days, 9:07:42<15:52:02] +[titan] 2025-09-10 08:44:42,713 - root - INFO - step: 31310 loss: 2.6226 memory: 122.03GiB(87.57%) tps: 10,181 tflops: 485.22 mfu: 49.06% global_avg_ntp_loss: 0.7177 global_avg_top_loss: 1.9049 +[titan] 2025-09-10 08:44:42,714 - root - INFO - lr: 4.0553e-06 gnorm: 0.44 [2 days, 9:08:14<15:51:29] +[titan] 2025-09-10 08:45:14,542 - root - INFO - step: 31315 loss: 2.6207 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.66 mfu: 49.61% global_avg_ntp_loss: 0.7149 global_avg_top_loss: 1.9059 +[titan] 2025-09-10 08:45:14,542 - root - INFO - lr: 4.0530e-06 gnorm: 0.53 [2 days, 9:08:46<15:50:56] +[titan] 2025-09-10 08:45:46,455 - root - INFO - step: 31320 loss: 3.1083 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.37 mfu: 49.48% global_avg_ntp_loss: 0.9954 global_avg_top_loss: 2.1130 +[titan] 2025-09-10 08:45:46,455 - root - INFO - lr: 4.0507e-06 gnorm: 0.43 [2 days, 9:09:18<15:50:23] +[titan] 2025-09-10 08:46:18,249 - root - INFO - step: 31325 loss: 2.9339 memory: 122.03GiB(87.57%) tps: 10,307 tflops: 491.20 mfu: 49.67% global_avg_ntp_loss: 0.8972 global_avg_top_loss: 2.0367 +[titan] 2025-09-10 08:46:18,249 - root - INFO - lr: 4.0485e-06 gnorm: 0.41 [2 days, 9:09:49<15:49:50] +[titan] 2025-09-10 08:46:50,492 - root - INFO - step: 31330 loss: 2.6757 memory: 122.03GiB(87.57%) tps: 10,163 tflops: 484.36 mfu: 48.98% global_avg_ntp_loss: 0.7448 global_avg_top_loss: 1.9309 +[titan] 2025-09-10 08:46:50,492 - root - INFO - lr: 4.0462e-06 gnorm: 0.43 [2 days, 9:10:22<15:49:17] +[titan] 2025-09-10 08:47:22,400 - root - INFO - step: 31335 loss: 2.5792 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.44 mfu: 49.49% global_avg_ntp_loss: 0.7020 global_avg_top_loss: 1.8772 +[titan] 2025-09-10 08:47:22,400 - root - INFO - lr: 4.0439e-06 gnorm: 0.43 [2 days, 9:10:54<15:48:44] +[titan] 2025-09-10 08:47:54,492 - root - INFO - step: 31340 loss: 2.7521 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.63 mfu: 49.20% global_avg_ntp_loss: 0.7915 global_avg_top_loss: 1.9607 +[titan] 2025-09-10 08:47:54,493 - root - INFO - lr: 4.0417e-06 gnorm: 0.51 [2 days, 9:11:26<15:48:11] +[titan] 2025-09-10 08:48:26,873 - root - INFO - step: 31345 loss: 2.6636 memory: 122.03GiB(87.57%) tps: 10,120 tflops: 482.31 mfu: 48.77% global_avg_ntp_loss: 0.7435 global_avg_top_loss: 1.9201 +[titan] 2025-09-10 08:48:26,873 - root - INFO - lr: 4.0394e-06 gnorm: 0.43 [2 days, 9:11:58<15:47:38] +[titan] 2025-09-10 08:48:52,183 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 08:48:58,703 - root - INFO - step: 31350 loss: 2.6900 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.63 mfu: 49.61% global_avg_ntp_loss: 0.7511 global_avg_top_loss: 1.9389 +[titan] 2025-09-10 08:48:58,704 - root - INFO - lr: 4.0371e-06 gnorm: 0.48 [2 days, 9:12:30<15:47:05] +[titan] 2025-09-10 08:49:30,607 - root - INFO - step: 31355 loss: 2.6145 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.51 mfu: 49.50% global_avg_ntp_loss: 0.7148 global_avg_top_loss: 1.8997 +[titan] 2025-09-10 08:49:30,608 - root - INFO - lr: 4.0349e-06 gnorm: 0.45 [2 days, 9:13:02<15:46:32] +[titan] 2025-09-10 08:50:02,421 - root - INFO - step: 31360 loss: 2.5785 memory: 122.03GiB(87.57%) tps: 10,300 tflops: 490.89 mfu: 49.64% global_avg_ntp_loss: 0.6990 global_avg_top_loss: 1.8795 +[titan] 2025-09-10 08:50:02,421 - root - INFO - lr: 4.0326e-06 gnorm: 0.42 [2 days, 9:13:34<15:45:58] +[titan] 2025-09-10 08:50:34,458 - root - INFO - step: 31365 loss: 2.6966 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.47 mfu: 49.29% global_avg_ntp_loss: 0.7542 global_avg_top_loss: 1.9424 +[titan] 2025-09-10 08:50:34,459 - root - INFO - lr: 4.0304e-06 gnorm: 0.44 [2 days, 9:14:06<15:45:25] +[titan] 2025-09-10 08:51:06,810 - root - INFO - step: 31370 loss: 2.6533 memory: 122.03GiB(87.57%) tps: 10,129 tflops: 482.74 mfu: 48.81% global_avg_ntp_loss: 0.7359 global_avg_top_loss: 1.9174 +[titan] 2025-09-10 08:51:06,810 - root - INFO - lr: 4.0281e-06 gnorm: 0.43 [2 days, 9:14:38<15:44:52] +[titan] 2025-09-10 08:51:39,050 - root - INFO - step: 31375 loss: 2.6357 memory: 122.03GiB(87.57%) tps: 10,164 tflops: 484.40 mfu: 48.98% global_avg_ntp_loss: 0.7266 global_avg_top_loss: 1.9091 +[titan] 2025-09-10 08:51:39,051 - root - INFO - lr: 4.0259e-06 gnorm: 0.43 [2 days, 9:15:10<15:44:19] +[titan] 2025-09-10 08:52:10,889 - root - INFO - step: 31380 loss: 2.6251 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.51 mfu: 49.60% global_avg_ntp_loss: 0.7198 global_avg_top_loss: 1.9053 +[titan] 2025-09-10 08:52:10,889 - root - INFO - lr: 4.0236e-06 gnorm: 0.45 [2 days, 9:15:42<15:43:46] +[titan] 2025-09-10 08:52:42,811 - root - INFO - step: 31385 loss: 2.5224 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.23 mfu: 49.47% global_avg_ntp_loss: 0.6738 global_avg_top_loss: 1.8487 +[titan] 2025-09-10 08:52:42,811 - root - INFO - lr: 4.0213e-06 gnorm: 0.43 [2 days, 9:16:14<15:43:13] +[titan] 2025-09-10 08:53:14,811 - root - INFO - step: 31390 loss: 2.5449 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.05 mfu: 49.35% global_avg_ntp_loss: 0.6816 global_avg_top_loss: 1.8632 +[titan] 2025-09-10 08:53:14,811 - root - INFO - lr: 4.0191e-06 gnorm: 0.44 [2 days, 9:16:46<15:42:40] +[titan] 2025-09-10 08:53:46,908 - root - INFO - step: 31395 loss: 2.6501 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.56 mfu: 49.20% global_avg_ntp_loss: 0.7310 global_avg_top_loss: 1.9191 +[titan] 2025-09-10 08:53:46,908 - root - INFO - lr: 4.0168e-06 gnorm: 0.53 [2 days, 9:17:18<15:42:07] +[titan] 2025-09-10 08:54:12,292 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 08:54:18,862 - root - INFO - step: 31400 loss: 2.6413 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.73 mfu: 49.42% global_avg_ntp_loss: 0.7282 global_avg_top_loss: 1.9131 +[titan] 2025-09-10 08:54:18,862 - root - INFO - lr: 4.0146e-06 gnorm: 0.45 [2 days, 9:17:50<15:41:34] +[titan] 2025-09-10 08:54:50,801 - root - INFO - step: 31405 loss: 2.6367 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.96 mfu: 49.44% global_avg_ntp_loss: 0.7238 global_avg_top_loss: 1.9129 +[titan] 2025-09-10 08:54:50,802 - root - INFO - lr: 4.0123e-06 gnorm: 0.42 [2 days, 9:18:22<15:41:01] +[titan] 2025-09-10 08:55:22,741 - root - INFO - step: 31410 loss: 2.6362 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.96 mfu: 49.44% global_avg_ntp_loss: 0.7273 global_avg_top_loss: 1.9089 +[titan] 2025-09-10 08:55:22,741 - root - INFO - lr: 4.0101e-06 gnorm: 0.42 [2 days, 9:18:54<15:40:28] +[titan] 2025-09-10 08:55:54,661 - root - INFO - step: 31415 loss: 2.5507 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.26 mfu: 49.47% global_avg_ntp_loss: 0.6849 global_avg_top_loss: 1.8658 +[titan] 2025-09-10 08:55:54,661 - root - INFO - lr: 4.0078e-06 gnorm: 0.43 [2 days, 9:19:26<15:39:55] +[titan] 2025-09-10 08:56:26,696 - root - INFO - step: 31420 loss: 2.7318 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.50 mfu: 49.29% global_avg_ntp_loss: 0.7693 global_avg_top_loss: 1.9624 +[titan] 2025-09-10 08:56:26,696 - root - INFO - lr: 4.0056e-06 gnorm: 0.46 [2 days, 9:19:58<15:39:22] +[titan] 2025-09-10 08:56:58,757 - root - INFO - step: 31425 loss: 2.7738 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.11 mfu: 49.25% global_avg_ntp_loss: 0.7932 global_avg_top_loss: 1.9806 +[titan] 2025-09-10 08:56:58,757 - root - INFO - lr: 4.0033e-06 gnorm: 0.46 [2 days, 9:20:30<15:38:49] +[titan] 2025-09-10 08:57:30,630 - root - INFO - step: 31430 loss: 3.1960 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.99 mfu: 49.54% global_avg_ntp_loss: 1.0399 global_avg_top_loss: 2.1562 +[titan] 2025-09-10 08:57:30,630 - root - INFO - lr: 4.0011e-06 gnorm: 0.47 [2 days, 9:21:02<15:38:15] +[titan] 2025-09-10 08:58:02,626 - root - INFO - step: 31435 loss: 2.6186 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.09 mfu: 49.35% global_avg_ntp_loss: 0.7166 global_avg_top_loss: 1.9020 +[titan] 2025-09-10 08:58:02,626 - root - INFO - lr: 3.9989e-06 gnorm: 0.43 [2 days, 9:21:34<15:37:42] +[titan] 2025-09-10 08:58:34,527 - root - INFO - step: 31440 loss: 2.7441 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.54 mfu: 49.50% global_avg_ntp_loss: 0.7770 global_avg_top_loss: 1.9671 +[titan] 2025-09-10 08:58:34,528 - root - INFO - lr: 3.9966e-06 gnorm: 0.44 [2 days, 9:22:06<15:37:09] +[titan] 2025-09-10 08:59:06,311 - root - INFO - step: 31445 loss: 2.6558 memory: 122.03GiB(87.57%) tps: 10,310 tflops: 491.36 mfu: 49.68% global_avg_ntp_loss: 0.7305 global_avg_top_loss: 1.9253 +[titan] 2025-09-10 08:59:06,311 - root - INFO - lr: 3.9944e-06 gnorm: 0.43 [2 days, 9:22:38<15:36:36] +[titan] 2025-09-10 08:59:31,977 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 08:59:38,465 - root - INFO - step: 31450 loss: 2.6161 memory: 122.03GiB(87.57%) tps: 10,191 tflops: 485.70 mfu: 49.11% global_avg_ntp_loss: 0.7161 global_avg_top_loss: 1.9000 +[titan] 2025-09-10 08:59:38,465 - root - INFO - lr: 3.9921e-06 gnorm: 0.45 [2 days, 9:23:10<15:36:03] +[titan] 2025-09-10 09:00:10,322 - root - INFO - step: 31455 loss: 2.6105 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.22 mfu: 49.57% global_avg_ntp_loss: 0.7148 global_avg_top_loss: 1.8957 +[titan] 2025-09-10 09:00:10,323 - root - INFO - lr: 3.9899e-06 gnorm: 0.45 [2 days, 9:23:42<15:35:30] +[titan] 2025-09-10 09:00:42,287 - root - INFO - step: 31460 loss: 2.6358 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.58 mfu: 49.40% global_avg_ntp_loss: 0.7245 global_avg_top_loss: 1.9113 +[titan] 2025-09-10 09:00:42,287 - root - INFO - lr: 3.9877e-06 gnorm: 0.44 [2 days, 9:24:13<15:34:57] +[titan] 2025-09-10 09:01:14,191 - root - INFO - step: 31465 loss: 2.5850 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.49 mfu: 49.49% global_avg_ntp_loss: 0.7009 global_avg_top_loss: 1.8841 +[titan] 2025-09-10 09:01:14,192 - root - INFO - lr: 3.9854e-06 gnorm: 0.43 [2 days, 9:24:45<15:34:24] +[titan] 2025-09-10 09:01:46,324 - root - INFO - step: 31470 loss: 2.5063 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.03 mfu: 49.14% global_avg_ntp_loss: 0.6665 global_avg_top_loss: 1.8398 +[titan] 2025-09-10 09:01:46,324 - root - INFO - lr: 3.9832e-06 gnorm: 0.42 [2 days, 9:25:18<15:33:51] +[titan] 2025-09-10 09:02:18,246 - root - INFO - step: 31475 loss: 2.6256 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.23 mfu: 49.47% global_avg_ntp_loss: 0.7136 global_avg_top_loss: 1.9120 +[titan] 2025-09-10 09:02:18,246 - root - INFO - lr: 3.9809e-06 gnorm: 0.52 [2 days, 9:25:49<15:33:18] +[titan] 2025-09-10 09:02:50,306 - root - INFO - step: 31480 loss: 2.6317 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.12 mfu: 49.25% global_avg_ntp_loss: 0.7250 global_avg_top_loss: 1.9067 +[titan] 2025-09-10 09:02:50,306 - root - INFO - lr: 3.9787e-06 gnorm: 0.44 [2 days, 9:26:21<15:32:45] +[titan] 2025-09-10 09:03:22,456 - root - INFO - step: 31485 loss: 2.5851 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.77 mfu: 49.12% global_avg_ntp_loss: 0.7017 global_avg_top_loss: 1.8835 +[titan] 2025-09-10 09:03:22,456 - root - INFO - lr: 3.9765e-06 gnorm: 0.43 [2 days, 9:26:54<15:32:12] +[titan] 2025-09-10 09:03:54,607 - root - INFO - step: 31490 loss: 2.6743 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.74 mfu: 49.11% global_avg_ntp_loss: 0.7405 global_avg_top_loss: 1.9338 +[titan] 2025-09-10 09:03:54,607 - root - INFO - lr: 3.9742e-06 gnorm: 0.44 [2 days, 9:27:26<15:31:39] +[titan] 2025-09-10 09:04:26,557 - root - INFO - step: 31495 loss: 2.5456 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.80 mfu: 49.42% global_avg_ntp_loss: 0.6832 global_avg_top_loss: 1.8624 +[titan] 2025-09-10 09:04:26,557 - root - INFO - lr: 3.9720e-06 gnorm: 0.45 [2 days, 9:27:58<15:31:05] +[titan] 2025-09-10 09:04:52,115 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 09:04:58,457 - root - INFO - step: 31500 loss: 2.6857 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.56 mfu: 49.50% global_avg_ntp_loss: 0.7504 global_avg_top_loss: 1.9352 +[titan] 2025-09-10 09:04:58,458 - root - INFO - lr: 3.9698e-06 gnorm: 0.44 [2 days, 9:28:30<15:30:32] +[titan] 2025-09-10 09:05:30,566 - root - INFO - step: 31505 loss: 2.6793 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.39 mfu: 49.18% global_avg_ntp_loss: 0.7453 global_avg_top_loss: 1.9340 +[titan] 2025-09-10 09:05:30,566 - root - INFO - lr: 3.9676e-06 gnorm: 0.44 [2 days, 9:29:02<15:29:59] +[titan] 2025-09-10 09:06:02,543 - root - INFO - step: 31510 loss: 2.7399 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.39 mfu: 49.38% global_avg_ntp_loss: 0.7791 global_avg_top_loss: 1.9608 +[titan] 2025-09-10 09:06:02,543 - root - INFO - lr: 3.9653e-06 gnorm: 0.46 [2 days, 9:29:34<15:29:26] +[titan] 2025-09-10 09:06:34,384 - root - INFO - step: 31515 loss: 2.6389 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.47 mfu: 49.59% global_avg_ntp_loss: 0.7291 global_avg_top_loss: 1.9098 +[titan] 2025-09-10 09:06:34,384 - root - INFO - lr: 3.9631e-06 gnorm: 0.44 [2 days, 9:30:06<15:28:53] +[titan] 2025-09-10 09:07:06,450 - root - INFO - step: 31520 loss: 2.6967 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.03 mfu: 49.24% global_avg_ntp_loss: 0.7538 global_avg_top_loss: 1.9429 +[titan] 2025-09-10 09:07:06,451 - root - INFO - lr: 3.9609e-06 gnorm: 0.43 [2 days, 9:30:38<15:28:20] +[titan] 2025-09-10 09:07:38,363 - root - INFO - step: 31525 loss: 2.6380 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.38 mfu: 49.48% global_avg_ntp_loss: 0.7245 global_avg_top_loss: 1.9135 +[titan] 2025-09-10 09:07:38,363 - root - INFO - lr: 3.9587e-06 gnorm: 0.44 [2 days, 9:31:10<15:27:47] +[titan] 2025-09-10 09:08:10,269 - root - INFO - step: 31530 loss: 2.7675 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.47 mfu: 49.49% global_avg_ntp_loss: 0.7973 global_avg_top_loss: 1.9702 +[titan] 2025-09-10 09:08:10,270 - root - INFO - lr: 3.9564e-06 gnorm: 0.44 [2 days, 9:31:41<15:27:14] +[titan] 2025-09-10 09:08:42,214 - root - INFO - step: 31535 loss: 2.6173 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.88 mfu: 49.43% global_avg_ntp_loss: 0.7199 global_avg_top_loss: 1.8973 +[titan] 2025-09-10 09:08:42,214 - root - INFO - lr: 3.9542e-06 gnorm: 0.44 [2 days, 9:32:13<15:26:41] +[titan] 2025-09-10 09:09:14,278 - root - INFO - step: 31540 loss: 2.5886 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.06 mfu: 49.25% global_avg_ntp_loss: 0.7035 global_avg_top_loss: 1.8851 +[titan] 2025-09-10 09:09:14,279 - root - INFO - lr: 3.9520e-06 gnorm: 0.46 [2 days, 9:32:45<15:26:08] +[titan] 2025-09-10 09:09:46,121 - root - INFO - step: 31545 loss: 2.5655 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.44 mfu: 49.59% global_avg_ntp_loss: 0.6922 global_avg_top_loss: 1.8733 +[titan] 2025-09-10 09:09:46,122 - root - INFO - lr: 3.9498e-06 gnorm: 0.44 [2 days, 9:33:17<15:25:35] +[titan] 2025-09-10 09:10:11,582 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 09:10:17,989 - root - INFO - step: 31550 loss: 2.5638 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.06 mfu: 49.55% global_avg_ntp_loss: 0.6939 global_avg_top_loss: 1.8698 +[titan] 2025-09-10 09:10:17,990 - root - INFO - lr: 3.9476e-06 gnorm: 0.45 [2 days, 9:33:49<15:25:02] +[titan] 2025-09-10 09:10:50,028 - root - INFO - step: 31555 loss: 2.5504 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.45 mfu: 49.29% global_avg_ntp_loss: 0.6836 global_avg_top_loss: 1.8668 +[titan] 2025-09-10 09:10:50,028 - root - INFO - lr: 3.9453e-06 gnorm: 0.50 [2 days, 9:34:21<15:24:29] +[titan] 2025-09-10 09:11:21,764 - root - INFO - step: 31560 loss: 2.7030 memory: 122.03GiB(87.57%) tps: 10,325 tflops: 492.10 mfu: 49.76% global_avg_ntp_loss: 0.7582 global_avg_top_loss: 1.9449 +[titan] 2025-09-10 09:11:21,764 - root - INFO - lr: 3.9431e-06 gnorm: 0.44 [2 days, 9:34:53<15:23:55] +[titan] 2025-09-10 09:11:53,718 - root - INFO - step: 31565 loss: 2.9512 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.73 mfu: 49.42% global_avg_ntp_loss: 0.9048 global_avg_top_loss: 2.0464 +[titan] 2025-09-10 09:11:53,718 - root - INFO - lr: 3.9409e-06 gnorm: 0.45 [2 days, 9:35:25<15:23:22] +[titan] 2025-09-10 09:12:25,703 - root - INFO - step: 31570 loss: 2.5201 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.27 mfu: 49.37% global_avg_ntp_loss: 0.6724 global_avg_top_loss: 1.8477 +[titan] 2025-09-10 09:12:25,703 - root - INFO - lr: 3.9387e-06 gnorm: 0.43 [2 days, 9:35:57<15:22:49] +[titan] 2025-09-10 09:12:57,596 - root - INFO - step: 31575 loss: 2.4823 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.68 mfu: 49.51% global_avg_ntp_loss: 0.6538 global_avg_top_loss: 1.8285 +[titan] 2025-09-10 09:12:57,596 - root - INFO - lr: 3.9365e-06 gnorm: 0.46 [2 days, 9:36:29<15:22:16] +[titan] 2025-09-10 09:13:29,669 - root - INFO - step: 31580 loss: 2.6359 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.92 mfu: 49.23% global_avg_ntp_loss: 0.7254 global_avg_top_loss: 1.9104 +[titan] 2025-09-10 09:13:29,670 - root - INFO - lr: 3.9343e-06 gnorm: 0.46 [2 days, 9:37:01<15:21:43] +[titan] 2025-09-10 09:14:01,634 - root - INFO - step: 31585 loss: 2.6981 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.58 mfu: 49.40% global_avg_ntp_loss: 0.7542 global_avg_top_loss: 1.9439 +[titan] 2025-09-10 09:14:01,634 - root - INFO - lr: 3.9321e-06 gnorm: 0.47 [2 days, 9:37:33<15:21:10] +[titan] 2025-09-10 09:14:33,707 - root - INFO - step: 31590 loss: 2.6305 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.92 mfu: 49.23% global_avg_ntp_loss: 0.7241 global_avg_top_loss: 1.9064 +[titan] 2025-09-10 09:14:33,707 - root - INFO - lr: 3.9298e-06 gnorm: 0.47 [2 days, 9:38:05<15:20:37] +[titan] 2025-09-10 09:15:05,721 - root - INFO - step: 31595 loss: 2.6560 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.82 mfu: 49.32% global_avg_ntp_loss: 0.7323 global_avg_top_loss: 1.9237 +[titan] 2025-09-10 09:15:05,721 - root - INFO - lr: 3.9276e-06 gnorm: 0.45 [2 days, 9:38:37<15:20:04] +[titan] 2025-09-10 09:15:31,399 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 09:15:37,916 - root - INFO - step: 31600 loss: 2.6212 memory: 122.03GiB(87.57%) tps: 10,178 tflops: 485.09 mfu: 49.05% global_avg_ntp_loss: 0.7158 global_avg_top_loss: 1.9053 +[titan] 2025-09-10 09:15:37,916 - root - INFO - lr: 3.9254e-06 gnorm: 0.46 [2 days, 9:39:09<15:19:31] +[titan] 2025-09-10 09:16:09,866 - root - INFO - step: 31605 loss: 2.5640 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.80 mfu: 49.42% global_avg_ntp_loss: 0.6936 global_avg_top_loss: 1.8703 +[titan] 2025-09-10 09:16:09,866 - root - INFO - lr: 3.9232e-06 gnorm: 0.42 [2 days, 9:39:41<15:18:58] +[titan] 2025-09-10 09:16:41,834 - root - INFO - step: 31610 loss: 2.6531 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.51 mfu: 49.39% global_avg_ntp_loss: 0.7310 global_avg_top_loss: 1.9221 +[titan] 2025-09-10 09:16:41,835 - root - INFO - lr: 3.9210e-06 gnorm: 0.46 [2 days, 9:40:13<15:18:25] +[titan] 2025-09-10 09:17:13,670 - root - INFO - step: 31615 loss: 2.7286 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.55 mfu: 49.60% global_avg_ntp_loss: 0.7685 global_avg_top_loss: 1.9601 +[titan] 2025-09-10 09:17:13,671 - root - INFO - lr: 3.9188e-06 gnorm: 0.45 [2 days, 9:40:45<15:17:52] +[titan] 2025-09-10 09:17:45,605 - root - INFO - step: 31620 loss: 2.6210 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.04 mfu: 49.45% global_avg_ntp_loss: 0.7183 global_avg_top_loss: 1.9027 +[titan] 2025-09-10 09:17:45,605 - root - INFO - lr: 3.9166e-06 gnorm: 0.45 [2 days, 9:41:17<15:17:19] +[titan] 2025-09-10 09:18:17,542 - root - INFO - step: 31625 loss: 2.5510 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 489.00 mfu: 49.44% global_avg_ntp_loss: 0.6881 global_avg_top_loss: 1.8629 +[titan] 2025-09-10 09:18:17,542 - root - INFO - lr: 3.9144e-06 gnorm: 0.46 [2 days, 9:41:49<15:16:45] +[titan] 2025-09-10 09:18:49,415 - root - INFO - step: 31630 loss: 2.5749 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.98 mfu: 49.54% global_avg_ntp_loss: 0.6947 global_avg_top_loss: 1.8802 +[titan] 2025-09-10 09:18:49,415 - root - INFO - lr: 3.9122e-06 gnorm: 0.43 [2 days, 9:42:21<15:16:12] +[titan] 2025-09-10 09:19:21,332 - root - INFO - step: 31635 loss: 2.5624 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.31 mfu: 49.47% global_avg_ntp_loss: 0.6887 global_avg_top_loss: 1.8737 +[titan] 2025-09-10 09:19:21,332 - root - INFO - lr: 3.9100e-06 gnorm: 0.55 [2 days, 9:42:53<15:15:39] +[titan] 2025-09-10 09:19:53,354 - root - INFO - step: 31640 loss: 2.4947 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.70 mfu: 49.31% global_avg_ntp_loss: 0.6637 global_avg_top_loss: 1.8309 +[titan] 2025-09-10 09:19:53,355 - root - INFO - lr: 3.9078e-06 gnorm: 0.44 [2 days, 9:43:25<15:15:06] +[titan] 2025-09-10 09:20:25,420 - root - INFO - step: 31645 loss: 2.6240 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.03 mfu: 49.25% global_avg_ntp_loss: 0.7160 global_avg_top_loss: 1.9080 +[titan] 2025-09-10 09:20:25,421 - root - INFO - lr: 3.9056e-06 gnorm: 0.46 [2 days, 9:43:57<15:14:33] +[titan] 2025-09-10 09:20:51,001 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 09:20:57,369 - root - INFO - step: 31650 loss: 2.6200 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.82 mfu: 49.43% global_avg_ntp_loss: 0.7157 global_avg_top_loss: 1.9043 +[titan] 2025-09-10 09:20:57,370 - root - INFO - lr: 3.9034e-06 gnorm: 0.44 [2 days, 9:44:29<15:14:00] +[titan] 2025-09-10 09:21:29,269 - root - INFO - step: 31655 loss: 3.0354 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.57 mfu: 49.50% global_avg_ntp_loss: 0.9573 global_avg_top_loss: 2.0781 +[titan] 2025-09-10 09:21:29,270 - root - INFO - lr: 3.9012e-06 gnorm: 0.48 [2 days, 9:45:00<15:13:27] +[titan] 2025-09-10 09:22:01,402 - root - INFO - step: 31660 loss: 2.6330 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.02 mfu: 49.14% global_avg_ntp_loss: 0.7256 global_avg_top_loss: 1.9074 +[titan] 2025-09-10 09:22:01,402 - root - INFO - lr: 3.8990e-06 gnorm: 0.47 [2 days, 9:45:33<15:12:54] +[titan] 2025-09-10 09:22:33,432 - root - INFO - step: 31665 loss: 2.5918 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.57 mfu: 49.30% global_avg_ntp_loss: 0.7062 global_avg_top_loss: 1.8856 +[titan] 2025-09-10 09:22:33,433 - root - INFO - lr: 3.8968e-06 gnorm: 0.45 [2 days, 9:46:05<15:12:21] +[titan] 2025-09-10 09:23:05,360 - root - INFO - step: 31670 loss: 2.7203 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.14 mfu: 49.46% global_avg_ntp_loss: 0.7654 global_avg_top_loss: 1.9549 +[titan] 2025-09-10 09:23:05,360 - root - INFO - lr: 3.8946e-06 gnorm: 0.46 [2 days, 9:46:37<15:11:48] +[titan] 2025-09-10 09:23:37,418 - root - INFO - step: 31675 loss: 2.6410 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.16 mfu: 49.26% global_avg_ntp_loss: 0.7272 global_avg_top_loss: 1.9137 +[titan] 2025-09-10 09:23:37,418 - root - INFO - lr: 3.8925e-06 gnorm: 0.42 [2 days, 9:47:09<15:11:15] +[titan] 2025-09-10 09:24:09,438 - root - INFO - step: 31680 loss: 2.5000 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.73 mfu: 49.32% global_avg_ntp_loss: 0.6630 global_avg_top_loss: 1.8371 +[titan] 2025-09-10 09:24:09,438 - root - INFO - lr: 3.8903e-06 gnorm: 0.43 [2 days, 9:47:41<15:10:42] +[titan] 2025-09-10 09:24:41,230 - root - INFO - step: 31685 loss: 2.6322 memory: 122.03GiB(87.57%) tps: 10,307 tflops: 491.24 mfu: 49.67% global_avg_ntp_loss: 0.7218 global_avg_top_loss: 1.9103 +[titan] 2025-09-10 09:24:41,230 - root - INFO - lr: 3.8881e-06 gnorm: 0.43 [2 days, 9:48:12<15:10:09] +[titan] 2025-09-10 09:25:13,243 - root - INFO - step: 31690 loss: 2.7618 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.84 mfu: 49.33% global_avg_ntp_loss: 0.7935 global_avg_top_loss: 1.9683 +[titan] 2025-09-10 09:25:13,243 - root - INFO - lr: 3.8859e-06 gnorm: 0.57 [2 days, 9:48:44<15:09:36] +[titan] 2025-09-10 09:25:45,258 - root - INFO - step: 31695 loss: 2.7192 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.81 mfu: 49.32% global_avg_ntp_loss: 0.7677 global_avg_top_loss: 1.9515 +[titan] 2025-09-10 09:25:45,258 - root - INFO - lr: 3.8837e-06 gnorm: 0.45 [2 days, 9:49:16<15:09:03] +[titan] 2025-09-10 09:26:10,703 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 09:26:17,136 - root - INFO - step: 31700 loss: 2.6645 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.91 mfu: 49.54% global_avg_ntp_loss: 0.7360 global_avg_top_loss: 1.9285 +[titan] 2025-09-10 09:26:17,136 - root - INFO - lr: 3.8815e-06 gnorm: 0.46 [2 days, 9:49:48<15:08:30] +[titan] 2025-09-10 09:26:49,014 - root - INFO - step: 31705 loss: 2.4071 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.90 mfu: 49.54% global_avg_ntp_loss: 0.6253 global_avg_top_loss: 1.7817 +[titan] 2025-09-10 09:26:49,014 - root - INFO - lr: 3.8793e-06 gnorm: 0.43 [2 days, 9:50:20<15:07:56] +[titan] 2025-09-10 09:27:20,901 - root - INFO - step: 31710 loss: 2.5525 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.77 mfu: 49.52% global_avg_ntp_loss: 0.6849 global_avg_top_loss: 1.8676 +[titan] 2025-09-10 09:27:20,901 - root - INFO - lr: 3.8772e-06 gnorm: 0.42 [2 days, 9:50:52<15:07:23] +[titan] 2025-09-10 09:27:52,901 - root - INFO - step: 31715 loss: 2.5372 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.04 mfu: 49.35% global_avg_ntp_loss: 0.6785 global_avg_top_loss: 1.8587 +[titan] 2025-09-10 09:27:52,901 - root - INFO - lr: 3.8750e-06 gnorm: 0.60 [2 days, 9:51:24<15:06:50] +[titan] 2025-09-10 09:28:24,831 - root - INFO - step: 31720 loss: 2.6314 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.10 mfu: 49.45% global_avg_ntp_loss: 0.7249 global_avg_top_loss: 1.9065 +[titan] 2025-09-10 09:28:24,832 - root - INFO - lr: 3.8728e-06 gnorm: 0.47 [2 days, 9:51:56<15:06:17] +[titan] 2025-09-10 09:28:56,750 - root - INFO - step: 31725 loss: 2.6535 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.29 mfu: 49.47% global_avg_ntp_loss: 0.7396 global_avg_top_loss: 1.9139 +[titan] 2025-09-10 09:28:56,750 - root - INFO - lr: 3.8706e-06 gnorm: 0.47 [2 days, 9:52:28<15:05:44] +[titan] 2025-09-10 09:29:28,809 - root - INFO - step: 31730 loss: 2.6224 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.13 mfu: 49.25% global_avg_ntp_loss: 0.7199 global_avg_top_loss: 1.9025 +[titan] 2025-09-10 09:29:28,810 - root - INFO - lr: 3.8684e-06 gnorm: 0.44 [2 days, 9:53:00<15:05:11] +[titan] 2025-09-10 09:30:00,889 - root - INFO - step: 31735 loss: 3.0359 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.83 mfu: 49.22% global_avg_ntp_loss: 0.9548 global_avg_top_loss: 2.0811 +[titan] 2025-09-10 09:30:00,889 - root - INFO - lr: 3.8663e-06 gnorm: 0.47 [2 days, 9:53:32<15:04:38] +[titan] 2025-09-10 09:30:32,794 - root - INFO - step: 31740 loss: 2.6665 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.49 mfu: 49.49% global_avg_ntp_loss: 0.7403 global_avg_top_loss: 1.9263 +[titan] 2025-09-10 09:30:32,795 - root - INFO - lr: 3.8641e-06 gnorm: 0.47 [2 days, 9:54:04<15:04:05] +[titan] 2025-09-10 09:30:58,594 - root - INFO - Dumping profiler traces at step 31744 +[titan] 2025-09-10 09:30:58,650 - root - INFO - Finished dumping profiler traces in 0.06 seconds +[titan] 2025-09-10 09:31:04,987 - root - INFO - step: 31745 loss: 2.6970 memory: 122.03GiB(87.57%) tps: 10,179 tflops: 485.11 mfu: 49.05% global_avg_ntp_loss: 0.7519 global_avg_top_loss: 1.9451 +[titan] 2025-09-10 09:31:04,987 - root - INFO - lr: 3.8619e-06 gnorm: 0.45 [2 days, 9:54:36<15:03:32] +[titan] 2025-09-10 09:31:30,665 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 09:31:37,053 - root - INFO - step: 31750 loss: 2.7459 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.03 mfu: 49.24% global_avg_ntp_loss: 0.7824 global_avg_top_loss: 1.9635 +[titan] 2025-09-10 09:31:37,054 - root - INFO - lr: 3.8597e-06 gnorm: 0.46 [2 days, 9:55:08<15:02:59] +[titan] 2025-09-10 09:32:09,124 - root - INFO - step: 31755 loss: 2.5713 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.96 mfu: 49.24% global_avg_ntp_loss: 0.6957 global_avg_top_loss: 1.8756 +[titan] 2025-09-10 09:32:09,124 - root - INFO - lr: 3.8576e-06 gnorm: 0.42 [2 days, 9:55:40<15:02:26] +[titan] 2025-09-10 09:32:41,019 - root - INFO - step: 31760 loss: 2.5694 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.64 mfu: 49.51% global_avg_ntp_loss: 0.6936 global_avg_top_loss: 1.8759 +[titan] 2025-09-10 09:32:41,020 - root - INFO - lr: 3.8554e-06 gnorm: 0.44 [2 days, 9:56:12<15:01:53] +[titan] 2025-09-10 09:33:12,993 - root - INFO - step: 31765 loss: 2.5670 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.44 mfu: 49.39% global_avg_ntp_loss: 0.6985 global_avg_top_loss: 1.8685 +[titan] 2025-09-10 09:33:12,993 - root - INFO - lr: 3.8532e-06 gnorm: 0.50 [2 days, 9:56:44<15:01:20] +[titan] 2025-09-10 09:33:44,829 - root - INFO - step: 31770 loss: 2.5296 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.55 mfu: 49.60% global_avg_ntp_loss: 0.6760 global_avg_top_loss: 1.8536 +[titan] 2025-09-10 09:33:44,829 - root - INFO - lr: 3.8510e-06 gnorm: 0.58 [2 days, 9:57:16<15:00:47] +[titan] 2025-09-10 09:34:16,835 - root - INFO - step: 31775 loss: 2.6595 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.95 mfu: 49.34% global_avg_ntp_loss: 0.7362 global_avg_top_loss: 1.9233 +[titan] 2025-09-10 09:34:16,835 - root - INFO - lr: 3.8489e-06 gnorm: 0.51 [2 days, 9:57:48<15:00:14] +[titan] 2025-09-10 09:34:48,938 - root - INFO - step: 31780 loss: 2.6126 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.47 mfu: 49.19% global_avg_ntp_loss: 0.7136 global_avg_top_loss: 1.8990 +[titan] 2025-09-10 09:34:48,939 - root - INFO - lr: 3.8467e-06 gnorm: 0.45 [2 days, 9:58:20<14:59:41] +[titan] 2025-09-10 09:35:20,915 - root - INFO - step: 31785 loss: 2.5441 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.40 mfu: 49.38% global_avg_ntp_loss: 0.6840 global_avg_top_loss: 1.8602 +[titan] 2025-09-10 09:35:20,915 - root - INFO - lr: 3.8445e-06 gnorm: 0.46 [2 days, 9:58:52<14:59:08] +[titan] 2025-09-10 09:35:52,762 - root - INFO - step: 31790 loss: 2.5595 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.37 mfu: 49.58% global_avg_ntp_loss: 0.6878 global_avg_top_loss: 1.8717 +[titan] 2025-09-10 09:35:52,763 - root - INFO - lr: 3.8424e-06 gnorm: 0.44 [2 days, 9:59:24<14:58:34] +[titan] 2025-09-10 09:36:24,578 - root - INFO - step: 31795 loss: 2.5777 memory: 122.03GiB(87.57%) tps: 10,300 tflops: 490.87 mfu: 49.63% global_avg_ntp_loss: 0.6966 global_avg_top_loss: 1.8810 +[titan] 2025-09-10 09:36:24,578 - root - INFO - lr: 3.8402e-06 gnorm: 0.53 [2 days, 9:59:56<14:58:01] +[titan] 2025-09-10 09:36:50,204 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 09:36:56,756 - root - INFO - step: 31800 loss: 2.5961 memory: 122.03GiB(87.57%) tps: 10,183 tflops: 485.34 mfu: 49.07% global_avg_ntp_loss: 0.7054 global_avg_top_loss: 1.8907 +[titan] 2025-09-10 09:36:56,756 - root - INFO - lr: 3.8381e-06 gnorm: 0.46 [2 days, 10:00:28<14:57:28] +[titan] 2025-09-10 09:37:28,582 - root - INFO - step: 31805 loss: 2.6804 memory: 122.03GiB(87.57%) tps: 10,296 tflops: 490.70 mfu: 49.62% global_avg_ntp_loss: 0.7616 global_avg_top_loss: 1.9188 +[titan] 2025-09-10 09:37:28,582 - root - INFO - lr: 3.8359e-06 gnorm: 0.45 [2 days, 10:01:00<14:56:55] +[titan] 2025-09-10 09:38:00,666 - root - INFO - step: 31810 loss: 2.6485 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.76 mfu: 49.22% global_avg_ntp_loss: 0.7387 global_avg_top_loss: 1.9098 +[titan] 2025-09-10 09:38:00,666 - root - INFO - lr: 3.8337e-06 gnorm: 0.46 [2 days, 10:01:32<14:56:22] +[titan] 2025-09-10 09:38:32,543 - root - INFO - step: 31815 loss: 3.0119 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.92 mfu: 49.54% global_avg_ntp_loss: 0.9482 global_avg_top_loss: 2.0637 +[titan] 2025-09-10 09:38:32,544 - root - INFO - lr: 3.8316e-06 gnorm: 0.47 [2 days, 10:02:04<14:55:49] +[titan] 2025-09-10 09:39:04,509 - root - INFO - step: 31820 loss: 2.9061 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.57 mfu: 49.40% global_avg_ntp_loss: 0.8721 global_avg_top_loss: 2.0340 +[titan] 2025-09-10 09:39:04,509 - root - INFO - lr: 3.8294e-06 gnorm: 0.47 [2 days, 10:02:36<14:55:16] +[titan] 2025-09-10 09:39:36,573 - root - INFO - step: 31825 loss: 2.6910 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.06 mfu: 49.25% global_avg_ntp_loss: 0.7566 global_avg_top_loss: 1.9345 +[titan] 2025-09-10 09:39:36,573 - root - INFO - lr: 3.8273e-06 gnorm: 0.47 [2 days, 10:03:08<14:54:43] +[titan] 2025-09-10 09:40:08,543 - root - INFO - step: 31830 loss: 2.6166 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.49 mfu: 49.39% global_avg_ntp_loss: 0.7174 global_avg_top_loss: 1.8992 +[titan] 2025-09-10 09:40:08,543 - root - INFO - lr: 3.8251e-06 gnorm: 0.47 [2 days, 10:03:40<14:54:10] +[titan] 2025-09-10 09:40:40,584 - root - INFO - step: 31835 loss: 2.7358 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.42 mfu: 49.28% global_avg_ntp_loss: 0.7717 global_avg_top_loss: 1.9641 +[titan] 2025-09-10 09:40:40,584 - root - INFO - lr: 3.8229e-06 gnorm: 0.45 [2 days, 10:04:12<14:53:37] +[titan] 2025-09-10 09:41:12,645 - root - INFO - step: 31840 loss: 2.6032 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.10 mfu: 49.25% global_avg_ntp_loss: 0.7121 global_avg_top_loss: 1.8911 +[titan] 2025-09-10 09:41:12,646 - root - INFO - lr: 3.8208e-06 gnorm: 0.46 [2 days, 10:04:44<14:53:04] +[titan] 2025-09-10 09:41:44,457 - root - INFO - step: 31845 loss: 2.6810 memory: 122.03GiB(87.57%) tps: 10,301 tflops: 490.92 mfu: 49.64% global_avg_ntp_loss: 0.7449 global_avg_top_loss: 1.9361 +[titan] 2025-09-10 09:41:44,458 - root - INFO - lr: 3.8186e-06 gnorm: 0.45 [2 days, 10:05:16<14:52:31] +[titan] 2025-09-10 09:42:10,062 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 09:42:16,475 - root - INFO - step: 31850 loss: 2.5933 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.78 mfu: 49.32% global_avg_ntp_loss: 0.7062 global_avg_top_loss: 1.8871 +[titan] 2025-09-10 09:42:16,475 - root - INFO - lr: 3.8165e-06 gnorm: 0.44 [2 days, 10:05:48<14:51:58] +[titan] 2025-09-10 09:42:48,474 - root - INFO - step: 31855 loss: 2.6566 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.04 mfu: 49.35% global_avg_ntp_loss: 0.7360 global_avg_top_loss: 1.9206 +[titan] 2025-09-10 09:42:48,475 - root - INFO - lr: 3.8143e-06 gnorm: 0.46 [2 days, 10:06:20<14:51:25] +[titan] 2025-09-10 09:43:20,275 - root - INFO - step: 31860 loss: 2.6349 memory: 122.03GiB(87.57%) tps: 10,304 tflops: 491.10 mfu: 49.66% global_avg_ntp_loss: 0.7257 global_avg_top_loss: 1.9092 +[titan] 2025-09-10 09:43:20,275 - root - INFO - lr: 3.8122e-06 gnorm: 0.46 [2 days, 10:06:51<14:50:52] +[titan] 2025-09-10 09:43:52,203 - root - INFO - step: 31865 loss: 2.5751 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.14 mfu: 49.46% global_avg_ntp_loss: 0.6982 global_avg_top_loss: 1.8769 +[titan] 2025-09-10 09:43:52,203 - root - INFO - lr: 3.8100e-06 gnorm: 0.45 [2 days, 10:07:23<14:50:19] +[titan] 2025-09-10 09:44:24,279 - root - INFO - step: 31870 loss: 2.6189 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.87 mfu: 49.23% global_avg_ntp_loss: 0.7151 global_avg_top_loss: 1.9038 +[titan] 2025-09-10 09:44:24,280 - root - INFO - lr: 3.8079e-06 gnorm: 0.47 [2 days, 10:07:55<14:49:46] +[titan] 2025-09-10 09:44:56,251 - root - INFO - step: 31875 loss: 2.5439 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.47 mfu: 49.39% global_avg_ntp_loss: 0.6830 global_avg_top_loss: 1.8609 +[titan] 2025-09-10 09:44:56,251 - root - INFO - lr: 3.8058e-06 gnorm: 0.56 [2 days, 10:08:27<14:49:12] +[titan] 2025-09-10 09:45:28,169 - root - INFO - step: 31880 loss: 2.6308 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.29 mfu: 49.47% global_avg_ntp_loss: 0.7238 global_avg_top_loss: 1.9070 +[titan] 2025-09-10 09:45:28,169 - root - INFO - lr: 3.8036e-06 gnorm: 0.46 [2 days, 10:08:59<14:48:39] +[titan] 2025-09-10 09:46:00,169 - root - INFO - step: 31885 loss: 2.6644 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.04 mfu: 49.35% global_avg_ntp_loss: 0.7423 global_avg_top_loss: 1.9221 +[titan] 2025-09-10 09:46:00,170 - root - INFO - lr: 3.8015e-06 gnorm: 0.47 [2 days, 10:09:31<14:48:06] +[titan] 2025-09-10 09:46:32,271 - root - INFO - step: 31890 loss: 2.6234 memory: 122.03GiB(87.57%) tps: 10,208 tflops: 486.49 mfu: 49.19% global_avg_ntp_loss: 0.7220 global_avg_top_loss: 1.9013 +[titan] 2025-09-10 09:46:32,271 - root - INFO - lr: 3.7993e-06 gnorm: 0.46 [2 days, 10:10:03<14:47:33] +[titan] 2025-09-10 09:47:04,096 - root - INFO - step: 31895 loss: 3.0281 memory: 122.03GiB(87.57%) tps: 10,296 tflops: 490.71 mfu: 49.62% global_avg_ntp_loss: 0.9539 global_avg_top_loss: 2.0742 +[titan] 2025-09-10 09:47:04,097 - root - INFO - lr: 3.7972e-06 gnorm: 0.46 [2 days, 10:10:35<14:47:00] +[titan] 2025-09-10 09:47:29,673 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 09:47:36,031 - root - INFO - step: 31900 loss: 2.6618 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.04 mfu: 49.45% global_avg_ntp_loss: 0.7380 global_avg_top_loss: 1.9237 +[titan] 2025-09-10 09:47:36,031 - root - INFO - lr: 3.7950e-06 gnorm: 0.48 [2 days, 10:11:07<14:46:27] +[titan] 2025-09-10 09:48:07,954 - root - INFO - step: 31905 loss: 2.7126 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.22 mfu: 49.47% global_avg_ntp_loss: 0.7615 global_avg_top_loss: 1.9511 +[titan] 2025-09-10 09:48:07,954 - root - INFO - lr: 3.7929e-06 gnorm: 0.47 [2 days, 10:11:39<14:45:54] +[titan] 2025-09-10 09:48:39,793 - root - INFO - step: 31910 loss: 3.0003 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.50 mfu: 49.60% global_avg_ntp_loss: 0.8931 global_avg_top_loss: 2.1073 +[titan] 2025-09-10 09:48:39,793 - root - INFO - lr: 3.7908e-06 gnorm: 0.48 [2 days, 10:12:11<14:45:21] +[titan] 2025-09-10 09:49:11,733 - root - INFO - step: 31915 loss: 2.5884 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.96 mfu: 49.44% global_avg_ntp_loss: 0.7055 global_avg_top_loss: 1.8828 +[titan] 2025-09-10 09:49:11,733 - root - INFO - lr: 3.7886e-06 gnorm: 0.43 [2 days, 10:12:43<14:44:48] +[titan] 2025-09-10 09:49:43,581 - root - INFO - step: 31920 loss: 2.6161 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.36 mfu: 49.58% global_avg_ntp_loss: 0.7146 global_avg_top_loss: 1.9015 +[titan] 2025-09-10 09:49:43,581 - root - INFO - lr: 3.7865e-06 gnorm: 0.44 [2 days, 10:13:15<14:44:15] +[titan] 2025-09-10 09:50:15,710 - root - INFO - step: 31925 loss: 2.6865 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.07 mfu: 49.15% global_avg_ntp_loss: 0.7477 global_avg_top_loss: 1.9388 +[titan] 2025-09-10 09:50:15,710 - root - INFO - lr: 3.7844e-06 gnorm: 0.46 [2 days, 10:13:47<14:43:42] +[titan] 2025-09-10 09:50:47,723 - root - INFO - step: 31930 loss: 2.6313 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.84 mfu: 49.33% global_avg_ntp_loss: 0.7262 global_avg_top_loss: 1.9051 +[titan] 2025-09-10 09:50:47,723 - root - INFO - lr: 3.7822e-06 gnorm: 0.47 [2 days, 10:14:19<14:43:09] +[titan] 2025-09-10 09:51:19,792 - root - INFO - step: 31935 loss: 2.6076 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.99 mfu: 49.24% global_avg_ntp_loss: 0.7146 global_avg_top_loss: 1.8931 +[titan] 2025-09-10 09:51:19,792 - root - INFO - lr: 3.7801e-06 gnorm: 0.48 [2 days, 10:14:51<14:42:36] +[titan] 2025-09-10 09:51:51,754 - root - INFO - step: 31940 loss: 2.6449 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.62 mfu: 49.41% global_avg_ntp_loss: 0.7275 global_avg_top_loss: 1.9174 +[titan] 2025-09-10 09:51:51,754 - root - INFO - lr: 3.7780e-06 gnorm: 0.49 [2 days, 10:15:23<14:42:03] +[titan] 2025-09-10 09:52:23,949 - root - INFO - step: 31945 loss: 2.6527 memory: 122.03GiB(87.57%) tps: 10,178 tflops: 485.08 mfu: 49.05% global_avg_ntp_loss: 0.7305 global_avg_top_loss: 1.9222 +[titan] 2025-09-10 09:52:23,949 - root - INFO - lr: 3.7758e-06 gnorm: 0.50 [2 days, 10:15:55<14:41:30] +[titan] 2025-09-10 09:52:49,388 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 09:52:55,773 - root - INFO - step: 31950 loss: 2.4541 memory: 122.03GiB(87.57%) tps: 10,297 tflops: 490.73 mfu: 49.62% global_avg_ntp_loss: 0.6419 global_avg_top_loss: 1.8122 +[titan] 2025-09-10 09:52:55,774 - root - INFO - lr: 3.7737e-06 gnorm: 0.45 [2 days, 10:16:27<14:40:57] +[titan] 2025-09-10 09:53:27,800 - root - INFO - step: 31955 loss: 2.5271 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.63 mfu: 49.31% global_avg_ntp_loss: 0.6699 global_avg_top_loss: 1.8573 +[titan] 2025-09-10 09:53:27,800 - root - INFO - lr: 3.7716e-06 gnorm: 0.53 [2 days, 10:16:59<14:40:24] +[titan] 2025-09-10 09:53:59,825 - root - INFO - step: 31960 loss: 3.1523 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.66 mfu: 49.31% global_avg_ntp_loss: 1.0131 global_avg_top_loss: 2.1393 +[titan] 2025-09-10 09:53:59,825 - root - INFO - lr: 3.7695e-06 gnorm: 0.45 [2 days, 10:17:31<14:39:51] +[titan] 2025-09-10 09:54:32,148 - root - INFO - step: 31965 loss: 2.6476 memory: 122.03GiB(87.57%) tps: 10,138 tflops: 483.15 mfu: 48.85% global_avg_ntp_loss: 0.7325 global_avg_top_loss: 1.9151 +[titan] 2025-09-10 09:54:32,149 - root - INFO - lr: 3.7673e-06 gnorm: 0.51 [2 days, 10:18:03<14:39:18] +[titan] 2025-09-10 09:55:03,890 - root - INFO - step: 31970 loss: 2.6402 memory: 122.03GiB(87.57%) tps: 10,323 tflops: 492.01 mfu: 49.75% global_avg_ntp_loss: 0.7254 global_avg_top_loss: 1.9149 +[titan] 2025-09-10 09:55:03,890 - root - INFO - lr: 3.7652e-06 gnorm: 0.49 [2 days, 10:18:35<14:38:45] +[titan] 2025-09-10 09:55:36,294 - root - INFO - step: 31975 loss: 2.5637 memory: 122.03GiB(87.57%) tps: 10,113 tflops: 481.96 mfu: 48.73% global_avg_ntp_loss: 0.6892 global_avg_top_loss: 1.8746 +[titan] 2025-09-10 09:55:36,294 - root - INFO - lr: 3.7631e-06 gnorm: 0.45 [2 days, 10:19:07<14:38:12] +[titan] 2025-09-10 09:56:08,160 - root - INFO - step: 31980 loss: 2.5927 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.09 mfu: 49.55% global_avg_ntp_loss: 0.7052 global_avg_top_loss: 1.8875 +[titan] 2025-09-10 09:56:08,160 - root - INFO - lr: 3.7610e-06 gnorm: 0.47 [2 days, 10:19:39<14:37:39] +[titan] 2025-09-10 09:56:40,251 - root - INFO - step: 31985 loss: 2.7264 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.66 mfu: 49.21% global_avg_ntp_loss: 0.7676 global_avg_top_loss: 1.9588 +[titan] 2025-09-10 09:56:40,251 - root - INFO - lr: 3.7588e-06 gnorm: 0.48 [2 days, 10:20:11<14:37:06] +[titan] 2025-09-10 09:57:12,060 - root - INFO - step: 31990 loss: 2.8388 memory: 122.03GiB(87.57%) tps: 10,301 tflops: 490.96 mfu: 49.64% global_avg_ntp_loss: 0.8364 global_avg_top_loss: 2.0024 +[titan] 2025-09-10 09:57:12,061 - root - INFO - lr: 3.7567e-06 gnorm: 0.51 [2 days, 10:20:43<14:36:32] +[titan] 2025-09-10 09:57:44,085 - root - INFO - step: 31995 loss: 2.6385 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.66 mfu: 49.31% global_avg_ntp_loss: 0.7299 global_avg_top_loss: 1.9085 +[titan] 2025-09-10 09:57:44,085 - root - INFO - lr: 3.7546e-06 gnorm: 0.43 [2 days, 10:21:15<14:35:59] +[titan] 2025-09-10 09:58:09,580 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 09:58:15,962 - root - INFO - step: 32000 loss: 2.6064 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.92 mfu: 49.54% global_avg_ntp_loss: 0.7104 global_avg_top_loss: 1.8960 +[titan] 2025-09-10 09:58:15,963 - root - INFO - lr: 3.7525e-06 gnorm: 0.46 [2 days, 10:21:47<14:35:26] +[titan] 2025-09-10 09:58:47,800 - root - INFO - step: 32005 loss: 2.6175 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.53 mfu: 49.60% global_avg_ntp_loss: 0.7182 global_avg_top_loss: 1.8993 +[titan] 2025-09-10 09:58:47,800 - root - INFO - lr: 3.7504e-06 gnorm: 0.44 [2 days, 10:22:19<14:34:53] +[titan] 2025-09-10 09:59:20,124 - root - INFO - step: 32010 loss: 2.6239 memory: 122.03GiB(87.57%) tps: 10,138 tflops: 483.15 mfu: 48.85% global_avg_ntp_loss: 0.7204 global_avg_top_loss: 1.9035 +[titan] 2025-09-10 09:59:20,124 - root - INFO - lr: 3.7483e-06 gnorm: 0.46 [2 days, 10:22:51<14:34:20] +[titan] 2025-09-10 09:59:52,017 - root - INFO - step: 32015 loss: 2.5668 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.67 mfu: 49.51% global_avg_ntp_loss: 0.7020 global_avg_top_loss: 1.8648 +[titan] 2025-09-10 09:59:52,017 - root - INFO - lr: 3.7461e-06 gnorm: 0.47 [2 days, 10:23:23<14:33:47] +[titan] 2025-09-10 10:00:24,054 - root - INFO - step: 32020 loss: 2.6366 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.48 mfu: 49.29% global_avg_ntp_loss: 0.7216 global_avg_top_loss: 1.9150 +[titan] 2025-09-10 10:00:24,054 - root - INFO - lr: 3.7440e-06 gnorm: 0.46 [2 days, 10:23:55<14:33:14] +[titan] 2025-09-10 10:00:56,211 - root - INFO - step: 32025 loss: 2.7410 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.65 mfu: 49.11% global_avg_ntp_loss: 0.7877 global_avg_top_loss: 1.9532 +[titan] 2025-09-10 10:00:56,211 - root - INFO - lr: 3.7419e-06 gnorm: 0.44 [2 days, 10:24:27<14:32:41] +[titan] 2025-09-10 10:01:28,098 - root - INFO - step: 32030 loss: 2.4934 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.77 mfu: 49.52% global_avg_ntp_loss: 0.6595 global_avg_top_loss: 1.8339 +[titan] 2025-09-10 10:01:28,098 - root - INFO - lr: 3.7398e-06 gnorm: 0.45 [2 days, 10:24:59<14:32:08] +[titan] 2025-09-10 10:02:00,357 - root - INFO - step: 32035 loss: 2.5542 memory: 122.03GiB(87.57%) tps: 10,158 tflops: 484.12 mfu: 48.95% global_avg_ntp_loss: 0.6807 global_avg_top_loss: 1.8734 +[titan] 2025-09-10 10:02:00,357 - root - INFO - lr: 3.7377e-06 gnorm: 0.55 [2 days, 10:25:31<14:31:35] +[titan] 2025-09-10 10:02:32,251 - root - INFO - step: 32040 loss: 2.4861 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.65 mfu: 49.51% global_avg_ntp_loss: 0.6598 global_avg_top_loss: 1.8263 +[titan] 2025-09-10 10:02:32,252 - root - INFO - lr: 3.7356e-06 gnorm: 0.44 [2 days, 10:26:03<14:31:02] +[titan] 2025-09-10 10:03:04,348 - root - INFO - step: 32045 loss: 2.4548 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.57 mfu: 49.20% global_avg_ntp_loss: 0.6419 global_avg_top_loss: 1.8130 +[titan] 2025-09-10 10:03:04,348 - root - INFO - lr: 3.7335e-06 gnorm: 0.51 [2 days, 10:26:35<14:30:29] +[titan] 2025-09-10 10:03:29,963 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 10:03:36,311 - root - INFO - step: 32050 loss: 2.6483 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.59 mfu: 49.40% global_avg_ntp_loss: 0.7321 global_avg_top_loss: 1.9163 +[titan] 2025-09-10 10:03:36,311 - root - INFO - lr: 3.7314e-06 gnorm: 0.47 [2 days, 10:27:07<14:29:56] +[titan] 2025-09-10 10:04:08,217 - root - INFO - step: 32055 loss: 2.6043 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.48 mfu: 49.49% global_avg_ntp_loss: 0.7113 global_avg_top_loss: 1.8930 +[titan] 2025-09-10 10:04:08,217 - root - INFO - lr: 3.7293e-06 gnorm: 0.47 [2 days, 10:27:39<14:29:23] +[titan] 2025-09-10 10:04:40,295 - root - INFO - step: 32060 loss: 2.6456 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.86 mfu: 49.23% global_avg_ntp_loss: 0.7281 global_avg_top_loss: 1.9175 +[titan] 2025-09-10 10:04:40,295 - root - INFO - lr: 3.7272e-06 gnorm: 0.49 [2 days, 10:28:11<14:28:50] +[titan] 2025-09-10 10:05:12,400 - root - INFO - step: 32065 loss: 2.6251 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.43 mfu: 49.18% global_avg_ntp_loss: 0.7216 global_avg_top_loss: 1.9034 +[titan] 2025-09-10 10:05:12,401 - root - INFO - lr: 3.7251e-06 gnorm: 0.46 [2 days, 10:28:44<14:28:17] +[titan] 2025-09-10 10:05:44,289 - root - INFO - step: 32070 loss: 2.6658 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.73 mfu: 49.52% global_avg_ntp_loss: 0.7396 global_avg_top_loss: 1.9262 +[titan] 2025-09-10 10:05:44,290 - root - INFO - lr: 3.7230e-06 gnorm: 0.48 [2 days, 10:29:15<14:27:44] +[titan] 2025-09-10 10:06:16,123 - root - INFO - step: 32075 loss: 2.6340 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.60 mfu: 49.61% global_avg_ntp_loss: 0.7244 global_avg_top_loss: 1.9095 +[titan] 2025-09-10 10:06:16,123 - root - INFO - lr: 3.7209e-06 gnorm: 0.46 [2 days, 10:29:47<14:27:11] +[titan] 2025-09-10 10:06:48,151 - root - INFO - step: 32080 loss: 2.6320 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.60 mfu: 49.30% global_avg_ntp_loss: 0.7232 global_avg_top_loss: 1.9089 +[titan] 2025-09-10 10:06:48,152 - root - INFO - lr: 3.7188e-06 gnorm: 0.46 [2 days, 10:30:19<14:26:38] +[titan] 2025-09-10 10:07:20,131 - root - INFO - step: 32085 loss: 2.6015 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.35 mfu: 49.38% global_avg_ntp_loss: 0.7099 global_avg_top_loss: 1.8916 +[titan] 2025-09-10 10:07:20,131 - root - INFO - lr: 3.7167e-06 gnorm: 0.44 [2 days, 10:30:51<14:26:05] +[titan] 2025-09-10 10:07:51,910 - root - INFO - step: 32090 loss: 2.6806 memory: 122.03GiB(87.57%) tps: 10,311 tflops: 491.42 mfu: 49.69% global_avg_ntp_loss: 0.7446 global_avg_top_loss: 1.9360 +[titan] 2025-09-10 10:07:51,911 - root - INFO - lr: 3.7146e-06 gnorm: 0.46 [2 days, 10:31:23<14:25:32] +[titan] 2025-09-10 10:08:23,759 - root - INFO - step: 32095 loss: 2.6440 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.35 mfu: 49.58% global_avg_ntp_loss: 0.7257 global_avg_top_loss: 1.9182 +[titan] 2025-09-10 10:08:23,760 - root - INFO - lr: 3.7125e-06 gnorm: 0.48 [2 days, 10:31:55<14:24:59] +[titan] 2025-09-10 10:08:49,331 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 10:08:55,869 - root - INFO - step: 32100 loss: 2.6944 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.37 mfu: 49.18% global_avg_ntp_loss: 0.7511 global_avg_top_loss: 1.9433 +[titan] 2025-09-10 10:08:55,869 - root - INFO - lr: 3.7104e-06 gnorm: 0.51 [2 days, 10:32:27<14:24:26] +[titan] 2025-09-10 10:09:27,729 - root - INFO - step: 32105 loss: 2.4859 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.19 mfu: 49.56% global_avg_ntp_loss: 0.6576 global_avg_top_loss: 1.8283 +[titan] 2025-09-10 10:09:27,729 - root - INFO - lr: 3.7083e-06 gnorm: 0.47 [2 days, 10:32:59<14:23:53] +[titan] 2025-09-10 10:09:59,722 - root - INFO - step: 32110 loss: 2.4638 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.13 mfu: 49.36% global_avg_ntp_loss: 0.6420 global_avg_top_loss: 1.8219 +[titan] 2025-09-10 10:09:59,723 - root - INFO - lr: 3.7062e-06 gnorm: 0.45 [2 days, 10:33:31<14:23:20] +[titan] 2025-09-10 10:10:31,633 - root - INFO - step: 32115 loss: 2.4832 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.41 mfu: 49.49% global_avg_ntp_loss: 0.6518 global_avg_top_loss: 1.8313 +[titan] 2025-09-10 10:10:31,633 - root - INFO - lr: 3.7041e-06 gnorm: 0.50 [2 days, 10:34:03<14:22:47] +[titan] 2025-09-10 10:11:03,714 - root - INFO - step: 32120 loss: 2.5959 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.80 mfu: 49.22% global_avg_ntp_loss: 0.7059 global_avg_top_loss: 1.8900 +[titan] 2025-09-10 10:11:03,714 - root - INFO - lr: 3.7020e-06 gnorm: 0.47 [2 days, 10:34:35<14:22:14] +[titan] 2025-09-10 10:11:35,668 - root - INFO - step: 32125 loss: 2.6263 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.73 mfu: 49.42% global_avg_ntp_loss: 0.7205 global_avg_top_loss: 1.9058 +[titan] 2025-09-10 10:11:35,669 - root - INFO - lr: 3.6999e-06 gnorm: 0.49 [2 days, 10:35:07<14:21:41] +[titan] 2025-09-10 10:12:07,867 - root - INFO - step: 32130 loss: 2.5800 memory: 122.03GiB(87.57%) tps: 10,177 tflops: 485.02 mfu: 49.04% global_avg_ntp_loss: 0.6981 global_avg_top_loss: 1.8819 +[titan] 2025-09-10 10:12:07,867 - root - INFO - lr: 3.6978e-06 gnorm: 0.46 [2 days, 10:35:39<14:21:08] +[titan] 2025-09-10 10:12:39,737 - root - INFO - step: 32135 loss: 2.9519 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.03 mfu: 49.55% global_avg_ntp_loss: 0.9104 global_avg_top_loss: 2.0415 +[titan] 2025-09-10 10:12:39,737 - root - INFO - lr: 3.6958e-06 gnorm: 0.45 [2 days, 10:36:11<14:20:34] +[titan] 2025-09-10 10:13:11,688 - root - INFO - step: 32140 loss: 2.7938 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.78 mfu: 49.42% global_avg_ntp_loss: 0.8026 global_avg_top_loss: 1.9912 +[titan] 2025-09-10 10:13:11,689 - root - INFO - lr: 3.6937e-06 gnorm: 0.47 [2 days, 10:36:43<14:20:01] +[titan] 2025-09-10 10:13:43,575 - root - INFO - step: 32145 loss: 2.6227 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.77 mfu: 49.52% global_avg_ntp_loss: 0.7157 global_avg_top_loss: 1.9070 +[titan] 2025-09-10 10:13:43,575 - root - INFO - lr: 3.6916e-06 gnorm: 0.45 [2 days, 10:37:15<14:19:28] +[titan] 2025-09-10 10:14:09,208 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 10:14:15,741 - root - INFO - step: 32150 loss: 2.6666 memory: 122.03GiB(87.57%) tps: 10,187 tflops: 485.52 mfu: 49.09% global_avg_ntp_loss: 0.7379 global_avg_top_loss: 1.9287 +[titan] 2025-09-10 10:14:15,742 - root - INFO - lr: 3.6895e-06 gnorm: 0.49 [2 days, 10:37:47<14:18:55] +[titan] 2025-09-10 10:14:47,650 - root - INFO - step: 32155 loss: 2.5876 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.43 mfu: 49.49% global_avg_ntp_loss: 0.7061 global_avg_top_loss: 1.8814 +[titan] 2025-09-10 10:14:47,651 - root - INFO - lr: 3.6874e-06 gnorm: 0.46 [2 days, 10:38:19<14:18:22] +[titan] 2025-09-10 10:15:19,600 - root - INFO - step: 32160 loss: 2.7056 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.81 mfu: 49.42% global_avg_ntp_loss: 0.7562 global_avg_top_loss: 1.9494 +[titan] 2025-09-10 10:15:19,600 - root - INFO - lr: 3.6853e-06 gnorm: 0.46 [2 days, 10:38:51<14:17:49] +[titan] 2025-09-10 10:15:51,554 - root - INFO - step: 32165 loss: 2.6492 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.73 mfu: 49.42% global_avg_ntp_loss: 0.7315 global_avg_top_loss: 1.9177 +[titan] 2025-09-10 10:15:51,555 - root - INFO - lr: 3.6833e-06 gnorm: 0.45 [2 days, 10:39:23<14:17:16] +[titan] 2025-09-10 10:16:23,550 - root - INFO - step: 32170 loss: 3.0529 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.10 mfu: 49.35% global_avg_ntp_loss: 0.9649 global_avg_top_loss: 2.0880 +[titan] 2025-09-10 10:16:23,550 - root - INFO - lr: 3.6812e-06 gnorm: 0.47 [2 days, 10:39:55<14:16:43] +[titan] 2025-09-10 10:16:55,593 - root - INFO - step: 32175 loss: 2.5625 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.39 mfu: 49.28% global_avg_ntp_loss: 0.6934 global_avg_top_loss: 1.8691 +[titan] 2025-09-10 10:16:55,593 - root - INFO - lr: 3.6791e-06 gnorm: 0.50 [2 days, 10:40:27<14:16:10] +[titan] 2025-09-10 10:17:27,518 - root - INFO - step: 32180 loss: 2.6370 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.19 mfu: 49.46% global_avg_ntp_loss: 0.7247 global_avg_top_loss: 1.9124 +[titan] 2025-09-10 10:17:27,518 - root - INFO - lr: 3.6770e-06 gnorm: 0.47 [2 days, 10:40:59<14:15:37] +[titan] 2025-09-10 10:17:59,542 - root - INFO - step: 32185 loss: 2.5771 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.67 mfu: 49.31% global_avg_ntp_loss: 0.6979 global_avg_top_loss: 1.8792 +[titan] 2025-09-10 10:17:59,542 - root - INFO - lr: 3.6750e-06 gnorm: 0.46 [2 days, 10:41:31<14:15:04] +[titan] 2025-09-10 10:18:31,458 - root - INFO - step: 32190 loss: 2.5342 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.32 mfu: 49.48% global_avg_ntp_loss: 0.6779 global_avg_top_loss: 1.8562 +[titan] 2025-09-10 10:18:31,458 - root - INFO - lr: 3.6729e-06 gnorm: 0.46 [2 days, 10:42:03<14:14:31] +[titan] 2025-09-10 10:19:03,324 - root - INFO - step: 32195 loss: 2.5811 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.08 mfu: 49.55% global_avg_ntp_loss: 0.6940 global_avg_top_loss: 1.8870 +[titan] 2025-09-10 10:19:03,325 - root - INFO - lr: 3.6708e-06 gnorm: 0.57 [2 days, 10:42:34<14:13:58] +[titan] 2025-09-10 10:19:28,977 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 10:19:35,329 - root - INFO - step: 32200 loss: 2.5918 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.96 mfu: 49.34% global_avg_ntp_loss: 0.7037 global_avg_top_loss: 1.8881 +[titan] 2025-09-10 10:19:35,330 - root - INFO - lr: 3.6687e-06 gnorm: 0.47 [2 days, 10:43:06<14:13:25] +[titan] 2025-09-10 10:20:07,376 - root - INFO - step: 32205 loss: 2.6026 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.33 mfu: 49.27% global_avg_ntp_loss: 0.7085 global_avg_top_loss: 1.8941 +[titan] 2025-09-10 10:20:07,376 - root - INFO - lr: 3.6667e-06 gnorm: 0.48 [2 days, 10:43:38<14:12:52] +[titan] 2025-09-10 10:20:39,422 - root - INFO - step: 32210 loss: 2.4993 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.34 mfu: 49.28% global_avg_ntp_loss: 0.6627 global_avg_top_loss: 1.8366 +[titan] 2025-09-10 10:20:39,422 - root - INFO - lr: 3.6646e-06 gnorm: 0.46 [2 days, 10:44:11<14:12:19] +[titan] 2025-09-10 10:21:11,616 - root - INFO - step: 32215 loss: 2.5446 memory: 122.03GiB(87.57%) tps: 10,179 tflops: 485.10 mfu: 49.05% global_avg_ntp_loss: 0.6841 global_avg_top_loss: 1.8605 +[titan] 2025-09-10 10:21:11,616 - root - INFO - lr: 3.6625e-06 gnorm: 0.43 [2 days, 10:44:43<14:11:46] +[titan] 2025-09-10 10:21:43,818 - root - INFO - step: 32220 loss: 2.6647 memory: 122.03GiB(87.57%) tps: 10,176 tflops: 484.98 mfu: 49.04% global_avg_ntp_loss: 0.7387 global_avg_top_loss: 1.9260 +[titan] 2025-09-10 10:21:43,818 - root - INFO - lr: 3.6605e-06 gnorm: 0.46 [2 days, 10:45:15<14:11:13] +[titan] 2025-09-10 10:22:15,854 - root - INFO - step: 32225 loss: 2.5842 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.49 mfu: 49.29% global_avg_ntp_loss: 0.7070 global_avg_top_loss: 1.8772 +[titan] 2025-09-10 10:22:15,854 - root - INFO - lr: 3.6584e-06 gnorm: 0.46 [2 days, 10:45:47<14:10:40] +[titan] 2025-09-10 10:22:47,978 - root - INFO - step: 32230 loss: 2.6027 memory: 122.03GiB(87.57%) tps: 10,200 tflops: 486.14 mfu: 49.16% global_avg_ntp_loss: 0.7092 global_avg_top_loss: 1.8935 +[titan] 2025-09-10 10:22:47,979 - root - INFO - lr: 3.6563e-06 gnorm: 0.48 [2 days, 10:46:19<14:10:07] +[titan] 2025-09-10 10:23:19,755 - root - INFO - step: 32235 loss: 2.5832 memory: 122.03GiB(87.57%) tps: 10,312 tflops: 491.47 mfu: 49.69% global_avg_ntp_loss: 0.7027 global_avg_top_loss: 1.8805 +[titan] 2025-09-10 10:23:19,755 - root - INFO - lr: 3.6543e-06 gnorm: 0.44 [2 days, 10:46:51<14:09:34] +[titan] 2025-09-10 10:23:51,810 - root - INFO - step: 32240 loss: 2.5744 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.19 mfu: 49.26% global_avg_ntp_loss: 0.6930 global_avg_top_loss: 1.8814 +[titan] 2025-09-10 10:23:51,810 - root - INFO - lr: 3.6522e-06 gnorm: 0.45 [2 days, 10:47:23<14:09:01] +[titan] 2025-09-10 10:24:23,968 - root - INFO - step: 32245 loss: 2.6403 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.65 mfu: 49.10% global_avg_ntp_loss: 0.7294 global_avg_top_loss: 1.9109 +[titan] 2025-09-10 10:24:23,968 - root - INFO - lr: 3.6501e-06 gnorm: 0.45 [2 days, 10:47:55<14:08:28] +[titan] 2025-09-10 10:24:49,390 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 10:24:55,886 - root - INFO - step: 32250 loss: 2.6225 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.29 mfu: 49.47% global_avg_ntp_loss: 0.7191 global_avg_top_loss: 1.9033 +[titan] 2025-09-10 10:24:55,886 - root - INFO - lr: 3.6481e-06 gnorm: 0.45 [2 days, 10:48:27<14:07:55] +[titan] 2025-09-10 10:25:27,784 - root - INFO - step: 32255 loss: 2.5577 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.60 mfu: 49.50% global_avg_ntp_loss: 0.6910 global_avg_top_loss: 1.8667 +[titan] 2025-09-10 10:25:27,784 - root - INFO - lr: 3.6460e-06 gnorm: 0.47 [2 days, 10:48:59<14:07:22] +[titan] 2025-09-10 10:25:34,557 - root - INFO - Dumping profiler traces at step 32256 +[titan] 2025-09-10 10:25:34,629 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-10 10:26:00,070 - root - INFO - step: 32260 loss: 2.6672 memory: 122.03GiB(87.57%) tps: 10,149 tflops: 483.71 mfu: 48.91% global_avg_ntp_loss: 0.7418 global_avg_top_loss: 1.9254 +[titan] 2025-09-10 10:26:00,071 - root - INFO - lr: 3.6440e-06 gnorm: 0.48 [2 days, 10:49:31<14:06:49] +[titan] 2025-09-10 10:26:31,839 - root - INFO - step: 32265 loss: 2.4402 memory: 122.03GiB(87.57%) tps: 10,315 tflops: 491.60 mfu: 49.71% global_avg_ntp_loss: 0.6368 global_avg_top_loss: 1.8034 +[titan] 2025-09-10 10:26:31,839 - root - INFO - lr: 3.6419e-06 gnorm: 0.46 [2 days, 10:50:03<14:06:16] +[titan] 2025-09-10 10:27:03,850 - root - INFO - step: 32270 loss: 2.4830 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.87 mfu: 49.33% global_avg_ntp_loss: 0.6537 global_avg_top_loss: 1.8294 +[titan] 2025-09-10 10:27:03,850 - root - INFO - lr: 3.6399e-06 gnorm: 0.48 [2 days, 10:50:35<14:05:43] +[titan] 2025-09-10 10:27:35,780 - root - INFO - step: 32275 loss: 2.5800 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.11 mfu: 49.45% global_avg_ntp_loss: 0.6940 global_avg_top_loss: 1.8860 +[titan] 2025-09-10 10:27:35,780 - root - INFO - lr: 3.6378e-06 gnorm: 0.56 [2 days, 10:51:07<14:05:10] +[titan] 2025-09-10 10:28:07,684 - root - INFO - step: 32280 loss: 2.5956 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.50 mfu: 49.49% global_avg_ntp_loss: 0.7035 global_avg_top_loss: 1.8921 +[titan] 2025-09-10 10:28:07,684 - root - INFO - lr: 3.6358e-06 gnorm: 0.48 [2 days, 10:51:39<14:04:37] +[titan] 2025-09-10 10:28:39,544 - root - INFO - step: 32285 loss: 2.5992 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.18 mfu: 49.56% global_avg_ntp_loss: 0.7051 global_avg_top_loss: 1.8942 +[titan] 2025-09-10 10:28:39,545 - root - INFO - lr: 3.6337e-06 gnorm: 0.49 [2 days, 10:52:11<14:04:04] +[titan] 2025-09-10 10:29:11,472 - root - INFO - step: 32290 loss: 2.5988 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.15 mfu: 49.46% global_avg_ntp_loss: 0.7050 global_avg_top_loss: 1.8938 +[titan] 2025-09-10 10:29:11,472 - root - INFO - lr: 3.6316e-06 gnorm: 0.49 [2 days, 10:52:43<14:03:31] +[titan] 2025-09-10 10:29:43,404 - root - INFO - step: 32295 loss: 2.4862 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.07 mfu: 49.45% global_avg_ntp_loss: 0.6582 global_avg_top_loss: 1.8280 +[titan] 2025-09-10 10:29:43,405 - root - INFO - lr: 3.6296e-06 gnorm: 0.45 [2 days, 10:53:15<14:02:58] +[titan] 2025-09-10 10:30:08,988 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 10:30:15,324 - root - INFO - step: 32300 loss: 2.6960 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.27 mfu: 49.47% global_avg_ntp_loss: 0.7528 global_avg_top_loss: 1.9433 +[titan] 2025-09-10 10:30:15,324 - root - INFO - lr: 3.6276e-06 gnorm: 0.46 [2 days, 10:53:46<14:02:25] +[titan] 2025-09-10 10:30:47,220 - root - INFO - step: 32305 loss: 2.6448 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.62 mfu: 49.51% global_avg_ntp_loss: 0.7286 global_avg_top_loss: 1.9162 +[titan] 2025-09-10 10:30:47,220 - root - INFO - lr: 3.6255e-06 gnorm: 0.50 [2 days, 10:54:18<14:01:52] +[titan] 2025-09-10 10:31:19,359 - root - INFO - step: 32310 loss: 3.0303 memory: 122.03GiB(87.57%) tps: 10,196 tflops: 485.92 mfu: 49.13% global_avg_ntp_loss: 0.9587 global_avg_top_loss: 2.0715 +[titan] 2025-09-10 10:31:19,360 - root - INFO - lr: 3.6235e-06 gnorm: 0.50 [2 days, 10:54:50<14:01:19] +[titan] 2025-09-10 10:31:51,347 - root - INFO - step: 32315 loss: 2.6080 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.23 mfu: 49.37% global_avg_ntp_loss: 0.7117 global_avg_top_loss: 1.8963 +[titan] 2025-09-10 10:31:51,347 - root - INFO - lr: 3.6214e-06 gnorm: 0.45 [2 days, 10:55:22<14:00:46] +[titan] 2025-09-10 10:32:23,407 - root - INFO - step: 32320 loss: 2.6781 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.12 mfu: 49.25% global_avg_ntp_loss: 0.7416 global_avg_top_loss: 1.9365 +[titan] 2025-09-10 10:32:23,408 - root - INFO - lr: 3.6194e-06 gnorm: 0.47 [2 days, 10:55:55<14:00:13] +[titan] 2025-09-10 10:32:55,385 - root - INFO - step: 32325 loss: 2.6301 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.37 mfu: 49.38% global_avg_ntp_loss: 0.7216 global_avg_top_loss: 1.9085 +[titan] 2025-09-10 10:32:55,386 - root - INFO - lr: 3.6173e-06 gnorm: 0.45 [2 days, 10:56:26<13:59:40] +[titan] 2025-09-10 10:33:27,389 - root - INFO - step: 32330 loss: 2.6844 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.99 mfu: 49.34% global_avg_ntp_loss: 0.7463 global_avg_top_loss: 1.9380 +[titan] 2025-09-10 10:33:27,389 - root - INFO - lr: 3.6153e-06 gnorm: 0.46 [2 days, 10:56:58<13:59:07] +[titan] 2025-09-10 10:33:59,292 - root - INFO - step: 32335 loss: 2.5819 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.51 mfu: 49.50% global_avg_ntp_loss: 0.6984 global_avg_top_loss: 1.8835 +[titan] 2025-09-10 10:33:59,293 - root - INFO - lr: 3.6132e-06 gnorm: 0.49 [2 days, 10:57:30<13:58:33] +[titan] 2025-09-10 10:34:31,158 - root - INFO - step: 32340 loss: 2.6401 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.09 mfu: 49.55% global_avg_ntp_loss: 0.7256 global_avg_top_loss: 1.9145 +[titan] 2025-09-10 10:34:31,158 - root - INFO - lr: 3.6112e-06 gnorm: 0.50 [2 days, 10:58:02<13:58:00] +[titan] 2025-09-10 10:35:03,018 - root - INFO - step: 32345 loss: 2.5226 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.18 mfu: 49.56% global_avg_ntp_loss: 0.6749 global_avg_top_loss: 1.8477 +[titan] 2025-09-10 10:35:03,018 - root - INFO - lr: 3.6092e-06 gnorm: 0.45 [2 days, 10:58:34<13:57:27] +[titan] 2025-09-10 10:35:28,592 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 10:35:35,059 - root - INFO - step: 32350 loss: 2.5082 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.42 mfu: 49.28% global_avg_ntp_loss: 0.6634 global_avg_top_loss: 1.8449 +[titan] 2025-09-10 10:35:35,059 - root - INFO - lr: 3.6071e-06 gnorm: 0.47 [2 days, 10:59:06<13:56:54] +[titan] 2025-09-10 10:36:07,040 - root - INFO - step: 32355 loss: 2.5323 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.33 mfu: 49.38% global_avg_ntp_loss: 0.6761 global_avg_top_loss: 1.8562 +[titan] 2025-09-10 10:36:07,040 - root - INFO - lr: 3.6051e-06 gnorm: 0.56 [2 days, 10:59:38<13:56:21] +[titan] 2025-09-10 10:36:39,113 - root - INFO - step: 32360 loss: 2.7161 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.92 mfu: 49.23% global_avg_ntp_loss: 0.7753 global_avg_top_loss: 1.9407 +[titan] 2025-09-10 10:36:39,113 - root - INFO - lr: 3.6031e-06 gnorm: 0.47 [2 days, 11:00:10<13:55:48] +[titan] 2025-09-10 10:37:11,136 - root - INFO - step: 32365 loss: 2.6720 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.69 mfu: 49.31% global_avg_ntp_loss: 0.7342 global_avg_top_loss: 1.9378 +[titan] 2025-09-10 10:37:11,136 - root - INFO - lr: 3.6010e-06 gnorm: 0.49 [2 days, 11:00:42<13:55:15] +[titan] 2025-09-10 10:37:43,097 - root - INFO - step: 32370 loss: 2.6422 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.62 mfu: 49.41% global_avg_ntp_loss: 0.7271 global_avg_top_loss: 1.9152 +[titan] 2025-09-10 10:37:43,097 - root - INFO - lr: 3.5990e-06 gnorm: 0.48 [2 days, 11:01:14<13:54:42] +[titan] 2025-09-10 10:38:15,009 - root - INFO - step: 32375 loss: 2.4889 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.39 mfu: 49.48% global_avg_ntp_loss: 0.6591 global_avg_top_loss: 1.8298 +[titan] 2025-09-10 10:38:15,009 - root - INFO - lr: 3.5970e-06 gnorm: 0.46 [2 days, 11:01:46<13:54:09] +[titan] 2025-09-10 10:38:46,944 - root - INFO - step: 32380 loss: 2.6203 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.04 mfu: 49.45% global_avg_ntp_loss: 0.7187 global_avg_top_loss: 1.9016 +[titan] 2025-09-10 10:38:46,944 - root - INFO - lr: 3.5949e-06 gnorm: 0.47 [2 days, 11:02:18<13:53:36] +[titan] 2025-09-10 10:39:18,844 - root - INFO - step: 32385 loss: 2.5842 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.56 mfu: 49.50% global_avg_ntp_loss: 0.7042 global_avg_top_loss: 1.8800 +[titan] 2025-09-10 10:39:18,845 - root - INFO - lr: 3.5929e-06 gnorm: 1.84 [2 days, 11:02:50<13:53:03] +[titan] 2025-09-10 10:39:50,877 - root - INFO - step: 32390 loss: 3.1069 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.54 mfu: 49.30% global_avg_ntp_loss: 0.9897 global_avg_top_loss: 2.1172 +[titan] 2025-09-10 10:39:50,877 - root - INFO - lr: 3.5909e-06 gnorm: 0.46 [2 days, 11:03:22<13:52:30] +[titan] 2025-09-10 10:40:23,094 - root - INFO - step: 32395 loss: 2.6002 memory: 122.03GiB(87.57%) tps: 10,171 tflops: 484.74 mfu: 49.01% global_avg_ntp_loss: 0.7064 global_avg_top_loss: 1.8937 +[titan] 2025-09-10 10:40:23,095 - root - INFO - lr: 3.5889e-06 gnorm: 0.46 [2 days, 11:03:54<13:51:57] +[titan] 2025-09-10 10:40:48,765 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 10:40:55,123 - root - INFO - step: 32400 loss: 2.6850 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.60 mfu: 49.30% global_avg_ntp_loss: 0.7401 global_avg_top_loss: 1.9449 +[titan] 2025-09-10 10:40:55,124 - root - INFO - lr: 3.5868e-06 gnorm: 0.52 [2 days, 11:04:26<13:51:24] +[titan] 2025-09-10 10:41:26,929 - root - INFO - step: 32405 loss: 2.5992 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.03 mfu: 49.65% global_avg_ntp_loss: 0.7081 global_avg_top_loss: 1.8911 +[titan] 2025-09-10 10:41:26,929 - root - INFO - lr: 3.5848e-06 gnorm: 0.46 [2 days, 11:04:58<13:50:51] +[titan] 2025-09-10 10:41:58,963 - root - INFO - step: 32410 loss: 2.5319 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.52 mfu: 49.29% global_avg_ntp_loss: 0.6769 global_avg_top_loss: 1.8550 +[titan] 2025-09-10 10:41:58,963 - root - INFO - lr: 3.5828e-06 gnorm: 0.45 [2 days, 11:05:30<13:50:18] +[titan] 2025-09-10 10:42:30,919 - root - INFO - step: 32415 loss: 2.6065 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.71 mfu: 49.41% global_avg_ntp_loss: 0.7103 global_avg_top_loss: 1.8962 +[titan] 2025-09-10 10:42:30,919 - root - INFO - lr: 3.5808e-06 gnorm: 0.48 [2 days, 11:06:02<13:49:45] +[titan] 2025-09-10 10:43:03,014 - root - INFO - step: 32420 loss: 2.5882 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.59 mfu: 49.20% global_avg_ntp_loss: 0.7028 global_avg_top_loss: 1.8854 +[titan] 2025-09-10 10:43:03,014 - root - INFO - lr: 3.5787e-06 gnorm: 0.53 [2 days, 11:06:34<13:49:12] +[titan] 2025-09-10 10:43:34,930 - root - INFO - step: 32425 loss: 2.6165 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.32 mfu: 49.48% global_avg_ntp_loss: 0.7126 global_avg_top_loss: 1.9039 +[titan] 2025-09-10 10:43:34,930 - root - INFO - lr: 3.5767e-06 gnorm: 0.47 [2 days, 11:07:06<13:48:39] +[titan] 2025-09-10 10:44:06,875 - root - INFO - step: 32430 loss: 2.5410 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.88 mfu: 49.43% global_avg_ntp_loss: 0.6826 global_avg_top_loss: 1.8584 +[titan] 2025-09-10 10:44:06,875 - root - INFO - lr: 3.5747e-06 gnorm: 0.50 [2 days, 11:07:38<13:48:06] +[titan] 2025-09-10 10:44:39,028 - root - INFO - step: 32435 loss: 2.5435 memory: 122.03GiB(87.57%) tps: 10,191 tflops: 485.71 mfu: 49.11% global_avg_ntp_loss: 0.6784 global_avg_top_loss: 1.8651 +[titan] 2025-09-10 10:44:39,028 - root - INFO - lr: 3.5727e-06 gnorm: 0.58 [2 days, 11:08:10<13:47:33] +[titan] 2025-09-10 10:45:11,036 - root - INFO - step: 32440 loss: 2.5355 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.92 mfu: 49.33% global_avg_ntp_loss: 0.6776 global_avg_top_loss: 1.8579 +[titan] 2025-09-10 10:45:11,036 - root - INFO - lr: 3.5707e-06 gnorm: 0.47 [2 days, 11:08:42<13:47:00] +[titan] 2025-09-10 10:45:43,193 - root - INFO - step: 32445 loss: 2.4780 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.66 mfu: 49.11% global_avg_ntp_loss: 0.6518 global_avg_top_loss: 1.8262 +[titan] 2025-09-10 10:45:43,193 - root - INFO - lr: 3.5687e-06 gnorm: 0.49 [2 days, 11:09:14<13:46:27] +[titan] 2025-09-10 10:46:09,050 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 10:46:15,378 - root - INFO - step: 32450 loss: 2.6245 memory: 122.03GiB(87.57%) tps: 10,181 tflops: 485.23 mfu: 49.06% global_avg_ntp_loss: 0.7167 global_avg_top_loss: 1.9079 +[titan] 2025-09-10 10:46:15,378 - root - INFO - lr: 3.5666e-06 gnorm: 0.49 [2 days, 11:09:46<13:45:54] +[titan] 2025-09-10 10:46:47,117 - root - INFO - step: 32455 loss: 2.9816 memory: 122.03GiB(87.57%) tps: 10,324 tflops: 492.05 mfu: 49.75% global_avg_ntp_loss: 0.9307 global_avg_top_loss: 2.0509 +[titan] 2025-09-10 10:46:47,117 - root - INFO - lr: 3.5646e-06 gnorm: 0.47 [2 days, 11:10:18<13:45:21] +[titan] 2025-09-10 10:47:19,110 - root - INFO - step: 32460 loss: 2.6009 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.16 mfu: 49.36% global_avg_ntp_loss: 0.7083 global_avg_top_loss: 1.8926 +[titan] 2025-09-10 10:47:19,110 - root - INFO - lr: 3.5626e-06 gnorm: 0.48 [2 days, 11:10:50<13:44:48] +[titan] 2025-09-10 10:47:51,295 - root - INFO - step: 32465 loss: 2.8401 memory: 122.03GiB(87.57%) tps: 10,181 tflops: 485.23 mfu: 49.06% global_avg_ntp_loss: 0.8484 global_avg_top_loss: 1.9917 +[titan] 2025-09-10 10:47:51,295 - root - INFO - lr: 3.5606e-06 gnorm: 0.50 [2 days, 11:11:22<13:44:15] +[titan] 2025-09-10 10:48:23,156 - root - INFO - step: 32470 loss: 3.6020 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.17 mfu: 49.56% global_avg_ntp_loss: 1.2681 global_avg_top_loss: 2.3338 +[titan] 2025-09-10 10:48:23,156 - root - INFO - lr: 3.5586e-06 gnorm: 0.48 [2 days, 11:11:54<13:43:42] +[titan] 2025-09-10 10:48:55,113 - root - INFO - step: 32475 loss: 2.6024 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.69 mfu: 49.41% global_avg_ntp_loss: 0.7099 global_avg_top_loss: 1.8925 +[titan] 2025-09-10 10:48:55,114 - root - INFO - lr: 3.5566e-06 gnorm: 0.46 [2 days, 11:12:26<13:43:09] +[titan] 2025-09-10 10:49:27,041 - root - INFO - step: 32480 loss: 2.6031 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.14 mfu: 49.46% global_avg_ntp_loss: 0.7095 global_avg_top_loss: 1.8935 +[titan] 2025-09-10 10:49:27,042 - root - INFO - lr: 3.5546e-06 gnorm: 0.49 [2 days, 11:12:58<13:42:36] +[titan] 2025-09-10 10:49:59,135 - root - INFO - step: 32485 loss: 2.6311 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.62 mfu: 49.20% global_avg_ntp_loss: 0.7210 global_avg_top_loss: 1.9102 +[titan] 2025-09-10 10:49:59,135 - root - INFO - lr: 3.5526e-06 gnorm: 0.47 [2 days, 11:13:30<13:42:03] +[titan] 2025-09-10 10:50:31,036 - root - INFO - step: 32490 loss: 2.6536 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.55 mfu: 49.50% global_avg_ntp_loss: 0.7349 global_avg_top_loss: 1.9187 +[titan] 2025-09-10 10:50:31,037 - root - INFO - lr: 3.5506e-06 gnorm: 0.45 [2 days, 11:14:02<13:41:30] +[titan] 2025-09-10 10:51:03,054 - root - INFO - step: 32495 loss: 2.6837 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.77 mfu: 49.32% global_avg_ntp_loss: 0.7454 global_avg_top_loss: 1.9383 +[titan] 2025-09-10 10:51:03,054 - root - INFO - lr: 3.5486e-06 gnorm: 0.51 [2 days, 11:14:34<13:40:57] +[titan] 2025-09-10 10:51:28,460 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 10:51:34,893 - root - INFO - step: 32500 loss: 2.5891 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.50 mfu: 49.60% global_avg_ntp_loss: 0.7039 global_avg_top_loss: 1.8852 +[titan] 2025-09-10 10:51:34,893 - root - INFO - lr: 3.5466e-06 gnorm: 0.49 [2 days, 11:15:06<13:40:24] +[titan] 2025-09-10 10:52:06,784 - root - INFO - step: 32505 loss: 2.4579 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.71 mfu: 49.52% global_avg_ntp_loss: 0.6441 global_avg_top_loss: 1.8138 +[titan] 2025-09-10 10:52:06,784 - root - INFO - lr: 3.5446e-06 gnorm: 0.46 [2 days, 11:15:38<13:39:51] +[titan] 2025-09-10 10:52:38,662 - root - INFO - step: 32510 loss: 2.5451 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.90 mfu: 49.53% global_avg_ntp_loss: 0.6792 global_avg_top_loss: 1.8660 +[titan] 2025-09-10 10:52:38,662 - root - INFO - lr: 3.5426e-06 gnorm: 0.46 [2 days, 11:16:10<13:39:18] +[titan] 2025-09-10 10:53:10,483 - root - INFO - step: 32515 loss: 2.5564 memory: 122.03GiB(87.57%) tps: 10,298 tflops: 490.78 mfu: 49.62% global_avg_ntp_loss: 0.6838 global_avg_top_loss: 1.8726 +[titan] 2025-09-10 10:53:10,483 - root - INFO - lr: 3.5406e-06 gnorm: 0.64 [2 days, 11:16:42<13:38:45] +[titan] 2025-09-10 10:53:42,597 - root - INFO - step: 32520 loss: 2.5582 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.30 mfu: 49.17% global_avg_ntp_loss: 0.6846 global_avg_top_loss: 1.8737 +[titan] 2025-09-10 10:53:42,598 - root - INFO - lr: 3.5386e-06 gnorm: 0.50 [2 days, 11:17:14<13:38:12] +[titan] 2025-09-10 10:54:14,662 - root - INFO - step: 32525 loss: 2.5412 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.05 mfu: 49.25% global_avg_ntp_loss: 0.6757 global_avg_top_loss: 1.8655 +[titan] 2025-09-10 10:54:14,663 - root - INFO - lr: 3.5366e-06 gnorm: 0.51 [2 days, 11:17:46<13:37:39] +[titan] 2025-09-10 10:54:46,595 - root - INFO - step: 32530 loss: 2.7528 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.07 mfu: 49.45% global_avg_ntp_loss: 0.7895 global_avg_top_loss: 1.9632 +[titan] 2025-09-10 10:54:46,595 - root - INFO - lr: 3.5346e-06 gnorm: 0.49 [2 days, 11:18:18<13:37:06] +[titan] 2025-09-10 10:55:18,735 - root - INFO - step: 32535 loss: 2.9839 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.91 mfu: 49.13% global_avg_ntp_loss: 0.9325 global_avg_top_loss: 2.0514 +[titan] 2025-09-10 10:55:18,735 - root - INFO - lr: 3.5326e-06 gnorm: 0.47 [2 days, 11:18:50<13:36:33] +[titan] 2025-09-10 10:55:50,689 - root - INFO - step: 32540 loss: 2.5895 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.74 mfu: 49.42% global_avg_ntp_loss: 0.7023 global_avg_top_loss: 1.8872 +[titan] 2025-09-10 10:55:50,690 - root - INFO - lr: 3.5306e-06 gnorm: 0.48 [2 days, 11:19:22<13:36:00] +[titan] 2025-09-10 10:56:22,534 - root - INFO - step: 32545 loss: 2.6220 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.41 mfu: 49.59% global_avg_ntp_loss: 0.7196 global_avg_top_loss: 1.9024 +[titan] 2025-09-10 10:56:22,535 - root - INFO - lr: 3.5286e-06 gnorm: 0.54 [2 days, 11:19:54<13:35:27] +[titan] 2025-09-10 10:56:48,039 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 10:56:54,496 - root - INFO - step: 32550 loss: 2.6622 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.63 mfu: 49.41% global_avg_ntp_loss: 0.7391 global_avg_top_loss: 1.9231 +[titan] 2025-09-10 10:56:54,496 - root - INFO - lr: 3.5266e-06 gnorm: 0.46 [2 days, 11:20:26<13:34:54] +[titan] 2025-09-10 10:57:26,372 - root - INFO - step: 32555 loss: 2.5960 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.92 mfu: 49.54% global_avg_ntp_loss: 0.7086 global_avg_top_loss: 1.8874 +[titan] 2025-09-10 10:57:26,373 - root - INFO - lr: 3.5246e-06 gnorm: 0.46 [2 days, 11:20:57<13:34:21] +[titan] 2025-09-10 10:57:58,380 - root - INFO - step: 32560 loss: 2.6671 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.92 mfu: 49.34% global_avg_ntp_loss: 0.7387 global_avg_top_loss: 1.9284 +[titan] 2025-09-10 10:57:58,380 - root - INFO - lr: 3.5227e-06 gnorm: 0.50 [2 days, 11:21:29<13:33:48] +[titan] 2025-09-10 10:58:30,140 - root - INFO - step: 32565 loss: 2.5631 memory: 122.03GiB(87.57%) tps: 10,318 tflops: 491.73 mfu: 49.72% global_avg_ntp_loss: 0.6924 global_avg_top_loss: 1.8707 +[titan] 2025-09-10 10:58:30,140 - root - INFO - lr: 3.5207e-06 gnorm: 0.46 [2 days, 11:22:01<13:33:15] +[titan] 2025-09-10 10:59:02,130 - root - INFO - step: 32570 loss: 2.5008 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.19 mfu: 49.36% global_avg_ntp_loss: 0.6645 global_avg_top_loss: 1.8363 +[titan] 2025-09-10 10:59:02,130 - root - INFO - lr: 3.5187e-06 gnorm: 0.48 [2 days, 11:22:33<13:32:42] +[titan] 2025-09-10 10:59:34,366 - root - INFO - step: 32575 loss: 2.7175 memory: 122.03GiB(87.57%) tps: 10,165 tflops: 484.46 mfu: 48.99% global_avg_ntp_loss: 0.7623 global_avg_top_loss: 1.9552 +[titan] 2025-09-10 10:59:34,366 - root - INFO - lr: 3.5167e-06 gnorm: 0.51 [2 days, 11:23:05<13:32:09] +[titan] 2025-09-10 11:00:06,020 - root - INFO - step: 32580 loss: 2.6220 memory: 122.03GiB(87.57%) tps: 10,352 tflops: 493.38 mfu: 49.89% global_avg_ntp_loss: 0.7163 global_avg_top_loss: 1.9057 +[titan] 2025-09-10 11:00:06,020 - root - INFO - lr: 3.5147e-06 gnorm: 0.50 [2 days, 11:23:37<13:31:36] +[titan] 2025-09-10 11:00:38,153 - root - INFO - step: 32585 loss: 2.4719 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.01 mfu: 49.14% global_avg_ntp_loss: 0.6506 global_avg_top_loss: 1.8213 +[titan] 2025-09-10 11:00:38,153 - root - INFO - lr: 3.5127e-06 gnorm: 0.46 [2 days, 11:24:09<13:31:03] +[titan] 2025-09-10 11:01:09,841 - root - INFO - step: 32590 loss: 2.4728 memory: 122.03GiB(87.57%) tps: 10,341 tflops: 492.85 mfu: 49.83% global_avg_ntp_loss: 0.6498 global_avg_top_loss: 1.8230 +[titan] 2025-09-10 11:01:09,841 - root - INFO - lr: 3.5108e-06 gnorm: 0.47 [2 days, 11:24:41<13:30:30] +[titan] 2025-09-10 11:01:41,842 - root - INFO - step: 32595 loss: 2.4407 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.02 mfu: 49.35% global_avg_ntp_loss: 0.6274 global_avg_top_loss: 1.8133 +[titan] 2025-09-10 11:01:41,842 - root - INFO - lr: 3.5088e-06 gnorm: 0.60 [2 days, 11:25:13<13:29:57] +[titan] 2025-09-10 11:02:07,569 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 11:02:14,037 - root - INFO - step: 32600 loss: 2.9215 memory: 122.03GiB(87.57%) tps: 10,178 tflops: 485.08 mfu: 49.05% global_avg_ntp_loss: 0.9045 global_avg_top_loss: 2.0170 +[titan] 2025-09-10 11:02:14,037 - root - INFO - lr: 3.5068e-06 gnorm: 0.45 [2 days, 11:25:45<13:29:24] +[titan] 2025-09-10 11:02:46,096 - root - INFO - step: 32605 loss: 2.6601 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.13 mfu: 49.26% global_avg_ntp_loss: 0.7431 global_avg_top_loss: 1.9170 +[titan] 2025-09-10 11:02:46,096 - root - INFO - lr: 3.5048e-06 gnorm: 0.52 [2 days, 11:26:17<13:28:51] +[titan] 2025-09-10 11:03:17,841 - root - INFO - step: 32610 loss: 2.6177 memory: 122.03GiB(87.57%) tps: 10,322 tflops: 491.96 mfu: 49.74% global_avg_ntp_loss: 0.7150 global_avg_top_loss: 1.9027 +[titan] 2025-09-10 11:03:17,841 - root - INFO - lr: 3.5028e-06 gnorm: 0.48 [2 days, 11:26:49<13:28:18] +[titan] 2025-09-10 11:03:49,625 - root - INFO - step: 32615 loss: 2.9550 memory: 122.03GiB(87.57%) tps: 10,310 tflops: 491.36 mfu: 49.68% global_avg_ntp_loss: 0.9230 global_avg_top_loss: 2.0320 +[titan] 2025-09-10 11:03:49,625 - root - INFO - lr: 3.5009e-06 gnorm: 0.46 [2 days, 11:27:21<13:27:45] +[titan] 2025-09-10 11:04:21,797 - root - INFO - step: 32620 loss: 2.5854 memory: 122.03GiB(87.57%) tps: 10,185 tflops: 485.43 mfu: 49.08% global_avg_ntp_loss: 0.7019 global_avg_top_loss: 1.8835 +[titan] 2025-09-10 11:04:21,797 - root - INFO - lr: 3.4989e-06 gnorm: 0.51 [2 days, 11:27:53<13:27:12] +[titan] 2025-09-10 11:04:53,776 - root - INFO - step: 32625 loss: 2.6579 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.35 mfu: 49.38% global_avg_ntp_loss: 0.7330 global_avg_top_loss: 1.9249 +[titan] 2025-09-10 11:04:53,777 - root - INFO - lr: 3.4969e-06 gnorm: 0.51 [2 days, 11:28:25<13:26:39] +[titan] 2025-09-10 11:05:25,803 - root - INFO - step: 32630 loss: 2.6595 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.64 mfu: 49.31% global_avg_ntp_loss: 0.7339 global_avg_top_loss: 1.9256 +[titan] 2025-09-10 11:05:25,803 - root - INFO - lr: 3.4950e-06 gnorm: 0.47 [2 days, 11:28:57<13:26:06] +[titan] 2025-09-10 11:05:57,638 - root - INFO - step: 32635 loss: 2.5456 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.56 mfu: 49.60% global_avg_ntp_loss: 0.6820 global_avg_top_loss: 1.8636 +[titan] 2025-09-10 11:05:57,638 - root - INFO - lr: 3.4930e-06 gnorm: 0.48 [2 days, 11:29:29<13:25:33] +[titan] 2025-09-10 11:06:29,705 - root - INFO - step: 32640 loss: 2.6058 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.01 mfu: 49.24% global_avg_ntp_loss: 0.7098 global_avg_top_loss: 1.8959 +[titan] 2025-09-10 11:06:29,706 - root - INFO - lr: 3.4910e-06 gnorm: 0.46 [2 days, 11:30:01<13:25:00] +[titan] 2025-09-10 11:07:01,673 - root - INFO - step: 32645 loss: 2.6563 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.53 mfu: 49.40% global_avg_ntp_loss: 0.7331 global_avg_top_loss: 1.9232 +[titan] 2025-09-10 11:07:01,673 - root - INFO - lr: 3.4890e-06 gnorm: 0.50 [2 days, 11:30:33<13:24:27] +[titan] 2025-09-10 11:07:27,008 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 11:07:33,495 - root - INFO - step: 32650 loss: 2.6000 memory: 122.03GiB(87.57%) tps: 10,297 tflops: 490.76 mfu: 49.62% global_avg_ntp_loss: 0.7105 global_avg_top_loss: 1.8895 +[titan] 2025-09-10 11:07:33,496 - root - INFO - lr: 3.4871e-06 gnorm: 0.48 [2 days, 11:31:05<13:23:54] +[titan] 2025-09-10 11:08:05,678 - root - INFO - step: 32655 loss: 2.6512 memory: 122.03GiB(87.57%) tps: 10,182 tflops: 485.28 mfu: 49.07% global_avg_ntp_loss: 0.7344 global_avg_top_loss: 1.9167 +[titan] 2025-09-10 11:08:05,678 - root - INFO - lr: 3.4851e-06 gnorm: 0.50 [2 days, 11:31:37<13:23:21] +[titan] 2025-09-10 11:08:37,689 - root - INFO - step: 32660 loss: 2.6249 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.87 mfu: 49.33% global_avg_ntp_loss: 0.7162 global_avg_top_loss: 1.9087 +[titan] 2025-09-10 11:08:37,689 - root - INFO - lr: 3.4832e-06 gnorm: 0.54 [2 days, 11:32:09<13:22:48] +[titan] 2025-09-10 11:09:09,665 - root - INFO - step: 32665 loss: 2.5249 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.40 mfu: 49.38% global_avg_ntp_loss: 0.6726 global_avg_top_loss: 1.8523 +[titan] 2025-09-10 11:09:09,665 - root - INFO - lr: 3.4812e-06 gnorm: 0.46 [2 days, 11:32:41<13:22:15] +[titan] 2025-09-10 11:09:41,500 - root - INFO - step: 32670 loss: 2.5397 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.57 mfu: 49.60% global_avg_ntp_loss: 0.6779 global_avg_top_loss: 1.8619 +[titan] 2025-09-10 11:09:41,500 - root - INFO - lr: 3.4792e-06 gnorm: 0.47 [2 days, 11:33:13<13:21:42] +[titan] 2025-09-10 11:10:13,609 - root - INFO - step: 32675 loss: 2.4715 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.37 mfu: 49.18% global_avg_ntp_loss: 0.6456 global_avg_top_loss: 1.8260 +[titan] 2025-09-10 11:10:13,610 - root - INFO - lr: 3.4773e-06 gnorm: 0.56 [2 days, 11:33:45<13:21:09] +[titan] 2025-09-10 11:10:45,574 - root - INFO - step: 32680 loss: 2.6185 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.58 mfu: 49.40% global_avg_ntp_loss: 0.7126 global_avg_top_loss: 1.9059 +[titan] 2025-09-10 11:10:45,574 - root - INFO - lr: 3.4753e-06 gnorm: 0.48 [2 days, 11:34:17<13:20:36] +[titan] 2025-09-10 11:11:17,631 - root - INFO - step: 32685 loss: 2.5272 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.18 mfu: 49.26% global_avg_ntp_loss: 0.6724 global_avg_top_loss: 1.8548 +[titan] 2025-09-10 11:11:17,631 - root - INFO - lr: 3.4734e-06 gnorm: 0.52 [2 days, 11:34:49<13:20:03] +[titan] 2025-09-10 11:11:49,705 - root - INFO - step: 32690 loss: 2.6776 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.90 mfu: 49.23% global_avg_ntp_loss: 0.7444 global_avg_top_loss: 1.9332 +[titan] 2025-09-10 11:11:49,706 - root - INFO - lr: 3.4714e-06 gnorm: 0.54 [2 days, 11:35:21<13:19:30] +[titan] 2025-09-10 11:12:21,550 - root - INFO - step: 32695 loss: 2.9332 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.42 mfu: 49.59% global_avg_ntp_loss: 0.9147 global_avg_top_loss: 2.0186 +[titan] 2025-09-10 11:12:21,550 - root - INFO - lr: 3.4694e-06 gnorm: 0.44 [2 days, 11:35:53<13:18:57] +[titan] 2025-09-10 11:12:47,245 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 11:12:53,574 - root - INFO - step: 32700 loss: 2.6665 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.67 mfu: 49.31% global_avg_ntp_loss: 0.7383 global_avg_top_loss: 1.9283 +[titan] 2025-09-10 11:12:53,575 - root - INFO - lr: 3.4675e-06 gnorm: 0.50 [2 days, 11:36:25<13:18:24] +[titan] 2025-09-10 11:13:25,530 - root - INFO - step: 32705 loss: 2.6914 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.72 mfu: 49.42% global_avg_ntp_loss: 0.7494 global_avg_top_loss: 1.9419 +[titan] 2025-09-10 11:13:25,530 - root - INFO - lr: 3.4655e-06 gnorm: 0.50 [2 days, 11:36:57<13:17:51] +[titan] 2025-09-10 11:13:57,564 - root - INFO - step: 32710 loss: 3.0670 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.52 mfu: 49.29% global_avg_ntp_loss: 0.9699 global_avg_top_loss: 2.0971 +[titan] 2025-09-10 11:13:57,564 - root - INFO - lr: 3.4636e-06 gnorm: 0.48 [2 days, 11:37:29<13:17:18] +[titan] 2025-09-10 11:14:29,510 - root - INFO - step: 32715 loss: 2.5746 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.86 mfu: 49.43% global_avg_ntp_loss: 0.6995 global_avg_top_loss: 1.8751 +[titan] 2025-09-10 11:14:29,510 - root - INFO - lr: 3.4616e-06 gnorm: 0.45 [2 days, 11:38:01<13:16:45] +[titan] 2025-09-10 11:15:01,286 - root - INFO - step: 32720 loss: 2.5888 memory: 122.03GiB(87.57%) tps: 10,312 tflops: 491.47 mfu: 49.69% global_avg_ntp_loss: 0.7045 global_avg_top_loss: 1.8843 +[titan] 2025-09-10 11:15:01,287 - root - INFO - lr: 3.4597e-06 gnorm: 0.48 [2 days, 11:38:32<13:16:12] +[titan] 2025-09-10 11:15:33,190 - root - INFO - step: 32725 loss: 2.7430 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.51 mfu: 49.50% global_avg_ntp_loss: 0.7880 global_avg_top_loss: 1.9550 +[titan] 2025-09-10 11:15:33,190 - root - INFO - lr: 3.4577e-06 gnorm: 0.47 [2 days, 11:39:04<13:15:39] +[titan] 2025-09-10 11:16:05,169 - root - INFO - step: 32730 loss: 2.5855 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.36 mfu: 49.38% global_avg_ntp_loss: 0.7005 global_avg_top_loss: 1.8850 +[titan] 2025-09-10 11:16:05,169 - root - INFO - lr: 3.4558e-06 gnorm: 0.47 [2 days, 11:39:36<13:15:06] +[titan] 2025-09-10 11:16:37,180 - root - INFO - step: 32735 loss: 2.6497 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.87 mfu: 49.33% global_avg_ntp_loss: 0.7302 global_avg_top_loss: 1.9196 +[titan] 2025-09-10 11:16:37,180 - root - INFO - lr: 3.4538e-06 gnorm: 0.53 [2 days, 11:40:08<13:14:33] +[titan] 2025-09-10 11:17:09,161 - root - INFO - step: 32740 loss: 2.5685 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.34 mfu: 49.38% global_avg_ntp_loss: 0.6936 global_avg_top_loss: 1.8749 +[titan] 2025-09-10 11:17:09,161 - root - INFO - lr: 3.4519e-06 gnorm: 0.52 [2 days, 11:40:40<13:14:00] +[titan] 2025-09-10 11:17:41,169 - root - INFO - step: 32745 loss: 2.5219 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.92 mfu: 49.33% global_avg_ntp_loss: 0.6761 global_avg_top_loss: 1.8458 +[titan] 2025-09-10 11:17:41,169 - root - INFO - lr: 3.4499e-06 gnorm: 0.47 [2 days, 11:41:12<13:13:27] +[titan] 2025-09-10 11:18:06,570 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 11:18:13,061 - root - INFO - step: 32750 loss: 2.5331 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.68 mfu: 49.51% global_avg_ntp_loss: 0.6757 global_avg_top_loss: 1.8574 +[titan] 2025-09-10 11:18:13,062 - root - INFO - lr: 3.4480e-06 gnorm: 0.47 [2 days, 11:41:44<13:12:54] +[titan] 2025-09-10 11:18:45,039 - root - INFO - step: 32755 loss: 2.5004 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.37 mfu: 49.38% global_avg_ntp_loss: 0.6578 global_avg_top_loss: 1.8425 +[titan] 2025-09-10 11:18:45,040 - root - INFO - lr: 3.4461e-06 gnorm: 0.59 [2 days, 11:42:16<13:12:21] +[titan] 2025-09-10 11:19:17,126 - root - INFO - step: 32760 loss: 2.6087 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.71 mfu: 49.21% global_avg_ntp_loss: 0.7109 global_avg_top_loss: 1.8978 +[titan] 2025-09-10 11:19:17,127 - root - INFO - lr: 3.4441e-06 gnorm: 0.49 [2 days, 11:42:48<13:11:48] +[titan] 2025-09-10 11:19:49,167 - root - INFO - step: 32765 loss: 2.5477 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.42 mfu: 49.28% global_avg_ntp_loss: 0.6793 global_avg_top_loss: 1.8684 +[titan] 2025-09-10 11:19:49,168 - root - INFO - lr: 3.4422e-06 gnorm: 0.52 [2 days, 11:43:20<13:11:15] +[titan] 2025-09-10 11:20:08,521 - root - INFO - Dumping profiler traces at step 32768 +[titan] 2025-09-10 11:20:08,578 - root - INFO - Finished dumping profiler traces in 0.06 seconds +[titan] 2025-09-10 11:20:21,292 - root - INFO - step: 32770 loss: 2.6041 memory: 122.03GiB(87.57%) tps: 10,200 tflops: 486.14 mfu: 49.15% global_avg_ntp_loss: 0.7069 global_avg_top_loss: 1.8972 +[titan] 2025-09-10 11:20:21,292 - root - INFO - lr: 3.4402e-06 gnorm: 0.51 [2 days, 11:43:52<13:10:42] +[titan] 2025-09-10 11:20:53,144 - root - INFO - step: 32775 loss: 2.9580 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.30 mfu: 49.58% global_avg_ntp_loss: 0.9180 global_avg_top_loss: 2.0400 +[titan] 2025-09-10 11:20:53,145 - root - INFO - lr: 3.4383e-06 gnorm: 0.47 [2 days, 11:44:24<13:10:09] +[titan] 2025-09-10 11:21:25,206 - root - INFO - step: 32780 loss: 2.6103 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.11 mfu: 49.25% global_avg_ntp_loss: 0.7086 global_avg_top_loss: 1.9017 +[titan] 2025-09-10 11:21:25,206 - root - INFO - lr: 3.4364e-06 gnorm: 0.52 [2 days, 11:44:56<13:09:36] +[titan] 2025-09-10 11:21:57,179 - root - INFO - step: 32785 loss: 2.6789 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.45 mfu: 49.39% global_avg_ntp_loss: 0.7422 global_avg_top_loss: 1.9367 +[titan] 2025-09-10 11:21:57,179 - root - INFO - lr: 3.4344e-06 gnorm: 0.50 [2 days, 11:45:28<13:09:03] +[titan] 2025-09-10 11:22:29,090 - root - INFO - step: 32790 loss: 3.0858 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.40 mfu: 49.48% global_avg_ntp_loss: 0.9833 global_avg_top_loss: 2.1025 +[titan] 2025-09-10 11:22:29,090 - root - INFO - lr: 3.4325e-06 gnorm: 0.52 [2 days, 11:46:00<13:08:30] +[titan] 2025-09-10 11:23:01,247 - root - INFO - step: 32795 loss: 2.6666 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.65 mfu: 49.11% global_avg_ntp_loss: 0.7367 global_avg_top_loss: 1.9299 +[titan] 2025-09-10 11:23:01,247 - root - INFO - lr: 3.4306e-06 gnorm: 0.48 [2 days, 11:46:32<13:07:57] +[titan] 2025-09-10 11:23:26,678 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 11:23:33,026 - root - INFO - step: 32800 loss: 2.5508 memory: 122.03GiB(87.57%) tps: 10,311 tflops: 491.43 mfu: 49.69% global_avg_ntp_loss: 0.6849 global_avg_top_loss: 1.8659 +[titan] 2025-09-10 11:23:33,026 - root - INFO - lr: 3.4286e-06 gnorm: 0.47 [2 days, 11:47:04<13:07:24] +[titan] 2025-09-10 11:24:04,809 - root - INFO - step: 32805 loss: 2.6162 memory: 122.03GiB(87.57%) tps: 10,310 tflops: 491.37 mfu: 49.68% global_avg_ntp_loss: 0.7137 global_avg_top_loss: 1.9025 +[titan] 2025-09-10 11:24:04,809 - root - INFO - lr: 3.4267e-06 gnorm: 0.46 [2 days, 11:47:36<13:06:51] +[titan] 2025-09-10 11:24:36,820 - root - INFO - step: 32810 loss: 2.6533 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.88 mfu: 49.33% global_avg_ntp_loss: 0.7307 global_avg_top_loss: 1.9226 +[titan] 2025-09-10 11:24:36,820 - root - INFO - lr: 3.4248e-06 gnorm: 0.46 [2 days, 11:48:08<13:06:18] +[titan] 2025-09-10 11:25:08,678 - root - INFO - step: 32815 loss: 2.6721 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.22 mfu: 49.57% global_avg_ntp_loss: 0.7422 global_avg_top_loss: 1.9299 +[titan] 2025-09-10 11:25:08,678 - root - INFO - lr: 3.4229e-06 gnorm: 0.55 [2 days, 11:48:40<13:05:45] +[titan] 2025-09-10 11:25:40,550 - root - INFO - step: 32820 loss: 2.6152 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 490.00 mfu: 49.54% global_avg_ntp_loss: 0.7124 global_avg_top_loss: 1.9028 +[titan] 2025-09-10 11:25:40,550 - root - INFO - lr: 3.4209e-06 gnorm: 0.53 [2 days, 11:49:12<13:05:12] +[titan] 2025-09-10 11:26:12,345 - root - INFO - step: 32825 loss: 2.6141 memory: 122.03GiB(87.57%) tps: 10,306 tflops: 491.18 mfu: 49.66% global_avg_ntp_loss: 0.7171 global_avg_top_loss: 1.8970 +[titan] 2025-09-10 11:26:12,345 - root - INFO - lr: 3.4190e-06 gnorm: 0.49 [2 days, 11:49:43<13:04:39] +[titan] 2025-09-10 11:26:44,280 - root - INFO - step: 32830 loss: 2.4435 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.03 mfu: 49.45% global_avg_ntp_loss: 0.6296 global_avg_top_loss: 1.8138 +[titan] 2025-09-10 11:26:44,280 - root - INFO - lr: 3.4171e-06 gnorm: 0.52 [2 days, 11:50:15<13:04:06] +[titan] 2025-09-10 11:27:16,121 - root - INFO - step: 32835 loss: 2.4279 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.47 mfu: 49.59% global_avg_ntp_loss: 0.6227 global_avg_top_loss: 1.8052 +[titan] 2025-09-10 11:27:16,122 - root - INFO - lr: 3.4152e-06 gnorm: 0.59 [2 days, 11:50:47<13:03:33] +[titan] 2025-09-10 11:27:48,188 - root - INFO - step: 32840 loss: 2.6491 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.02 mfu: 49.24% global_avg_ntp_loss: 0.7303 global_avg_top_loss: 1.9188 +[titan] 2025-09-10 11:27:48,189 - root - INFO - lr: 3.4132e-06 gnorm: 0.50 [2 days, 11:51:19<13:03:00] +[titan] 2025-09-10 11:28:20,076 - root - INFO - step: 32845 loss: 2.5356 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.76 mfu: 49.52% global_avg_ntp_loss: 0.6748 global_avg_top_loss: 1.8607 +[titan] 2025-09-10 11:28:20,076 - root - INFO - lr: 3.4113e-06 gnorm: 0.51 [2 days, 11:51:51<13:02:27] +[titan] 2025-09-10 11:28:45,683 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 11:28:52,069 - root - INFO - step: 32850 loss: 2.5757 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.14 mfu: 49.36% global_avg_ntp_loss: 0.6977 global_avg_top_loss: 1.8781 +[titan] 2025-09-10 11:28:52,069 - root - INFO - lr: 3.4094e-06 gnorm: 0.49 [2 days, 11:52:23<13:01:54] +[titan] 2025-09-10 11:29:23,944 - root - INFO - step: 32855 loss: 2.5029 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.95 mfu: 49.54% global_avg_ntp_loss: 0.6587 global_avg_top_loss: 1.8443 +[titan] 2025-09-10 11:29:23,944 - root - INFO - lr: 3.4075e-06 gnorm: 0.49 [2 days, 11:52:55<13:01:21] +[titan] 2025-09-10 11:29:55,762 - root - INFO - step: 32860 loss: 2.6274 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.83 mfu: 49.63% global_avg_ntp_loss: 0.7188 global_avg_top_loss: 1.9086 +[titan] 2025-09-10 11:29:55,763 - root - INFO - lr: 3.4056e-06 gnorm: 0.52 [2 days, 11:53:27<13:00:48] +[titan] 2025-09-10 11:30:27,821 - root - INFO - step: 32865 loss: 2.5987 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.14 mfu: 49.26% global_avg_ntp_loss: 0.7074 global_avg_top_loss: 1.8913 +[titan] 2025-09-10 11:30:27,822 - root - INFO - lr: 3.4037e-06 gnorm: 0.53 [2 days, 11:53:59<13:00:15] +[titan] 2025-09-10 11:30:59,672 - root - INFO - step: 32870 loss: 3.1722 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.33 mfu: 49.58% global_avg_ntp_loss: 1.0207 global_avg_top_loss: 2.1515 +[titan] 2025-09-10 11:30:59,672 - root - INFO - lr: 3.4017e-06 gnorm: 0.51 [2 days, 11:54:31<12:59:42] +[titan] 2025-09-10 11:31:31,808 - root - INFO - step: 32875 loss: 2.5527 memory: 122.03GiB(87.57%) tps: 10,197 tflops: 485.97 mfu: 49.14% global_avg_ntp_loss: 0.6821 global_avg_top_loss: 1.8705 +[titan] 2025-09-10 11:31:31,808 - root - INFO - lr: 3.3998e-06 gnorm: 0.50 [2 days, 11:55:03<12:59:09] +[titan] 2025-09-10 11:32:03,743 - root - INFO - step: 32880 loss: 2.8023 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.03 mfu: 49.45% global_avg_ntp_loss: 0.8043 global_avg_top_loss: 1.9980 +[titan] 2025-09-10 11:32:03,743 - root - INFO - lr: 3.3979e-06 gnorm: 0.50 [2 days, 11:55:35<12:58:36] +[titan] 2025-09-10 11:32:35,732 - root - INFO - step: 32885 loss: 2.5757 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.21 mfu: 49.36% global_avg_ntp_loss: 0.6973 global_avg_top_loss: 1.8784 +[titan] 2025-09-10 11:32:35,732 - root - INFO - lr: 3.3960e-06 gnorm: 0.48 [2 days, 11:56:07<12:58:03] +[titan] 2025-09-10 11:33:07,720 - root - INFO - step: 32890 loss: 2.7157 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.21 mfu: 49.36% global_avg_ntp_loss: 0.7583 global_avg_top_loss: 1.9574 +[titan] 2025-09-10 11:33:07,721 - root - INFO - lr: 3.3941e-06 gnorm: 0.50 [2 days, 11:56:39<12:57:30] +[titan] 2025-09-10 11:33:39,559 - root - INFO - step: 32895 loss: 2.6279 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.51 mfu: 49.60% global_avg_ntp_loss: 0.7244 global_avg_top_loss: 1.9036 +[titan] 2025-09-10 11:33:39,559 - root - INFO - lr: 3.3922e-06 gnorm: 0.53 [2 days, 11:57:11<12:56:57] +[titan] 2025-09-10 11:34:05,100 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 11:34:11,606 - root - INFO - step: 32900 loss: 2.6005 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.32 mfu: 49.27% global_avg_ntp_loss: 0.7058 global_avg_top_loss: 1.8947 +[titan] 2025-09-10 11:34:11,606 - root - INFO - lr: 3.3903e-06 gnorm: 0.55 [2 days, 11:57:43<12:56:24] +[titan] 2025-09-10 11:34:43,494 - root - INFO - step: 32905 loss: 2.2527 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.76 mfu: 49.52% global_avg_ntp_loss: 0.5475 global_avg_top_loss: 1.7052 +[titan] 2025-09-10 11:34:43,494 - root - INFO - lr: 3.3884e-06 gnorm: 0.46 [2 days, 11:58:15<12:55:51] +[titan] 2025-09-10 11:35:15,283 - root - INFO - step: 32910 loss: 2.4294 memory: 122.03GiB(87.57%) tps: 10,308 tflops: 491.27 mfu: 49.67% global_avg_ntp_loss: 0.6225 global_avg_top_loss: 1.8070 +[titan] 2025-09-10 11:35:15,283 - root - INFO - lr: 3.3865e-06 gnorm: 0.48 [2 days, 11:58:46<12:55:18] +[titan] 2025-09-10 11:35:47,108 - root - INFO - step: 32915 loss: 2.4614 memory: 122.03GiB(87.57%) tps: 10,297 tflops: 490.73 mfu: 49.62% global_avg_ntp_loss: 0.6385 global_avg_top_loss: 1.8229 +[titan] 2025-09-10 11:35:47,108 - root - INFO - lr: 3.3846e-06 gnorm: 0.61 [2 days, 11:59:18<12:54:45] +[titan] 2025-09-10 11:36:19,088 - root - INFO - step: 32920 loss: 2.4285 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.34 mfu: 49.38% global_avg_ntp_loss: 0.6321 global_avg_top_loss: 1.7964 +[titan] 2025-09-10 11:36:19,088 - root - INFO - lr: 3.3827e-06 gnorm: 0.46 [2 days, 11:59:50<12:54:12] +[titan] 2025-09-10 11:36:51,003 - root - INFO - step: 32925 loss: 2.4379 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.34 mfu: 49.48% global_avg_ntp_loss: 0.6301 global_avg_top_loss: 1.8078 +[titan] 2025-09-10 11:36:51,003 - root - INFO - lr: 3.3808e-06 gnorm: 0.52 [2 days, 12:00:22<12:53:39] +[titan] 2025-09-10 11:37:22,991 - root - INFO - step: 32930 loss: 2.6020 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.21 mfu: 49.36% global_avg_ntp_loss: 0.7055 global_avg_top_loss: 1.8966 +[titan] 2025-09-10 11:37:22,992 - root - INFO - lr: 3.3789e-06 gnorm: 0.51 [2 days, 12:00:54<12:53:06] +[titan] 2025-09-10 11:37:54,953 - root - INFO - step: 32935 loss: 2.3956 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.62 mfu: 49.41% global_avg_ntp_loss: 0.6165 global_avg_top_loss: 1.7790 +[titan] 2025-09-10 11:37:54,953 - root - INFO - lr: 3.3770e-06 gnorm: 0.49 [2 days, 12:01:26<12:52:33] +[titan] 2025-09-10 11:38:26,918 - root - INFO - step: 32940 loss: 2.5996 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.57 mfu: 49.40% global_avg_ntp_loss: 0.7066 global_avg_top_loss: 1.8930 +[titan] 2025-09-10 11:38:26,919 - root - INFO - lr: 3.3751e-06 gnorm: 0.50 [2 days, 12:01:58<12:52:00] +[titan] 2025-09-10 11:38:58,620 - root - INFO - step: 32945 loss: 2.6606 memory: 122.03GiB(87.57%) tps: 10,337 tflops: 492.64 mfu: 49.81% global_avg_ntp_loss: 0.7351 global_avg_top_loss: 1.9255 +[titan] 2025-09-10 11:38:58,620 - root - INFO - lr: 3.3732e-06 gnorm: 0.51 [2 days, 12:02:30<12:51:27] +[titan] 2025-09-10 11:39:24,148 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 11:39:30,523 - root - INFO - step: 32950 loss: 2.5952 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.52 mfu: 49.50% global_avg_ntp_loss: 0.7045 global_avg_top_loss: 1.8907 +[titan] 2025-09-10 11:39:30,523 - root - INFO - lr: 3.3713e-06 gnorm: 0.51 [2 days, 12:03:02<12:50:54] +[titan] 2025-09-10 11:40:02,484 - root - INFO - step: 32955 loss: 2.5793 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.64 mfu: 49.41% global_avg_ntp_loss: 0.6974 global_avg_top_loss: 1.8819 +[titan] 2025-09-10 11:40:02,484 - root - INFO - lr: 3.3694e-06 gnorm: 0.48 [2 days, 12:03:34<12:50:21] +[titan] 2025-09-10 11:40:34,510 - root - INFO - step: 32960 loss: 2.5976 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.65 mfu: 49.31% global_avg_ntp_loss: 0.7061 global_avg_top_loss: 1.8915 +[titan] 2025-09-10 11:40:34,510 - root - INFO - lr: 3.3675e-06 gnorm: 0.48 [2 days, 12:04:06<12:49:48] +[titan] 2025-09-10 11:41:06,350 - root - INFO - step: 32965 loss: 2.5580 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.49 mfu: 49.59% global_avg_ntp_loss: 0.6922 global_avg_top_loss: 1.8658 +[titan] 2025-09-10 11:41:06,350 - root - INFO - lr: 3.3656e-06 gnorm: 0.48 [2 days, 12:04:37<12:49:15] +[titan] 2025-09-10 11:41:38,424 - root - INFO - step: 32970 loss: 2.6950 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.90 mfu: 49.23% global_avg_ntp_loss: 0.7522 global_avg_top_loss: 1.9429 +[titan] 2025-09-10 11:41:38,425 - root - INFO - lr: 3.3637e-06 gnorm: 0.46 [2 days, 12:05:09<12:48:42] +[titan] 2025-09-10 11:42:10,235 - root - INFO - step: 32975 loss: 2.5997 memory: 122.03GiB(87.57%) tps: 10,301 tflops: 490.94 mfu: 49.64% global_avg_ntp_loss: 0.7063 global_avg_top_loss: 1.8934 +[titan] 2025-09-10 11:42:10,235 - root - INFO - lr: 3.3618e-06 gnorm: 0.52 [2 days, 12:05:41<12:48:09] +[titan] 2025-09-10 11:42:41,947 - root - INFO - step: 32980 loss: 2.5755 memory: 122.03GiB(87.57%) tps: 10,333 tflops: 492.47 mfu: 49.79% global_avg_ntp_loss: 0.6979 global_avg_top_loss: 1.8775 +[titan] 2025-09-10 11:42:41,948 - root - INFO - lr: 3.3599e-06 gnorm: 0.56 [2 days, 12:06:13<12:47:36] +[titan] 2025-09-10 11:43:13,986 - root - INFO - step: 32985 loss: 2.5143 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.45 mfu: 49.29% global_avg_ntp_loss: 0.6680 global_avg_top_loss: 1.8464 +[titan] 2025-09-10 11:43:13,986 - root - INFO - lr: 3.3581e-06 gnorm: 0.47 [2 days, 12:06:45<12:47:03] +[titan] 2025-09-10 11:43:45,681 - root - INFO - step: 32990 loss: 2.5189 memory: 122.03GiB(87.57%) tps: 10,339 tflops: 492.73 mfu: 49.82% global_avg_ntp_loss: 0.6643 global_avg_top_loss: 1.8546 +[titan] 2025-09-10 11:43:45,682 - root - INFO - lr: 3.3562e-06 gnorm: 0.50 [2 days, 12:07:17<12:46:30] +[titan] 2025-09-10 11:44:17,663 - root - INFO - step: 32995 loss: 2.5261 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.32 mfu: 49.37% global_avg_ntp_loss: 0.6668 global_avg_top_loss: 1.8593 +[titan] 2025-09-10 11:44:17,663 - root - INFO - lr: 3.3543e-06 gnorm: 0.57 [2 days, 12:07:49<12:45:57] +[titan] 2025-09-10 11:44:43,211 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 11:44:49,621 - root - INFO - step: 33000 loss: 2.6048 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.67 mfu: 49.41% global_avg_ntp_loss: 0.7059 global_avg_top_loss: 1.8990 +[titan] 2025-09-10 11:44:49,622 - root - INFO - lr: 3.3524e-06 gnorm: 0.49 [2 days, 12:08:21<12:45:24] +[titan] 2025-09-10 11:45:21,481 - root - INFO - step: 33005 loss: 2.5521 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.19 mfu: 49.56% global_avg_ntp_loss: 0.6794 global_avg_top_loss: 1.8728 +[titan] 2025-09-10 11:45:21,481 - root - INFO - lr: 3.3505e-06 gnorm: 0.53 [2 days, 12:08:53<12:44:51] +[titan] 2025-09-10 11:45:53,146 - root - INFO - step: 33010 loss: 2.6058 memory: 122.03GiB(87.57%) tps: 10,348 tflops: 493.19 mfu: 49.87% global_avg_ntp_loss: 0.7081 global_avg_top_loss: 1.8977 +[titan] 2025-09-10 11:45:53,147 - root - INFO - lr: 3.3486e-06 gnorm: 0.53 [2 days, 12:09:24<12:44:18] +[titan] 2025-09-10 11:46:24,952 - root - INFO - step: 33015 loss: 2.4955 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.02 mfu: 49.65% global_avg_ntp_loss: 0.6590 global_avg_top_loss: 1.8365 +[titan] 2025-09-10 11:46:24,952 - root - INFO - lr: 3.3468e-06 gnorm: 0.51 [2 days, 12:09:56<12:43:45] +[titan] 2025-09-10 11:46:56,883 - root - INFO - step: 33020 loss: 2.6787 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.09 mfu: 49.45% global_avg_ntp_loss: 0.7416 global_avg_top_loss: 1.9372 +[titan] 2025-09-10 11:46:56,884 - root - INFO - lr: 3.3449e-06 gnorm: 0.52 [2 days, 12:10:28<12:43:12] +[titan] 2025-09-10 11:47:28,910 - root - INFO - step: 33025 loss: 2.6239 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.64 mfu: 49.31% global_avg_ntp_loss: 0.7200 global_avg_top_loss: 1.9039 +[titan] 2025-09-10 11:47:28,910 - root - INFO - lr: 3.3430e-06 gnorm: 0.50 [2 days, 12:11:00<12:42:39] +[titan] 2025-09-10 11:48:00,860 - root - INFO - step: 33030 loss: 3.0801 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.80 mfu: 49.42% global_avg_ntp_loss: 0.9770 global_avg_top_loss: 2.1031 +[titan] 2025-09-10 11:48:00,860 - root - INFO - lr: 3.3411e-06 gnorm: 0.56 [2 days, 12:11:32<12:42:06] +[titan] 2025-09-10 11:48:32,745 - root - INFO - step: 33035 loss: 2.5760 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.80 mfu: 49.52% global_avg_ntp_loss: 0.6958 global_avg_top_loss: 1.8802 +[titan] 2025-09-10 11:48:32,745 - root - INFO - lr: 3.3393e-06 gnorm: 0.47 [2 days, 12:12:04<12:41:33] +[titan] 2025-09-10 11:49:04,792 - root - INFO - step: 33040 loss: 2.5261 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.31 mfu: 49.27% global_avg_ntp_loss: 0.6742 global_avg_top_loss: 1.8519 +[titan] 2025-09-10 11:49:04,793 - root - INFO - lr: 3.3374e-06 gnorm: 0.47 [2 days, 12:12:36<12:41:00] +[titan] 2025-09-10 11:49:36,619 - root - INFO - step: 33045 loss: 2.6017 memory: 122.03GiB(87.57%) tps: 10,296 tflops: 490.69 mfu: 49.61% global_avg_ntp_loss: 0.7075 global_avg_top_loss: 1.8942 +[titan] 2025-09-10 11:49:36,620 - root - INFO - lr: 3.3355e-06 gnorm: 0.47 [2 days, 12:13:08<12:40:27] +[titan] 2025-09-10 11:50:02,182 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 11:50:08,556 - root - INFO - step: 33050 loss: 2.7836 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 489.00 mfu: 49.44% global_avg_ntp_loss: 0.8165 global_avg_top_loss: 1.9671 +[titan] 2025-09-10 11:50:08,557 - root - INFO - lr: 3.3336e-06 gnorm: 0.48 [2 days, 12:13:40<12:39:54] +[titan] 2025-09-10 11:50:40,451 - root - INFO - step: 33055 loss: 2.6297 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.65 mfu: 49.51% global_avg_ntp_loss: 0.7222 global_avg_top_loss: 1.9074 +[titan] 2025-09-10 11:50:40,451 - root - INFO - lr: 3.3318e-06 gnorm: 0.52 [2 days, 12:14:11<12:39:21] +[titan] 2025-09-10 11:51:12,345 - root - INFO - step: 33060 loss: 2.6427 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.66 mfu: 49.51% global_avg_ntp_loss: 0.7292 global_avg_top_loss: 1.9135 +[titan] 2025-09-10 11:51:12,345 - root - INFO - lr: 3.3299e-06 gnorm: 0.55 [2 days, 12:14:43<12:38:48] +[titan] 2025-09-10 11:51:44,095 - root - INFO - step: 33065 loss: 2.4193 memory: 122.03GiB(87.57%) tps: 10,321 tflops: 491.89 mfu: 49.74% global_avg_ntp_loss: 0.6237 global_avg_top_loss: 1.7956 +[titan] 2025-09-10 11:51:44,095 - root - INFO - lr: 3.3280e-06 gnorm: 0.47 [2 days, 12:15:15<12:38:15] +[titan] 2025-09-10 11:52:15,874 - root - INFO - step: 33070 loss: 2.4301 memory: 122.03GiB(87.57%) tps: 10,311 tflops: 491.42 mfu: 49.69% global_avg_ntp_loss: 0.6315 global_avg_top_loss: 1.7986 +[titan] 2025-09-10 11:52:15,875 - root - INFO - lr: 3.3262e-06 gnorm: 0.48 [2 days, 12:15:47<12:37:42] +[titan] 2025-09-10 11:52:48,034 - root - INFO - step: 33075 loss: 2.5194 memory: 122.03GiB(87.57%) tps: 10,189 tflops: 485.62 mfu: 49.10% global_avg_ntp_loss: 0.6680 global_avg_top_loss: 1.8513 +[titan] 2025-09-10 11:52:48,034 - root - INFO - lr: 3.3243e-06 gnorm: 0.59 [2 days, 12:16:19<12:37:09] +[titan] 2025-09-10 11:53:20,052 - root - INFO - step: 33080 loss: 2.5813 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.76 mfu: 49.32% global_avg_ntp_loss: 0.6949 global_avg_top_loss: 1.8864 +[titan] 2025-09-10 11:53:20,052 - root - INFO - lr: 3.3224e-06 gnorm: 0.51 [2 days, 12:16:51<12:36:36] +[titan] 2025-09-10 11:53:52,033 - root - INFO - step: 33085 loss: 2.5054 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.34 mfu: 49.38% global_avg_ntp_loss: 0.6616 global_avg_top_loss: 1.8438 +[titan] 2025-09-10 11:53:52,033 - root - INFO - lr: 3.3206e-06 gnorm: 0.53 [2 days, 12:17:23<12:36:03] +[titan] 2025-09-10 11:54:23,866 - root - INFO - step: 33090 loss: 2.6463 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.59 mfu: 49.60% global_avg_ntp_loss: 0.7257 global_avg_top_loss: 1.9206 +[titan] 2025-09-10 11:54:23,866 - root - INFO - lr: 3.3187e-06 gnorm: 0.52 [2 days, 12:17:55<12:35:30] +[titan] 2025-09-10 11:54:55,801 - root - INFO - step: 33095 loss: 2.4542 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.03 mfu: 49.45% global_avg_ntp_loss: 0.6373 global_avg_top_loss: 1.8169 +[titan] 2025-09-10 11:54:55,801 - root - INFO - lr: 3.3169e-06 gnorm: 0.51 [2 days, 12:18:27<12:34:57] +[titan] 2025-09-10 11:55:21,335 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 11:55:27,762 - root - INFO - step: 33100 loss: 2.6025 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.64 mfu: 49.41% global_avg_ntp_loss: 0.7068 global_avg_top_loss: 1.8956 +[titan] 2025-09-10 11:55:27,762 - root - INFO - lr: 3.3150e-06 gnorm: 0.48 [2 days, 12:18:59<12:34:24] +[titan] 2025-09-10 11:55:59,474 - root - INFO - step: 33105 loss: 2.6705 memory: 122.03GiB(87.57%) tps: 10,333 tflops: 492.47 mfu: 49.79% global_avg_ntp_loss: 0.7386 global_avg_top_loss: 1.9318 +[titan] 2025-09-10 11:55:59,475 - root - INFO - lr: 3.3131e-06 gnorm: 0.52 [2 days, 12:19:30<12:33:51] +[titan] 2025-09-10 11:56:31,550 - root - INFO - step: 33110 loss: 2.7291 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.89 mfu: 49.23% global_avg_ntp_loss: 0.7688 global_avg_top_loss: 1.9602 +[titan] 2025-09-10 11:56:31,550 - root - INFO - lr: 3.3113e-06 gnorm: 0.76 [2 days, 12:20:03<12:33:18] +[titan] 2025-09-10 11:57:03,512 - root - INFO - step: 33115 loss: 2.6486 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.61 mfu: 49.40% global_avg_ntp_loss: 0.7306 global_avg_top_loss: 1.9180 +[titan] 2025-09-10 11:57:03,513 - root - INFO - lr: 3.3094e-06 gnorm: 0.51 [2 days, 12:20:35<12:32:45] +[titan] 2025-09-10 11:57:35,354 - root - INFO - step: 33120 loss: 2.5861 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.46 mfu: 49.59% global_avg_ntp_loss: 0.6990 global_avg_top_loss: 1.8871 +[titan] 2025-09-10 11:57:35,355 - root - INFO - lr: 3.3076e-06 gnorm: 0.51 [2 days, 12:21:06<12:32:12] +[titan] 2025-09-10 11:58:07,186 - root - INFO - step: 33125 loss: 2.5657 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.61 mfu: 49.61% global_avg_ntp_loss: 0.6913 global_avg_top_loss: 1.8744 +[titan] 2025-09-10 11:58:07,187 - root - INFO - lr: 3.3057e-06 gnorm: 0.49 [2 days, 12:21:38<12:31:39] +[titan] 2025-09-10 11:58:39,142 - root - INFO - step: 33130 loss: 2.7226 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.71 mfu: 49.41% global_avg_ntp_loss: 0.7619 global_avg_top_loss: 1.9607 +[titan] 2025-09-10 11:58:39,143 - root - INFO - lr: 3.3039e-06 gnorm: 0.49 [2 days, 12:22:10<12:31:06] +[titan] 2025-09-10 11:59:11,169 - root - INFO - step: 33135 loss: 2.6094 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.63 mfu: 49.31% global_avg_ntp_loss: 0.7123 global_avg_top_loss: 1.8972 +[titan] 2025-09-10 11:59:11,169 - root - INFO - lr: 3.3020e-06 gnorm: 0.50 [2 days, 12:22:42<12:30:33] +[titan] 2025-09-10 11:59:43,068 - root - INFO - step: 33140 loss: 2.5889 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.59 mfu: 49.50% global_avg_ntp_loss: 0.7052 global_avg_top_loss: 1.8837 +[titan] 2025-09-10 11:59:43,068 - root - INFO - lr: 3.3002e-06 gnorm: 0.52 [2 days, 12:23:14<12:30:00] +[titan] 2025-09-10 12:00:14,847 - root - INFO - step: 33145 loss: 2.4370 memory: 122.03GiB(87.57%) tps: 10,311 tflops: 491.43 mfu: 49.69% global_avg_ntp_loss: 0.6336 global_avg_top_loss: 1.8033 +[titan] 2025-09-10 12:00:14,847 - root - INFO - lr: 3.2983e-06 gnorm: 0.48 [2 days, 12:23:46<12:29:27] +[titan] 2025-09-10 12:00:40,304 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 12:00:46,782 - root - INFO - step: 33150 loss: 2.5159 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.02 mfu: 49.45% global_avg_ntp_loss: 0.6681 global_avg_top_loss: 1.8478 +[titan] 2025-09-10 12:00:46,782 - root - INFO - lr: 3.2965e-06 gnorm: 0.49 [2 days, 12:24:18<12:28:54] +[titan] 2025-09-10 12:01:18,795 - root - INFO - step: 33155 loss: 2.5355 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.84 mfu: 49.33% global_avg_ntp_loss: 0.6698 global_avg_top_loss: 1.8657 +[titan] 2025-09-10 12:01:18,796 - root - INFO - lr: 3.2946e-06 gnorm: 0.61 [2 days, 12:24:50<12:28:21] +[titan] 2025-09-10 12:01:51,076 - root - INFO - step: 33160 loss: 2.5519 memory: 122.03GiB(87.57%) tps: 10,151 tflops: 483.79 mfu: 48.92% global_avg_ntp_loss: 0.6818 global_avg_top_loss: 1.8701 +[titan] 2025-09-10 12:01:51,077 - root - INFO - lr: 3.2928e-06 gnorm: 0.49 [2 days, 12:25:22<12:27:48] +[titan] 2025-09-10 12:02:22,871 - root - INFO - step: 33165 loss: 2.5104 memory: 122.03GiB(87.57%) tps: 10,306 tflops: 491.18 mfu: 49.66% global_avg_ntp_loss: 0.6620 global_avg_top_loss: 1.8484 +[titan] 2025-09-10 12:02:22,872 - root - INFO - lr: 3.2910e-06 gnorm: 0.51 [2 days, 12:25:54<12:27:15] +[titan] 2025-09-10 12:02:54,930 - root - INFO - step: 33170 loss: 2.5120 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.15 mfu: 49.26% global_avg_ntp_loss: 0.6659 global_avg_top_loss: 1.8461 +[titan] 2025-09-10 12:02:54,930 - root - INFO - lr: 3.2891e-06 gnorm: 0.50 [2 days, 12:26:26<12:26:42] +[titan] 2025-09-10 12:03:26,865 - root - INFO - step: 33175 loss: 2.5619 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.03 mfu: 49.45% global_avg_ntp_loss: 0.6855 global_avg_top_loss: 1.8764 +[titan] 2025-09-10 12:03:26,865 - root - INFO - lr: 3.2873e-06 gnorm: 0.51 [2 days, 12:26:58<12:26:10] +[titan] 2025-09-10 12:03:58,727 - root - INFO - step: 33180 loss: 2.4940 memory: 122.03GiB(87.57%) tps: 10,284 tflops: 490.14 mfu: 49.56% global_avg_ntp_loss: 0.6580 global_avg_top_loss: 1.8360 +[titan] 2025-09-10 12:03:58,727 - root - INFO - lr: 3.2854e-06 gnorm: 0.50 [2 days, 12:27:30<12:25:37] +[titan] 2025-09-10 12:04:30,705 - root - INFO - step: 33185 loss: 2.6139 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.38 mfu: 49.38% global_avg_ntp_loss: 0.7170 global_avg_top_loss: 1.8969 +[titan] 2025-09-10 12:04:30,705 - root - INFO - lr: 3.2836e-06 gnorm: 0.52 [2 days, 12:28:02<12:25:04] +[titan] 2025-09-10 12:05:02,661 - root - INFO - step: 33190 loss: 2.5887 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.71 mfu: 49.41% global_avg_ntp_loss: 0.7047 global_avg_top_loss: 1.8840 +[titan] 2025-09-10 12:05:02,661 - root - INFO - lr: 3.2818e-06 gnorm: 0.52 [2 days, 12:28:34<12:24:31] +[titan] 2025-09-10 12:05:34,385 - root - INFO - step: 33195 loss: 2.5864 memory: 122.03GiB(87.57%) tps: 10,329 tflops: 492.29 mfu: 49.78% global_avg_ntp_loss: 0.6996 global_avg_top_loss: 1.8868 +[titan] 2025-09-10 12:05:34,385 - root - INFO - lr: 3.2799e-06 gnorm: 0.50 [2 days, 12:29:05<12:23:58] +[titan] 2025-09-10 12:05:59,951 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 12:06:06,333 - root - INFO - step: 33200 loss: 2.6992 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.83 mfu: 49.43% global_avg_ntp_loss: 0.7502 global_avg_top_loss: 1.9490 +[titan] 2025-09-10 12:06:06,333 - root - INFO - lr: 3.2781e-06 gnorm: 0.52 [2 days, 12:29:37<12:23:25] +[titan] 2025-09-10 12:06:38,249 - root - INFO - step: 33205 loss: 2.5689 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.33 mfu: 49.48% global_avg_ntp_loss: 0.6924 global_avg_top_loss: 1.8765 +[titan] 2025-09-10 12:06:38,249 - root - INFO - lr: 3.2763e-06 gnorm: 0.49 [2 days, 12:30:09<12:22:52] +[titan] 2025-09-10 12:07:10,113 - root - INFO - step: 33210 loss: 2.6387 memory: 122.03GiB(87.57%) tps: 10,284 tflops: 490.12 mfu: 49.56% global_avg_ntp_loss: 0.7285 global_avg_top_loss: 1.9103 +[titan] 2025-09-10 12:07:10,113 - root - INFO - lr: 3.2744e-06 gnorm: 0.47 [2 days, 12:30:41<12:22:19] +[titan] 2025-09-10 12:07:42,161 - root - INFO - step: 33215 loss: 2.6018 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.31 mfu: 49.27% global_avg_ntp_loss: 0.7078 global_avg_top_loss: 1.8940 +[titan] 2025-09-10 12:07:42,161 - root - INFO - lr: 3.2726e-06 gnorm: 0.55 [2 days, 12:31:13<12:21:46] +[titan] 2025-09-10 12:08:14,003 - root - INFO - step: 33220 loss: 2.5725 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.45 mfu: 49.59% global_avg_ntp_loss: 0.6922 global_avg_top_loss: 1.8803 +[titan] 2025-09-10 12:08:14,004 - root - INFO - lr: 3.2708e-06 gnorm: 0.56 [2 days, 12:31:45<12:21:13] +[titan] 2025-09-10 12:08:45,960 - root - INFO - step: 33225 loss: 2.5194 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.70 mfu: 49.41% global_avg_ntp_loss: 0.6741 global_avg_top_loss: 1.8453 +[titan] 2025-09-10 12:08:45,960 - root - INFO - lr: 3.2689e-06 gnorm: 0.48 [2 days, 12:32:17<12:20:40] +[titan] 2025-09-10 12:09:18,008 - root - INFO - step: 33230 loss: 2.4602 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.31 mfu: 49.27% global_avg_ntp_loss: 0.6412 global_avg_top_loss: 1.8190 +[titan] 2025-09-10 12:09:18,008 - root - INFO - lr: 3.2671e-06 gnorm: 0.48 [2 days, 12:32:49<12:20:07] +[titan] 2025-09-10 12:09:50,014 - root - INFO - step: 33235 loss: 2.4804 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.95 mfu: 49.34% global_avg_ntp_loss: 0.6474 global_avg_top_loss: 1.8330 +[titan] 2025-09-10 12:09:50,014 - root - INFO - lr: 3.2653e-06 gnorm: 0.63 [2 days, 12:33:21<12:19:34] +[titan] 2025-09-10 12:10:21,863 - root - INFO - step: 33240 loss: 3.0725 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.34 mfu: 49.58% global_avg_ntp_loss: 0.9756 global_avg_top_loss: 2.0968 +[titan] 2025-09-10 12:10:21,864 - root - INFO - lr: 3.2635e-06 gnorm: 0.52 [2 days, 12:33:53<12:19:01] +[titan] 2025-09-10 12:10:53,735 - root - INFO - step: 33245 loss: 2.5279 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 490.01 mfu: 49.55% global_avg_ntp_loss: 0.6715 global_avg_top_loss: 1.8564 +[titan] 2025-09-10 12:10:53,735 - root - INFO - lr: 3.2616e-06 gnorm: 0.53 [2 days, 12:34:25<12:18:28] +[titan] 2025-09-10 12:11:19,277 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 12:11:25,679 - root - INFO - step: 33250 loss: 2.4331 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.89 mfu: 49.43% global_avg_ntp_loss: 0.6310 global_avg_top_loss: 1.8021 +[titan] 2025-09-10 12:11:25,679 - root - INFO - lr: 3.2598e-06 gnorm: 0.47 [2 days, 12:34:57<12:17:55] +[titan] 2025-09-10 12:11:57,658 - root - INFO - step: 33255 loss: 2.5937 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.35 mfu: 49.38% global_avg_ntp_loss: 0.7006 global_avg_top_loss: 1.8932 +[titan] 2025-09-10 12:11:57,659 - root - INFO - lr: 3.2580e-06 gnorm: 0.51 [2 days, 12:35:29<12:17:22] +[titan] 2025-09-10 12:12:29,730 - root - INFO - step: 33260 loss: 2.6077 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.95 mfu: 49.24% global_avg_ntp_loss: 0.7094 global_avg_top_loss: 1.8983 +[titan] 2025-09-10 12:12:29,730 - root - INFO - lr: 3.2562e-06 gnorm: 0.50 [2 days, 12:36:01<12:16:49] +[titan] 2025-09-10 12:13:01,764 - root - INFO - step: 33265 loss: 2.5867 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.52 mfu: 49.29% global_avg_ntp_loss: 0.7025 global_avg_top_loss: 1.8842 +[titan] 2025-09-10 12:13:01,764 - root - INFO - lr: 3.2544e-06 gnorm: 0.51 [2 days, 12:36:33<12:16:16] +[titan] 2025-09-10 12:13:33,634 - root - INFO - step: 33270 loss: 2.6354 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.04 mfu: 49.55% global_avg_ntp_loss: 0.7291 global_avg_top_loss: 1.9063 +[titan] 2025-09-10 12:13:33,634 - root - INFO - lr: 3.2525e-06 gnorm: 0.53 [2 days, 12:37:05<12:15:43] +[titan] 2025-09-10 12:14:05,725 - root - INFO - step: 33275 loss: 2.5804 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.65 mfu: 49.21% global_avg_ntp_loss: 0.6978 global_avg_top_loss: 1.8826 +[titan] 2025-09-10 12:14:05,725 - root - INFO - lr: 3.2507e-06 gnorm: 0.47 [2 days, 12:37:37<12:15:10] +[titan] 2025-09-10 12:14:37,793 - root - INFO - step: 33280 loss: 2.5654 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 487.00 mfu: 49.24% global_avg_ntp_loss: 0.6880 global_avg_top_loss: 1.8773 +[titan] 2025-09-10 12:14:37,793 - root - INFO - lr: 3.2489e-06 gnorm: 0.48 [2 days, 12:38:09<12:14:37] +[titan] 2025-09-10 12:14:38,086 - root - INFO - Dumping profiler traces at step 33280 +[titan] 2025-09-10 12:14:38,155 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-10 12:15:09,953 - root - INFO - step: 33285 loss: 2.6180 memory: 122.03GiB(87.57%) tps: 10,189 tflops: 485.62 mfu: 49.10% global_avg_ntp_loss: 0.7120 global_avg_top_loss: 1.9060 +[titan] 2025-09-10 12:15:09,953 - root - INFO - lr: 3.2471e-06 gnorm: 0.49 [2 days, 12:38:41<12:14:04] +[titan] 2025-09-10 12:15:41,916 - root - INFO - step: 33290 loss: 2.6144 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.59 mfu: 49.40% global_avg_ntp_loss: 0.7122 global_avg_top_loss: 1.9022 +[titan] 2025-09-10 12:15:41,917 - root - INFO - lr: 3.2453e-06 gnorm: 0.48 [2 days, 12:39:13<12:13:31] +[titan] 2025-09-10 12:16:14,086 - root - INFO - step: 33295 loss: 2.5586 memory: 122.03GiB(87.57%) tps: 10,186 tflops: 485.46 mfu: 49.09% global_avg_ntp_loss: 0.6921 global_avg_top_loss: 1.8665 +[titan] 2025-09-10 12:16:14,087 - root - INFO - lr: 3.2435e-06 gnorm: 0.57 [2 days, 12:39:45<12:12:58] +[titan] 2025-09-10 12:16:39,698 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 12:16:46,081 - root - INFO - step: 33300 loss: 2.5678 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.12 mfu: 49.36% global_avg_ntp_loss: 0.6972 global_avg_top_loss: 1.8706 +[titan] 2025-09-10 12:16:46,081 - root - INFO - lr: 3.2417e-06 gnorm: 0.58 [2 days, 12:40:17<12:12:25] +[titan] 2025-09-10 12:17:18,022 - root - INFO - step: 33305 loss: 2.5740 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.93 mfu: 49.44% global_avg_ntp_loss: 0.6965 global_avg_top_loss: 1.8774 +[titan] 2025-09-10 12:17:18,023 - root - INFO - lr: 3.2398e-06 gnorm: 0.49 [2 days, 12:40:49<12:11:52] +[titan] 2025-09-10 12:17:50,179 - root - INFO - step: 33310 loss: 2.4597 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.67 mfu: 49.11% global_avg_ntp_loss: 0.6424 global_avg_top_loss: 1.8173 +[titan] 2025-09-10 12:17:50,179 - root - INFO - lr: 3.2380e-06 gnorm: 0.50 [2 days, 12:41:21<12:11:19] +[titan] 2025-09-10 12:18:22,024 - root - INFO - step: 33315 loss: 2.3603 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.41 mfu: 49.59% global_avg_ntp_loss: 0.5903 global_avg_top_loss: 1.7700 +[titan] 2025-09-10 12:18:22,024 - root - INFO - lr: 3.2362e-06 gnorm: 0.60 [2 days, 12:41:53<12:10:47] +[titan] 2025-09-10 12:18:53,886 - root - INFO - step: 33320 loss: 2.4561 memory: 122.03GiB(87.57%) tps: 10,284 tflops: 490.15 mfu: 49.56% global_avg_ntp_loss: 0.6415 global_avg_top_loss: 1.8146 +[titan] 2025-09-10 12:18:53,886 - root - INFO - lr: 3.2344e-06 gnorm: 0.49 [2 days, 12:42:25<12:10:14] +[titan] 2025-09-10 12:19:25,762 - root - INFO - step: 33325 loss: 2.5630 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.94 mfu: 49.54% global_avg_ntp_loss: 0.6912 global_avg_top_loss: 1.8718 +[titan] 2025-09-10 12:19:25,762 - root - INFO - lr: 3.2326e-06 gnorm: 0.51 [2 days, 12:42:57<12:09:41] +[titan] 2025-09-10 12:19:57,699 - root - INFO - step: 33330 loss: 2.4655 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 489.00 mfu: 49.44% global_avg_ntp_loss: 0.6475 global_avg_top_loss: 1.8180 +[titan] 2025-09-10 12:19:57,699 - root - INFO - lr: 3.2308e-06 gnorm: 0.49 [2 days, 12:43:29<12:09:08] +[titan] 2025-09-10 12:20:29,571 - root - INFO - step: 33335 loss: 2.4752 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.99 mfu: 49.54% global_avg_ntp_loss: 0.6497 global_avg_top_loss: 1.8255 +[titan] 2025-09-10 12:20:29,572 - root - INFO - lr: 3.2290e-06 gnorm: 0.50 [2 days, 12:44:01<12:08:35] +[titan] 2025-09-10 12:21:01,467 - root - INFO - step: 33340 loss: 2.5738 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.64 mfu: 49.51% global_avg_ntp_loss: 0.6976 global_avg_top_loss: 1.8761 +[titan] 2025-09-10 12:21:01,467 - root - INFO - lr: 3.2272e-06 gnorm: 0.52 [2 days, 12:44:32<12:08:02] +[titan] 2025-09-10 12:21:33,457 - root - INFO - step: 33345 loss: 2.5965 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.19 mfu: 49.36% global_avg_ntp_loss: 0.7074 global_avg_top_loss: 1.8890 +[titan] 2025-09-10 12:21:33,457 - root - INFO - lr: 3.2254e-06 gnorm: 0.54 [2 days, 12:45:04<12:07:29] +[titan] 2025-09-10 12:21:59,085 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 12:22:05,508 - root - INFO - step: 33350 loss: 2.5592 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.25 mfu: 49.27% global_avg_ntp_loss: 0.6901 global_avg_top_loss: 1.8691 +[titan] 2025-09-10 12:22:05,508 - root - INFO - lr: 3.2236e-06 gnorm: 0.58 [2 days, 12:45:36<12:06:56] +[titan] 2025-09-10 12:22:37,421 - root - INFO - step: 33355 loss: 2.5544 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.37 mfu: 49.48% global_avg_ntp_loss: 0.6857 global_avg_top_loss: 1.8686 +[titan] 2025-09-10 12:22:37,421 - root - INFO - lr: 3.2218e-06 gnorm: 0.49 [2 days, 12:46:08<12:06:23] +[titan] 2025-09-10 12:23:09,719 - root - INFO - step: 33360 loss: 2.6192 memory: 122.03GiB(87.57%) tps: 10,145 tflops: 483.53 mfu: 48.89% global_avg_ntp_loss: 0.7115 global_avg_top_loss: 1.9077 +[titan] 2025-09-10 12:23:09,719 - root - INFO - lr: 3.2200e-06 gnorm: 0.50 [2 days, 12:46:41<12:05:50] +[titan] 2025-09-10 12:23:41,548 - root - INFO - step: 33365 loss: 2.5401 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.66 mfu: 49.61% global_avg_ntp_loss: 0.6796 global_avg_top_loss: 1.8605 +[titan] 2025-09-10 12:23:41,548 - root - INFO - lr: 3.2182e-06 gnorm: 0.49 [2 days, 12:47:13<12:05:17] +[titan] 2025-09-10 12:24:13,687 - root - INFO - step: 33370 loss: 2.7375 memory: 122.03GiB(87.57%) tps: 10,196 tflops: 485.93 mfu: 49.13% global_avg_ntp_loss: 0.7828 global_avg_top_loss: 1.9547 +[titan] 2025-09-10 12:24:13,687 - root - INFO - lr: 3.2164e-06 gnorm: 0.48 [2 days, 12:47:45<12:04:44] +[titan] 2025-09-10 12:24:45,675 - root - INFO - step: 33375 loss: 2.6369 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.22 mfu: 49.37% global_avg_ntp_loss: 0.7207 global_avg_top_loss: 1.9162 +[titan] 2025-09-10 12:24:45,675 - root - INFO - lr: 3.2147e-06 gnorm: 0.60 [2 days, 12:48:17<12:04:11] +[titan] 2025-09-10 12:25:17,867 - root - INFO - step: 33380 loss: 2.5110 memory: 122.03GiB(87.57%) tps: 10,179 tflops: 485.13 mfu: 49.05% global_avg_ntp_loss: 0.6673 global_avg_top_loss: 1.8437 +[titan] 2025-09-10 12:25:17,867 - root - INFO - lr: 3.2129e-06 gnorm: 0.56 [2 days, 12:48:49<12:03:38] +[titan] 2025-09-10 12:25:49,828 - root - INFO - step: 33385 loss: 2.6781 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.62 mfu: 49.41% global_avg_ntp_loss: 0.7644 global_avg_top_loss: 1.9137 +[titan] 2025-09-10 12:25:49,828 - root - INFO - lr: 3.2111e-06 gnorm: 0.49 [2 days, 12:49:21<12:03:05] +[titan] 2025-09-10 12:26:21,755 - root - INFO - step: 33390 loss: 2.7724 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.16 mfu: 49.46% global_avg_ntp_loss: 0.8085 global_avg_top_loss: 1.9639 +[titan] 2025-09-10 12:26:21,755 - root - INFO - lr: 3.2093e-06 gnorm: 0.51 [2 days, 12:49:53<12:02:32] +[titan] 2025-09-10 12:26:53,856 - root - INFO - step: 33395 loss: 2.3956 memory: 122.03GiB(87.57%) tps: 10,208 tflops: 486.50 mfu: 49.19% global_avg_ntp_loss: 0.6073 global_avg_top_loss: 1.7883 +[titan] 2025-09-10 12:26:53,856 - root - INFO - lr: 3.2075e-06 gnorm: 0.63 [2 days, 12:50:25<12:01:59] +[titan] 2025-09-10 12:27:19,311 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 12:27:25,755 - root - INFO - step: 33400 loss: 2.5812 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.59 mfu: 49.50% global_avg_ntp_loss: 0.6949 global_avg_top_loss: 1.8863 +[titan] 2025-09-10 12:27:25,755 - root - INFO - lr: 3.2057e-06 gnorm: 0.52 [2 days, 12:50:57<12:01:26] +[titan] 2025-09-10 12:27:57,650 - root - INFO - step: 33405 loss: 2.5125 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.64 mfu: 49.51% global_avg_ntp_loss: 0.6657 global_avg_top_loss: 1.8468 +[titan] 2025-09-10 12:27:57,650 - root - INFO - lr: 3.2039e-06 gnorm: 0.56 [2 days, 12:51:29<12:00:53] +[titan] 2025-09-10 12:28:29,557 - root - INFO - step: 33410 loss: 2.5205 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.46 mfu: 49.49% global_avg_ntp_loss: 0.6670 global_avg_top_loss: 1.8535 +[titan] 2025-09-10 12:28:29,557 - root - INFO - lr: 3.2021e-06 gnorm: 0.51 [2 days, 12:52:01<12:00:20] +[titan] 2025-09-10 12:29:01,716 - root - INFO - step: 33415 loss: 2.4858 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.63 mfu: 49.10% global_avg_ntp_loss: 0.6552 global_avg_top_loss: 1.8306 +[titan] 2025-09-10 12:29:01,716 - root - INFO - lr: 3.2004e-06 gnorm: 0.48 [2 days, 12:52:33<11:59:47] +[titan] 2025-09-10 12:29:33,586 - root - INFO - step: 33420 loss: 2.5397 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.03 mfu: 49.55% global_avg_ntp_loss: 0.6767 global_avg_top_loss: 1.8630 +[titan] 2025-09-10 12:29:33,586 - root - INFO - lr: 3.1986e-06 gnorm: 0.52 [2 days, 12:53:05<11:59:14] +[titan] 2025-09-10 12:30:05,439 - root - INFO - step: 33425 loss: 2.6249 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.29 mfu: 49.57% global_avg_ntp_loss: 0.7165 global_avg_top_loss: 1.9084 +[titan] 2025-09-10 12:30:05,439 - root - INFO - lr: 3.1968e-06 gnorm: 0.54 [2 days, 12:53:36<11:58:41] +[titan] 2025-09-10 12:30:37,447 - root - INFO - step: 33430 loss: 2.6478 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.91 mfu: 49.33% global_avg_ntp_loss: 0.7314 global_avg_top_loss: 1.9164 +[titan] 2025-09-10 12:30:37,448 - root - INFO - lr: 3.1950e-06 gnorm: 0.58 [2 days, 12:54:08<11:58:09] +[titan] 2025-09-10 12:31:09,240 - root - INFO - step: 33435 loss: 2.6474 memory: 122.03GiB(87.57%) tps: 10,307 tflops: 491.22 mfu: 49.67% global_avg_ntp_loss: 0.7274 global_avg_top_loss: 1.9200 +[titan] 2025-09-10 12:31:09,241 - root - INFO - lr: 3.1932e-06 gnorm: 0.50 [2 days, 12:54:40<11:57:36] +[titan] 2025-09-10 12:31:41,206 - root - INFO - step: 33440 loss: 2.5405 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.56 mfu: 49.40% global_avg_ntp_loss: 0.6757 global_avg_top_loss: 1.8649 +[titan] 2025-09-10 12:31:41,207 - root - INFO - lr: 3.1915e-06 gnorm: 0.50 [2 days, 12:55:12<11:57:03] +[titan] 2025-09-10 12:32:13,216 - root - INFO - step: 33445 loss: 2.5588 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.89 mfu: 49.33% global_avg_ntp_loss: 0.6851 global_avg_top_loss: 1.8737 +[titan] 2025-09-10 12:32:13,216 - root - INFO - lr: 3.1897e-06 gnorm: 0.50 [2 days, 12:55:44<11:56:30] +[titan] 2025-09-10 12:32:38,579 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 12:32:44,967 - root - INFO - step: 33450 loss: 2.8069 memory: 122.03GiB(87.57%) tps: 10,320 tflops: 491.86 mfu: 49.73% global_avg_ntp_loss: 0.8245 global_avg_top_loss: 1.9824 +[titan] 2025-09-10 12:32:44,967 - root - INFO - lr: 3.1879e-06 gnorm: 0.48 [2 days, 12:56:16<11:55:57] +[titan] 2025-09-10 12:33:16,876 - root - INFO - step: 33455 loss: 2.5763 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.43 mfu: 49.49% global_avg_ntp_loss: 0.6942 global_avg_top_loss: 1.8820 +[titan] 2025-09-10 12:33:16,876 - root - INFO - lr: 3.1862e-06 gnorm: 0.54 [2 days, 12:56:48<11:55:24] +[titan] 2025-09-10 12:33:48,901 - root - INFO - step: 33460 loss: 2.6461 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.65 mfu: 49.31% global_avg_ntp_loss: 0.7255 global_avg_top_loss: 1.9206 +[titan] 2025-09-10 12:33:48,902 - root - INFO - lr: 3.1844e-06 gnorm: 0.57 [2 days, 12:57:20<11:54:51] +[titan] 2025-09-10 12:34:20,878 - root - INFO - step: 33465 loss: 2.4301 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.39 mfu: 49.38% global_avg_ntp_loss: 0.6299 global_avg_top_loss: 1.8002 +[titan] 2025-09-10 12:34:20,878 - root - INFO - lr: 3.1826e-06 gnorm: 0.47 [2 days, 12:57:52<11:54:18] +[titan] 2025-09-10 12:34:52,801 - root - INFO - step: 33470 loss: 2.4799 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.22 mfu: 49.47% global_avg_ntp_loss: 0.6537 global_avg_top_loss: 1.8262 +[titan] 2025-09-10 12:34:52,801 - root - INFO - lr: 3.1808e-06 gnorm: 0.52 [2 days, 12:58:24<11:53:45] +[titan] 2025-09-10 12:35:24,785 - root - INFO - step: 33475 loss: 2.4320 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.28 mfu: 49.37% global_avg_ntp_loss: 0.6247 global_avg_top_loss: 1.8073 +[titan] 2025-09-10 12:35:24,785 - root - INFO - lr: 3.1791e-06 gnorm: 0.68 [2 days, 12:58:56<11:53:12] +[titan] 2025-09-10 12:35:56,967 - root - INFO - step: 33480 loss: 2.5629 memory: 122.03GiB(87.57%) tps: 10,182 tflops: 485.28 mfu: 49.07% global_avg_ntp_loss: 0.6866 global_avg_top_loss: 1.8764 +[titan] 2025-09-10 12:35:56,967 - root - INFO - lr: 3.1773e-06 gnorm: 0.51 [2 days, 12:59:28<11:52:39] +[titan] 2025-09-10 12:36:29,084 - root - INFO - step: 33485 loss: 2.4849 memory: 122.03GiB(87.57%) tps: 10,203 tflops: 486.25 mfu: 49.17% global_avg_ntp_loss: 0.6494 global_avg_top_loss: 1.8355 +[titan] 2025-09-10 12:36:29,085 - root - INFO - lr: 3.1755e-06 gnorm: 0.54 [2 days, 13:00:00<11:52:06] +[titan] 2025-09-10 12:37:01,028 - root - INFO - step: 33490 loss: 2.5725 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.90 mfu: 49.43% global_avg_ntp_loss: 0.6896 global_avg_top_loss: 1.8828 +[titan] 2025-09-10 12:37:01,028 - root - INFO - lr: 3.1738e-06 gnorm: 0.53 [2 days, 13:00:32<11:51:33] +[titan] 2025-09-10 12:37:33,038 - root - INFO - step: 33495 loss: 2.5170 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.88 mfu: 49.33% global_avg_ntp_loss: 0.6676 global_avg_top_loss: 1.8494 +[titan] 2025-09-10 12:37:33,038 - root - INFO - lr: 3.1720e-06 gnorm: 0.51 [2 days, 13:01:04<11:51:00] +[titan] 2025-09-10 12:37:58,585 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 12:38:04,973 - root - INFO - step: 33500 loss: 2.6286 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.02 mfu: 49.45% global_avg_ntp_loss: 0.7228 global_avg_top_loss: 1.9058 +[titan] 2025-09-10 12:38:04,974 - root - INFO - lr: 3.1703e-06 gnorm: 0.53 [2 days, 13:01:36<11:50:27] +[titan] 2025-09-10 12:38:36,902 - root - INFO - step: 33505 loss: 2.5920 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.12 mfu: 49.46% global_avg_ntp_loss: 0.7001 global_avg_top_loss: 1.8919 +[titan] 2025-09-10 12:38:36,903 - root - INFO - lr: 3.1685e-06 gnorm: 0.52 [2 days, 13:02:08<11:49:54] +[titan] 2025-09-10 12:39:09,036 - root - INFO - step: 33510 loss: 2.9199 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.01 mfu: 49.14% global_avg_ntp_loss: 0.8799 global_avg_top_loss: 2.0400 +[titan] 2025-09-10 12:39:09,036 - root - INFO - lr: 3.1667e-06 gnorm: 0.58 [2 days, 13:02:40<11:49:21] +[titan] 2025-09-10 12:39:40,957 - root - INFO - step: 33515 loss: 2.6080 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.25 mfu: 49.47% global_avg_ntp_loss: 0.7092 global_avg_top_loss: 1.8987 +[titan] 2025-09-10 12:39:40,957 - root - INFO - lr: 3.1650e-06 gnorm: 0.50 [2 days, 13:03:12<11:48:48] +[titan] 2025-09-10 12:40:12,952 - root - INFO - step: 33520 loss: 2.6178 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.11 mfu: 49.35% global_avg_ntp_loss: 0.7166 global_avg_top_loss: 1.9011 +[titan] 2025-09-10 12:40:12,952 - root - INFO - lr: 3.1632e-06 gnorm: 0.52 [2 days, 13:03:44<11:48:15] +[titan] 2025-09-10 12:40:44,940 - root - INFO - step: 33525 loss: 2.6660 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.22 mfu: 49.36% global_avg_ntp_loss: 0.7528 global_avg_top_loss: 1.9132 +[titan] 2025-09-10 12:40:44,941 - root - INFO - lr: 3.1615e-06 gnorm: 0.49 [2 days, 13:04:16<11:47:42] +[titan] 2025-09-10 12:41:16,840 - root - INFO - step: 33530 loss: 2.6128 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.58 mfu: 49.50% global_avg_ntp_loss: 0.7109 global_avg_top_loss: 1.9019 +[titan] 2025-09-10 12:41:16,840 - root - INFO - lr: 3.1597e-06 gnorm: 0.48 [2 days, 13:04:48<11:47:09] +[titan] 2025-09-10 12:41:48,855 - root - INFO - step: 33535 loss: 2.6326 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.80 mfu: 49.32% global_avg_ntp_loss: 0.7215 global_avg_top_loss: 1.9112 +[titan] 2025-09-10 12:41:48,856 - root - INFO - lr: 3.1580e-06 gnorm: 0.57 [2 days, 13:05:20<11:46:37] +[titan] 2025-09-10 12:42:20,736 - root - INFO - step: 33540 loss: 2.5658 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.87 mfu: 49.53% global_avg_ntp_loss: 0.6890 global_avg_top_loss: 1.8769 +[titan] 2025-09-10 12:42:20,736 - root - INFO - lr: 3.1562e-06 gnorm: 0.60 [2 days, 13:05:52<11:46:04] +[titan] 2025-09-10 12:42:52,489 - root - INFO - step: 33545 loss: 2.8857 memory: 122.03GiB(87.57%) tps: 10,320 tflops: 491.83 mfu: 49.73% global_avg_ntp_loss: 0.8882 global_avg_top_loss: 1.9975 +[titan] 2025-09-10 12:42:52,489 - root - INFO - lr: 3.1545e-06 gnorm: 0.49 [2 days, 13:06:23<11:45:31] +[titan] 2025-09-10 12:43:18,262 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 12:43:24,590 - root - INFO - step: 33550 loss: 2.3970 memory: 122.03GiB(87.57%) tps: 10,208 tflops: 486.50 mfu: 49.19% global_avg_ntp_loss: 0.6083 global_avg_top_loss: 1.7887 +[titan] 2025-09-10 12:43:24,590 - root - INFO - lr: 3.1527e-06 gnorm: 0.55 [2 days, 13:06:56<11:44:58] +[titan] 2025-09-10 12:43:56,533 - root - INFO - step: 33555 loss: 2.4709 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.91 mfu: 49.43% global_avg_ntp_loss: 0.6409 global_avg_top_loss: 1.8301 +[titan] 2025-09-10 12:43:56,534 - root - INFO - lr: 3.1510e-06 gnorm: 0.65 [2 days, 13:07:28<11:44:25] +[titan] 2025-09-10 12:44:28,420 - root - INFO - step: 33560 loss: 2.6621 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.78 mfu: 49.52% global_avg_ntp_loss: 0.7422 global_avg_top_loss: 1.9199 +[titan] 2025-09-10 12:44:28,420 - root - INFO - lr: 3.1492e-06 gnorm: 0.50 [2 days, 13:07:59<11:43:52] +[titan] 2025-09-10 12:45:00,331 - root - INFO - step: 33565 loss: 2.5104 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.39 mfu: 49.48% global_avg_ntp_loss: 0.6624 global_avg_top_loss: 1.8480 +[titan] 2025-09-10 12:45:00,331 - root - INFO - lr: 3.1475e-06 gnorm: 0.55 [2 days, 13:08:31<11:43:19] +[titan] 2025-09-10 12:45:32,343 - root - INFO - step: 33570 loss: 2.6461 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.85 mfu: 49.33% global_avg_ntp_loss: 0.7212 global_avg_top_loss: 1.9249 +[titan] 2025-09-10 12:45:32,343 - root - INFO - lr: 3.1457e-06 gnorm: 0.53 [2 days, 13:09:03<11:42:46] +[titan] 2025-09-10 12:46:04,193 - root - INFO - step: 33575 loss: 2.4515 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.34 mfu: 49.58% global_avg_ntp_loss: 0.6395 global_avg_top_loss: 1.8120 +[titan] 2025-09-10 12:46:04,193 - root - INFO - lr: 3.1440e-06 gnorm: 0.51 [2 days, 13:09:35<11:42:13] +[titan] 2025-09-10 12:46:36,276 - root - INFO - step: 33580 loss: 2.6147 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.77 mfu: 49.22% global_avg_ntp_loss: 0.7158 global_avg_top_loss: 1.8989 +[titan] 2025-09-10 12:46:36,276 - root - INFO - lr: 3.1423e-06 gnorm: 0.57 [2 days, 13:10:07<11:41:40] +[titan] 2025-09-10 12:47:08,250 - root - INFO - step: 33585 loss: 2.6107 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.44 mfu: 49.39% global_avg_ntp_loss: 0.7143 global_avg_top_loss: 1.8964 +[titan] 2025-09-10 12:47:08,250 - root - INFO - lr: 3.1405e-06 gnorm: 0.51 [2 days, 13:10:39<11:41:07] +[titan] 2025-09-10 12:47:40,242 - root - INFO - step: 33590 loss: 2.6036 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.16 mfu: 49.36% global_avg_ntp_loss: 0.7117 global_avg_top_loss: 1.8919 +[titan] 2025-09-10 12:47:40,242 - root - INFO - lr: 3.1388e-06 gnorm: 0.56 [2 days, 13:11:11<11:40:34] +[titan] 2025-09-10 12:48:12,362 - root - INFO - step: 33595 loss: 2.6124 memory: 122.03GiB(87.57%) tps: 10,202 tflops: 486.21 mfu: 49.16% global_avg_ntp_loss: 0.7120 global_avg_top_loss: 1.9003 +[titan] 2025-09-10 12:48:12,362 - root - INFO - lr: 3.1370e-06 gnorm: 0.63 [2 days, 13:11:43<11:40:01] +[titan] 2025-09-10 12:48:38,010 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 12:48:44,463 - root - INFO - step: 33600 loss: 2.5599 memory: 122.03GiB(87.57%) tps: 10,208 tflops: 486.50 mfu: 49.19% global_avg_ntp_loss: 0.6837 global_avg_top_loss: 1.8761 +[titan] 2025-09-10 12:48:44,464 - root - INFO - lr: 3.1353e-06 gnorm: 0.50 [2 days, 13:12:15<11:39:28] +[titan] 2025-09-10 12:49:16,353 - root - INFO - step: 33605 loss: 2.5987 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.72 mfu: 49.52% global_avg_ntp_loss: 0.7058 global_avg_top_loss: 1.8929 +[titan] 2025-09-10 12:49:16,354 - root - INFO - lr: 3.1336e-06 gnorm: 0.50 [2 days, 13:12:47<11:38:55] +[titan] 2025-09-10 12:49:48,232 - root - INFO - step: 33610 loss: 2.5841 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.90 mfu: 49.53% global_avg_ntp_loss: 0.6989 global_avg_top_loss: 1.8852 +[titan] 2025-09-10 12:49:48,232 - root - INFO - lr: 3.1318e-06 gnorm: 0.48 [2 days, 13:13:19<11:38:22] +[titan] 2025-09-10 12:50:20,308 - root - INFO - step: 33615 loss: 2.6119 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.88 mfu: 49.23% global_avg_ntp_loss: 0.7123 global_avg_top_loss: 1.8996 +[titan] 2025-09-10 12:50:20,308 - root - INFO - lr: 3.1301e-06 gnorm: 0.62 [2 days, 13:13:51<11:37:49] +[titan] 2025-09-10 12:50:52,218 - root - INFO - step: 33620 loss: 2.6066 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.41 mfu: 49.49% global_avg_ntp_loss: 0.7075 global_avg_top_loss: 1.8991 +[titan] 2025-09-10 12:50:52,218 - root - INFO - lr: 3.1284e-06 gnorm: 0.58 [2 days, 13:14:23<11:37:16] +[titan] 2025-09-10 12:51:24,155 - root - INFO - step: 33625 loss: 2.8952 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 489.00 mfu: 49.44% global_avg_ntp_loss: 0.8898 global_avg_top_loss: 2.0054 +[titan] 2025-09-10 12:51:24,155 - root - INFO - lr: 3.1266e-06 gnorm: 0.50 [2 days, 13:14:55<11:36:44] +[titan] 2025-09-10 12:51:56,085 - root - INFO - step: 33630 loss: 2.4891 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.10 mfu: 49.45% global_avg_ntp_loss: 0.6539 global_avg_top_loss: 1.8352 +[titan] 2025-09-10 12:51:56,085 - root - INFO - lr: 3.1249e-06 gnorm: 0.56 [2 days, 13:15:27<11:36:11] +[titan] 2025-09-10 12:52:28,273 - root - INFO - step: 33635 loss: 2.3894 memory: 122.03GiB(87.57%) tps: 10,180 tflops: 485.19 mfu: 49.06% global_avg_ntp_loss: 0.6044 global_avg_top_loss: 1.7850 +[titan] 2025-09-10 12:52:28,273 - root - INFO - lr: 3.1232e-06 gnorm: 0.62 [2 days, 13:15:59<11:35:38] +[titan] 2025-09-10 12:53:00,468 - root - INFO - step: 33640 loss: 2.5065 memory: 122.03GiB(87.57%) tps: 10,178 tflops: 485.09 mfu: 49.05% global_avg_ntp_loss: 0.6625 global_avg_top_loss: 1.8440 +[titan] 2025-09-10 12:53:00,468 - root - INFO - lr: 3.1215e-06 gnorm: 0.50 [2 days, 13:16:31<11:35:05] +[titan] 2025-09-10 12:53:32,435 - root - INFO - step: 33645 loss: 2.5108 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.53 mfu: 49.40% global_avg_ntp_loss: 0.6608 global_avg_top_loss: 1.8501 +[titan] 2025-09-10 12:53:32,435 - root - INFO - lr: 3.1197e-06 gnorm: 0.55 [2 days, 13:17:03<11:34:32] +[titan] 2025-09-10 12:53:57,892 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 12:54:04,307 - root - INFO - step: 33650 loss: 2.5783 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 490.00 mfu: 49.54% global_avg_ntp_loss: 0.6930 global_avg_top_loss: 1.8853 +[titan] 2025-09-10 12:54:04,307 - root - INFO - lr: 3.1180e-06 gnorm: 0.51 [2 days, 13:17:35<11:33:59] +[titan] 2025-09-10 12:54:36,275 - root - INFO - step: 33655 loss: 2.5320 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.52 mfu: 49.40% global_avg_ntp_loss: 0.6751 global_avg_top_loss: 1.8569 +[titan] 2025-09-10 12:54:36,276 - root - INFO - lr: 3.1163e-06 gnorm: 0.49 [2 days, 13:18:07<11:33:26] +[titan] 2025-09-10 12:55:08,143 - root - INFO - step: 33660 loss: 2.5819 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.06 mfu: 49.55% global_avg_ntp_loss: 0.6963 global_avg_top_loss: 1.8856 +[titan] 2025-09-10 12:55:08,144 - root - INFO - lr: 3.1146e-06 gnorm: 0.52 [2 days, 13:18:39<11:32:53] +[titan] 2025-09-10 12:55:40,109 - root - INFO - step: 33665 loss: 2.5668 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.56 mfu: 49.40% global_avg_ntp_loss: 0.6922 global_avg_top_loss: 1.8746 +[titan] 2025-09-10 12:55:40,110 - root - INFO - lr: 3.1128e-06 gnorm: 0.54 [2 days, 13:19:11<11:32:20] +[titan] 2025-09-10 12:56:12,269 - root - INFO - step: 33670 loss: 2.7106 memory: 122.03GiB(87.57%) tps: 10,189 tflops: 485.61 mfu: 49.10% global_avg_ntp_loss: 0.7591 global_avg_top_loss: 1.9515 +[titan] 2025-09-10 12:56:12,270 - root - INFO - lr: 3.1111e-06 gnorm: 0.57 [2 days, 13:19:43<11:31:47] +[titan] 2025-09-10 12:56:44,262 - root - INFO - step: 33675 loss: 2.5250 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.15 mfu: 49.36% global_avg_ntp_loss: 0.6748 global_avg_top_loss: 1.8502 +[titan] 2025-09-10 12:56:44,263 - root - INFO - lr: 3.1094e-06 gnorm: 0.50 [2 days, 13:20:15<11:31:14] +[titan] 2025-09-10 12:57:16,118 - root - INFO - step: 33680 loss: 2.5755 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.24 mfu: 49.57% global_avg_ntp_loss: 0.6959 global_avg_top_loss: 1.8796 +[titan] 2025-09-10 12:57:16,119 - root - INFO - lr: 3.1077e-06 gnorm: 0.53 [2 days, 13:20:47<11:30:41] +[titan] 2025-09-10 12:57:47,901 - root - INFO - step: 33685 loss: 2.5604 memory: 122.03GiB(87.57%) tps: 10,310 tflops: 491.38 mfu: 49.68% global_avg_ntp_loss: 0.6926 global_avg_top_loss: 1.8679 +[titan] 2025-09-10 12:57:47,901 - root - INFO - lr: 3.1060e-06 gnorm: 0.49 [2 days, 13:21:19<11:30:08] +[titan] 2025-09-10 12:58:19,764 - root - INFO - step: 33690 loss: 2.7423 memory: 122.03GiB(87.57%) tps: 10,284 tflops: 490.14 mfu: 49.56% global_avg_ntp_loss: 0.7748 global_avg_top_loss: 1.9675 +[titan] 2025-09-10 12:58:19,764 - root - INFO - lr: 3.1043e-06 gnorm: 0.49 [2 days, 13:21:51<11:29:35] +[titan] 2025-09-10 12:58:51,626 - root - INFO - step: 33695 loss: 2.5905 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.15 mfu: 49.56% global_avg_ntp_loss: 0.7046 global_avg_top_loss: 1.8859 +[titan] 2025-09-10 12:58:51,626 - root - INFO - lr: 3.1025e-06 gnorm: 0.59 [2 days, 13:22:23<11:29:02] +[titan] 2025-09-10 12:59:16,980 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 12:59:23,324 - root - INFO - step: 33700 loss: 2.6063 memory: 122.03GiB(87.57%) tps: 10,338 tflops: 492.68 mfu: 49.82% global_avg_ntp_loss: 0.7084 global_avg_top_loss: 1.8979 +[titan] 2025-09-10 12:59:23,325 - root - INFO - lr: 3.1008e-06 gnorm: 0.62 [2 days, 13:22:54<11:28:29] +[titan] 2025-09-10 12:59:55,478 - root - INFO - step: 33705 loss: 2.8513 memory: 122.03GiB(87.57%) tps: 10,191 tflops: 485.71 mfu: 49.11% global_avg_ntp_loss: 0.8674 global_avg_top_loss: 1.9839 +[titan] 2025-09-10 12:59:55,478 - root - INFO - lr: 3.0991e-06 gnorm: 0.49 [2 days, 13:23:26<11:27:56] +[titan] 2025-09-10 13:00:27,396 - root - INFO - step: 33710 loss: 2.4496 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.30 mfu: 49.47% global_avg_ntp_loss: 0.6351 global_avg_top_loss: 1.8146 +[titan] 2025-09-10 13:00:27,396 - root - INFO - lr: 3.0974e-06 gnorm: 0.52 [2 days, 13:23:58<11:27:23] +[titan] 2025-09-10 13:00:59,350 - root - INFO - step: 33715 loss: 2.4967 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.73 mfu: 49.42% global_avg_ntp_loss: 0.6548 global_avg_top_loss: 1.8419 +[titan] 2025-09-10 13:00:59,351 - root - INFO - lr: 3.0957e-06 gnorm: 0.68 [2 days, 13:24:30<11:26:51] +[titan] 2025-09-10 13:01:31,372 - root - INFO - step: 33720 loss: 2.5727 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.71 mfu: 49.31% global_avg_ntp_loss: 0.6931 global_avg_top_loss: 1.8796 +[titan] 2025-09-10 13:01:31,372 - root - INFO - lr: 3.0940e-06 gnorm: 0.50 [2 days, 13:25:02<11:26:18] +[titan] 2025-09-10 13:02:03,053 - root - INFO - step: 33725 loss: 2.9129 memory: 122.03GiB(87.57%) tps: 10,343 tflops: 492.95 mfu: 49.84% global_avg_ntp_loss: 0.8910 global_avg_top_loss: 2.0219 +[titan] 2025-09-10 13:02:03,053 - root - INFO - lr: 3.0923e-06 gnorm: 0.54 [2 days, 13:25:34<11:25:45] +[titan] 2025-09-10 13:02:35,130 - root - INFO - step: 33730 loss: 2.5524 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.87 mfu: 49.23% global_avg_ntp_loss: 0.6829 global_avg_top_loss: 1.8695 +[titan] 2025-09-10 13:02:35,130 - root - INFO - lr: 3.0906e-06 gnorm: 0.53 [2 days, 13:26:06<11:25:12] +[titan] 2025-09-10 13:03:07,405 - root - INFO - step: 33735 loss: 2.5313 memory: 122.03GiB(87.57%) tps: 10,153 tflops: 483.88 mfu: 48.93% global_avg_ntp_loss: 0.6745 global_avg_top_loss: 1.8569 +[titan] 2025-09-10 13:03:07,405 - root - INFO - lr: 3.0889e-06 gnorm: 0.49 [2 days, 13:26:38<11:24:39] +[titan] 2025-09-10 13:03:39,225 - root - INFO - step: 33740 loss: 2.6106 memory: 122.03GiB(87.57%) tps: 10,298 tflops: 490.79 mfu: 49.63% global_avg_ntp_loss: 0.7097 global_avg_top_loss: 1.9009 +[titan] 2025-09-10 13:03:39,225 - root - INFO - lr: 3.0872e-06 gnorm: 0.54 [2 days, 13:27:10<11:24:06] +[titan] 2025-09-10 13:04:11,168 - root - INFO - step: 33745 loss: 2.6255 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.92 mfu: 49.44% global_avg_ntp_loss: 0.7157 global_avg_top_loss: 1.9098 +[titan] 2025-09-10 13:04:11,168 - root - INFO - lr: 3.0855e-06 gnorm: 0.54 [2 days, 13:27:42<11:23:33] +[titan] 2025-09-10 13:04:36,866 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 13:04:43,183 - root - INFO - step: 33750 loss: 2.6539 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.81 mfu: 49.32% global_avg_ntp_loss: 0.7336 global_avg_top_loss: 1.9203 +[titan] 2025-09-10 13:04:43,183 - root - INFO - lr: 3.0838e-06 gnorm: 0.60 [2 days, 13:28:14<11:23:00] +[titan] 2025-09-10 13:05:15,179 - root - INFO - step: 33755 loss: 2.6252 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.10 mfu: 49.35% global_avg_ntp_loss: 0.7100 global_avg_top_loss: 1.9152 +[titan] 2025-09-10 13:05:15,179 - root - INFO - lr: 3.0821e-06 gnorm: 0.53 [2 days, 13:28:46<11:22:27] +[titan] 2025-09-10 13:05:46,991 - root - INFO - step: 33760 loss: 2.5734 memory: 122.03GiB(87.57%) tps: 10,300 tflops: 490.91 mfu: 49.64% global_avg_ntp_loss: 0.6913 global_avg_top_loss: 1.8821 +[titan] 2025-09-10 13:05:46,992 - root - INFO - lr: 3.0804e-06 gnorm: 0.49 [2 days, 13:29:18<11:21:54] +[titan] 2025-09-10 13:06:19,078 - root - INFO - step: 33765 loss: 2.5173 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.72 mfu: 49.21% global_avg_ntp_loss: 0.6689 global_avg_top_loss: 1.8484 +[titan] 2025-09-10 13:06:19,078 - root - INFO - lr: 3.0787e-06 gnorm: 0.51 [2 days, 13:29:50<11:21:21] +[titan] 2025-09-10 13:06:51,017 - root - INFO - step: 33770 loss: 2.4826 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.97 mfu: 49.44% global_avg_ntp_loss: 0.6577 global_avg_top_loss: 1.8249 +[titan] 2025-09-10 13:06:51,017 - root - INFO - lr: 3.0770e-06 gnorm: 0.46 [2 days, 13:30:22<11:20:48] +[titan] 2025-09-10 13:07:22,974 - root - INFO - step: 33775 loss: 2.6078 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.69 mfu: 49.41% global_avg_ntp_loss: 0.7096 global_avg_top_loss: 1.8982 +[titan] 2025-09-10 13:07:22,975 - root - INFO - lr: 3.0753e-06 gnorm: 0.56 [2 days, 13:30:54<11:20:15] +[titan] 2025-09-10 13:07:55,148 - root - INFO - step: 33780 loss: 2.6265 memory: 122.03GiB(87.57%) tps: 10,185 tflops: 485.41 mfu: 49.08% global_avg_ntp_loss: 0.7154 global_avg_top_loss: 1.9111 +[titan] 2025-09-10 13:07:55,148 - root - INFO - lr: 3.0736e-06 gnorm: 0.63 [2 days, 13:31:26<11:19:42] +[titan] 2025-09-10 13:08:26,942 - root - INFO - step: 33785 loss: 2.4772 memory: 122.03GiB(87.57%) tps: 10,306 tflops: 491.20 mfu: 49.67% global_avg_ntp_loss: 0.6499 global_avg_top_loss: 1.8273 +[titan] 2025-09-10 13:08:26,942 - root - INFO - lr: 3.0719e-06 gnorm: 0.49 [2 days, 13:31:58<11:19:09] +[titan] 2025-09-10 13:08:59,028 - root - INFO - step: 33790 loss: 2.4807 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.73 mfu: 49.21% global_avg_ntp_loss: 0.6451 global_avg_top_loss: 1.8356 +[titan] 2025-09-10 13:08:59,028 - root - INFO - lr: 3.0702e-06 gnorm: 0.56 [2 days, 13:32:30<11:18:37] +[titan] 2025-09-10 13:09:12,145 - root - INFO - Dumping profiler traces at step 33792 +[titan] 2025-09-10 13:09:12,215 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-10 13:09:31,231 - root - INFO - step: 33795 loss: 2.4378 memory: 122.03GiB(87.57%) tps: 10,176 tflops: 484.96 mfu: 49.04% global_avg_ntp_loss: 0.6258 global_avg_top_loss: 1.8120 +[titan] 2025-09-10 13:09:31,231 - root - INFO - lr: 3.0686e-06 gnorm: 0.69 [2 days, 13:33:02<11:18:04] +[titan] 2025-09-10 13:09:56,712 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 13:10:03,064 - root - INFO - step: 33800 loss: 2.5206 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.60 mfu: 49.61% global_avg_ntp_loss: 0.6687 global_avg_top_loss: 1.8519 +[titan] 2025-09-10 13:10:03,064 - root - INFO - lr: 3.0669e-06 gnorm: 0.50 [2 days, 13:33:34<11:17:31] +[titan] 2025-09-10 13:10:34,963 - root - INFO - step: 33805 loss: 2.5469 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.58 mfu: 49.50% global_avg_ntp_loss: 0.6779 global_avg_top_loss: 1.8690 +[titan] 2025-09-10 13:10:34,964 - root - INFO - lr: 3.0652e-06 gnorm: 0.54 [2 days, 13:34:06<11:16:58] +[titan] 2025-09-10 13:11:07,041 - root - INFO - step: 33810 loss: 2.5629 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.85 mfu: 49.23% global_avg_ntp_loss: 0.6886 global_avg_top_loss: 1.8743 +[titan] 2025-09-10 13:11:07,042 - root - INFO - lr: 3.0635e-06 gnorm: 0.54 [2 days, 13:34:38<11:16:25] +[titan] 2025-09-10 13:11:38,896 - root - INFO - step: 33815 loss: 2.5085 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.26 mfu: 49.57% global_avg_ntp_loss: 0.6671 global_avg_top_loss: 1.8414 +[titan] 2025-09-10 13:11:38,896 - root - INFO - lr: 3.0618e-06 gnorm: 0.52 [2 days, 13:35:10<11:15:52] +[titan] 2025-09-10 13:12:10,849 - root - INFO - step: 33820 loss: 2.5586 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.76 mfu: 49.42% global_avg_ntp_loss: 0.6867 global_avg_top_loss: 1.8719 +[titan] 2025-09-10 13:12:10,849 - root - INFO - lr: 3.0601e-06 gnorm: 0.53 [2 days, 13:35:42<11:15:19] +[titan] 2025-09-10 13:12:42,888 - root - INFO - step: 33825 loss: 2.6565 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.44 mfu: 49.29% global_avg_ntp_loss: 0.7336 global_avg_top_loss: 1.9229 +[titan] 2025-09-10 13:12:42,888 - root - INFO - lr: 3.0585e-06 gnorm: 0.55 [2 days, 13:36:14<11:14:46] +[titan] 2025-09-10 13:13:14,935 - root - INFO - step: 33830 loss: 2.6576 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.33 mfu: 49.27% global_avg_ntp_loss: 0.7322 global_avg_top_loss: 1.9254 +[titan] 2025-09-10 13:13:14,935 - root - INFO - lr: 3.0568e-06 gnorm: 0.57 [2 days, 13:36:46<11:14:13] +[titan] 2025-09-10 13:13:46,945 - root - INFO - step: 33835 loss: 2.5496 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.89 mfu: 49.33% global_avg_ntp_loss: 0.6775 global_avg_top_loss: 1.8721 +[titan] 2025-09-10 13:13:46,945 - root - INFO - lr: 3.0551e-06 gnorm: 0.55 [2 days, 13:37:18<11:13:40] +[titan] 2025-09-10 13:14:18,949 - root - INFO - step: 33840 loss: 2.6266 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.97 mfu: 49.34% global_avg_ntp_loss: 0.7274 global_avg_top_loss: 1.8992 +[titan] 2025-09-10 13:14:18,950 - root - INFO - lr: 3.0534e-06 gnorm: 0.50 [2 days, 13:37:50<11:13:07] +[titan] 2025-09-10 13:14:50,790 - root - INFO - step: 33845 loss: 2.6028 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.48 mfu: 49.59% global_avg_ntp_loss: 0.7079 global_avg_top_loss: 1.8949 +[titan] 2025-09-10 13:14:50,790 - root - INFO - lr: 3.0517e-06 gnorm: 0.52 [2 days, 13:38:22<11:12:34] +[titan] 2025-09-10 13:15:16,314 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 13:15:22,665 - root - INFO - step: 33850 loss: 2.6410 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.95 mfu: 49.54% global_avg_ntp_loss: 0.7274 global_avg_top_loss: 1.9136 +[titan] 2025-09-10 13:15:22,666 - root - INFO - lr: 3.0501e-06 gnorm: 0.50 [2 days, 13:38:54<11:12:01] +[titan] 2025-09-10 13:15:54,794 - root - INFO - step: 33855 loss: 2.6415 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.07 mfu: 49.15% global_avg_ntp_loss: 0.7269 global_avg_top_loss: 1.9146 +[titan] 2025-09-10 13:15:54,795 - root - INFO - lr: 3.0484e-06 gnorm: 0.53 [2 days, 13:39:26<11:11:28] +[titan] 2025-09-10 13:16:26,802 - root - INFO - step: 33860 loss: 2.5850 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.92 mfu: 49.33% global_avg_ntp_loss: 0.6986 global_avg_top_loss: 1.8865 +[titan] 2025-09-10 13:16:26,802 - root - INFO - lr: 3.0467e-06 gnorm: 0.61 [2 days, 13:39:58<11:10:56] +[titan] 2025-09-10 13:16:58,728 - root - INFO - step: 33865 loss: 2.4914 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.17 mfu: 49.46% global_avg_ntp_loss: 0.6540 global_avg_top_loss: 1.8373 +[titan] 2025-09-10 13:16:58,729 - root - INFO - lr: 3.0451e-06 gnorm: 0.52 [2 days, 13:40:30<11:10:23] +[titan] 2025-09-10 13:17:31,008 - root - INFO - step: 33870 loss: 2.4718 memory: 122.03GiB(87.57%) tps: 10,151 tflops: 483.81 mfu: 48.92% global_avg_ntp_loss: 0.6441 global_avg_top_loss: 1.8278 +[titan] 2025-09-10 13:17:31,009 - root - INFO - lr: 3.0434e-06 gnorm: 0.56 [2 days, 13:41:02<11:09:50] +[titan] 2025-09-10 13:18:03,080 - root - INFO - step: 33875 loss: 2.4742 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.94 mfu: 49.24% global_avg_ntp_loss: 0.6421 global_avg_top_loss: 1.8321 +[titan] 2025-09-10 13:18:03,081 - root - INFO - lr: 3.0417e-06 gnorm: 0.64 [2 days, 13:41:34<11:09:17] +[titan] 2025-09-10 13:18:35,051 - root - INFO - step: 33880 loss: 2.9889 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.49 mfu: 49.39% global_avg_ntp_loss: 0.9367 global_avg_top_loss: 2.0521 +[titan] 2025-09-10 13:18:35,051 - root - INFO - lr: 3.0401e-06 gnorm: 0.49 [2 days, 13:42:06<11:08:44] +[titan] 2025-09-10 13:19:07,001 - root - INFO - step: 33885 loss: 2.8136 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.80 mfu: 49.42% global_avg_ntp_loss: 0.8297 global_avg_top_loss: 1.9839 +[titan] 2025-09-10 13:19:07,001 - root - INFO - lr: 3.0384e-06 gnorm: 0.52 [2 days, 13:42:38<11:08:11] +[titan] 2025-09-10 13:19:39,189 - root - INFO - step: 33890 loss: 2.4566 memory: 122.03GiB(87.57%) tps: 10,181 tflops: 485.20 mfu: 49.06% global_avg_ntp_loss: 0.6374 global_avg_top_loss: 1.8192 +[titan] 2025-09-10 13:19:39,189 - root - INFO - lr: 3.0367e-06 gnorm: 0.52 [2 days, 13:43:10<11:07:38] +[titan] 2025-09-10 13:20:11,239 - root - INFO - step: 33895 loss: 2.4471 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.26 mfu: 49.27% global_avg_ntp_loss: 0.6383 global_avg_top_loss: 1.8088 +[titan] 2025-09-10 13:20:11,240 - root - INFO - lr: 3.0351e-06 gnorm: 0.48 [2 days, 13:43:42<11:07:05] +[titan] 2025-09-10 13:20:36,667 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 13:20:43,029 - root - INFO - step: 33900 loss: 2.6321 memory: 122.03GiB(87.57%) tps: 10,308 tflops: 491.27 mfu: 49.67% global_avg_ntp_loss: 0.7197 global_avg_top_loss: 1.9123 +[titan] 2025-09-10 13:20:43,029 - root - INFO - lr: 3.0334e-06 gnorm: 0.51 [2 days, 13:44:14<11:06:32] +[titan] 2025-09-10 13:21:15,049 - root - INFO - step: 33905 loss: 2.5613 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.73 mfu: 49.32% global_avg_ntp_loss: 0.6875 global_avg_top_loss: 1.8737 +[titan] 2025-09-10 13:21:15,050 - root - INFO - lr: 3.0317e-06 gnorm: 0.53 [2 days, 13:44:46<11:05:59] +[titan] 2025-09-10 13:21:46,796 - root - INFO - step: 33910 loss: 2.7102 memory: 122.03GiB(87.57%) tps: 10,322 tflops: 491.93 mfu: 49.74% global_avg_ntp_loss: 0.7569 global_avg_top_loss: 1.9533 +[titan] 2025-09-10 13:21:46,796 - root - INFO - lr: 3.0301e-06 gnorm: 0.62 [2 days, 13:45:18<11:05:26] +[titan] 2025-09-10 13:22:18,907 - root - INFO - step: 33915 loss: 2.5797 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.36 mfu: 49.18% global_avg_ntp_loss: 0.6952 global_avg_top_loss: 1.8845 +[titan] 2025-09-10 13:22:18,907 - root - INFO - lr: 3.0284e-06 gnorm: 0.51 [2 days, 13:45:50<11:04:53] +[titan] 2025-09-10 13:22:50,753 - root - INFO - step: 33920 loss: 2.5602 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.39 mfu: 49.58% global_avg_ntp_loss: 0.6842 global_avg_top_loss: 1.8760 +[titan] 2025-09-10 13:22:50,753 - root - INFO - lr: 3.0268e-06 gnorm: 0.51 [2 days, 13:46:22<11:04:20] +[titan] 2025-09-10 13:23:22,796 - root - INFO - step: 33925 loss: 2.5665 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.38 mfu: 49.28% global_avg_ntp_loss: 0.6896 global_avg_top_loss: 1.8769 +[titan] 2025-09-10 13:23:22,797 - root - INFO - lr: 3.0251e-06 gnorm: 0.51 [2 days, 13:46:54<11:03:48] +[titan] 2025-09-10 13:23:54,686 - root - INFO - step: 33930 loss: 2.5958 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.73 mfu: 49.52% global_avg_ntp_loss: 0.7025 global_avg_top_loss: 1.8933 +[titan] 2025-09-10 13:23:54,686 - root - INFO - lr: 3.0235e-06 gnorm: 0.51 [2 days, 13:47:26<11:03:15] +[titan] 2025-09-10 13:24:26,816 - root - INFO - step: 33935 loss: 2.6296 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.06 mfu: 49.15% global_avg_ntp_loss: 0.7205 global_avg_top_loss: 1.9090 +[titan] 2025-09-10 13:24:26,816 - root - INFO - lr: 3.0218e-06 gnorm: 0.52 [2 days, 13:47:58<11:02:42] +[titan] 2025-09-10 13:24:58,776 - root - INFO - step: 33940 loss: 2.6650 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.65 mfu: 49.41% global_avg_ntp_loss: 0.7469 global_avg_top_loss: 1.9181 +[titan] 2025-09-10 13:24:58,776 - root - INFO - lr: 3.0202e-06 gnorm: 0.63 [2 days, 13:48:30<11:02:09] +[titan] 2025-09-10 13:25:30,701 - root - INFO - step: 33945 loss: 2.4455 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.18 mfu: 49.46% global_avg_ntp_loss: 0.6354 global_avg_top_loss: 1.8101 +[titan] 2025-09-10 13:25:30,702 - root - INFO - lr: 3.0185e-06 gnorm: 0.51 [2 days, 13:49:02<11:01:36] +[titan] 2025-09-10 13:25:56,248 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 13:26:02,748 - root - INFO - step: 33950 loss: 2.5139 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.33 mfu: 49.27% global_avg_ntp_loss: 0.6637 global_avg_top_loss: 1.8502 +[titan] 2025-09-10 13:26:02,748 - root - INFO - lr: 3.0169e-06 gnorm: 0.57 [2 days, 13:49:34<11:01:03] +[titan] 2025-09-10 13:26:34,540 - root - INFO - step: 33955 loss: 2.3453 memory: 122.03GiB(87.57%) tps: 10,307 tflops: 491.23 mfu: 49.67% global_avg_ntp_loss: 0.5893 global_avg_top_loss: 1.7560 +[titan] 2025-09-10 13:26:34,540 - root - INFO - lr: 3.0152e-06 gnorm: 0.61 [2 days, 13:50:05<11:00:30] +[titan] 2025-09-10 13:27:06,523 - root - INFO - step: 33960 loss: 2.6782 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.30 mfu: 49.37% global_avg_ntp_loss: 0.7509 global_avg_top_loss: 1.9273 +[titan] 2025-09-10 13:27:06,523 - root - INFO - lr: 3.0136e-06 gnorm: 0.53 [2 days, 13:50:37<10:59:57] +[titan] 2025-09-10 13:27:38,524 - root - INFO - step: 33965 loss: 2.5575 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.02 mfu: 49.34% global_avg_ntp_loss: 0.6823 global_avg_top_loss: 1.8752 +[titan] 2025-09-10 13:27:38,524 - root - INFO - lr: 3.0119e-06 gnorm: 0.56 [2 days, 13:51:09<10:59:24] +[titan] 2025-09-10 13:28:10,516 - root - INFO - step: 33970 loss: 2.6249 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.16 mfu: 49.36% global_avg_ntp_loss: 0.7237 global_avg_top_loss: 1.9012 +[titan] 2025-09-10 13:28:10,516 - root - INFO - lr: 3.0103e-06 gnorm: 0.55 [2 days, 13:51:41<10:58:51] +[titan] 2025-09-10 13:28:42,361 - root - INFO - step: 33975 loss: 2.4893 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.41 mfu: 49.59% global_avg_ntp_loss: 0.6570 global_avg_top_loss: 1.8322 +[titan] 2025-09-10 13:28:42,361 - root - INFO - lr: 3.0086e-06 gnorm: 0.53 [2 days, 13:52:13<10:58:18] +[titan] 2025-09-10 13:29:14,443 - root - INFO - step: 33980 loss: 2.6427 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.79 mfu: 49.22% global_avg_ntp_loss: 0.7259 global_avg_top_loss: 1.9168 +[titan] 2025-09-10 13:29:14,443 - root - INFO - lr: 3.0070e-06 gnorm: 0.59 [2 days, 13:52:45<10:57:45] +[titan] 2025-09-10 13:29:46,499 - root - INFO - step: 33985 loss: 2.6054 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.18 mfu: 49.26% global_avg_ntp_loss: 0.7079 global_avg_top_loss: 1.8975 +[titan] 2025-09-10 13:29:46,500 - root - INFO - lr: 3.0053e-06 gnorm: 0.54 [2 days, 13:53:17<10:57:12] +[titan] 2025-09-10 13:30:18,431 - root - INFO - step: 33990 loss: 2.5811 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.07 mfu: 49.45% global_avg_ntp_loss: 0.7022 global_avg_top_loss: 1.8789 +[titan] 2025-09-10 13:30:18,432 - root - INFO - lr: 3.0037e-06 gnorm: 0.61 [2 days, 13:53:49<10:56:40] +[titan] 2025-09-10 13:30:50,352 - root - INFO - step: 33995 loss: 2.5703 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.25 mfu: 49.47% global_avg_ntp_loss: 0.6919 global_avg_top_loss: 1.8785 +[titan] 2025-09-10 13:30:50,352 - root - INFO - lr: 3.0021e-06 gnorm: 0.54 [2 days, 13:54:21<10:56:07] +[titan] 2025-09-10 13:31:15,677 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 13:31:22,087 - root - INFO - step: 34000 loss: 2.5756 memory: 122.03GiB(87.57%) tps: 10,325 tflops: 492.11 mfu: 49.76% global_avg_ntp_loss: 0.6938 global_avg_top_loss: 1.8819 +[titan] 2025-09-10 13:31:22,088 - root - INFO - lr: 3.0004e-06 gnorm: 0.53 [2 days, 13:54:53<10:55:34] +[titan] 2025-09-10 13:31:54,364 - root - INFO - step: 34005 loss: 2.6265 memory: 122.03GiB(87.57%) tps: 10,152 tflops: 483.85 mfu: 48.92% global_avg_ntp_loss: 0.7155 global_avg_top_loss: 1.9110 +[titan] 2025-09-10 13:31:54,365 - root - INFO - lr: 2.9988e-06 gnorm: 0.53 [2 days, 13:55:25<10:55:01] +[titan] 2025-09-10 13:32:26,195 - root - INFO - step: 34010 loss: 2.6958 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.64 mfu: 49.61% global_avg_ntp_loss: 0.7527 global_avg_top_loss: 1.9430 +[titan] 2025-09-10 13:32:26,195 - root - INFO - lr: 2.9972e-06 gnorm: 0.52 [2 days, 13:55:57<10:54:28] +[titan] 2025-09-10 13:32:58,307 - root - INFO - step: 34015 loss: 2.6126 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.33 mfu: 49.17% global_avg_ntp_loss: 0.7091 global_avg_top_loss: 1.9035 +[titan] 2025-09-10 13:32:58,307 - root - INFO - lr: 2.9955e-06 gnorm: 0.55 [2 days, 13:56:29<10:53:55] +[titan] 2025-09-10 13:33:30,318 - root - INFO - step: 34020 loss: 3.0293 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.88 mfu: 49.33% global_avg_ntp_loss: 0.9468 global_avg_top_loss: 2.0825 +[titan] 2025-09-10 13:33:30,318 - root - INFO - lr: 2.9939e-06 gnorm: 0.60 [2 days, 13:57:01<10:53:22] +[titan] 2025-09-10 13:34:02,367 - root - INFO - step: 34025 loss: 2.4961 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.29 mfu: 49.27% global_avg_ntp_loss: 0.6718 global_avg_top_loss: 1.8243 +[titan] 2025-09-10 13:34:02,367 - root - INFO - lr: 2.9923e-06 gnorm: 0.49 [2 days, 13:57:33<10:52:49] +[titan] 2025-09-10 13:34:34,289 - root - INFO - step: 34030 loss: 2.4974 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.23 mfu: 49.47% global_avg_ntp_loss: 0.6579 global_avg_top_loss: 1.8395 +[titan] 2025-09-10 13:34:34,289 - root - INFO - lr: 2.9906e-06 gnorm: 0.59 [2 days, 13:58:05<10:52:16] +[titan] 2025-09-10 13:35:06,450 - root - INFO - step: 34035 loss: 2.4567 memory: 122.03GiB(87.57%) tps: 10,189 tflops: 485.60 mfu: 49.10% global_avg_ntp_loss: 0.6361 global_avg_top_loss: 1.8206 +[titan] 2025-09-10 13:35:06,450 - root - INFO - lr: 2.9890e-06 gnorm: 0.61 [2 days, 13:58:37<10:51:43] +[titan] 2025-09-10 13:35:38,413 - root - INFO - step: 34040 loss: 2.4792 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.60 mfu: 49.40% global_avg_ntp_loss: 0.6503 global_avg_top_loss: 1.8288 +[titan] 2025-09-10 13:35:38,413 - root - INFO - lr: 2.9874e-06 gnorm: 0.52 [2 days, 13:59:09<10:51:10] +[titan] 2025-09-10 13:36:10,450 - root - INFO - step: 34045 loss: 2.5039 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.47 mfu: 49.29% global_avg_ntp_loss: 0.6596 global_avg_top_loss: 1.8443 +[titan] 2025-09-10 13:36:10,451 - root - INFO - lr: 2.9858e-06 gnorm: 0.52 [2 days, 13:59:41<10:50:37] +[titan] 2025-09-10 13:36:35,880 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 13:36:42,274 - root - INFO - step: 34050 loss: 2.5573 memory: 122.03GiB(87.57%) tps: 10,297 tflops: 490.74 mfu: 49.62% global_avg_ntp_loss: 0.6875 global_avg_top_loss: 1.8698 +[titan] 2025-09-10 13:36:42,274 - root - INFO - lr: 2.9841e-06 gnorm: 0.52 [2 days, 14:00:13<10:50:05] +[titan] 2025-09-10 13:37:14,526 - root - INFO - step: 34055 loss: 3.0317 memory: 122.03GiB(87.57%) tps: 10,160 tflops: 484.23 mfu: 48.96% global_avg_ntp_loss: 0.9531 global_avg_top_loss: 2.0787 +[titan] 2025-09-10 13:37:14,526 - root - INFO - lr: 2.9825e-06 gnorm: 0.52 [2 days, 14:00:45<10:49:32] +[titan] 2025-09-10 13:37:46,566 - root - INFO - step: 34060 loss: 2.5960 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.42 mfu: 49.28% global_avg_ntp_loss: 0.7142 global_avg_top_loss: 1.8817 +[titan] 2025-09-10 13:37:46,566 - root - INFO - lr: 2.9809e-06 gnorm: 0.54 [2 days, 14:01:17<10:48:59] +[titan] 2025-09-10 13:38:18,641 - root - INFO - step: 34065 loss: 2.6247 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.90 mfu: 49.23% global_avg_ntp_loss: 0.7226 global_avg_top_loss: 1.9021 +[titan] 2025-09-10 13:38:18,641 - root - INFO - lr: 2.9793e-06 gnorm: 0.57 [2 days, 14:01:50<10:48:26] +[titan] 2025-09-10 13:38:50,479 - root - INFO - step: 34070 loss: 3.0716 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.52 mfu: 49.60% global_avg_ntp_loss: 0.9732 global_avg_top_loss: 2.0984 +[titan] 2025-09-10 13:38:50,479 - root - INFO - lr: 2.9777e-06 gnorm: 0.65 [2 days, 14:02:21<10:47:53] +[titan] 2025-09-10 13:39:22,490 - root - INFO - step: 34075 loss: 2.7850 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.86 mfu: 49.33% global_avg_ntp_loss: 0.8065 global_avg_top_loss: 1.9785 +[titan] 2025-09-10 13:39:22,490 - root - INFO - lr: 2.9760e-06 gnorm: 0.53 [2 days, 14:02:53<10:47:20] +[titan] 2025-09-10 13:39:54,417 - root - INFO - step: 34080 loss: 2.5994 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.15 mfu: 49.46% global_avg_ntp_loss: 0.6994 global_avg_top_loss: 1.8999 +[titan] 2025-09-10 13:39:54,418 - root - INFO - lr: 2.9744e-06 gnorm: 0.55 [2 days, 14:03:25<10:46:47] +[titan] 2025-09-10 13:40:26,520 - root - INFO - step: 34085 loss: 2.5968 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.48 mfu: 49.19% global_avg_ntp_loss: 0.7030 global_avg_top_loss: 1.8938 +[titan] 2025-09-10 13:40:26,520 - root - INFO - lr: 2.9728e-06 gnorm: 0.51 [2 days, 14:03:57<10:46:14] +[titan] 2025-09-10 13:40:58,522 - root - INFO - step: 34090 loss: 2.6725 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 488.00 mfu: 49.34% global_avg_ntp_loss: 0.7411 global_avg_top_loss: 1.9314 +[titan] 2025-09-10 13:40:58,523 - root - INFO - lr: 2.9712e-06 gnorm: 0.54 [2 days, 14:04:29<10:45:41] +[titan] 2025-09-10 13:41:30,407 - root - INFO - step: 34095 loss: 2.5861 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.80 mfu: 49.52% global_avg_ntp_loss: 0.6998 global_avg_top_loss: 1.8862 +[titan] 2025-09-10 13:41:30,408 - root - INFO - lr: 2.9696e-06 gnorm: 0.52 [2 days, 14:05:01<10:45:08] +[titan] 2025-09-10 13:41:56,053 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 13:42:02,453 - root - INFO - step: 34100 loss: 2.5201 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.34 mfu: 49.28% global_avg_ntp_loss: 0.6759 global_avg_top_loss: 1.8442 +[titan] 2025-09-10 13:42:02,453 - root - INFO - lr: 2.9680e-06 gnorm: 0.60 [2 days, 14:05:33<10:44:35] +[titan] 2025-09-10 13:42:34,454 - root - INFO - step: 34105 loss: 2.4702 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.01 mfu: 49.34% global_avg_ntp_loss: 0.6413 global_avg_top_loss: 1.8289 +[titan] 2025-09-10 13:42:34,455 - root - INFO - lr: 2.9664e-06 gnorm: 0.51 [2 days, 14:06:05<10:44:03] +[titan] 2025-09-10 13:43:06,534 - root - INFO - step: 34110 loss: 2.4668 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.83 mfu: 49.22% global_avg_ntp_loss: 0.6424 global_avg_top_loss: 1.8244 +[titan] 2025-09-10 13:43:06,534 - root - INFO - lr: 2.9647e-06 gnorm: 0.53 [2 days, 14:06:37<10:43:30] +[titan] 2025-09-10 13:43:38,790 - root - INFO - step: 34115 loss: 2.4254 memory: 122.03GiB(87.57%) tps: 10,159 tflops: 484.17 mfu: 48.96% global_avg_ntp_loss: 0.6211 global_avg_top_loss: 1.8043 +[titan] 2025-09-10 13:43:38,790 - root - INFO - lr: 2.9631e-06 gnorm: 0.61 [2 days, 14:07:10<10:42:57] +[titan] 2025-09-10 13:44:10,839 - root - INFO - step: 34120 loss: 2.5322 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.29 mfu: 49.27% global_avg_ntp_loss: 0.6738 global_avg_top_loss: 1.8584 +[titan] 2025-09-10 13:44:10,839 - root - INFO - lr: 2.9615e-06 gnorm: 0.52 [2 days, 14:07:42<10:42:24] +[titan] 2025-09-10 13:44:43,012 - root - INFO - step: 34125 loss: 2.9574 memory: 122.03GiB(87.57%) tps: 10,185 tflops: 485.42 mfu: 49.08% global_avg_ntp_loss: 0.9049 global_avg_top_loss: 2.0525 +[titan] 2025-09-10 13:44:43,012 - root - INFO - lr: 2.9599e-06 gnorm: 0.54 [2 days, 14:08:14<10:41:51] +[titan] 2025-09-10 13:45:14,894 - root - INFO - step: 34130 loss: 2.6310 memory: 122.03GiB(87.57%) tps: 10,278 tflops: 489.83 mfu: 49.53% global_avg_ntp_loss: 0.7195 global_avg_top_loss: 1.9115 +[titan] 2025-09-10 13:45:14,895 - root - INFO - lr: 2.9583e-06 gnorm: 0.59 [2 days, 14:08:46<10:41:18] +[titan] 2025-09-10 13:45:46,743 - root - INFO - step: 34135 loss: 2.9552 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.36 mfu: 49.58% global_avg_ntp_loss: 0.9152 global_avg_top_loss: 2.0401 +[titan] 2025-09-10 13:45:46,743 - root - INFO - lr: 2.9567e-06 gnorm: 0.52 [2 days, 14:09:18<10:40:45] +[titan] 2025-09-10 13:46:18,764 - root - INFO - step: 34140 loss: 2.6091 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.71 mfu: 49.31% global_avg_ntp_loss: 0.7093 global_avg_top_loss: 1.8998 +[titan] 2025-09-10 13:46:18,764 - root - INFO - lr: 2.9551e-06 gnorm: 0.53 [2 days, 14:09:50<10:40:12] +[titan] 2025-09-10 13:46:50,848 - root - INFO - step: 34145 loss: 2.5566 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.77 mfu: 49.22% global_avg_ntp_loss: 0.6888 global_avg_top_loss: 1.8679 +[titan] 2025-09-10 13:46:50,848 - root - INFO - lr: 2.9535e-06 gnorm: 0.58 [2 days, 14:10:22<10:39:39] +[titan] 2025-09-10 13:47:16,318 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 13:47:22,736 - root - INFO - step: 34150 loss: 2.7069 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.75 mfu: 49.52% global_avg_ntp_loss: 0.7571 global_avg_top_loss: 1.9498 +[titan] 2025-09-10 13:47:22,736 - root - INFO - lr: 2.9519e-06 gnorm: 0.60 [2 days, 14:10:54<10:39:06] +[titan] 2025-09-10 13:47:54,810 - root - INFO - step: 34155 loss: 2.5422 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.91 mfu: 49.23% global_avg_ntp_loss: 0.6788 global_avg_top_loss: 1.8634 +[titan] 2025-09-10 13:47:54,810 - root - INFO - lr: 2.9503e-06 gnorm: 0.53 [2 days, 14:11:26<10:38:34] +[titan] 2025-09-10 13:48:26,767 - root - INFO - step: 34160 loss: 2.5158 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.69 mfu: 49.41% global_avg_ntp_loss: 0.6652 global_avg_top_loss: 1.8506 +[titan] 2025-09-10 13:48:26,768 - root - INFO - lr: 2.9487e-06 gnorm: 0.52 [2 days, 14:11:58<10:38:01] +[titan] 2025-09-10 13:48:58,848 - root - INFO - step: 34165 loss: 2.5502 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.82 mfu: 49.22% global_avg_ntp_loss: 0.6836 global_avg_top_loss: 1.8666 +[titan] 2025-09-10 13:48:58,848 - root - INFO - lr: 2.9471e-06 gnorm: 0.53 [2 days, 14:12:30<10:37:28] +[titan] 2025-09-10 13:49:30,777 - root - INFO - step: 34170 loss: 2.6044 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.11 mfu: 49.46% global_avg_ntp_loss: 0.7085 global_avg_top_loss: 1.8959 +[titan] 2025-09-10 13:49:30,778 - root - INFO - lr: 2.9455e-06 gnorm: 0.53 [2 days, 14:13:02<10:36:55] +[titan] 2025-09-10 13:50:02,887 - root - INFO - step: 34175 loss: 2.4417 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.37 mfu: 49.18% global_avg_ntp_loss: 0.6341 global_avg_top_loss: 1.8076 +[titan] 2025-09-10 13:50:02,887 - root - INFO - lr: 2.9440e-06 gnorm: 0.53 [2 days, 14:13:34<10:36:22] +[titan] 2025-09-10 13:50:34,778 - root - INFO - step: 34180 loss: 2.5657 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.70 mfu: 49.51% global_avg_ntp_loss: 0.6925 global_avg_top_loss: 1.8732 +[titan] 2025-09-10 13:50:34,779 - root - INFO - lr: 2.9424e-06 gnorm: 0.64 [2 days, 14:14:06<10:35:49] +[titan] 2025-09-10 13:51:06,791 - root - INFO - step: 34185 loss: 2.4536 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.85 mfu: 49.33% global_avg_ntp_loss: 0.6388 global_avg_top_loss: 1.8148 +[titan] 2025-09-10 13:51:06,791 - root - INFO - lr: 2.9408e-06 gnorm: 0.50 [2 days, 14:14:38<10:35:16] +[titan] 2025-09-10 13:51:38,829 - root - INFO - step: 34190 loss: 2.5944 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.45 mfu: 49.29% global_avg_ntp_loss: 0.7017 global_avg_top_loss: 1.8927 +[titan] 2025-09-10 13:51:38,830 - root - INFO - lr: 2.9392e-06 gnorm: 0.61 [2 days, 14:15:10<10:34:43] +[titan] 2025-09-10 13:52:10,793 - root - INFO - step: 34195 loss: 2.4844 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.60 mfu: 49.40% global_avg_ntp_loss: 0.6502 global_avg_top_loss: 1.8342 +[titan] 2025-09-10 13:52:10,793 - root - INFO - lr: 2.9376e-06 gnorm: 0.65 [2 days, 14:15:42<10:34:10] +[titan] 2025-09-10 13:52:36,367 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 13:52:42,745 - root - INFO - step: 34200 loss: 2.4685 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.77 mfu: 49.42% global_avg_ntp_loss: 0.6469 global_avg_top_loss: 1.8216 +[titan] 2025-09-10 13:52:42,745 - root - INFO - lr: 2.9360e-06 gnorm: 0.50 [2 days, 14:16:14<10:33:37] +[titan] 2025-09-10 13:53:14,697 - root - INFO - step: 34205 loss: 2.5054 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.77 mfu: 49.42% global_avg_ntp_loss: 0.6608 global_avg_top_loss: 1.8445 +[titan] 2025-09-10 13:53:14,697 - root - INFO - lr: 2.9344e-06 gnorm: 0.53 [2 days, 14:16:46<10:33:04] +[titan] 2025-09-10 13:53:46,643 - root - INFO - step: 34210 loss: 2.5497 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.86 mfu: 49.43% global_avg_ntp_loss: 0.6820 global_avg_top_loss: 1.8676 +[titan] 2025-09-10 13:53:46,643 - root - INFO - lr: 2.9328e-06 gnorm: 0.57 [2 days, 14:17:18<10:32:32] +[titan] 2025-09-10 13:54:18,539 - root - INFO - step: 34215 loss: 2.9634 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.63 mfu: 49.51% global_avg_ntp_loss: 0.9189 global_avg_top_loss: 2.0445 +[titan] 2025-09-10 13:54:18,539 - root - INFO - lr: 2.9313e-06 gnorm: 0.53 [2 days, 14:17:49<10:31:59] +[titan] 2025-09-10 13:54:50,701 - root - INFO - step: 34220 loss: 2.5857 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.57 mfu: 49.10% global_avg_ntp_loss: 0.6986 global_avg_top_loss: 1.8872 +[titan] 2025-09-10 13:54:50,702 - root - INFO - lr: 2.9297e-06 gnorm: 0.52 [2 days, 14:18:22<10:31:26] +[titan] 2025-09-10 13:55:22,663 - root - INFO - step: 34225 loss: 2.6404 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.62 mfu: 49.41% global_avg_ntp_loss: 0.7282 global_avg_top_loss: 1.9123 +[titan] 2025-09-10 13:55:22,664 - root - INFO - lr: 2.9281e-06 gnorm: 0.53 [2 days, 14:18:54<10:30:53] +[titan] 2025-09-10 13:55:54,662 - root - INFO - step: 34230 loss: 2.6297 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.06 mfu: 49.35% global_avg_ntp_loss: 0.7258 global_avg_top_loss: 1.9039 +[titan] 2025-09-10 13:55:54,662 - root - INFO - lr: 2.9265e-06 gnorm: 0.63 [2 days, 14:19:26<10:30:20] +[titan] 2025-09-10 13:56:26,937 - root - INFO - step: 34235 loss: 2.5434 memory: 122.03GiB(87.57%) tps: 10,153 tflops: 483.88 mfu: 48.93% global_avg_ntp_loss: 0.6799 global_avg_top_loss: 1.8635 +[titan] 2025-09-10 13:56:26,937 - root - INFO - lr: 2.9249e-06 gnorm: 0.58 [2 days, 14:19:58<10:29:47] +[titan] 2025-09-10 13:56:58,953 - root - INFO - step: 34240 loss: 2.5766 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.78 mfu: 49.32% global_avg_ntp_loss: 0.6953 global_avg_top_loss: 1.8813 +[titan] 2025-09-10 13:56:58,954 - root - INFO - lr: 2.9234e-06 gnorm: 0.53 [2 days, 14:20:30<10:29:14] +[titan] 2025-09-10 13:57:30,827 - root - INFO - step: 34245 loss: 2.5865 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.97 mfu: 49.54% global_avg_ntp_loss: 0.7027 global_avg_top_loss: 1.8838 +[titan] 2025-09-10 13:57:30,827 - root - INFO - lr: 2.9218e-06 gnorm: 0.54 [2 days, 14:21:02<10:28:41] +[titan] 2025-09-10 13:57:56,751 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 13:58:03,189 - root - INFO - step: 34250 loss: 2.6618 memory: 122.03GiB(87.57%) tps: 10,126 tflops: 482.57 mfu: 48.79% global_avg_ntp_loss: 0.7413 global_avg_top_loss: 1.9205 +[titan] 2025-09-10 13:58:03,190 - root - INFO - lr: 2.9202e-06 gnorm: 0.54 [2 days, 14:21:34<10:28:08] +[titan] 2025-09-10 13:58:35,116 - root - INFO - step: 34255 loss: 2.5679 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.16 mfu: 49.46% global_avg_ntp_loss: 0.6925 global_avg_top_loss: 1.8754 +[titan] 2025-09-10 13:58:35,116 - root - INFO - lr: 2.9186e-06 gnorm: 0.56 [2 days, 14:22:06<10:27:35] +[titan] 2025-09-10 13:59:06,960 - root - INFO - step: 34260 loss: 2.5655 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.43 mfu: 49.59% global_avg_ntp_loss: 0.6943 global_avg_top_loss: 1.8712 +[titan] 2025-09-10 13:59:06,960 - root - INFO - lr: 2.9171e-06 gnorm: 0.62 [2 days, 14:22:38<10:27:03] +[titan] 2025-09-10 13:59:38,858 - root - INFO - step: 34265 loss: 2.4433 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.59 mfu: 49.50% global_avg_ntp_loss: 0.6420 global_avg_top_loss: 1.8013 +[titan] 2025-09-10 13:59:38,858 - root - INFO - lr: 2.9155e-06 gnorm: 0.49 [2 days, 14:23:10<10:26:30] +[titan] 2025-09-10 14:00:10,843 - root - INFO - step: 34270 loss: 2.2998 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.27 mfu: 49.37% global_avg_ntp_loss: 0.5663 global_avg_top_loss: 1.7335 +[titan] 2025-09-10 14:00:10,843 - root - INFO - lr: 2.9139e-06 gnorm: 0.55 [2 days, 14:23:42<10:25:57] +[titan] 2025-09-10 14:00:42,828 - root - INFO - step: 34275 loss: 2.4180 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.27 mfu: 49.37% global_avg_ntp_loss: 0.6158 global_avg_top_loss: 1.8022 +[titan] 2025-09-10 14:00:42,828 - root - INFO - lr: 2.9124e-06 gnorm: 0.71 [2 days, 14:24:14<10:25:24] +[titan] 2025-09-10 14:01:14,742 - root - INFO - step: 34280 loss: 2.5321 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.34 mfu: 49.48% global_avg_ntp_loss: 0.6710 global_avg_top_loss: 1.8611 +[titan] 2025-09-10 14:01:14,743 - root - INFO - lr: 2.9108e-06 gnorm: 0.51 [2 days, 14:24:46<10:24:51] +[titan] 2025-09-10 14:01:46,524 - root - INFO - step: 34285 loss: 2.4776 memory: 122.03GiB(87.57%) tps: 10,310 tflops: 491.39 mfu: 49.69% global_avg_ntp_loss: 0.6467 global_avg_top_loss: 1.8309 +[titan] 2025-09-10 14:01:46,524 - root - INFO - lr: 2.9092e-06 gnorm: 0.52 [2 days, 14:25:17<10:24:18] +[titan] 2025-09-10 14:02:18,383 - root - INFO - step: 34290 loss: 2.5800 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.20 mfu: 49.57% global_avg_ntp_loss: 0.7023 global_avg_top_loss: 1.8776 +[titan] 2025-09-10 14:02:18,383 - root - INFO - lr: 2.9077e-06 gnorm: 0.55 [2 days, 14:25:49<10:23:45] +[titan] 2025-09-10 14:02:50,466 - root - INFO - step: 34295 loss: 2.7312 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.77 mfu: 49.22% global_avg_ntp_loss: 0.7985 global_avg_top_loss: 1.9327 +[titan] 2025-09-10 14:02:50,466 - root - INFO - lr: 2.9061e-06 gnorm: 0.54 [2 days, 14:26:21<10:23:12] +[titan] 2025-09-10 14:03:16,112 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 14:03:22,513 - root - INFO - step: 34300 loss: 2.7862 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.33 mfu: 49.27% global_avg_ntp_loss: 0.8117 global_avg_top_loss: 1.9745 +[titan] 2025-09-10 14:03:22,513 - root - INFO - lr: 2.9046e-06 gnorm: 0.54 [2 days, 14:26:53<10:22:39] +[titan] 2025-09-10 14:03:48,490 - root - INFO - Dumping profiler traces at step 34304 +[titan] 2025-09-10 14:03:48,559 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-10 14:03:54,940 - root - INFO - step: 34305 loss: 2.5701 memory: 122.03GiB(87.57%) tps: 10,105 tflops: 481.60 mfu: 48.70% global_avg_ntp_loss: 0.6942 global_avg_top_loss: 1.8759 +[titan] 2025-09-10 14:03:54,941 - root - INFO - lr: 2.9030e-06 gnorm: 0.54 [2 days, 14:27:26<10:22:06] +[titan] 2025-09-10 14:04:26,500 - root - INFO - step: 34310 loss: 3.1634 memory: 122.03GiB(87.57%) tps: 10,383 tflops: 494.85 mfu: 50.04% global_avg_ntp_loss: 1.0156 global_avg_top_loss: 2.1478 +[titan] 2025-09-10 14:04:26,500 - root - INFO - lr: 2.9014e-06 gnorm: 0.58 [2 days, 14:27:57<10:21:33] +[titan] 2025-09-10 14:04:58,395 - root - INFO - step: 34315 loss: 2.5950 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.64 mfu: 49.51% global_avg_ntp_loss: 0.7024 global_avg_top_loss: 1.8926 +[titan] 2025-09-10 14:04:58,395 - root - INFO - lr: 2.8999e-06 gnorm: 0.58 [2 days, 14:28:29<10:21:01] +[titan] 2025-09-10 14:05:30,398 - root - INFO - step: 34320 loss: 2.6343 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.99 mfu: 49.34% global_avg_ntp_loss: 0.7186 global_avg_top_loss: 1.9157 +[titan] 2025-09-10 14:05:30,399 - root - INFO - lr: 2.8983e-06 gnorm: 0.53 [2 days, 14:29:01<10:20:28] +[titan] 2025-09-10 14:06:02,569 - root - INFO - step: 34325 loss: 2.5249 memory: 122.03GiB(87.57%) tps: 10,186 tflops: 485.46 mfu: 49.09% global_avg_ntp_loss: 0.6736 global_avg_top_loss: 1.8513 +[titan] 2025-09-10 14:06:02,569 - root - INFO - lr: 2.8968e-06 gnorm: 0.52 [2 days, 14:29:33<10:19:55] +[titan] 2025-09-10 14:06:34,707 - root - INFO - step: 34330 loss: 3.1743 memory: 122.03GiB(87.57%) tps: 10,196 tflops: 485.93 mfu: 49.13% global_avg_ntp_loss: 1.0188 global_avg_top_loss: 2.1554 +[titan] 2025-09-10 14:06:34,708 - root - INFO - lr: 2.8952e-06 gnorm: 0.53 [2 days, 14:30:06<10:19:22] +[titan] 2025-09-10 14:07:06,749 - root - INFO - step: 34335 loss: 2.5739 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.40 mfu: 49.28% global_avg_ntp_loss: 0.6952 global_avg_top_loss: 1.8787 +[titan] 2025-09-10 14:07:06,750 - root - INFO - lr: 2.8937e-06 gnorm: 0.59 [2 days, 14:30:38<10:18:49] +[titan] 2025-09-10 14:07:38,604 - root - INFO - step: 34340 loss: 2.5117 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.27 mfu: 49.57% global_avg_ntp_loss: 0.6657 global_avg_top_loss: 1.8460 +[titan] 2025-09-10 14:07:38,604 - root - INFO - lr: 2.8921e-06 gnorm: 0.58 [2 days, 14:31:09<10:18:16] +[titan] 2025-09-10 14:08:10,623 - root - INFO - step: 34345 loss: 2.4509 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.74 mfu: 49.32% global_avg_ntp_loss: 0.6367 global_avg_top_loss: 1.8142 +[titan] 2025-09-10 14:08:10,623 - root - INFO - lr: 2.8906e-06 gnorm: 0.52 [2 days, 14:31:42<10:17:43] +[titan] 2025-09-10 14:08:36,289 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 14:08:42,702 - root - INFO - step: 34350 loss: 2.4436 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.84 mfu: 49.23% global_avg_ntp_loss: 0.6329 global_avg_top_loss: 1.8107 +[titan] 2025-09-10 14:08:42,702 - root - INFO - lr: 2.8890e-06 gnorm: 0.56 [2 days, 14:32:14<10:17:10] +[titan] 2025-09-10 14:09:14,726 - root - INFO - step: 34355 loss: 2.4606 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.66 mfu: 49.31% global_avg_ntp_loss: 0.6344 global_avg_top_loss: 1.8262 +[titan] 2025-09-10 14:09:14,726 - root - INFO - lr: 2.8875e-06 gnorm: 0.73 [2 days, 14:32:46<10:16:37] +[titan] 2025-09-10 14:09:46,558 - root - INFO - step: 34360 loss: 2.5823 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.62 mfu: 49.61% global_avg_ntp_loss: 0.6944 global_avg_top_loss: 1.8879 +[titan] 2025-09-10 14:09:46,558 - root - INFO - lr: 2.8859e-06 gnorm: 0.53 [2 days, 14:33:17<10:16:04] +[titan] 2025-09-10 14:10:18,634 - root - INFO - step: 34365 loss: 2.8468 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.89 mfu: 49.23% global_avg_ntp_loss: 0.8473 global_avg_top_loss: 1.9994 +[titan] 2025-09-10 14:10:18,634 - root - INFO - lr: 2.8844e-06 gnorm: 0.55 [2 days, 14:33:50<10:15:32] +[titan] 2025-09-10 14:10:50,664 - root - INFO - step: 34370 loss: 2.5834 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.58 mfu: 49.30% global_avg_ntp_loss: 0.6990 global_avg_top_loss: 1.8844 +[titan] 2025-09-10 14:10:50,664 - root - INFO - lr: 2.8828e-06 gnorm: 0.55 [2 days, 14:34:22<10:14:59] +[titan] 2025-09-10 14:11:22,406 - root - INFO - step: 34375 loss: 2.5398 memory: 122.03GiB(87.57%) tps: 10,323 tflops: 492.00 mfu: 49.75% global_avg_ntp_loss: 0.6772 global_avg_top_loss: 1.8626 +[titan] 2025-09-10 14:11:22,407 - root - INFO - lr: 2.8813e-06 gnorm: 0.54 [2 days, 14:34:53<10:14:26] +[titan] 2025-09-10 14:11:54,398 - root - INFO - step: 34380 loss: 2.6110 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.16 mfu: 49.36% global_avg_ntp_loss: 0.7117 global_avg_top_loss: 1.8993 +[titan] 2025-09-10 14:11:54,398 - root - INFO - lr: 2.8798e-06 gnorm: 0.58 [2 days, 14:35:25<10:13:53] +[titan] 2025-09-10 14:12:26,264 - root - INFO - step: 34385 loss: 2.6278 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.10 mfu: 49.55% global_avg_ntp_loss: 0.7216 global_avg_top_loss: 1.9062 +[titan] 2025-09-10 14:12:26,264 - root - INFO - lr: 2.8782e-06 gnorm: 2.21 [2 days, 14:35:57<10:13:20] +[titan] 2025-09-10 14:12:58,312 - root - INFO - step: 34390 loss: 3.0921 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.30 mfu: 49.27% global_avg_ntp_loss: 0.9832 global_avg_top_loss: 2.1089 +[titan] 2025-09-10 14:12:58,312 - root - INFO - lr: 2.8767e-06 gnorm: 0.55 [2 days, 14:36:29<10:12:47] +[titan] 2025-09-10 14:13:30,306 - root - INFO - step: 34395 loss: 2.5039 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.14 mfu: 49.36% global_avg_ntp_loss: 0.6597 global_avg_top_loss: 1.8442 +[titan] 2025-09-10 14:13:30,306 - root - INFO - lr: 2.8752e-06 gnorm: 0.55 [2 days, 14:37:01<10:12:14] +[titan] 2025-09-10 14:13:55,887 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 14:14:02,220 - root - INFO - step: 34400 loss: 2.5908 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.35 mfu: 49.48% global_avg_ntp_loss: 0.6973 global_avg_top_loss: 1.8935 +[titan] 2025-09-10 14:14:02,220 - root - INFO - lr: 2.8736e-06 gnorm: 0.54 [2 days, 14:37:33<10:11:41] +[titan] 2025-09-10 14:14:34,002 - root - INFO - step: 34405 loss: 2.5576 memory: 122.03GiB(87.57%) tps: 10,310 tflops: 491.39 mfu: 49.69% global_avg_ntp_loss: 0.6883 global_avg_top_loss: 1.8693 +[titan] 2025-09-10 14:14:34,002 - root - INFO - lr: 2.8721e-06 gnorm: 0.52 [2 days, 14:38:05<10:11:08] +[titan] 2025-09-10 14:15:06,014 - root - INFO - step: 34410 loss: 2.6473 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.85 mfu: 49.33% global_avg_ntp_loss: 0.7278 global_avg_top_loss: 1.9195 +[titan] 2025-09-10 14:15:06,014 - root - INFO - lr: 2.8706e-06 gnorm: 0.52 [2 days, 14:38:37<10:10:35] +[titan] 2025-09-10 14:15:37,967 - root - INFO - step: 34415 loss: 2.6085 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.76 mfu: 49.42% global_avg_ntp_loss: 0.7096 global_avg_top_loss: 1.8989 +[titan] 2025-09-10 14:15:37,967 - root - INFO - lr: 2.8690e-06 gnorm: 0.58 [2 days, 14:39:09<10:10:03] +[titan] 2025-09-10 14:16:10,063 - root - INFO - step: 34420 loss: 2.5350 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.57 mfu: 49.20% global_avg_ntp_loss: 0.6750 global_avg_top_loss: 1.8600 +[titan] 2025-09-10 14:16:10,064 - root - INFO - lr: 2.8675e-06 gnorm: 0.66 [2 days, 14:39:41<10:09:30] +[titan] 2025-09-10 14:16:42,096 - root - INFO - step: 34425 loss: 2.4125 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.54 mfu: 49.30% global_avg_ntp_loss: 0.6189 global_avg_top_loss: 1.7936 +[titan] 2025-09-10 14:16:42,096 - root - INFO - lr: 2.8660e-06 gnorm: 0.52 [2 days, 14:40:13<10:08:57] +[titan] 2025-09-10 14:17:13,715 - root - INFO - step: 34430 loss: 2.4276 memory: 122.03GiB(87.57%) tps: 10,363 tflops: 493.91 mfu: 49.94% global_avg_ntp_loss: 0.6255 global_avg_top_loss: 1.8020 +[titan] 2025-09-10 14:17:13,715 - root - INFO - lr: 2.8644e-06 gnorm: 0.58 [2 days, 14:40:45<10:08:24] +[titan] 2025-09-10 14:17:45,777 - root - INFO - step: 34435 loss: 2.4217 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.10 mfu: 49.25% global_avg_ntp_loss: 0.6169 global_avg_top_loss: 1.8047 +[titan] 2025-09-10 14:17:45,777 - root - INFO - lr: 2.8629e-06 gnorm: 0.66 [2 days, 14:41:17<10:07:51] +[titan] 2025-09-10 14:18:17,888 - root - INFO - step: 34440 loss: 2.4821 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.35 mfu: 49.18% global_avg_ntp_loss: 0.6513 global_avg_top_loss: 1.8308 +[titan] 2025-09-10 14:18:17,888 - root - INFO - lr: 2.8614e-06 gnorm: 0.52 [2 days, 14:41:49<10:07:18] +[titan] 2025-09-10 14:18:49,899 - root - INFO - step: 34445 loss: 2.3935 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.87 mfu: 49.33% global_avg_ntp_loss: 0.6114 global_avg_top_loss: 1.7821 +[titan] 2025-09-10 14:18:49,899 - root - INFO - lr: 2.8599e-06 gnorm: 0.53 [2 days, 14:42:21<10:06:45] +[titan] 2025-09-10 14:19:15,392 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 14:19:21,858 - root - INFO - step: 34450 loss: 2.4758 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.66 mfu: 49.41% global_avg_ntp_loss: 0.6532 global_avg_top_loss: 1.8226 +[titan] 2025-09-10 14:19:21,859 - root - INFO - lr: 2.8583e-06 gnorm: 0.62 [2 days, 14:42:53<10:06:12] +[titan] 2025-09-10 14:19:53,914 - root - INFO - step: 34455 loss: 2.8263 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.19 mfu: 49.26% global_avg_ntp_loss: 0.8372 global_avg_top_loss: 1.9891 +[titan] 2025-09-10 14:19:53,914 - root - INFO - lr: 2.8568e-06 gnorm: 0.55 [2 days, 14:43:25<10:05:39] +[titan] 2025-09-10 14:20:25,954 - root - INFO - step: 34460 loss: 2.7884 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.44 mfu: 49.29% global_avg_ntp_loss: 0.8074 global_avg_top_loss: 1.9810 +[titan] 2025-09-10 14:20:25,954 - root - INFO - lr: 2.8553e-06 gnorm: 0.56 [2 days, 14:43:57<10:05:06] +[titan] 2025-09-10 14:20:57,868 - root - INFO - step: 34465 loss: 2.6186 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.35 mfu: 49.48% global_avg_ntp_loss: 0.7159 global_avg_top_loss: 1.9028 +[titan] 2025-09-10 14:20:57,868 - root - INFO - lr: 2.8538e-06 gnorm: 0.60 [2 days, 14:44:29<10:04:34] +[titan] 2025-09-10 14:21:29,719 - root - INFO - step: 34470 loss: 3.1716 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.33 mfu: 49.58% global_avg_ntp_loss: 1.0231 global_avg_top_loss: 2.1486 +[titan] 2025-09-10 14:21:29,719 - root - INFO - lr: 2.8523e-06 gnorm: 0.62 [2 days, 14:45:01<10:04:01] +[titan] 2025-09-10 14:22:01,519 - root - INFO - step: 34475 loss: 2.5567 memory: 122.03GiB(87.57%) tps: 10,305 tflops: 491.11 mfu: 49.66% global_avg_ntp_loss: 0.6845 global_avg_top_loss: 1.8722 +[titan] 2025-09-10 14:22:01,519 - root - INFO - lr: 2.8507e-06 gnorm: 0.60 [2 days, 14:45:32<10:03:28] +[titan] 2025-09-10 14:22:33,411 - root - INFO - step: 34480 loss: 2.6201 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.68 mfu: 49.51% global_avg_ntp_loss: 0.7080 global_avg_top_loss: 1.9122 +[titan] 2025-09-10 14:22:33,412 - root - INFO - lr: 2.8492e-06 gnorm: 0.59 [2 days, 14:46:04<10:02:55] +[titan] 2025-09-10 14:23:05,531 - root - INFO - step: 34485 loss: 2.5392 memory: 122.03GiB(87.57%) tps: 10,202 tflops: 486.21 mfu: 49.16% global_avg_ntp_loss: 0.6735 global_avg_top_loss: 1.8657 +[titan] 2025-09-10 14:23:05,532 - root - INFO - lr: 2.8477e-06 gnorm: 0.54 [2 days, 14:46:36<10:02:22] +[titan] 2025-09-10 14:23:37,479 - root - INFO - step: 34490 loss: 3.1119 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.84 mfu: 49.43% global_avg_ntp_loss: 0.9950 global_avg_top_loss: 2.1169 +[titan] 2025-09-10 14:23:37,479 - root - INFO - lr: 2.8462e-06 gnorm: 0.48 [2 days, 14:47:08<10:01:49] +[titan] 2025-09-10 14:24:09,419 - root - INFO - step: 34495 loss: 2.5740 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.95 mfu: 49.44% global_avg_ntp_loss: 0.6946 global_avg_top_loss: 1.8794 +[titan] 2025-09-10 14:24:09,419 - root - INFO - lr: 2.8447e-06 gnorm: 0.58 [2 days, 14:47:40<10:01:16] +[titan] 2025-09-10 14:24:34,978 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 14:24:41,410 - root - INFO - step: 34500 loss: 2.4772 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.16 mfu: 49.36% global_avg_ntp_loss: 0.6474 global_avg_top_loss: 1.8298 +[titan] 2025-09-10 14:24:41,411 - root - INFO - lr: 2.8432e-06 gnorm: 0.66 [2 days, 14:48:12<10:00:43] +[titan] 2025-09-10 14:25:13,412 - root - INFO - step: 34505 loss: 2.4304 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.02 mfu: 49.34% global_avg_ntp_loss: 0.6268 global_avg_top_loss: 1.8036 +[titan] 2025-09-10 14:25:13,412 - root - INFO - lr: 2.8417e-06 gnorm: 0.51 [2 days, 14:48:44<10:00:10] +[titan] 2025-09-10 14:25:45,526 - root - INFO - step: 34510 loss: 2.3789 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.30 mfu: 49.17% global_avg_ntp_loss: 0.6003 global_avg_top_loss: 1.7785 +[titan] 2025-09-10 14:25:45,526 - root - INFO - lr: 2.8402e-06 gnorm: 0.57 [2 days, 14:49:16< 9:59:38] +[titan] 2025-09-10 14:26:17,347 - root - INFO - step: 34515 loss: 2.5025 memory: 122.03GiB(87.57%) tps: 10,298 tflops: 490.79 mfu: 49.62% global_avg_ntp_loss: 0.6592 global_avg_top_loss: 1.8433 +[titan] 2025-09-10 14:26:17,347 - root - INFO - lr: 2.8387e-06 gnorm: 0.75 [2 days, 14:49:48< 9:59:05] +[titan] 2025-09-10 14:26:49,524 - root - INFO - step: 34520 loss: 2.9686 memory: 122.03GiB(87.57%) tps: 10,184 tflops: 485.35 mfu: 49.07% global_avg_ntp_loss: 0.9250 global_avg_top_loss: 2.0436 +[titan] 2025-09-10 14:26:49,524 - root - INFO - lr: 2.8372e-06 gnorm: 0.51 [2 days, 14:50:20< 9:58:32] +[titan] 2025-09-10 14:27:21,330 - root - INFO - step: 34525 loss: 2.4785 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.02 mfu: 49.65% global_avg_ntp_loss: 0.6471 global_avg_top_loss: 1.8313 +[titan] 2025-09-10 14:27:21,330 - root - INFO - lr: 2.8357e-06 gnorm: 0.54 [2 days, 14:50:52< 9:57:59] +[titan] 2025-09-10 14:27:53,403 - root - INFO - step: 34530 loss: 2.5169 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.91 mfu: 49.23% global_avg_ntp_loss: 0.6668 global_avg_top_loss: 1.8502 +[titan] 2025-09-10 14:27:53,404 - root - INFO - lr: 2.8342e-06 gnorm: 0.59 [2 days, 14:51:24< 9:57:26] +[titan] 2025-09-10 14:28:25,341 - root - INFO - step: 34535 loss: 2.5763 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.99 mfu: 49.44% global_avg_ntp_loss: 0.6924 global_avg_top_loss: 1.8839 +[titan] 2025-09-10 14:28:25,341 - root - INFO - lr: 2.8327e-06 gnorm: 0.56 [2 days, 14:51:56< 9:56:53] +[titan] 2025-09-10 14:28:57,235 - root - INFO - step: 34540 loss: 2.5782 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.66 mfu: 49.51% global_avg_ntp_loss: 0.6943 global_avg_top_loss: 1.8839 +[titan] 2025-09-10 14:28:57,235 - root - INFO - lr: 2.8312e-06 gnorm: 0.55 [2 days, 14:52:28< 9:56:20] +[titan] 2025-09-10 14:29:29,244 - root - INFO - step: 34545 loss: 2.6811 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.90 mfu: 49.33% global_avg_ntp_loss: 0.7423 global_avg_top_loss: 1.9389 +[titan] 2025-09-10 14:29:29,244 - root - INFO - lr: 2.8297e-06 gnorm: 0.57 [2 days, 14:53:00< 9:55:47] +[titan] 2025-09-10 14:29:54,643 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 14:30:01,050 - root - INFO - step: 34550 loss: 2.6447 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.02 mfu: 49.65% global_avg_ntp_loss: 0.7276 global_avg_top_loss: 1.9171 +[titan] 2025-09-10 14:30:01,050 - root - INFO - lr: 2.8282e-06 gnorm: 0.68 [2 days, 14:53:32< 9:55:14] +[titan] 2025-09-10 14:30:33,290 - root - INFO - step: 34555 loss: 2.4607 memory: 122.03GiB(87.57%) tps: 10,164 tflops: 484.40 mfu: 48.98% global_avg_ntp_loss: 0.6390 global_avg_top_loss: 1.8217 +[titan] 2025-09-10 14:30:33,290 - root - INFO - lr: 2.8267e-06 gnorm: 0.54 [2 days, 14:54:04< 9:54:41] +[titan] 2025-09-10 14:31:05,259 - root - INFO - step: 34560 loss: 2.5483 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.51 mfu: 49.39% global_avg_ntp_loss: 0.6800 global_avg_top_loss: 1.8683 +[titan] 2025-09-10 14:31:05,259 - root - INFO - lr: 2.8252e-06 gnorm: 0.52 [2 days, 14:54:36< 9:54:09] +[titan] 2025-09-10 14:31:37,021 - root - INFO - step: 34565 loss: 2.5002 memory: 122.03GiB(87.57%) tps: 10,317 tflops: 491.69 mfu: 49.72% global_avg_ntp_loss: 0.6595 global_avg_top_loss: 1.8408 +[titan] 2025-09-10 14:31:37,022 - root - INFO - lr: 2.8237e-06 gnorm: 0.54 [2 days, 14:55:08< 9:53:36] +[titan] 2025-09-10 14:32:09,007 - root - INFO - step: 34570 loss: 2.5493 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.25 mfu: 49.37% global_avg_ntp_loss: 0.6819 global_avg_top_loss: 1.8674 +[titan] 2025-09-10 14:32:09,008 - root - INFO - lr: 2.8222e-06 gnorm: 0.53 [2 days, 14:55:40< 9:53:03] +[titan] 2025-09-10 14:32:41,212 - root - INFO - step: 34575 loss: 2.6398 memory: 122.03GiB(87.57%) tps: 10,175 tflops: 484.94 mfu: 49.03% global_avg_ntp_loss: 0.7261 global_avg_top_loss: 1.9137 +[titan] 2025-09-10 14:32:41,212 - root - INFO - lr: 2.8207e-06 gnorm: 0.64 [2 days, 14:56:12< 9:52:30] +[titan] 2025-09-10 14:33:12,978 - root - INFO - step: 34580 loss: 2.5474 memory: 122.03GiB(87.57%) tps: 10,315 tflops: 491.62 mfu: 49.71% global_avg_ntp_loss: 0.6805 global_avg_top_loss: 1.8669 +[titan] 2025-09-10 14:33:12,979 - root - INFO - lr: 2.8192e-06 gnorm: 0.62 [2 days, 14:56:44< 9:51:57] +[titan] 2025-09-10 14:33:44,810 - root - INFO - step: 34585 loss: 2.4114 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.61 mfu: 49.61% global_avg_ntp_loss: 0.6183 global_avg_top_loss: 1.7931 +[titan] 2025-09-10 14:33:44,811 - root - INFO - lr: 2.8177e-06 gnorm: 0.51 [2 days, 14:57:16< 9:51:24] +[titan] 2025-09-10 14:34:16,797 - root - INFO - step: 34590 loss: 2.4644 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.24 mfu: 49.37% global_avg_ntp_loss: 0.6407 global_avg_top_loss: 1.8237 +[titan] 2025-09-10 14:34:16,797 - root - INFO - lr: 2.8162e-06 gnorm: 0.63 [2 days, 14:57:48< 9:50:51] +[titan] 2025-09-10 14:34:48,819 - root - INFO - step: 34595 loss: 2.4558 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.70 mfu: 49.31% global_avg_ntp_loss: 0.6353 global_avg_top_loss: 1.8206 +[titan] 2025-09-10 14:34:48,819 - root - INFO - lr: 2.8148e-06 gnorm: 0.69 [2 days, 14:58:20< 9:50:18] +[titan] 2025-09-10 14:35:14,292 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 14:35:20,695 - root - INFO - step: 34600 loss: 2.4386 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.93 mfu: 49.54% global_avg_ntp_loss: 0.6320 global_avg_top_loss: 1.8065 +[titan] 2025-09-10 14:35:20,696 - root - INFO - lr: 2.8133e-06 gnorm: 0.52 [2 days, 14:58:52< 9:49:45] +[titan] 2025-09-10 14:35:52,613 - root - INFO - step: 34605 loss: 2.4964 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.30 mfu: 49.47% global_avg_ntp_loss: 0.6574 global_avg_top_loss: 1.8390 +[titan] 2025-09-10 14:35:52,613 - root - INFO - lr: 2.8118e-06 gnorm: 0.57 [2 days, 14:59:23< 9:49:13] +[titan] 2025-09-10 14:36:24,517 - root - INFO - step: 34610 loss: 2.5159 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.50 mfu: 49.49% global_avg_ntp_loss: 0.6654 global_avg_top_loss: 1.8505 +[titan] 2025-09-10 14:36:24,517 - root - INFO - lr: 2.8103e-06 gnorm: 0.58 [2 days, 14:59:55< 9:48:40] +[titan] 2025-09-10 14:36:56,623 - root - INFO - step: 34615 loss: 2.5451 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.43 mfu: 49.18% global_avg_ntp_loss: 0.6756 global_avg_top_loss: 1.8695 +[titan] 2025-09-10 14:36:56,623 - root - INFO - lr: 2.8088e-06 gnorm: 0.57 [2 days, 15:00:27< 9:48:07] +[titan] 2025-09-10 14:37:28,528 - root - INFO - step: 34620 loss: 2.5849 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.49 mfu: 49.49% global_avg_ntp_loss: 0.6973 global_avg_top_loss: 1.8876 +[titan] 2025-09-10 14:37:28,528 - root - INFO - lr: 2.8074e-06 gnorm: 0.56 [2 days, 15:00:59< 9:47:34] +[titan] 2025-09-10 14:38:00,418 - root - INFO - step: 34625 loss: 2.5190 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.72 mfu: 49.52% global_avg_ntp_loss: 0.6685 global_avg_top_loss: 1.8505 +[titan] 2025-09-10 14:38:00,418 - root - INFO - lr: 2.8059e-06 gnorm: 0.58 [2 days, 15:01:31< 9:47:01] +[titan] 2025-09-10 14:38:32,469 - root - INFO - step: 34630 loss: 2.7673 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.26 mfu: 49.27% global_avg_ntp_loss: 0.8051 global_avg_top_loss: 1.9621 +[titan] 2025-09-10 14:38:32,469 - root - INFO - lr: 2.8044e-06 gnorm: 0.65 [2 days, 15:02:03< 9:46:28] +[titan] 2025-09-10 14:39:04,300 - root - INFO - step: 34635 loss: 2.5633 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.62 mfu: 49.61% global_avg_ntp_loss: 0.6885 global_avg_top_loss: 1.8748 +[titan] 2025-09-10 14:39:04,301 - root - INFO - lr: 2.8029e-06 gnorm: 0.57 [2 days, 15:02:35< 9:45:55] +[titan] 2025-09-10 14:39:36,211 - root - INFO - step: 34640 loss: 2.5023 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.41 mfu: 49.48% global_avg_ntp_loss: 0.6584 global_avg_top_loss: 1.8438 +[titan] 2025-09-10 14:39:36,211 - root - INFO - lr: 2.8014e-06 gnorm: 0.50 [2 days, 15:03:07< 9:45:22] +[titan] 2025-09-10 14:40:08,155 - root - INFO - step: 34645 loss: 2.4809 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.89 mfu: 49.43% global_avg_ntp_loss: 0.6497 global_avg_top_loss: 1.8312 +[titan] 2025-09-10 14:40:08,156 - root - INFO - lr: 2.8000e-06 gnorm: 0.52 [2 days, 15:03:39< 9:44:49] +[titan] 2025-09-10 14:40:33,703 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 14:40:40,058 - root - INFO - step: 34650 loss: 2.6049 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.52 mfu: 49.50% global_avg_ntp_loss: 0.7100 global_avg_top_loss: 1.8949 +[titan] 2025-09-10 14:40:40,059 - root - INFO - lr: 2.7985e-06 gnorm: 0.52 [2 days, 15:04:11< 9:44:17] +[titan] 2025-09-10 14:41:11,966 - root - INFO - step: 34655 loss: 2.6098 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.45 mfu: 49.49% global_avg_ntp_loss: 0.7097 global_avg_top_loss: 1.9002 +[titan] 2025-09-10 14:41:11,966 - root - INFO - lr: 2.7970e-06 gnorm: 0.60 [2 days, 15:04:43< 9:43:44] +[titan] 2025-09-10 14:41:43,777 - root - INFO - step: 34660 loss: 2.5190 memory: 122.03GiB(87.57%) tps: 10,301 tflops: 490.94 mfu: 49.64% global_avg_ntp_loss: 0.6697 global_avg_top_loss: 1.8493 +[titan] 2025-09-10 14:41:43,777 - root - INFO - lr: 2.7956e-06 gnorm: 0.64 [2 days, 15:05:15< 9:43:11] +[titan] 2025-09-10 14:42:15,656 - root - INFO - step: 34665 loss: 2.4013 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.89 mfu: 49.53% global_avg_ntp_loss: 0.6221 global_avg_top_loss: 1.7792 +[titan] 2025-09-10 14:42:15,657 - root - INFO - lr: 2.7941e-06 gnorm: 0.51 [2 days, 15:05:47< 9:42:38] +[titan] 2025-09-10 14:42:47,624 - root - INFO - step: 34670 loss: 2.4832 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.53 mfu: 49.40% global_avg_ntp_loss: 0.6531 global_avg_top_loss: 1.8302 +[titan] 2025-09-10 14:42:47,624 - root - INFO - lr: 2.7926e-06 gnorm: 0.56 [2 days, 15:06:18< 9:42:05] +[titan] 2025-09-10 14:43:19,570 - root - INFO - step: 34675 loss: 2.4153 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.86 mfu: 49.43% global_avg_ntp_loss: 0.6146 global_avg_top_loss: 1.8007 +[titan] 2025-09-10 14:43:19,571 - root - INFO - lr: 2.7912e-06 gnorm: 0.73 [2 days, 15:06:50< 9:41:32] +[titan] 2025-09-10 14:43:51,546 - root - INFO - step: 34680 loss: 2.5884 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.41 mfu: 49.38% global_avg_ntp_loss: 0.7005 global_avg_top_loss: 1.8879 +[titan] 2025-09-10 14:43:51,546 - root - INFO - lr: 2.7897e-06 gnorm: 0.51 [2 days, 15:07:22< 9:40:59] +[titan] 2025-09-10 14:44:23,565 - root - INFO - step: 34685 loss: 2.5077 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.75 mfu: 49.32% global_avg_ntp_loss: 0.6616 global_avg_top_loss: 1.8461 +[titan] 2025-09-10 14:44:23,565 - root - INFO - lr: 2.7882e-06 gnorm: 0.55 [2 days, 15:07:54< 9:40:26] +[titan] 2025-09-10 14:44:55,266 - root - INFO - step: 34690 loss: 2.5019 memory: 122.03GiB(87.57%) tps: 10,337 tflops: 492.65 mfu: 49.81% global_avg_ntp_loss: 0.6578 global_avg_top_loss: 1.8441 +[titan] 2025-09-10 14:44:55,266 - root - INFO - lr: 2.7868e-06 gnorm: 0.58 [2 days, 15:08:26< 9:39:53] +[titan] 2025-09-10 14:45:27,030 - root - INFO - step: 34695 loss: 2.4007 memory: 122.03GiB(87.57%) tps: 10,316 tflops: 491.66 mfu: 49.71% global_avg_ntp_loss: 0.6139 global_avg_top_loss: 1.7868 +[titan] 2025-09-10 14:45:27,030 - root - INFO - lr: 2.7853e-06 gnorm: 0.57 [2 days, 15:08:58< 9:39:20] +[titan] 2025-09-10 14:45:52,476 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 14:45:58,894 - root - INFO - step: 34700 loss: 2.5208 memory: 122.03GiB(87.57%) tps: 10,284 tflops: 490.11 mfu: 49.56% global_avg_ntp_loss: 0.6694 global_avg_top_loss: 1.8514 +[titan] 2025-09-10 14:45:58,895 - root - INFO - lr: 2.7839e-06 gnorm: 0.60 [2 days, 15:09:30< 9:38:48] +[titan] 2025-09-10 14:46:31,035 - root - INFO - step: 34705 loss: 2.5418 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.90 mfu: 49.13% global_avg_ntp_loss: 0.6829 global_avg_top_loss: 1.8589 +[titan] 2025-09-10 14:46:31,036 - root - INFO - lr: 2.7824e-06 gnorm: 0.55 [2 days, 15:10:02< 9:38:15] +[titan] 2025-09-10 14:47:02,867 - root - INFO - step: 34710 loss: 2.5663 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.63 mfu: 49.61% global_avg_ntp_loss: 0.6915 global_avg_top_loss: 1.8748 +[titan] 2025-09-10 14:47:02,867 - root - INFO - lr: 2.7810e-06 gnorm: 0.59 [2 days, 15:10:34< 9:37:42] +[titan] 2025-09-10 14:47:34,886 - root - INFO - step: 34715 loss: 2.5771 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.75 mfu: 49.32% global_avg_ntp_loss: 0.6958 global_avg_top_loss: 1.8813 +[titan] 2025-09-10 14:47:34,886 - root - INFO - lr: 2.7795e-06 gnorm: 0.53 [2 days, 15:11:06< 9:37:09] +[titan] 2025-09-10 14:48:06,904 - root - INFO - step: 34720 loss: 2.5163 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.75 mfu: 49.32% global_avg_ntp_loss: 0.6678 global_avg_top_loss: 1.8485 +[titan] 2025-09-10 14:48:06,904 - root - INFO - lr: 2.7781e-06 gnorm: 0.54 [2 days, 15:11:38< 9:36:36] +[titan] 2025-09-10 14:48:38,922 - root - INFO - step: 34725 loss: 2.5033 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.76 mfu: 49.32% global_avg_ntp_loss: 0.6584 global_avg_top_loss: 1.8449 +[titan] 2025-09-10 14:48:38,923 - root - INFO - lr: 2.7766e-06 gnorm: 0.55 [2 days, 15:12:10< 9:36:03] +[titan] 2025-09-10 14:49:10,809 - root - INFO - step: 34730 loss: 2.6614 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.76 mfu: 49.52% global_avg_ntp_loss: 0.7344 global_avg_top_loss: 1.9270 +[titan] 2025-09-10 14:49:10,810 - root - INFO - lr: 2.7752e-06 gnorm: 0.53 [2 days, 15:12:42< 9:35:30] +[titan] 2025-09-10 14:49:42,820 - root - INFO - step: 34735 loss: 2.5226 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.88 mfu: 49.33% global_avg_ntp_loss: 0.6731 global_avg_top_loss: 1.8496 +[titan] 2025-09-10 14:49:42,820 - root - INFO - lr: 2.7737e-06 gnorm: 0.60 [2 days, 15:13:14< 9:34:57] +[titan] 2025-09-10 14:50:14,808 - root - INFO - step: 34740 loss: 2.5417 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.21 mfu: 49.36% global_avg_ntp_loss: 0.6809 global_avg_top_loss: 1.8608 +[titan] 2025-09-10 14:50:14,809 - root - INFO - lr: 2.7723e-06 gnorm: 0.65 [2 days, 15:13:46< 9:34:24] +[titan] 2025-09-10 14:50:46,656 - root - INFO - step: 34745 loss: 2.4398 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.37 mfu: 49.58% global_avg_ntp_loss: 0.6379 global_avg_top_loss: 1.8019 +[titan] 2025-09-10 14:50:46,657 - root - INFO - lr: 2.7708e-06 gnorm: 0.51 [2 days, 15:14:17< 9:33:52] +[titan] 2025-09-10 14:51:12,198 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 14:51:18,643 - root - INFO - step: 34750 loss: 2.3638 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.24 mfu: 49.37% global_avg_ntp_loss: 0.5984 global_avg_top_loss: 1.7654 +[titan] 2025-09-10 14:51:18,644 - root - INFO - lr: 2.7694e-06 gnorm: 0.57 [2 days, 15:14:49< 9:33:19] +[titan] 2025-09-10 14:51:50,492 - root - INFO - step: 34755 loss: 2.3831 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.35 mfu: 49.58% global_avg_ntp_loss: 0.5986 global_avg_top_loss: 1.7845 +[titan] 2025-09-10 14:51:50,493 - root - INFO - lr: 2.7679e-06 gnorm: 0.66 [2 days, 15:15:21< 9:32:46] +[titan] 2025-09-10 14:52:22,659 - root - INFO - step: 34760 loss: 2.5940 memory: 122.03GiB(87.57%) tps: 10,187 tflops: 485.50 mfu: 49.09% global_avg_ntp_loss: 0.7005 global_avg_top_loss: 1.8935 +[titan] 2025-09-10 14:52:22,660 - root - INFO - lr: 2.7665e-06 gnorm: 0.54 [2 days, 15:15:54< 9:32:13] +[titan] 2025-09-10 14:52:54,581 - root - INFO - step: 34765 loss: 2.4574 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.23 mfu: 49.47% global_avg_ntp_loss: 0.6361 global_avg_top_loss: 1.8213 +[titan] 2025-09-10 14:52:54,581 - root - INFO - lr: 2.7650e-06 gnorm: 0.56 [2 days, 15:16:25< 9:31:40] +[titan] 2025-09-10 14:53:26,728 - root - INFO - step: 34770 loss: 2.5932 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.80 mfu: 49.12% global_avg_ntp_loss: 0.7015 global_avg_top_loss: 1.8917 +[titan] 2025-09-10 14:53:26,729 - root - INFO - lr: 2.7636e-06 gnorm: 0.54 [2 days, 15:16:58< 9:31:07] +[titan] 2025-09-10 14:53:58,723 - root - INFO - step: 34775 loss: 2.9990 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.12 mfu: 49.35% global_avg_ntp_loss: 0.9339 global_avg_top_loss: 2.0650 +[titan] 2025-09-10 14:53:58,723 - root - INFO - lr: 2.7622e-06 gnorm: 0.59 [2 days, 15:17:30< 9:30:34] +[titan] 2025-09-10 14:54:30,696 - root - INFO - step: 34780 loss: 2.6879 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.45 mfu: 49.39% global_avg_ntp_loss: 0.7666 global_avg_top_loss: 1.9214 +[titan] 2025-09-10 14:54:30,696 - root - INFO - lr: 2.7607e-06 gnorm: 0.53 [2 days, 15:18:02< 9:30:01] +[titan] 2025-09-10 14:55:02,498 - root - INFO - step: 34785 loss: 2.5689 memory: 122.03GiB(87.57%) tps: 10,304 tflops: 491.07 mfu: 49.65% global_avg_ntp_loss: 0.6909 global_avg_top_loss: 1.8780 +[titan] 2025-09-10 14:55:02,498 - root - INFO - lr: 2.7593e-06 gnorm: 0.55 [2 days, 15:18:33< 9:29:29] +[titan] 2025-09-10 14:55:34,380 - root - INFO - step: 34790 loss: 2.6000 memory: 122.03GiB(87.57%) tps: 10,278 tflops: 489.85 mfu: 49.53% global_avg_ntp_loss: 0.7119 global_avg_top_loss: 1.8881 +[titan] 2025-09-10 14:55:34,380 - root - INFO - lr: 2.7579e-06 gnorm: 0.60 [2 days, 15:19:05< 9:28:56] +[titan] 2025-09-10 14:56:06,271 - root - INFO - step: 34795 loss: 2.5313 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.70 mfu: 49.51% global_avg_ntp_loss: 0.6748 global_avg_top_loss: 1.8565 +[titan] 2025-09-10 14:56:06,271 - root - INFO - lr: 2.7564e-06 gnorm: 0.55 [2 days, 15:19:37< 9:28:23] +[titan] 2025-09-10 14:56:31,746 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 14:56:38,049 - root - INFO - step: 34800 loss: 2.5982 memory: 122.03GiB(87.57%) tps: 10,312 tflops: 491.44 mfu: 49.69% global_avg_ntp_loss: 0.7091 global_avg_top_loss: 1.8891 +[titan] 2025-09-10 14:56:38,050 - root - INFO - lr: 2.7550e-06 gnorm: 0.56 [2 days, 15:20:09< 9:27:50] +[titan] 2025-09-10 14:57:09,972 - root - INFO - step: 34805 loss: 2.6903 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.22 mfu: 49.47% global_avg_ntp_loss: 0.7542 global_avg_top_loss: 1.9361 +[titan] 2025-09-10 14:57:09,972 - root - INFO - lr: 2.7536e-06 gnorm: 0.54 [2 days, 15:20:41< 9:27:17] +[titan] 2025-09-10 14:57:41,955 - root - INFO - step: 34810 loss: 2.5752 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.29 mfu: 49.37% global_avg_ntp_loss: 0.6921 global_avg_top_loss: 1.8831 +[titan] 2025-09-10 14:57:41,956 - root - INFO - lr: 2.7521e-06 gnorm: 0.54 [2 days, 15:21:13< 9:26:44] +[titan] 2025-09-10 14:58:14,176 - root - INFO - step: 34815 loss: 2.5905 memory: 122.03GiB(87.57%) tps: 10,170 tflops: 484.70 mfu: 49.01% global_avg_ntp_loss: 0.7010 global_avg_top_loss: 1.8895 +[titan] 2025-09-10 14:58:14,176 - root - INFO - lr: 2.7507e-06 gnorm: 0.61 [2 days, 15:21:45< 9:26:11] +[titan] 2025-09-10 14:58:20,885 - root - INFO - Dumping profiler traces at step 34816 +[titan] 2025-09-10 14:58:20,953 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-10 14:58:46,426 - root - INFO - step: 34820 loss: 2.5828 memory: 122.03GiB(87.57%) tps: 10,161 tflops: 484.24 mfu: 48.96% global_avg_ntp_loss: 0.6975 global_avg_top_loss: 1.8853 +[titan] 2025-09-10 14:58:46,427 - root - INFO - lr: 2.7493e-06 gnorm: 0.62 [2 days, 15:22:17< 9:25:38] +[titan] 2025-09-10 14:59:18,211 - root - INFO - step: 34825 loss: 2.7279 memory: 122.03GiB(87.57%) tps: 10,310 tflops: 491.35 mfu: 49.68% global_avg_ntp_loss: 0.7828 global_avg_top_loss: 1.9451 +[titan] 2025-09-10 14:59:18,211 - root - INFO - lr: 2.7478e-06 gnorm: 0.52 [2 days, 15:22:49< 9:25:06] +[titan] 2025-09-10 14:59:50,359 - root - INFO - step: 34830 loss: 2.4522 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.79 mfu: 49.12% global_avg_ntp_loss: 0.6332 global_avg_top_loss: 1.8190 +[titan] 2025-09-10 14:59:50,359 - root - INFO - lr: 2.7464e-06 gnorm: 0.63 [2 days, 15:23:21< 9:24:33] +[titan] 2025-09-10 15:00:22,331 - root - INFO - step: 34835 loss: 2.4028 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.47 mfu: 49.39% global_avg_ntp_loss: 0.6091 global_avg_top_loss: 1.7937 +[titan] 2025-09-10 15:00:22,331 - root - INFO - lr: 2.7450e-06 gnorm: 0.65 [2 days, 15:23:53< 9:24:00] +[titan] 2025-09-10 15:00:54,564 - root - INFO - step: 34840 loss: 2.6249 memory: 122.03GiB(87.57%) tps: 10,166 tflops: 484.51 mfu: 48.99% global_avg_ntp_loss: 0.7161 global_avg_top_loss: 1.9088 +[titan] 2025-09-10 15:00:54,564 - root - INFO - lr: 2.7436e-06 gnorm: 0.54 [2 days, 15:24:25< 9:23:27] +[titan] 2025-09-10 15:01:26,577 - root - INFO - step: 34845 loss: 2.3706 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.83 mfu: 49.33% global_avg_ntp_loss: 0.5969 global_avg_top_loss: 1.7737 +[titan] 2025-09-10 15:01:26,577 - root - INFO - lr: 2.7422e-06 gnorm: 0.59 [2 days, 15:24:57< 9:22:54] +[titan] 2025-09-10 15:01:52,132 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 15:01:58,557 - root - INFO - step: 34850 loss: 2.5241 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.34 mfu: 49.38% global_avg_ntp_loss: 0.6677 global_avg_top_loss: 1.8564 +[titan] 2025-09-10 15:01:58,558 - root - INFO - lr: 2.7407e-06 gnorm: 0.56 [2 days, 15:25:29< 9:22:21] +[titan] 2025-09-10 15:02:30,446 - root - INFO - step: 34855 loss: 2.9628 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.75 mfu: 49.52% global_avg_ntp_loss: 0.9201 global_avg_top_loss: 2.0427 +[titan] 2025-09-10 15:02:30,446 - root - INFO - lr: 2.7393e-06 gnorm: 0.57 [2 days, 15:26:01< 9:21:48] +[titan] 2025-09-10 15:03:02,473 - root - INFO - step: 34860 loss: 2.5276 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.62 mfu: 49.30% global_avg_ntp_loss: 0.6757 global_avg_top_loss: 1.8519 +[titan] 2025-09-10 15:03:02,473 - root - INFO - lr: 2.7379e-06 gnorm: 0.54 [2 days, 15:26:33< 9:21:15] +[titan] 2025-09-10 15:03:34,377 - root - INFO - step: 34865 loss: 2.6033 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.50 mfu: 49.49% global_avg_ntp_loss: 0.7111 global_avg_top_loss: 1.8922 +[titan] 2025-09-10 15:03:34,378 - root - INFO - lr: 2.7365e-06 gnorm: 0.58 [2 days, 15:27:05< 9:20:43] +[titan] 2025-09-10 15:04:06,459 - root - INFO - step: 34870 loss: 2.6537 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.80 mfu: 49.22% global_avg_ntp_loss: 0.7336 global_avg_top_loss: 1.9201 +[titan] 2025-09-10 15:04:06,459 - root - INFO - lr: 2.7351e-06 gnorm: 0.63 [2 days, 15:27:37< 9:20:10] +[titan] 2025-09-10 15:04:38,529 - root - INFO - step: 34875 loss: 2.4648 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.98 mfu: 49.24% global_avg_ntp_loss: 0.6435 global_avg_top_loss: 1.8213 +[titan] 2025-09-10 15:04:38,529 - root - INFO - lr: 2.7337e-06 gnorm: 0.52 [2 days, 15:28:09< 9:19:37] +[titan] 2025-09-10 15:05:10,468 - root - INFO - step: 34880 loss: 2.5155 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.96 mfu: 49.44% global_avg_ntp_loss: 0.6654 global_avg_top_loss: 1.8501 +[titan] 2025-09-10 15:05:10,468 - root - INFO - lr: 2.7323e-06 gnorm: 0.56 [2 days, 15:28:41< 9:19:04] +[titan] 2025-09-10 15:05:42,370 - root - INFO - step: 34885 loss: 2.5577 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.54 mfu: 49.50% global_avg_ntp_loss: 0.6868 global_avg_top_loss: 1.8709 +[titan] 2025-09-10 15:05:42,370 - root - INFO - lr: 2.7308e-06 gnorm: 0.60 [2 days, 15:29:13< 9:18:31] +[titan] 2025-09-10 15:06:14,360 - root - INFO - step: 34890 loss: 2.5833 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.19 mfu: 49.36% global_avg_ntp_loss: 0.6974 global_avg_top_loss: 1.8859 +[titan] 2025-09-10 15:06:14,360 - root - INFO - lr: 2.7294e-06 gnorm: 0.52 [2 days, 15:29:45< 9:17:58] +[titan] 2025-09-10 15:06:46,331 - root - INFO - step: 34895 loss: 2.5437 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.49 mfu: 49.39% global_avg_ntp_loss: 0.6821 global_avg_top_loss: 1.8616 +[titan] 2025-09-10 15:06:46,331 - root - INFO - lr: 2.7280e-06 gnorm: 0.68 [2 days, 15:30:17< 9:17:25] +[titan] 2025-09-10 15:07:11,786 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 15:07:18,261 - root - INFO - step: 34900 loss: 2.5313 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.10 mfu: 49.45% global_avg_ntp_loss: 0.6766 global_avg_top_loss: 1.8547 +[titan] 2025-09-10 15:07:18,262 - root - INFO - lr: 2.7266e-06 gnorm: 0.63 [2 days, 15:30:49< 9:16:52] +[titan] 2025-09-10 15:07:50,230 - root - INFO - step: 34905 loss: 2.3828 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.51 mfu: 49.39% global_avg_ntp_loss: 0.6059 global_avg_top_loss: 1.7769 +[titan] 2025-09-10 15:07:50,231 - root - INFO - lr: 2.7252e-06 gnorm: 0.51 [2 days, 15:31:21< 9:16:20] +[titan] 2025-09-10 15:08:22,154 - root - INFO - step: 34910 loss: 2.4222 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.21 mfu: 49.46% global_avg_ntp_loss: 0.6249 global_avg_top_loss: 1.7973 +[titan] 2025-09-10 15:08:22,154 - root - INFO - lr: 2.7238e-06 gnorm: 0.61 [2 days, 15:31:53< 9:15:47] +[titan] 2025-09-10 15:08:53,986 - root - INFO - step: 34915 loss: 2.4337 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.62 mfu: 49.61% global_avg_ntp_loss: 0.6233 global_avg_top_loss: 1.8104 +[titan] 2025-09-10 15:08:53,986 - root - INFO - lr: 2.7224e-06 gnorm: 0.68 [2 days, 15:32:25< 9:15:14] +[titan] 2025-09-10 15:09:26,017 - root - INFO - step: 34920 loss: 2.4636 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.56 mfu: 49.30% global_avg_ntp_loss: 0.6384 global_avg_top_loss: 1.8252 +[titan] 2025-09-10 15:09:26,018 - root - INFO - lr: 2.7210e-06 gnorm: 0.54 [2 days, 15:32:57< 9:14:41] +[titan] 2025-09-10 15:09:57,909 - root - INFO - step: 34925 loss: 2.4843 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.69 mfu: 49.51% global_avg_ntp_loss: 0.6473 global_avg_top_loss: 1.8371 +[titan] 2025-09-10 15:09:57,910 - root - INFO - lr: 2.7196e-06 gnorm: 0.57 [2 days, 15:33:29< 9:14:08] +[titan] 2025-09-10 15:10:29,966 - root - INFO - step: 34930 loss: 2.4406 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.18 mfu: 49.26% global_avg_ntp_loss: 0.6347 global_avg_top_loss: 1.8059 +[titan] 2025-09-10 15:10:29,966 - root - INFO - lr: 2.7182e-06 gnorm: 0.71 [2 days, 15:34:01< 9:13:35] +[titan] 2025-09-10 15:11:01,766 - root - INFO - step: 34935 loss: 2.9065 memory: 122.03GiB(87.57%) tps: 10,305 tflops: 491.11 mfu: 49.66% global_avg_ntp_loss: 0.8933 global_avg_top_loss: 2.0133 +[titan] 2025-09-10 15:11:01,766 - root - INFO - lr: 2.7168e-06 gnorm: 0.57 [2 days, 15:34:33< 9:13:02] +[titan] 2025-09-10 15:11:33,675 - root - INFO - step: 34940 loss: 2.6125 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.43 mfu: 49.49% global_avg_ntp_loss: 0.7113 global_avg_top_loss: 1.9012 +[titan] 2025-09-10 15:11:33,675 - root - INFO - lr: 2.7154e-06 gnorm: 0.56 [2 days, 15:35:04< 9:12:29] +[titan] 2025-09-10 15:12:05,673 - root - INFO - step: 34945 loss: 2.6841 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.07 mfu: 49.35% global_avg_ntp_loss: 0.7469 global_avg_top_loss: 1.9372 +[titan] 2025-09-10 15:12:05,673 - root - INFO - lr: 2.7140e-06 gnorm: 0.55 [2 days, 15:35:36< 9:11:57] +[titan] 2025-09-10 15:12:31,266 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 15:12:37,572 - root - INFO - step: 34950 loss: 2.5463 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.58 mfu: 49.50% global_avg_ntp_loss: 0.6850 global_avg_top_loss: 1.8612 +[titan] 2025-09-10 15:12:37,572 - root - INFO - lr: 2.7126e-06 gnorm: 0.61 [2 days, 15:36:08< 9:11:24] +[titan] 2025-09-10 15:13:09,450 - root - INFO - step: 34955 loss: 2.4675 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.89 mfu: 49.53% global_avg_ntp_loss: 0.6446 global_avg_top_loss: 1.8230 +[titan] 2025-09-10 15:13:09,451 - root - INFO - lr: 2.7112e-06 gnorm: 0.59 [2 days, 15:36:40< 9:10:51] +[titan] 2025-09-10 15:13:41,532 - root - INFO - step: 34960 loss: 2.5341 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.79 mfu: 49.22% global_avg_ntp_loss: 0.6720 global_avg_top_loss: 1.8621 +[titan] 2025-09-10 15:13:41,532 - root - INFO - lr: 2.7099e-06 gnorm: 0.53 [2 days, 15:37:12< 9:10:18] +[titan] 2025-09-10 15:14:13,318 - root - INFO - step: 34965 loss: 2.4948 memory: 122.03GiB(87.57%) tps: 10,309 tflops: 491.32 mfu: 49.68% global_avg_ntp_loss: 0.6591 global_avg_top_loss: 1.8357 +[titan] 2025-09-10 15:14:13,319 - root - INFO - lr: 2.7085e-06 gnorm: 0.56 [2 days, 15:37:44< 9:09:45] +[titan] 2025-09-10 15:14:45,220 - root - INFO - step: 34970 loss: 2.6011 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.54 mfu: 49.50% global_avg_ntp_loss: 0.7094 global_avg_top_loss: 1.8917 +[titan] 2025-09-10 15:14:45,220 - root - INFO - lr: 2.7071e-06 gnorm: 0.54 [2 days, 15:38:16< 9:09:12] +[titan] 2025-09-10 15:15:17,180 - root - INFO - step: 34975 loss: 2.5225 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.65 mfu: 49.41% global_avg_ntp_loss: 0.6811 global_avg_top_loss: 1.8414 +[titan] 2025-09-10 15:15:17,180 - root - INFO - lr: 2.7057e-06 gnorm: 0.70 [2 days, 15:38:48< 9:08:39] +[titan] 2025-09-10 15:15:49,057 - root - INFO - step: 34980 loss: 2.5751 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.92 mfu: 49.54% global_avg_ntp_loss: 0.6908 global_avg_top_loss: 1.8842 +[titan] 2025-09-10 15:15:49,057 - root - INFO - lr: 2.7043e-06 gnorm: 0.64 [2 days, 15:39:20< 9:08:06] +[titan] 2025-09-10 15:16:21,069 - root - INFO - step: 34985 loss: 2.4006 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.85 mfu: 49.33% global_avg_ntp_loss: 0.6116 global_avg_top_loss: 1.7891 +[titan] 2025-09-10 15:16:21,070 - root - INFO - lr: 2.7029e-06 gnorm: 0.53 [2 days, 15:39:52< 9:07:34] +[titan] 2025-09-10 15:16:52,917 - root - INFO - step: 34990 loss: 2.4147 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.37 mfu: 49.58% global_avg_ntp_loss: 0.6182 global_avg_top_loss: 1.7965 +[titan] 2025-09-10 15:16:52,917 - root - INFO - lr: 2.7015e-06 gnorm: 0.58 [2 days, 15:40:24< 9:07:01] +[titan] 2025-09-10 15:17:24,778 - root - INFO - step: 34995 loss: 2.3757 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.16 mfu: 49.56% global_avg_ntp_loss: 0.5960 global_avg_top_loss: 1.7798 +[titan] 2025-09-10 15:17:24,779 - root - INFO - lr: 2.7002e-06 gnorm: 0.77 [2 days, 15:40:56< 9:06:28] +[titan] 2025-09-10 15:17:50,330 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 15:17:56,775 - root - INFO - step: 35000 loss: 2.5142 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.09 mfu: 49.35% global_avg_ntp_loss: 0.6653 global_avg_top_loss: 1.8489 +[titan] 2025-09-10 15:17:56,775 - root - INFO - lr: 2.6988e-06 gnorm: 0.54 [2 days, 15:41:28< 9:05:55] +[titan] 2025-09-10 15:17:56,775 - root - INFO - Saving the checkpoint (or staging if async is enabled). +[titan] 2025-09-10 15:18:29,493 - root - INFO - [GC] GC collection invoked by checkpointer. 0.02 seconds. +[titan] 2025-09-10 15:18:29,493 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 32.72 seconds. +[titan] 2025-09-10 15:37:08,227 - root - INFO - step: 35005 loss: 2.4816 memory: 122.03GiB(87.57%) tps: 285 tflops: 13.56 mfu: 1.37% global_avg_ntp_loss: 0.6451 global_avg_top_loss: 1.8365 +[titan] 2025-09-10 15:37:08,228 - root - INFO - lr: 2.6974e-06 gnorm: 0.55 [2 days, 16:00:39< 9:08:02] +[titan] 2025-09-10 15:37:37,966 - root - INFO - step: 35010 loss: 2.5626 memory: 122.03GiB(87.57%) tps: 11,019 tflops: 525.15 mfu: 53.10% global_avg_ntp_loss: 0.6885 global_avg_top_loss: 1.8741 +[titan] 2025-09-10 15:37:37,966 - root - INFO - lr: 2.6960e-06 gnorm: 0.58 [2 days, 16:01:09< 9:07:28] +[titan] 2025-09-10 15:38:07,994 - root - INFO - step: 35015 loss: 2.9326 memory: 122.03GiB(87.57%) tps: 10,913 tflops: 520.09 mfu: 52.59% global_avg_ntp_loss: 0.9101 global_avg_top_loss: 2.0225 +[titan] 2025-09-10 15:38:07,994 - root - INFO - lr: 2.6946e-06 gnorm: 0.51 [2 days, 16:01:39< 9:06:55] +[titan] 2025-09-10 15:38:38,150 - root - INFO - step: 35020 loss: 2.5410 memory: 122.03GiB(87.57%) tps: 10,866 tflops: 517.87 mfu: 52.36% global_avg_ntp_loss: 0.6772 global_avg_top_loss: 1.8638 +[titan] 2025-09-10 15:38:38,150 - root - INFO - lr: 2.6933e-06 gnorm: 0.52 [2 days, 16:02:09< 9:06:22] +[titan] 2025-09-10 15:39:08,502 - root - INFO - step: 35025 loss: 2.6117 memory: 122.03GiB(87.57%) tps: 10,796 tflops: 514.54 mfu: 52.03% global_avg_ntp_loss: 0.7119 global_avg_top_loss: 1.8998 +[titan] 2025-09-10 15:39:08,502 - root - INFO - lr: 2.6919e-06 gnorm: 0.60 [2 days, 16:02:39< 9:05:49] +[titan] 2025-09-10 15:39:38,902 - root - INFO - step: 35030 loss: 2.7043 memory: 122.03GiB(87.57%) tps: 10,779 tflops: 513.73 mfu: 51.94% global_avg_ntp_loss: 0.7666 global_avg_top_loss: 1.9377 +[titan] 2025-09-10 15:39:38,902 - root - INFO - lr: 2.6905e-06 gnorm: 0.63 [2 days, 16:03:10< 9:05:15] +[titan] 2025-09-10 15:40:09,586 - root - INFO - step: 35035 loss: 2.4994 memory: 122.03GiB(87.57%) tps: 10,679 tflops: 508.98 mfu: 51.46% global_avg_ntp_loss: 0.6563 global_avg_top_loss: 1.8431 +[titan] 2025-09-10 15:40:09,586 - root - INFO - lr: 2.6892e-06 gnorm: 0.59 [2 days, 16:03:40< 9:04:42] +[titan] 2025-09-10 15:40:40,521 - root - INFO - step: 35040 loss: 2.4705 memory: 122.03GiB(87.57%) tps: 10,592 tflops: 504.83 mfu: 51.04% global_avg_ntp_loss: 0.6404 global_avg_top_loss: 1.8301 +[titan] 2025-09-10 15:40:40,522 - root - INFO - lr: 2.6878e-06 gnorm: 0.53 [2 days, 16:04:11< 9:04:09] +[titan] 2025-09-10 15:41:11,603 - root - INFO - step: 35045 loss: 2.5614 memory: 122.03GiB(87.57%) tps: 10,543 tflops: 502.46 mfu: 50.80% global_avg_ntp_loss: 0.6853 global_avg_top_loss: 1.8761 +[titan] 2025-09-10 15:41:11,603 - root - INFO - lr: 2.6864e-06 gnorm: 0.58 [2 days, 16:04:42< 9:03:36] +[titan] 2025-09-10 15:41:36,765 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 15:41:43,127 - root - INFO - step: 35050 loss: 2.5355 memory: 122.03GiB(87.57%) tps: 10,395 tflops: 495.40 mfu: 50.09% global_avg_ntp_loss: 0.6747 global_avg_top_loss: 1.8609 +[titan] 2025-09-10 15:41:43,128 - root - INFO - lr: 2.6851e-06 gnorm: 0.53 [2 days, 16:05:14< 9:03:03] +[titan] 2025-09-10 15:42:14,470 - root - INFO - step: 35055 loss: 2.5819 memory: 122.03GiB(87.57%) tps: 10,455 tflops: 498.28 mfu: 50.38% global_avg_ntp_loss: 0.6943 global_avg_top_loss: 1.8876 +[titan] 2025-09-10 15:42:14,470 - root - INFO - lr: 2.6837e-06 gnorm: 0.64 [2 days, 16:05:45< 9:02:29] +[titan] 2025-09-10 15:42:46,113 - root - INFO - step: 35060 loss: 2.5758 memory: 122.03GiB(87.57%) tps: 10,356 tflops: 493.54 mfu: 49.90% global_avg_ntp_loss: 0.6972 global_avg_top_loss: 1.8786 +[titan] 2025-09-10 15:42:46,113 - root - INFO - lr: 2.6823e-06 gnorm: 0.73 [2 days, 16:06:17< 9:01:56] +[titan] 2025-09-10 15:43:17,829 - root - INFO - step: 35065 loss: 2.4279 memory: 122.03GiB(87.57%) tps: 10,332 tflops: 492.41 mfu: 49.79% global_avg_ntp_loss: 0.6315 global_avg_top_loss: 1.7964 +[titan] 2025-09-10 15:43:17,829 - root - INFO - lr: 2.6810e-06 gnorm: 0.51 [2 days, 16:06:49< 9:01:23] +[titan] 2025-09-10 15:43:49,479 - root - INFO - step: 35070 loss: 2.4362 memory: 122.03GiB(87.57%) tps: 10,353 tflops: 493.43 mfu: 49.89% global_avg_ntp_loss: 0.6272 global_avg_top_loss: 1.8090 +[titan] 2025-09-10 15:43:49,479 - root - INFO - lr: 2.6796e-06 gnorm: 0.58 [2 days, 16:07:20< 9:00:50] +[titan] 2025-09-10 15:44:21,213 - root - INFO - step: 35075 loss: 2.4237 memory: 122.03GiB(87.57%) tps: 10,326 tflops: 492.12 mfu: 49.76% global_avg_ntp_loss: 0.6179 global_avg_top_loss: 1.8058 +[titan] 2025-09-10 15:44:21,214 - root - INFO - lr: 2.6782e-06 gnorm: 0.73 [2 days, 16:07:52< 9:00:17] +[titan] 2025-09-10 15:44:53,080 - root - INFO - step: 35080 loss: 2.5232 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.09 mfu: 49.55% global_avg_ntp_loss: 0.6694 global_avg_top_loss: 1.8538 +[titan] 2025-09-10 15:44:53,080 - root - INFO - lr: 2.6769e-06 gnorm: 0.55 [2 days, 16:08:24< 8:59:44] +[titan] 2025-09-10 15:45:24,991 - root - INFO - step: 35085 loss: 2.4961 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.39 mfu: 49.48% global_avg_ntp_loss: 0.6507 global_avg_top_loss: 1.8454 +[titan] 2025-09-10 15:45:24,991 - root - INFO - lr: 2.6755e-06 gnorm: 0.57 [2 days, 16:08:56< 8:59:11] +[titan] 2025-09-10 15:45:57,023 - root - INFO - step: 35090 loss: 2.5564 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.55 mfu: 49.30% global_avg_ntp_loss: 0.6880 global_avg_top_loss: 1.8684 +[titan] 2025-09-10 15:45:57,024 - root - INFO - lr: 2.6742e-06 gnorm: 0.57 [2 days, 16:09:28< 8:58:38] +[titan] 2025-09-10 15:46:29,036 - root - INFO - step: 35095 loss: 2.8937 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.85 mfu: 49.33% global_avg_ntp_loss: 0.8886 global_avg_top_loss: 2.0052 +[titan] 2025-09-10 15:46:29,036 - root - INFO - lr: 2.6728e-06 gnorm: 0.54 [2 days, 16:10:00< 8:58:05] +[titan] 2025-09-10 15:46:54,670 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 15:47:01,096 - root - INFO - step: 35100 loss: 2.5151 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.12 mfu: 49.25% global_avg_ntp_loss: 0.6652 global_avg_top_loss: 1.8499 +[titan] 2025-09-10 15:47:01,096 - root - INFO - lr: 2.6715e-06 gnorm: 0.58 [2 days, 16:10:32< 8:57:32] +[titan] 2025-09-10 15:47:33,174 - root - INFO - step: 35105 loss: 2.5863 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.85 mfu: 49.23% global_avg_ntp_loss: 0.6991 global_avg_top_loss: 1.8872 +[titan] 2025-09-10 15:47:33,175 - root - INFO - lr: 2.6701e-06 gnorm: 0.59 [2 days, 16:11:04< 8:56:59] +[titan] 2025-09-10 15:48:05,069 - root - INFO - step: 35110 loss: 3.0308 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.65 mfu: 49.51% global_avg_ntp_loss: 0.9496 global_avg_top_loss: 2.0812 +[titan] 2025-09-10 15:48:05,069 - root - INFO - lr: 2.6688e-06 gnorm: 0.65 [2 days, 16:11:36< 8:56:26] +[titan] 2025-09-10 15:48:37,090 - root - INFO - step: 35115 loss: 2.5902 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.72 mfu: 49.31% global_avg_ntp_loss: 0.6940 global_avg_top_loss: 1.8962 +[titan] 2025-09-10 15:48:37,090 - root - INFO - lr: 2.6674e-06 gnorm: 0.62 [2 days, 16:12:08< 8:55:53] +[titan] 2025-09-10 15:49:09,059 - root - INFO - step: 35120 loss: 2.6340 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.51 mfu: 49.39% global_avg_ntp_loss: 0.7334 global_avg_top_loss: 1.9006 +[titan] 2025-09-10 15:49:09,059 - root - INFO - lr: 2.6661e-06 gnorm: 0.56 [2 days, 16:12:40< 8:55:20] +[titan] 2025-09-10 15:49:41,144 - root - INFO - step: 35125 loss: 2.5762 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.75 mfu: 49.22% global_avg_ntp_loss: 0.6914 global_avg_top_loss: 1.8847 +[titan] 2025-09-10 15:49:41,144 - root - INFO - lr: 2.6647e-06 gnorm: 0.55 [2 days, 16:13:12< 8:54:47] +[titan] 2025-09-10 15:50:13,299 - root - INFO - step: 35130 loss: 2.5745 memory: 122.03GiB(87.57%) tps: 10,191 tflops: 485.67 mfu: 49.11% global_avg_ntp_loss: 0.6941 global_avg_top_loss: 1.8804 +[titan] 2025-09-10 15:50:13,300 - root - INFO - lr: 2.6634e-06 gnorm: 0.50 [2 days, 16:13:44< 8:54:14] +[titan] 2025-09-10 15:50:45,476 - root - INFO - step: 35135 loss: 2.5737 memory: 122.03GiB(87.57%) tps: 10,184 tflops: 485.36 mfu: 49.08% global_avg_ntp_loss: 0.6897 global_avg_top_loss: 1.8840 +[titan] 2025-09-10 15:50:45,476 - root - INFO - lr: 2.6620e-06 gnorm: 0.71 [2 days, 16:14:16< 8:53:41] +[titan] 2025-09-10 15:51:17,458 - root - INFO - step: 35140 loss: 2.5872 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.32 mfu: 49.37% global_avg_ntp_loss: 0.7030 global_avg_top_loss: 1.8842 +[titan] 2025-09-10 15:51:17,458 - root - INFO - lr: 2.6607e-06 gnorm: 0.63 [2 days, 16:14:48< 8:53:08] +[titan] 2025-09-10 15:51:49,498 - root - INFO - step: 35145 loss: 2.5211 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.42 mfu: 49.28% global_avg_ntp_loss: 0.6737 global_avg_top_loss: 1.8474 +[titan] 2025-09-10 15:51:49,499 - root - INFO - lr: 2.6593e-06 gnorm: 0.50 [2 days, 16:15:20< 8:52:35] +[titan] 2025-09-10 15:52:15,314 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 15:52:21,664 - root - INFO - step: 35150 loss: 2.3933 memory: 122.03GiB(87.57%) tps: 10,187 tflops: 485.52 mfu: 49.09% global_avg_ntp_loss: 0.6092 global_avg_top_loss: 1.7841 +[titan] 2025-09-10 15:52:21,664 - root - INFO - lr: 2.6580e-06 gnorm: 0.62 [2 days, 16:15:52< 8:52:02] +[titan] 2025-09-10 15:52:53,660 - root - INFO - step: 35155 loss: 2.4196 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.11 mfu: 49.35% global_avg_ntp_loss: 0.6142 global_avg_top_loss: 1.8054 +[titan] 2025-09-10 15:52:53,660 - root - INFO - lr: 2.6567e-06 gnorm: 0.68 [2 days, 16:16:24< 8:51:29] +[titan] 2025-09-10 15:53:25,869 - root - INFO - step: 35160 loss: 2.9686 memory: 122.03GiB(87.57%) tps: 10,174 tflops: 484.86 mfu: 49.03% global_avg_ntp_loss: 0.9227 global_avg_top_loss: 2.0459 +[titan] 2025-09-10 15:53:25,869 - root - INFO - lr: 2.6553e-06 gnorm: 0.53 [2 days, 16:16:57< 8:50:56] +[titan] 2025-09-10 15:53:57,944 - root - INFO - step: 35165 loss: 2.4412 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.89 mfu: 49.23% global_avg_ntp_loss: 0.6295 global_avg_top_loss: 1.8118 +[titan] 2025-09-10 15:53:57,945 - root - INFO - lr: 2.6540e-06 gnorm: 0.56 [2 days, 16:17:29< 8:50:23] +[titan] 2025-09-10 15:54:30,262 - root - INFO - step: 35170 loss: 2.4845 memory: 122.03GiB(87.57%) tps: 10,139 tflops: 483.24 mfu: 48.86% global_avg_ntp_loss: 0.6519 global_avg_top_loss: 1.8327 +[titan] 2025-09-10 15:54:30,262 - root - INFO - lr: 2.6526e-06 gnorm: 0.56 [2 days, 16:18:01< 8:49:50] +[titan] 2025-09-10 15:55:02,523 - root - INFO - step: 35175 loss: 2.4095 memory: 122.03GiB(87.57%) tps: 10,157 tflops: 484.09 mfu: 48.95% global_avg_ntp_loss: 0.6159 global_avg_top_loss: 1.7935 +[titan] 2025-09-10 15:55:02,523 - root - INFO - lr: 2.6513e-06 gnorm: 0.56 [2 days, 16:18:33< 8:49:17] +[titan] 2025-09-10 15:55:34,652 - root - INFO - step: 35180 loss: 2.5832 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.09 mfu: 49.15% global_avg_ntp_loss: 0.6992 global_avg_top_loss: 1.8841 +[titan] 2025-09-10 15:55:34,652 - root - INFO - lr: 2.6500e-06 gnorm: 0.58 [2 days, 16:19:05< 8:48:44] +[titan] 2025-09-10 15:56:06,889 - root - INFO - step: 35185 loss: 2.6244 memory: 122.03GiB(87.57%) tps: 10,165 tflops: 484.44 mfu: 48.98% global_avg_ntp_loss: 0.7155 global_avg_top_loss: 1.9089 +[titan] 2025-09-10 15:56:06,889 - root - INFO - lr: 2.6486e-06 gnorm: 0.57 [2 days, 16:19:38< 8:48:11] +[titan] 2025-09-10 15:56:39,125 - root - INFO - step: 35190 loss: 2.5960 memory: 122.03GiB(87.57%) tps: 10,165 tflops: 484.47 mfu: 48.99% global_avg_ntp_loss: 0.7087 global_avg_top_loss: 1.8873 +[titan] 2025-09-10 15:56:39,125 - root - INFO - lr: 2.6473e-06 gnorm: 0.65 [2 days, 16:20:10< 8:47:38] +[titan] 2025-09-10 15:57:11,207 - root - INFO - step: 35195 loss: 2.5294 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.80 mfu: 49.22% global_avg_ntp_loss: 0.6710 global_avg_top_loss: 1.8584 +[titan] 2025-09-10 15:57:11,207 - root - INFO - lr: 2.6460e-06 gnorm: 0.61 [2 days, 16:20:42< 8:47:05] +[titan] 2025-09-10 15:57:37,184 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 15:57:43,643 - root - INFO - step: 35200 loss: 2.4879 memory: 122.03GiB(87.57%) tps: 10,102 tflops: 481.48 mfu: 48.68% global_avg_ntp_loss: 0.6508 global_avg_top_loss: 1.8370 +[titan] 2025-09-10 15:57:43,643 - root - INFO - lr: 2.6447e-06 gnorm: 0.57 [2 days, 16:21:14< 8:46:32] +[titan] 2025-09-10 15:58:15,574 - root - INFO - step: 35205 loss: 2.5659 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.09 mfu: 49.45% global_avg_ntp_loss: 0.6882 global_avg_top_loss: 1.8778 +[titan] 2025-09-10 15:58:15,574 - root - INFO - lr: 2.6433e-06 gnorm: 0.59 [2 days, 16:21:46< 8:45:58] +[titan] 2025-09-10 15:58:47,738 - root - INFO - step: 35210 loss: 2.6955 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.54 mfu: 49.09% global_avg_ntp_loss: 0.7576 global_avg_top_loss: 1.9379 +[titan] 2025-09-10 15:58:47,739 - root - INFO - lr: 2.6420e-06 gnorm: 0.54 [2 days, 16:22:19< 8:45:25] +[titan] 2025-09-10 15:59:19,755 - root - INFO - step: 35215 loss: 2.6016 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.79 mfu: 49.32% global_avg_ntp_loss: 0.7056 global_avg_top_loss: 1.8960 +[titan] 2025-09-10 15:59:19,755 - root - INFO - lr: 2.6407e-06 gnorm: 0.66 [2 days, 16:22:51< 8:44:52] +[titan] 2025-09-10 15:59:51,725 - root - INFO - step: 35220 loss: 2.6237 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.49 mfu: 49.39% global_avg_ntp_loss: 0.7221 global_avg_top_loss: 1.9016 +[titan] 2025-09-10 15:59:51,726 - root - INFO - lr: 2.6394e-06 gnorm: 0.69 [2 days, 16:23:23< 8:44:19] +[titan] 2025-09-10 16:00:23,975 - root - INFO - step: 35225 loss: 2.3291 memory: 122.03GiB(87.57%) tps: 10,161 tflops: 484.26 mfu: 48.97% global_avg_ntp_loss: 0.5804 global_avg_top_loss: 1.7487 +[titan] 2025-09-10 16:00:23,975 - root - INFO - lr: 2.6380e-06 gnorm: 0.51 [2 days, 16:23:55< 8:43:46] +[titan] 2025-09-10 16:00:56,062 - root - INFO - step: 35230 loss: 2.4725 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.71 mfu: 49.21% global_avg_ntp_loss: 0.6432 global_avg_top_loss: 1.8293 +[titan] 2025-09-10 16:00:56,062 - root - INFO - lr: 2.6367e-06 gnorm: 0.62 [2 days, 16:24:27< 8:43:13] +[titan] 2025-09-10 16:01:28,367 - root - INFO - step: 35235 loss: 2.4449 memory: 122.03GiB(87.57%) tps: 10,143 tflops: 483.43 mfu: 48.88% global_avg_ntp_loss: 0.6259 global_avg_top_loss: 1.8191 +[titan] 2025-09-10 16:01:28,367 - root - INFO - lr: 2.6354e-06 gnorm: 0.65 [2 days, 16:24:59< 8:42:40] +[titan] 2025-09-10 16:02:00,303 - root - INFO - step: 35240 loss: 2.4991 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.01 mfu: 49.44% global_avg_ntp_loss: 0.6556 global_avg_top_loss: 1.8435 +[titan] 2025-09-10 16:02:00,304 - root - INFO - lr: 2.6341e-06 gnorm: 0.56 [2 days, 16:25:31< 8:42:07] +[titan] 2025-09-10 16:02:32,535 - root - INFO - step: 35245 loss: 2.4363 memory: 122.03GiB(87.57%) tps: 10,167 tflops: 484.53 mfu: 48.99% global_avg_ntp_loss: 0.6244 global_avg_top_loss: 1.8119 +[titan] 2025-09-10 16:02:32,535 - root - INFO - lr: 2.6328e-06 gnorm: 0.58 [2 days, 16:26:03< 8:41:34] +[titan] 2025-09-10 16:02:58,100 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 16:03:04,601 - root - INFO - step: 35250 loss: 2.5068 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.02 mfu: 49.24% global_avg_ntp_loss: 0.6605 global_avg_top_loss: 1.8462 +[titan] 2025-09-10 16:03:04,602 - root - INFO - lr: 2.6315e-06 gnorm: 0.57 [2 days, 16:26:35< 8:41:01] +[titan] 2025-09-10 16:03:36,687 - root - INFO - step: 35255 loss: 2.4105 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.73 mfu: 49.21% global_avg_ntp_loss: 0.6170 global_avg_top_loss: 1.7934 +[titan] 2025-09-10 16:03:36,688 - root - INFO - lr: 2.6301e-06 gnorm: 0.55 [2 days, 16:27:07< 8:40:28] +[titan] 2025-09-10 16:04:08,886 - root - INFO - step: 35260 loss: 2.6022 memory: 122.03GiB(87.57%) tps: 10,177 tflops: 485.02 mfu: 49.04% global_avg_ntp_loss: 0.7015 global_avg_top_loss: 1.9008 +[titan] 2025-09-10 16:04:08,887 - root - INFO - lr: 2.6288e-06 gnorm: 0.57 [2 days, 16:27:40< 8:39:55] +[titan] 2025-09-10 16:04:40,945 - root - INFO - step: 35265 loss: 2.5833 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.14 mfu: 49.26% global_avg_ntp_loss: 0.6984 global_avg_top_loss: 1.8848 +[titan] 2025-09-10 16:04:40,946 - root - INFO - lr: 2.6275e-06 gnorm: 0.56 [2 days, 16:28:12< 8:39:22] +[titan] 2025-09-10 16:05:13,181 - root - INFO - step: 35270 loss: 2.9912 memory: 122.03GiB(87.57%) tps: 10,165 tflops: 484.47 mfu: 48.99% global_avg_ntp_loss: 0.9371 global_avg_top_loss: 2.0541 +[titan] 2025-09-10 16:05:13,181 - root - INFO - lr: 2.6262e-06 gnorm: 0.61 [2 days, 16:28:44< 8:38:49] +[titan] 2025-09-10 16:05:45,312 - root - INFO - step: 35275 loss: 2.5175 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.05 mfu: 49.15% global_avg_ntp_loss: 0.6677 global_avg_top_loss: 1.8498 +[titan] 2025-09-10 16:05:45,312 - root - INFO - lr: 2.6249e-06 gnorm: 0.62 [2 days, 16:29:16< 8:38:16] +[titan] 2025-09-10 16:06:17,484 - root - INFO - step: 35280 loss: 2.5026 memory: 122.03GiB(87.57%) tps: 10,185 tflops: 485.42 mfu: 49.08% global_avg_ntp_loss: 0.6559 global_avg_top_loss: 1.8467 +[titan] 2025-09-10 16:06:17,484 - root - INFO - lr: 2.6236e-06 gnorm: 0.56 [2 days, 16:29:48< 8:37:43] +[titan] 2025-09-10 16:06:49,481 - root - INFO - step: 35285 loss: 2.6090 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.08 mfu: 49.35% global_avg_ntp_loss: 0.7085 global_avg_top_loss: 1.9005 +[titan] 2025-09-10 16:06:49,482 - root - INFO - lr: 2.6223e-06 gnorm: 0.58 [2 days, 16:30:20< 8:37:10] +[titan] 2025-09-10 16:07:21,638 - root - INFO - step: 35290 loss: 2.5879 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.66 mfu: 49.11% global_avg_ntp_loss: 0.7010 global_avg_top_loss: 1.8868 +[titan] 2025-09-10 16:07:21,638 - root - INFO - lr: 2.6210e-06 gnorm: 0.58 [2 days, 16:30:52< 8:36:37] +[titan] 2025-09-10 16:07:53,662 - root - INFO - step: 35295 loss: 2.5065 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.67 mfu: 49.31% global_avg_ntp_loss: 0.6572 global_avg_top_loss: 1.8493 +[titan] 2025-09-10 16:07:53,662 - root - INFO - lr: 2.6197e-06 gnorm: 0.64 [2 days, 16:31:24< 8:36:04] +[titan] 2025-09-10 16:08:19,336 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 16:08:25,742 - root - INFO - step: 35300 loss: 2.5408 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.82 mfu: 49.22% global_avg_ntp_loss: 0.6793 global_avg_top_loss: 1.8616 +[titan] 2025-09-10 16:08:25,742 - root - INFO - lr: 2.6184e-06 gnorm: 0.68 [2 days, 16:31:57< 8:35:31] +[titan] 2025-09-10 16:08:57,885 - root - INFO - step: 35305 loss: 2.3985 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.87 mfu: 49.13% global_avg_ntp_loss: 0.6114 global_avg_top_loss: 1.7871 +[titan] 2025-09-10 16:08:57,885 - root - INFO - lr: 2.6171e-06 gnorm: 0.52 [2 days, 16:32:29< 8:34:58] +[titan] 2025-09-10 16:09:30,166 - root - INFO - step: 35310 loss: 2.4453 memory: 122.03GiB(87.57%) tps: 10,151 tflops: 483.79 mfu: 48.92% global_avg_ntp_loss: 0.6284 global_avg_top_loss: 1.8169 +[titan] 2025-09-10 16:09:30,166 - root - INFO - lr: 2.6158e-06 gnorm: 0.61 [2 days, 16:33:01< 8:34:25] +[titan] 2025-09-10 16:10:02,146 - root - INFO - step: 35315 loss: 2.3350 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.35 mfu: 49.38% global_avg_ntp_loss: 0.5771 global_avg_top_loss: 1.7579 +[titan] 2025-09-10 16:10:02,146 - root - INFO - lr: 2.6145e-06 gnorm: 0.67 [2 days, 16:33:33< 8:33:52] +[titan] 2025-09-10 16:10:34,471 - root - INFO - step: 35320 loss: 2.4427 memory: 122.03GiB(87.57%) tps: 10,137 tflops: 483.13 mfu: 48.85% global_avg_ntp_loss: 0.6316 global_avg_top_loss: 1.8110 +[titan] 2025-09-10 16:10:34,471 - root - INFO - lr: 2.6132e-06 gnorm: 0.58 [2 days, 16:34:05< 8:33:19] +[titan] 2025-09-10 16:11:06,808 - root - INFO - step: 35325 loss: 2.4538 memory: 122.03GiB(87.57%) tps: 10,133 tflops: 482.95 mfu: 48.83% global_avg_ntp_loss: 0.6336 global_avg_top_loss: 1.8203 +[titan] 2025-09-10 16:11:06,808 - root - INFO - lr: 2.6119e-06 gnorm: 0.61 [2 days, 16:34:38< 8:32:46] +[titan] 2025-09-10 16:11:26,288 - root - INFO - Dumping profiler traces at step 35328 +[titan] 2025-09-10 16:11:26,347 - root - INFO - Finished dumping profiler traces in 0.06 seconds +[titan] 2025-09-10 16:11:39,332 - root - INFO - step: 35330 loss: 2.5044 memory: 122.03GiB(87.57%) tps: 10,075 tflops: 480.17 mfu: 48.55% global_avg_ntp_loss: 0.6603 global_avg_top_loss: 1.8441 +[titan] 2025-09-10 16:11:39,332 - root - INFO - lr: 2.6106e-06 gnorm: 0.61 [2 days, 16:35:10< 8:32:13] +[titan] 2025-09-10 16:12:11,375 - root - INFO - step: 35335 loss: 2.3671 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.38 mfu: 49.28% global_avg_ntp_loss: 0.5995 global_avg_top_loss: 1.7676 +[titan] 2025-09-10 16:12:11,375 - root - INFO - lr: 2.6093e-06 gnorm: 0.56 [2 days, 16:35:42< 8:31:40] +[titan] 2025-09-10 16:12:43,385 - root - INFO - step: 35340 loss: 2.5700 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.89 mfu: 49.33% global_avg_ntp_loss: 0.6949 global_avg_top_loss: 1.8751 +[titan] 2025-09-10 16:12:43,385 - root - INFO - lr: 2.6080e-06 gnorm: 0.57 [2 days, 16:36:14< 8:31:07] +[titan] 2025-09-10 16:13:15,484 - root - INFO - step: 35345 loss: 2.5833 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.53 mfu: 49.19% global_avg_ntp_loss: 0.6992 global_avg_top_loss: 1.8842 +[titan] 2025-09-10 16:13:15,484 - root - INFO - lr: 2.6067e-06 gnorm: 0.56 [2 days, 16:36:46< 8:30:34] +[titan] 2025-09-10 16:13:40,992 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 16:13:47,382 - root - INFO - step: 35350 loss: 3.0764 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.60 mfu: 49.51% global_avg_ntp_loss: 0.9797 global_avg_top_loss: 2.0967 +[titan] 2025-09-10 16:13:47,382 - root - INFO - lr: 2.6054e-06 gnorm: 0.65 [2 days, 16:37:18< 8:30:01] +[titan] 2025-09-10 16:14:19,488 - root - INFO - step: 35355 loss: 2.5101 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.42 mfu: 49.18% global_avg_ntp_loss: 0.6654 global_avg_top_loss: 1.8447 +[titan] 2025-09-10 16:14:19,488 - root - INFO - lr: 2.6042e-06 gnorm: 0.58 [2 days, 16:37:50< 8:29:28] +[titan] 2025-09-10 16:14:51,675 - root - INFO - step: 35360 loss: 2.5247 memory: 122.03GiB(87.57%) tps: 10,181 tflops: 485.21 mfu: 49.06% global_avg_ntp_loss: 0.6681 global_avg_top_loss: 1.8566 +[titan] 2025-09-10 16:14:51,675 - root - INFO - lr: 2.6029e-06 gnorm: 0.57 [2 days, 16:38:22< 8:28:55] +[titan] 2025-09-10 16:15:23,956 - root - INFO - step: 35365 loss: 2.5102 memory: 122.03GiB(87.57%) tps: 10,151 tflops: 483.79 mfu: 48.92% global_avg_ntp_loss: 0.6616 global_avg_top_loss: 1.8486 +[titan] 2025-09-10 16:15:23,956 - root - INFO - lr: 2.6016e-06 gnorm: 0.58 [2 days, 16:38:55< 8:28:22] +[titan] 2025-09-10 16:15:56,097 - root - INFO - step: 35370 loss: 2.6159 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.89 mfu: 49.13% global_avg_ntp_loss: 0.7156 global_avg_top_loss: 1.9003 +[titan] 2025-09-10 16:15:56,097 - root - INFO - lr: 2.6003e-06 gnorm: 0.54 [2 days, 16:39:27< 8:27:49] +[titan] 2025-09-10 16:16:28,200 - root - INFO - step: 35375 loss: 2.5566 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.47 mfu: 49.19% global_avg_ntp_loss: 0.6851 global_avg_top_loss: 1.8715 +[titan] 2025-09-10 16:16:28,200 - root - INFO - lr: 2.5990e-06 gnorm: 0.62 [2 days, 16:39:59< 8:27:16] +[titan] 2025-09-10 16:17:00,674 - root - INFO - step: 35380 loss: 2.5218 memory: 122.03GiB(87.57%) tps: 10,091 tflops: 480.92 mfu: 48.63% global_avg_ntp_loss: 0.6717 global_avg_top_loss: 1.8501 +[titan] 2025-09-10 16:17:00,674 - root - INFO - lr: 2.5977e-06 gnorm: 0.67 [2 days, 16:40:31< 8:26:43] +[titan] 2025-09-10 16:17:32,903 - root - INFO - step: 35385 loss: 2.4275 memory: 122.03GiB(87.57%) tps: 10,167 tflops: 484.56 mfu: 49.00% global_avg_ntp_loss: 0.6277 global_avg_top_loss: 1.7999 +[titan] 2025-09-10 16:17:32,903 - root - INFO - lr: 2.5965e-06 gnorm: 0.53 [2 days, 16:41:04< 8:26:10] +[titan] 2025-09-10 16:18:05,050 - root - INFO - step: 35390 loss: 2.3729 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.81 mfu: 49.12% global_avg_ntp_loss: 0.5938 global_avg_top_loss: 1.7791 +[titan] 2025-09-10 16:18:05,050 - root - INFO - lr: 2.5952e-06 gnorm: 0.67 [2 days, 16:41:36< 8:25:37] +[titan] 2025-09-10 16:18:37,039 - root - INFO - step: 35395 loss: 2.3608 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.21 mfu: 49.36% global_avg_ntp_loss: 0.5917 global_avg_top_loss: 1.7691 +[titan] 2025-09-10 16:18:37,039 - root - INFO - lr: 2.5939e-06 gnorm: 0.78 [2 days, 16:42:08< 8:25:04] +[titan] 2025-09-10 16:19:02,767 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 16:19:09,147 - root - INFO - step: 35400 loss: 2.4482 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.40 mfu: 49.18% global_avg_ntp_loss: 0.6324 global_avg_top_loss: 1.8158 +[titan] 2025-09-10 16:19:09,147 - root - INFO - lr: 2.5926e-06 gnorm: 0.54 [2 days, 16:42:40< 8:24:31] +[titan] 2025-09-10 16:19:41,370 - root - INFO - step: 35405 loss: 2.4825 memory: 122.03GiB(87.57%) tps: 10,169 tflops: 484.66 mfu: 49.01% global_avg_ntp_loss: 0.6476 global_avg_top_loss: 1.8349 +[titan] 2025-09-10 16:19:41,370 - root - INFO - lr: 2.5914e-06 gnorm: 0.60 [2 days, 16:43:12< 8:23:58] +[titan] 2025-09-10 16:20:13,619 - root - INFO - step: 35410 loss: 2.5115 memory: 122.03GiB(87.57%) tps: 10,161 tflops: 484.27 mfu: 48.97% global_avg_ntp_loss: 0.6632 global_avg_top_loss: 1.8482 +[titan] 2025-09-10 16:20:13,619 - root - INFO - lr: 2.5901e-06 gnorm: 0.58 [2 days, 16:43:44< 8:23:25] +[titan] 2025-09-10 16:20:45,614 - root - INFO - step: 35415 loss: 2.4100 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.12 mfu: 49.35% global_avg_ntp_loss: 0.6140 global_avg_top_loss: 1.7960 +[titan] 2025-09-10 16:20:45,614 - root - INFO - lr: 2.5888e-06 gnorm: 0.56 [2 days, 16:44:16< 8:22:52] +[titan] 2025-09-10 16:21:17,694 - root - INFO - step: 35420 loss: 2.6038 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.82 mfu: 49.22% global_avg_ntp_loss: 0.7032 global_avg_top_loss: 1.9006 +[titan] 2025-09-10 16:21:17,694 - root - INFO - lr: 2.5876e-06 gnorm: 0.64 [2 days, 16:44:48< 8:22:19] +[titan] 2025-09-10 16:21:49,813 - root - INFO - step: 35425 loss: 2.6262 memory: 122.03GiB(87.57%) tps: 10,202 tflops: 486.22 mfu: 49.16% global_avg_ntp_loss: 0.7168 global_avg_top_loss: 1.9094 +[titan] 2025-09-10 16:21:49,814 - root - INFO - lr: 2.5863e-06 gnorm: 0.61 [2 days, 16:45:21< 8:21:46] +[titan] 2025-09-10 16:22:21,871 - root - INFO - step: 35430 loss: 3.0951 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.17 mfu: 49.26% global_avg_ntp_loss: 0.9853 global_avg_top_loss: 2.1098 +[titan] 2025-09-10 16:22:21,871 - root - INFO - lr: 2.5850e-06 gnorm: 0.62 [2 days, 16:45:53< 8:21:13] +[titan] 2025-09-10 16:22:53,990 - root - INFO - step: 35435 loss: 2.5669 memory: 122.03GiB(87.57%) tps: 10,202 tflops: 486.23 mfu: 49.16% global_avg_ntp_loss: 0.6841 global_avg_top_loss: 1.8827 +[titan] 2025-09-10 16:22:53,990 - root - INFO - lr: 2.5838e-06 gnorm: 0.60 [2 days, 16:46:25< 8:20:40] +[titan] 2025-09-10 16:23:26,387 - root - INFO - step: 35440 loss: 2.4313 memory: 122.03GiB(87.57%) tps: 10,115 tflops: 482.05 mfu: 48.74% global_avg_ntp_loss: 0.6265 global_avg_top_loss: 1.8048 +[titan] 2025-09-10 16:23:26,387 - root - INFO - lr: 2.5825e-06 gnorm: 0.55 [2 days, 16:46:57< 8:20:07] +[titan] 2025-09-10 16:23:58,747 - root - INFO - step: 35445 loss: 2.5720 memory: 122.03GiB(87.57%) tps: 10,126 tflops: 482.61 mfu: 48.80% global_avg_ntp_loss: 0.6890 global_avg_top_loss: 1.8831 +[titan] 2025-09-10 16:23:58,747 - root - INFO - lr: 2.5812e-06 gnorm: 0.58 [2 days, 16:47:30< 8:19:34] +[titan] 2025-09-10 16:24:24,441 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 16:24:30,826 - root - INFO - step: 35450 loss: 2.6308 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.83 mfu: 49.22% global_avg_ntp_loss: 0.7185 global_avg_top_loss: 1.9123 +[titan] 2025-09-10 16:24:30,826 - root - INFO - lr: 2.5800e-06 gnorm: 0.54 [2 days, 16:48:02< 8:19:01] +[titan] 2025-09-10 16:25:03,051 - root - INFO - step: 35455 loss: 2.5309 memory: 122.03GiB(87.57%) tps: 10,169 tflops: 484.63 mfu: 49.00% global_avg_ntp_loss: 0.6744 global_avg_top_loss: 1.8565 +[titan] 2025-09-10 16:25:03,051 - root - INFO - lr: 2.5787e-06 gnorm: 0.64 [2 days, 16:48:34< 8:18:28] +[titan] 2025-09-10 16:25:35,217 - root - INFO - step: 35460 loss: 2.5574 memory: 122.03GiB(87.57%) tps: 10,187 tflops: 485.52 mfu: 49.09% global_avg_ntp_loss: 0.6867 global_avg_top_loss: 1.8706 +[titan] 2025-09-10 16:25:35,217 - root - INFO - lr: 2.5774e-06 gnorm: 0.77 [2 days, 16:49:06< 8:17:55] +[titan] 2025-09-10 16:26:07,340 - root - INFO - step: 35465 loss: 2.3993 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.16 mfu: 49.16% global_avg_ntp_loss: 0.6145 global_avg_top_loss: 1.7848 +[titan] 2025-09-10 16:26:07,340 - root - INFO - lr: 2.5762e-06 gnorm: 0.52 [2 days, 16:49:38< 8:17:22] +[titan] 2025-09-10 16:26:39,521 - root - INFO - step: 35470 loss: 2.3748 memory: 122.03GiB(87.57%) tps: 10,183 tflops: 485.30 mfu: 49.07% global_avg_ntp_loss: 0.5981 global_avg_top_loss: 1.7767 +[titan] 2025-09-10 16:26:39,521 - root - INFO - lr: 2.5749e-06 gnorm: 0.67 [2 days, 16:50:10< 8:16:49] +[titan] 2025-09-10 16:27:11,847 - root - INFO - step: 35475 loss: 2.3701 memory: 122.03GiB(87.57%) tps: 10,137 tflops: 483.10 mfu: 48.85% global_avg_ntp_loss: 0.5902 global_avg_top_loss: 1.7799 +[titan] 2025-09-10 16:27:11,848 - root - INFO - lr: 2.5737e-06 gnorm: 0.66 [2 days, 16:50:43< 8:16:16] +[titan] 2025-09-10 16:27:44,109 - root - INFO - step: 35480 loss: 2.4596 memory: 122.03GiB(87.57%) tps: 10,157 tflops: 484.08 mfu: 48.95% global_avg_ntp_loss: 0.6398 global_avg_top_loss: 1.8199 +[titan] 2025-09-10 16:27:44,109 - root - INFO - lr: 2.5724e-06 gnorm: 0.54 [2 days, 16:51:15< 8:15:43] +[titan] 2025-09-10 16:28:16,471 - root - INFO - step: 35485 loss: 2.7948 memory: 122.03GiB(87.57%) tps: 10,126 tflops: 482.58 mfu: 48.80% global_avg_ntp_loss: 0.8278 global_avg_top_loss: 1.9671 +[titan] 2025-09-10 16:28:16,471 - root - INFO - lr: 2.5712e-06 gnorm: 0.61 [2 days, 16:51:47< 8:15:10] +[titan] 2025-09-10 16:28:48,906 - root - INFO - step: 35490 loss: 2.5204 memory: 122.03GiB(87.57%) tps: 10,103 tflops: 481.49 mfu: 48.68% global_avg_ntp_loss: 0.6663 global_avg_top_loss: 1.8541 +[titan] 2025-09-10 16:28:48,906 - root - INFO - lr: 2.5699e-06 gnorm: 0.60 [2 days, 16:52:20< 8:14:37] +[titan] 2025-09-10 16:29:21,083 - root - INFO - step: 35495 loss: 2.4322 memory: 122.03GiB(87.57%) tps: 10,184 tflops: 485.35 mfu: 49.07% global_avg_ntp_loss: 0.6255 global_avg_top_loss: 1.8068 +[titan] 2025-09-10 16:29:21,083 - root - INFO - lr: 2.5687e-06 gnorm: 0.58 [2 days, 16:52:52< 8:14:04] +[titan] 2025-09-10 16:29:46,825 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 16:29:53,157 - root - INFO - step: 35500 loss: 2.5046 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.92 mfu: 49.23% global_avg_ntp_loss: 0.6593 global_avg_top_loss: 1.8453 +[titan] 2025-09-10 16:29:53,157 - root - INFO - lr: 2.5674e-06 gnorm: 0.58 [2 days, 16:53:24< 8:13:31] +[titan] 2025-09-10 16:30:25,460 - root - INFO - step: 35505 loss: 2.6794 memory: 122.03GiB(87.57%) tps: 10,144 tflops: 483.46 mfu: 48.88% global_avg_ntp_loss: 0.7433 global_avg_top_loss: 1.9362 +[titan] 2025-09-10 16:30:25,460 - root - INFO - lr: 2.5662e-06 gnorm: 0.61 [2 days, 16:53:56< 8:12:58] +[titan] 2025-09-10 16:30:57,617 - root - INFO - step: 35510 loss: 2.7028 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.66 mfu: 49.11% global_avg_ntp_loss: 0.7608 global_avg_top_loss: 1.9420 +[titan] 2025-09-10 16:30:57,617 - root - INFO - lr: 2.5649e-06 gnorm: 0.60 [2 days, 16:54:28< 8:12:25] +[titan] 2025-09-10 16:31:29,730 - root - INFO - step: 35515 loss: 2.4804 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.32 mfu: 49.17% global_avg_ntp_loss: 0.6492 global_avg_top_loss: 1.8312 +[titan] 2025-09-10 16:31:29,730 - root - INFO - lr: 2.5637e-06 gnorm: 0.62 [2 days, 16:55:00< 8:11:52] +[titan] 2025-09-10 16:32:01,940 - root - INFO - step: 35520 loss: 2.3997 memory: 122.03GiB(87.57%) tps: 10,173 tflops: 484.85 mfu: 49.02% global_avg_ntp_loss: 0.6129 global_avg_top_loss: 1.7867 +[titan] 2025-09-10 16:32:01,940 - root - INFO - lr: 2.5624e-06 gnorm: 0.52 [2 days, 16:55:33< 8:11:19] +[titan] 2025-09-10 16:32:34,160 - root - INFO - step: 35525 loss: 2.5912 memory: 122.03GiB(87.57%) tps: 10,170 tflops: 484.71 mfu: 49.01% global_avg_ntp_loss: 0.7040 global_avg_top_loss: 1.8873 +[titan] 2025-09-10 16:32:34,160 - root - INFO - lr: 2.5612e-06 gnorm: 0.63 [2 days, 16:56:05< 8:10:46] +[titan] 2025-09-10 16:33:06,433 - root - INFO - step: 35530 loss: 2.5136 memory: 122.03GiB(87.57%) tps: 10,154 tflops: 483.91 mfu: 48.93% global_avg_ntp_loss: 0.6666 global_avg_top_loss: 1.8470 +[titan] 2025-09-10 16:33:06,433 - root - INFO - lr: 2.5600e-06 gnorm: 0.54 [2 days, 16:56:37< 8:10:13] +[titan] 2025-09-10 16:33:38,513 - root - INFO - step: 35535 loss: 2.4414 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.82 mfu: 49.22% global_avg_ntp_loss: 0.6321 global_avg_top_loss: 1.8093 +[titan] 2025-09-10 16:33:38,513 - root - INFO - lr: 2.5587e-06 gnorm: 0.64 [2 days, 16:57:09< 8:09:40] +[titan] 2025-09-10 16:34:10,575 - root - INFO - step: 35540 loss: 2.5449 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.10 mfu: 49.25% global_avg_ntp_loss: 0.6803 global_avg_top_loss: 1.8646 +[titan] 2025-09-10 16:34:10,575 - root - INFO - lr: 2.5575e-06 gnorm: 0.70 [2 days, 16:57:41< 8:09:07] +[titan] 2025-09-10 16:34:42,581 - root - INFO - step: 35545 loss: 2.4411 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.94 mfu: 49.34% global_avg_ntp_loss: 0.6311 global_avg_top_loss: 1.8100 +[titan] 2025-09-10 16:34:42,581 - root - INFO - lr: 2.5563e-06 gnorm: 0.54 [2 days, 16:58:13< 8:08:34] +[titan] 2025-09-10 16:35:08,199 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 16:35:14,671 - root - INFO - step: 35550 loss: 2.3833 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.66 mfu: 49.21% global_avg_ntp_loss: 0.6018 global_avg_top_loss: 1.7815 +[titan] 2025-09-10 16:35:14,672 - root - INFO - lr: 2.5550e-06 gnorm: 0.60 [2 days, 16:58:45< 8:08:01] +[titan] 2025-09-10 16:35:46,730 - root - INFO - step: 35555 loss: 2.3191 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.15 mfu: 49.26% global_avg_ntp_loss: 0.5703 global_avg_top_loss: 1.7488 +[titan] 2025-09-10 16:35:46,730 - root - INFO - lr: 2.5538e-06 gnorm: 0.73 [2 days, 16:59:17< 8:07:28] +[titan] 2025-09-10 16:36:18,865 - root - INFO - step: 35560 loss: 2.5011 memory: 122.03GiB(87.57%) tps: 10,197 tflops: 485.99 mfu: 49.14% global_avg_ntp_loss: 0.6568 global_avg_top_loss: 1.8444 +[titan] 2025-09-10 16:36:18,865 - root - INFO - lr: 2.5525e-06 gnorm: 0.56 [2 days, 16:59:50< 8:06:55] +[titan] 2025-09-10 16:36:51,216 - root - INFO - step: 35565 loss: 2.4283 memory: 122.03GiB(87.57%) tps: 10,129 tflops: 482.75 mfu: 48.81% global_avg_ntp_loss: 0.6200 global_avg_top_loss: 1.8084 +[titan] 2025-09-10 16:36:51,216 - root - INFO - lr: 2.5513e-06 gnorm: 0.58 [2 days, 17:00:22< 8:06:22] +[titan] 2025-09-10 16:37:23,633 - root - INFO - step: 35570 loss: 2.5077 memory: 122.03GiB(87.57%) tps: 10,108 tflops: 481.75 mfu: 48.71% global_avg_ntp_loss: 0.6634 global_avg_top_loss: 1.8443 +[titan] 2025-09-10 16:37:23,634 - root - INFO - lr: 2.5501e-06 gnorm: 0.60 [2 days, 17:00:54< 8:05:49] +[titan] 2025-09-10 16:37:55,846 - root - INFO - step: 35575 loss: 2.4735 memory: 122.03GiB(87.57%) tps: 10,173 tflops: 484.82 mfu: 49.02% global_avg_ntp_loss: 0.6449 global_avg_top_loss: 1.8286 +[titan] 2025-09-10 16:37:55,846 - root - INFO - lr: 2.5489e-06 gnorm: 0.59 [2 days, 17:01:27< 8:05:16] +[titan] 2025-09-10 16:38:27,905 - root - INFO - step: 35580 loss: 2.5906 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.15 mfu: 49.26% global_avg_ntp_loss: 0.7025 global_avg_top_loss: 1.8882 +[titan] 2025-09-10 16:38:27,905 - root - INFO - lr: 2.5476e-06 gnorm: 0.58 [2 days, 17:01:59< 8:04:43] +[titan] 2025-09-10 16:39:00,176 - root - INFO - step: 35585 loss: 2.5873 memory: 122.03GiB(87.57%) tps: 10,154 tflops: 483.93 mfu: 48.93% global_avg_ntp_loss: 0.7034 global_avg_top_loss: 1.8839 +[titan] 2025-09-10 16:39:00,177 - root - INFO - lr: 2.5464e-06 gnorm: 0.60 [2 days, 17:02:31< 8:04:10] +[titan] 2025-09-10 16:39:32,766 - root - INFO - step: 35590 loss: 2.6809 memory: 122.03GiB(87.57%) tps: 10,055 tflops: 479.21 mfu: 48.45% global_avg_ntp_loss: 0.7466 global_avg_top_loss: 1.9343 +[titan] 2025-09-10 16:39:32,766 - root - INFO - lr: 2.5452e-06 gnorm: 0.66 [2 days, 17:03:04< 8:03:38] +[titan] 2025-09-10 16:40:04,799 - root - INFO - step: 35595 loss: 2.8161 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.53 mfu: 49.30% global_avg_ntp_loss: 0.8248 global_avg_top_loss: 1.9913 +[titan] 2025-09-10 16:40:04,799 - root - INFO - lr: 2.5440e-06 gnorm: 0.61 [2 days, 17:03:36< 8:03:05] +[titan] 2025-09-10 16:40:30,404 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 16:40:36,870 - root - INFO - step: 35600 loss: 2.5220 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.96 mfu: 49.24% global_avg_ntp_loss: 0.6681 global_avg_top_loss: 1.8539 +[titan] 2025-09-10 16:40:36,870 - root - INFO - lr: 2.5427e-06 gnorm: 0.57 [2 days, 17:04:08< 8:02:32] +[titan] 2025-09-10 16:41:08,992 - root - INFO - step: 35605 loss: 2.6257 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.19 mfu: 49.16% global_avg_ntp_loss: 0.7340 global_avg_top_loss: 1.8916 +[titan] 2025-09-10 16:41:08,992 - root - INFO - lr: 2.5415e-06 gnorm: 0.59 [2 days, 17:04:40< 8:01:59] +[titan] 2025-09-10 16:41:41,210 - root - INFO - step: 35610 loss: 3.0522 memory: 122.03GiB(87.57%) tps: 10,171 tflops: 484.73 mfu: 49.01% global_avg_ntp_loss: 0.9596 global_avg_top_loss: 2.0926 +[titan] 2025-09-10 16:41:41,210 - root - INFO - lr: 2.5403e-06 gnorm: 0.56 [2 days, 17:05:12< 8:01:26] +[titan] 2025-09-10 16:42:13,386 - root - INFO - step: 35615 loss: 2.5288 memory: 122.03GiB(87.57%) tps: 10,184 tflops: 485.36 mfu: 49.08% global_avg_ntp_loss: 0.6740 global_avg_top_loss: 1.8549 +[titan] 2025-09-10 16:42:13,386 - root - INFO - lr: 2.5391e-06 gnorm: 0.61 [2 days, 17:05:44< 8:00:53] +[titan] 2025-09-10 16:42:45,523 - root - INFO - step: 35620 loss: 2.5951 memory: 122.03GiB(87.57%) tps: 10,196 tflops: 485.96 mfu: 49.14% global_avg_ntp_loss: 0.7053 global_avg_top_loss: 1.8899 +[titan] 2025-09-10 16:42:45,523 - root - INFO - lr: 2.5379e-06 gnorm: 0.69 [2 days, 17:06:16< 8:00:20] +[titan] 2025-09-10 16:43:17,535 - root - INFO - step: 35625 loss: 2.4524 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.85 mfu: 49.33% global_avg_ntp_loss: 0.6356 global_avg_top_loss: 1.8168 +[titan] 2025-09-10 16:43:17,535 - root - INFO - lr: 2.5367e-06 gnorm: 0.53 [2 days, 17:06:48< 7:59:47] +[titan] 2025-09-10 16:43:49,830 - root - INFO - step: 35630 loss: 2.4052 memory: 122.03GiB(87.57%) tps: 10,146 tflops: 483.57 mfu: 48.90% global_avg_ntp_loss: 0.6134 global_avg_top_loss: 1.7919 +[titan] 2025-09-10 16:43:49,831 - root - INFO - lr: 2.5354e-06 gnorm: 0.60 [2 days, 17:07:21< 7:59:14] +[titan] 2025-09-10 16:44:22,363 - root - INFO - step: 35635 loss: 2.2925 memory: 122.03GiB(87.57%) tps: 10,072 tflops: 480.05 mfu: 48.54% global_avg_ntp_loss: 0.5564 global_avg_top_loss: 1.7361 +[titan] 2025-09-10 16:44:22,363 - root - INFO - lr: 2.5342e-06 gnorm: 0.70 [2 days, 17:07:53< 7:58:41] +[titan] 2025-09-10 16:44:54,553 - root - INFO - step: 35640 loss: 2.5080 memory: 122.03GiB(87.57%) tps: 10,180 tflops: 485.16 mfu: 49.06% global_avg_ntp_loss: 0.6602 global_avg_top_loss: 1.8478 +[titan] 2025-09-10 16:44:54,553 - root - INFO - lr: 2.5330e-06 gnorm: 0.56 [2 days, 17:08:25< 7:58:08] +[titan] 2025-09-10 16:45:26,574 - root - INFO - step: 35645 loss: 2.4130 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.71 mfu: 49.31% global_avg_ntp_loss: 0.6165 global_avg_top_loss: 1.7966 +[titan] 2025-09-10 16:45:26,574 - root - INFO - lr: 2.5318e-06 gnorm: 0.59 [2 days, 17:08:57< 7:57:35] +[titan] 2025-09-10 16:45:52,468 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 16:45:58,959 - root - INFO - step: 35650 loss: 2.4273 memory: 122.03GiB(87.57%) tps: 10,119 tflops: 482.24 mfu: 48.76% global_avg_ntp_loss: 0.6239 global_avg_top_loss: 1.8034 +[titan] 2025-09-10 16:45:58,959 - root - INFO - lr: 2.5306e-06 gnorm: 0.58 [2 days, 17:09:30< 7:57:02] +[titan] 2025-09-10 16:46:30,958 - root - INFO - step: 35655 loss: 2.4271 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.04 mfu: 49.35% global_avg_ntp_loss: 0.6239 global_avg_top_loss: 1.8032 +[titan] 2025-09-10 16:46:30,958 - root - INFO - lr: 2.5294e-06 gnorm: 0.63 [2 days, 17:10:02< 7:56:29] +[titan] 2025-09-10 16:47:03,241 - root - INFO - step: 35660 loss: 2.4960 memory: 122.03GiB(87.57%) tps: 10,151 tflops: 483.77 mfu: 48.91% global_avg_ntp_loss: 0.6560 global_avg_top_loss: 1.8399 +[titan] 2025-09-10 16:47:03,241 - root - INFO - lr: 2.5282e-06 gnorm: 0.57 [2 days, 17:10:34< 7:55:56] +[titan] 2025-09-10 16:47:35,382 - root - INFO - step: 35665 loss: 2.5385 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.89 mfu: 49.13% global_avg_ntp_loss: 0.6805 global_avg_top_loss: 1.8580 +[titan] 2025-09-10 16:47:35,382 - root - INFO - lr: 2.5270e-06 gnorm: 0.57 [2 days, 17:11:06< 7:55:23] +[titan] 2025-09-10 16:48:07,697 - root - INFO - step: 35670 loss: 3.0531 memory: 122.03GiB(87.57%) tps: 10,140 tflops: 483.28 mfu: 48.87% global_avg_ntp_loss: 0.9622 global_avg_top_loss: 2.0908 +[titan] 2025-09-10 16:48:07,698 - root - INFO - lr: 2.5258e-06 gnorm: 0.72 [2 days, 17:11:38< 7:54:50] +[titan] 2025-09-10 16:48:39,980 - root - INFO - step: 35675 loss: 2.5379 memory: 122.03GiB(87.57%) tps: 10,150 tflops: 483.76 mfu: 48.91% global_avg_ntp_loss: 0.6746 global_avg_top_loss: 1.8633 +[titan] 2025-09-10 16:48:39,981 - root - INFO - lr: 2.5246e-06 gnorm: 0.58 [2 days, 17:12:11< 7:54:17] +[titan] 2025-09-10 16:49:12,200 - root - INFO - step: 35680 loss: 2.5094 memory: 122.03GiB(87.57%) tps: 10,170 tflops: 484.71 mfu: 49.01% global_avg_ntp_loss: 0.6609 global_avg_top_loss: 1.8485 +[titan] 2025-09-10 16:49:12,200 - root - INFO - lr: 2.5234e-06 gnorm: 0.57 [2 days, 17:12:43< 7:53:44] +[titan] 2025-09-10 16:49:44,425 - root - INFO - step: 35685 loss: 2.6113 memory: 122.03GiB(87.57%) tps: 10,169 tflops: 484.64 mfu: 49.00% global_avg_ntp_loss: 0.7092 global_avg_top_loss: 1.9021 +[titan] 2025-09-10 16:49:44,425 - root - INFO - lr: 2.5222e-06 gnorm: 0.58 [2 days, 17:13:15< 7:53:11] +[titan] 2025-09-10 16:50:16,399 - root - INFO - step: 35690 loss: 2.5552 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.43 mfu: 49.39% global_avg_ntp_loss: 0.6852 global_avg_top_loss: 1.8701 +[titan] 2025-09-10 16:50:16,399 - root - INFO - lr: 2.5210e-06 gnorm: 0.58 [2 days, 17:13:47< 7:52:38] +[titan] 2025-09-10 16:50:48,844 - root - INFO - step: 35695 loss: 2.6156 memory: 122.03GiB(87.57%) tps: 10,100 tflops: 481.34 mfu: 48.67% global_avg_ntp_loss: 0.7102 global_avg_top_loss: 1.9054 +[titan] 2025-09-10 16:50:48,845 - root - INFO - lr: 2.5198e-06 gnorm: 0.75 [2 days, 17:14:20< 7:52:05] +[titan] 2025-09-10 16:51:14,904 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 16:51:21,487 - root - INFO - step: 35700 loss: 2.5080 memory: 122.03GiB(87.57%) tps: 10,039 tflops: 478.43 mfu: 48.38% global_avg_ntp_loss: 0.6616 global_avg_top_loss: 1.8463 +[titan] 2025-09-10 16:51:21,487 - root - INFO - lr: 2.5186e-06 gnorm: 0.69 [2 days, 17:14:52< 7:51:32] +[titan] 2025-09-10 16:51:53,577 - root - INFO - step: 35705 loss: 2.5396 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.67 mfu: 49.21% global_avg_ntp_loss: 0.6837 global_avg_top_loss: 1.8559 +[titan] 2025-09-10 16:51:53,577 - root - INFO - lr: 2.5174e-06 gnorm: 0.54 [2 days, 17:15:24< 7:50:59] +[titan] 2025-09-10 16:52:25,797 - root - INFO - step: 35710 loss: 2.3636 memory: 122.03GiB(87.57%) tps: 10,170 tflops: 484.69 mfu: 49.01% global_avg_ntp_loss: 0.5924 global_avg_top_loss: 1.7712 +[titan] 2025-09-10 16:52:25,798 - root - INFO - lr: 2.5162e-06 gnorm: 0.60 [2 days, 17:15:57< 7:50:26] +[titan] 2025-09-10 16:52:57,722 - root - INFO - step: 35715 loss: 2.3714 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.18 mfu: 49.46% global_avg_ntp_loss: 0.5942 global_avg_top_loss: 1.7772 +[titan] 2025-09-10 16:52:57,723 - root - INFO - lr: 2.5150e-06 gnorm: 0.76 [2 days, 17:16:28< 7:49:53] +[titan] 2025-09-10 16:53:30,200 - root - INFO - step: 35720 loss: 2.4713 memory: 122.03GiB(87.57%) tps: 10,090 tflops: 480.86 mfu: 48.62% global_avg_ntp_loss: 0.6451 global_avg_top_loss: 1.8262 +[titan] 2025-09-10 16:53:30,200 - root - INFO - lr: 2.5138e-06 gnorm: 0.56 [2 days, 17:17:01< 7:49:20] +[titan] 2025-09-10 16:54:02,386 - root - INFO - step: 35725 loss: 2.4657 memory: 122.03GiB(87.57%) tps: 10,181 tflops: 485.22 mfu: 49.06% global_avg_ntp_loss: 0.6375 global_avg_top_loss: 1.8282 +[titan] 2025-09-10 16:54:02,386 - root - INFO - lr: 2.5126e-06 gnorm: 0.88 [2 days, 17:17:33< 7:48:47] +[titan] 2025-09-10 16:54:34,637 - root - INFO - step: 35730 loss: 2.3586 memory: 122.03GiB(87.57%) tps: 10,160 tflops: 484.24 mfu: 48.96% global_avg_ntp_loss: 0.5955 global_avg_top_loss: 1.7631 +[titan] 2025-09-10 16:54:34,637 - root - INFO - lr: 2.5114e-06 gnorm: 0.56 [2 days, 17:18:05< 7:48:14] +[titan] 2025-09-10 16:55:06,901 - root - INFO - step: 35735 loss: 2.4988 memory: 122.03GiB(87.57%) tps: 10,156 tflops: 484.04 mfu: 48.94% global_avg_ntp_loss: 0.6540 global_avg_top_loss: 1.8447 +[titan] 2025-09-10 16:55:06,902 - root - INFO - lr: 2.5103e-06 gnorm: 0.59 [2 days, 17:18:38< 7:47:41] +[titan] 2025-09-10 16:55:38,875 - root - INFO - step: 35740 loss: 2.4889 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.43 mfu: 49.39% global_avg_ntp_loss: 0.6539 global_avg_top_loss: 1.8350 +[titan] 2025-09-10 16:55:38,876 - root - INFO - lr: 2.5091e-06 gnorm: 0.57 [2 days, 17:19:10< 7:47:08] +[titan] 2025-09-10 16:56:11,248 - root - INFO - step: 35745 loss: 2.5412 memory: 122.03GiB(87.57%) tps: 10,122 tflops: 482.42 mfu: 48.78% global_avg_ntp_loss: 0.6790 global_avg_top_loss: 1.8621 +[titan] 2025-09-10 16:56:11,248 - root - INFO - lr: 2.5079e-06 gnorm: 0.60 [2 days, 17:19:42< 7:46:35] +[titan] 2025-09-10 16:56:37,194 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 16:56:43,648 - root - INFO - step: 35750 loss: 2.7200 memory: 122.03GiB(87.57%) tps: 10,114 tflops: 482.01 mfu: 48.74% global_avg_ntp_loss: 0.7664 global_avg_top_loss: 1.9536 +[titan] 2025-09-10 16:56:43,649 - root - INFO - lr: 2.5067e-06 gnorm: 0.61 [2 days, 17:20:14< 7:46:02] +[titan] 2025-09-10 16:57:15,637 - root - INFO - step: 35755 loss: 2.5435 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.22 mfu: 49.36% global_avg_ntp_loss: 0.6784 global_avg_top_loss: 1.8651 +[titan] 2025-09-10 16:57:15,637 - root - INFO - lr: 2.5055e-06 gnorm: 0.60 [2 days, 17:20:46< 7:45:29] +[titan] 2025-09-10 16:57:47,927 - root - INFO - step: 35760 loss: 2.5351 memory: 122.03GiB(87.57%) tps: 10,148 tflops: 483.66 mfu: 48.90% global_avg_ntp_loss: 0.6707 global_avg_top_loss: 1.8644 +[titan] 2025-09-10 16:57:47,927 - root - INFO - lr: 2.5044e-06 gnorm: 0.58 [2 days, 17:21:19< 7:44:56] +[titan] 2025-09-10 16:58:20,177 - root - INFO - step: 35765 loss: 2.4644 memory: 122.03GiB(87.57%) tps: 10,161 tflops: 484.24 mfu: 48.96% global_avg_ntp_loss: 0.6426 global_avg_top_loss: 1.8218 +[titan] 2025-09-10 16:58:20,178 - root - INFO - lr: 2.5032e-06 gnorm: 0.59 [2 days, 17:21:51< 7:44:23] +[titan] 2025-09-10 16:58:52,310 - root - INFO - step: 35770 loss: 2.5586 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.02 mfu: 49.14% global_avg_ntp_loss: 0.6879 global_avg_top_loss: 1.8707 +[titan] 2025-09-10 16:58:52,311 - root - INFO - lr: 2.5020e-06 gnorm: 0.55 [2 days, 17:22:23< 7:43:50] +[titan] 2025-09-10 16:59:24,516 - root - INFO - step: 35775 loss: 2.5874 memory: 122.03GiB(87.57%) tps: 10,175 tflops: 484.93 mfu: 49.03% global_avg_ntp_loss: 0.6999 global_avg_top_loss: 1.8874 +[titan] 2025-09-10 16:59:24,516 - root - INFO - lr: 2.5008e-06 gnorm: 0.74 [2 days, 17:22:55< 7:43:17] +[titan] 2025-09-10 16:59:56,563 - root - INFO - step: 35780 loss: 2.5821 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.31 mfu: 49.27% global_avg_ntp_loss: 0.6961 global_avg_top_loss: 1.8860 +[titan] 2025-09-10 16:59:56,563 - root - INFO - lr: 2.4996e-06 gnorm: 0.71 [2 days, 17:23:27< 7:42:44] +[titan] 2025-09-10 17:00:28,753 - root - INFO - step: 35785 loss: 2.4120 memory: 122.03GiB(87.57%) tps: 10,180 tflops: 485.17 mfu: 49.06% global_avg_ntp_loss: 0.6194 global_avg_top_loss: 1.7926 +[titan] 2025-09-10 17:00:28,753 - root - INFO - lr: 2.4985e-06 gnorm: 0.53 [2 days, 17:23:59< 7:42:11] +[titan] 2025-09-10 17:01:01,238 - root - INFO - step: 35790 loss: 2.3560 memory: 122.03GiB(87.57%) tps: 10,087 tflops: 480.74 mfu: 48.61% global_avg_ntp_loss: 0.5931 global_avg_top_loss: 1.7629 +[titan] 2025-09-10 17:01:01,239 - root - INFO - lr: 2.4973e-06 gnorm: 0.64 [2 days, 17:24:32< 7:41:38] +[titan] 2025-09-10 17:01:33,411 - root - INFO - step: 35795 loss: 2.3759 memory: 122.03GiB(87.57%) tps: 10,185 tflops: 485.43 mfu: 49.08% global_avg_ntp_loss: 0.5946 global_avg_top_loss: 1.7813 +[titan] 2025-09-10 17:01:33,411 - root - INFO - lr: 2.4961e-06 gnorm: 0.72 [2 days, 17:25:04< 7:41:05] +[titan] 2025-09-10 17:01:59,208 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 17:02:05,781 - root - INFO - step: 35800 loss: 3.0188 memory: 122.03GiB(87.57%) tps: 10,123 tflops: 482.46 mfu: 48.78% global_avg_ntp_loss: 0.9464 global_avg_top_loss: 2.0724 +[titan] 2025-09-10 17:02:05,781 - root - INFO - lr: 2.4950e-06 gnorm: 0.55 [2 days, 17:25:37< 7:40:32] +[titan] 2025-09-10 17:02:38,008 - root - INFO - step: 35805 loss: 2.4224 memory: 122.03GiB(87.57%) tps: 10,168 tflops: 484.60 mfu: 49.00% global_avg_ntp_loss: 0.6173 global_avg_top_loss: 1.8051 +[titan] 2025-09-10 17:02:38,008 - root - INFO - lr: 2.4938e-06 gnorm: 0.61 [2 days, 17:26:09< 7:39:59] +[titan] 2025-09-10 17:03:10,177 - root - INFO - step: 35810 loss: 2.3936 memory: 122.03GiB(87.57%) tps: 10,186 tflops: 485.47 mfu: 49.09% global_avg_ntp_loss: 0.6114 global_avg_top_loss: 1.7822 +[titan] 2025-09-10 17:03:10,178 - root - INFO - lr: 2.4926e-06 gnorm: 0.55 [2 days, 17:26:41< 7:39:26] +[titan] 2025-09-10 17:03:42,502 - root - INFO - step: 35815 loss: 2.4134 memory: 122.03GiB(87.57%) tps: 10,137 tflops: 483.13 mfu: 48.85% global_avg_ntp_loss: 0.6179 global_avg_top_loss: 1.7954 +[titan] 2025-09-10 17:03:42,502 - root - INFO - lr: 2.4915e-06 gnorm: 0.62 [2 days, 17:27:13< 7:38:53] +[titan] 2025-09-10 17:04:14,927 - root - INFO - step: 35820 loss: 2.5316 memory: 122.03GiB(87.57%) tps: 10,106 tflops: 481.64 mfu: 48.70% global_avg_ntp_loss: 0.6708 global_avg_top_loss: 1.8609 +[titan] 2025-09-10 17:04:14,927 - root - INFO - lr: 2.4903e-06 gnorm: 0.60 [2 days, 17:27:46< 7:38:20] +[titan] 2025-09-10 17:04:47,014 - root - INFO - step: 35825 loss: 2.9024 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.72 mfu: 49.21% global_avg_ntp_loss: 0.8707 global_avg_top_loss: 2.0318 +[titan] 2025-09-10 17:04:47,014 - root - INFO - lr: 2.4891e-06 gnorm: 0.64 [2 days, 17:28:18< 7:37:47] +[titan] 2025-09-10 17:05:19,172 - root - INFO - step: 35830 loss: 2.6354 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.64 mfu: 49.10% global_avg_ntp_loss: 0.7234 global_avg_top_loss: 1.9121 +[titan] 2025-09-10 17:05:19,172 - root - INFO - lr: 2.4880e-06 gnorm: 0.70 [2 days, 17:28:50< 7:37:14] +[titan] 2025-09-10 17:05:51,395 - root - INFO - step: 35835 loss: 2.5337 memory: 122.03GiB(87.57%) tps: 10,169 tflops: 484.66 mfu: 49.01% global_avg_ntp_loss: 0.6703 global_avg_top_loss: 1.8634 +[titan] 2025-09-10 17:05:51,395 - root - INFO - lr: 2.4868e-06 gnorm: 0.64 [2 days, 17:29:22< 7:36:42] +[titan] 2025-09-10 17:06:23,519 - root - INFO - step: 35840 loss: 2.5711 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.15 mfu: 49.16% global_avg_ntp_loss: 0.6875 global_avg_top_loss: 1.8836 +[titan] 2025-09-10 17:06:23,519 - root - INFO - lr: 2.4857e-06 gnorm: 0.56 [2 days, 17:29:54< 7:36:09] +[titan] 2025-09-10 17:06:23,819 - root - INFO - Dumping profiler traces at step 35840 +[titan] 2025-09-10 17:06:23,888 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-10 17:06:55,807 - root - INFO - step: 35845 loss: 2.4734 memory: 122.03GiB(87.57%) tps: 10,149 tflops: 483.69 mfu: 48.91% global_avg_ntp_loss: 0.6448 global_avg_top_loss: 1.8286 +[titan] 2025-09-10 17:06:55,807 - root - INFO - lr: 2.4845e-06 gnorm: 0.62 [2 days, 17:30:27< 7:35:36] +[titan] 2025-09-10 17:07:21,414 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 17:07:27,870 - root - INFO - step: 35850 loss: 2.6186 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.08 mfu: 49.25% global_avg_ntp_loss: 0.7113 global_avg_top_loss: 1.9074 +[titan] 2025-09-10 17:07:27,870 - root - INFO - lr: 2.4834e-06 gnorm: 0.57 [2 days, 17:30:59< 7:35:03] +[titan] 2025-09-10 17:08:00,323 - root - INFO - step: 35855 loss: 2.5292 memory: 122.03GiB(87.57%) tps: 10,097 tflops: 481.22 mfu: 48.66% global_avg_ntp_loss: 0.6714 global_avg_top_loss: 1.8578 +[titan] 2025-09-10 17:08:00,323 - root - INFO - lr: 2.4822e-06 gnorm: 0.69 [2 days, 17:31:31< 7:34:30] +[titan] 2025-09-10 17:08:32,248 - root - INFO - step: 35860 loss: 2.5298 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.19 mfu: 49.46% global_avg_ntp_loss: 0.6734 global_avg_top_loss: 1.8564 +[titan] 2025-09-10 17:08:32,248 - root - INFO - lr: 2.4811e-06 gnorm: 0.69 [2 days, 17:32:03< 7:33:57] +[titan] 2025-09-10 17:09:04,665 - root - INFO - step: 35865 loss: 2.4196 memory: 122.03GiB(87.57%) tps: 10,108 tflops: 481.76 mfu: 48.71% global_avg_ntp_loss: 0.6242 global_avg_top_loss: 1.7954 +[titan] 2025-09-10 17:09:04,665 - root - INFO - lr: 2.4799e-06 gnorm: 0.54 [2 days, 17:32:35< 7:33:24] +[titan] 2025-09-10 17:09:36,596 - root - INFO - step: 35870 loss: 2.3486 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.09 mfu: 49.45% global_avg_ntp_loss: 0.5826 global_avg_top_loss: 1.7660 +[titan] 2025-09-10 17:09:36,596 - root - INFO - lr: 2.4788e-06 gnorm: 0.64 [2 days, 17:33:07< 7:32:51] +[titan] 2025-09-10 17:10:09,143 - root - INFO - step: 35875 loss: 2.4064 memory: 122.03GiB(87.57%) tps: 10,068 tflops: 479.84 mfu: 48.52% global_avg_ntp_loss: 0.6091 global_avg_top_loss: 1.7973 +[titan] 2025-09-10 17:10:09,143 - root - INFO - lr: 2.4776e-06 gnorm: 0.77 [2 days, 17:33:40< 7:32:18] +[titan] 2025-09-10 17:10:41,257 - root - INFO - step: 35880 loss: 2.3271 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.29 mfu: 49.17% global_avg_ntp_loss: 0.5826 global_avg_top_loss: 1.7445 +[titan] 2025-09-10 17:10:41,258 - root - INFO - lr: 2.4765e-06 gnorm: 0.51 [2 days, 17:34:12< 7:31:45] +[titan] 2025-09-10 17:11:13,519 - root - INFO - step: 35885 loss: 2.3995 memory: 122.03GiB(87.57%) tps: 10,157 tflops: 484.09 mfu: 48.95% global_avg_ntp_loss: 0.6076 global_avg_top_loss: 1.7918 +[titan] 2025-09-10 17:11:13,519 - root - INFO - lr: 2.4753e-06 gnorm: 0.58 [2 days, 17:34:44< 7:31:12] +[titan] 2025-09-10 17:11:45,574 - root - INFO - step: 35890 loss: 2.5711 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.19 mfu: 49.26% global_avg_ntp_loss: 0.6892 global_avg_top_loss: 1.8819 +[titan] 2025-09-10 17:11:45,574 - root - INFO - lr: 2.4742e-06 gnorm: 0.61 [2 days, 17:35:16< 7:30:39] +[titan] 2025-09-10 17:12:17,733 - root - INFO - step: 35895 loss: 2.3785 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.63 mfu: 49.10% global_avg_ntp_loss: 0.6001 global_avg_top_loss: 1.7784 +[titan] 2025-09-10 17:12:17,733 - root - INFO - lr: 2.4730e-06 gnorm: 0.56 [2 days, 17:35:48< 7:30:06] +[titan] 2025-09-10 17:12:43,363 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 17:12:49,903 - root - INFO - step: 35900 loss: 2.5643 memory: 122.03GiB(87.57%) tps: 10,186 tflops: 485.46 mfu: 49.09% global_avg_ntp_loss: 0.6875 global_avg_top_loss: 1.8768 +[titan] 2025-09-10 17:12:49,903 - root - INFO - lr: 2.4719e-06 gnorm: 0.57 [2 days, 17:36:21< 7:29:33] +[titan] 2025-09-10 17:13:22,091 - root - INFO - step: 35905 loss: 2.6372 memory: 122.03GiB(87.57%) tps: 10,180 tflops: 485.18 mfu: 49.06% global_avg_ntp_loss: 0.7277 global_avg_top_loss: 1.9095 +[titan] 2025-09-10 17:13:22,091 - root - INFO - lr: 2.4707e-06 gnorm: 0.66 [2 days, 17:36:53< 7:29:00] +[titan] 2025-09-10 17:13:54,389 - root - INFO - step: 35910 loss: 2.8248 memory: 122.03GiB(87.57%) tps: 10,146 tflops: 483.54 mfu: 48.89% global_avg_ntp_loss: 0.8082 global_avg_top_loss: 2.0167 +[titan] 2025-09-10 17:13:54,389 - root - INFO - lr: 2.4696e-06 gnorm: 0.65 [2 days, 17:37:25< 7:28:27] +[titan] 2025-09-10 17:14:26,466 - root - INFO - step: 35915 loss: 2.5096 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.86 mfu: 49.23% global_avg_ntp_loss: 0.6608 global_avg_top_loss: 1.8488 +[titan] 2025-09-10 17:14:26,466 - root - INFO - lr: 2.4685e-06 gnorm: 0.57 [2 days, 17:37:57< 7:27:54] +[titan] 2025-09-10 17:14:58,576 - root - INFO - step: 35920 loss: 2.5000 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.37 mfu: 49.18% global_avg_ntp_loss: 0.6536 global_avg_top_loss: 1.8463 +[titan] 2025-09-10 17:14:58,576 - root - INFO - lr: 2.4673e-06 gnorm: 0.58 [2 days, 17:38:29< 7:27:21] +[titan] 2025-09-10 17:15:30,852 - root - INFO - step: 35925 loss: 2.3887 memory: 122.03GiB(87.57%) tps: 10,152 tflops: 483.85 mfu: 48.92% global_avg_ntp_loss: 0.6071 global_avg_top_loss: 1.7816 +[titan] 2025-09-10 17:15:30,853 - root - INFO - lr: 2.4662e-06 gnorm: 0.60 [2 days, 17:39:02< 7:26:48] +[titan] 2025-09-10 17:16:03,065 - root - INFO - step: 35930 loss: 2.6294 memory: 122.03GiB(87.57%) tps: 10,173 tflops: 484.81 mfu: 49.02% global_avg_ntp_loss: 0.7259 global_avg_top_loss: 1.9035 +[titan] 2025-09-10 17:16:03,065 - root - INFO - lr: 2.4651e-06 gnorm: 0.54 [2 days, 17:39:34< 7:26:15] +[titan] 2025-09-10 17:16:35,068 - root - INFO - step: 35935 loss: 2.5567 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.99 mfu: 49.34% global_avg_ntp_loss: 0.6859 global_avg_top_loss: 1.8708 +[titan] 2025-09-10 17:16:35,068 - root - INFO - lr: 2.4639e-06 gnorm: 0.73 [2 days, 17:40:06< 7:25:42] +[titan] 2025-09-10 17:17:07,249 - root - INFO - step: 35940 loss: 2.5177 memory: 122.03GiB(87.57%) tps: 10,183 tflops: 485.29 mfu: 49.07% global_avg_ntp_loss: 0.6684 global_avg_top_loss: 1.8493 +[titan] 2025-09-10 17:17:07,249 - root - INFO - lr: 2.4628e-06 gnorm: 0.71 [2 days, 17:40:38< 7:25:09] +[titan] 2025-09-10 17:17:39,705 - root - INFO - step: 35945 loss: 2.3978 memory: 122.03GiB(87.57%) tps: 10,096 tflops: 481.18 mfu: 48.65% global_avg_ntp_loss: 0.6102 global_avg_top_loss: 1.7876 +[titan] 2025-09-10 17:17:39,705 - root - INFO - lr: 2.4617e-06 gnorm: 0.56 [2 days, 17:41:10< 7:24:36] +[titan] 2025-09-10 17:18:05,435 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 17:18:11,827 - root - INFO - step: 35950 loss: 2.4600 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.18 mfu: 49.16% global_avg_ntp_loss: 0.6378 global_avg_top_loss: 1.8222 +[titan] 2025-09-10 17:18:11,827 - root - INFO - lr: 2.4605e-06 gnorm: 0.60 [2 days, 17:41:43< 7:24:03] +[titan] 2025-09-10 17:18:43,821 - root - INFO - step: 35955 loss: 2.3415 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.13 mfu: 49.36% global_avg_ntp_loss: 0.5791 global_avg_top_loss: 1.7625 +[titan] 2025-09-10 17:18:43,821 - root - INFO - lr: 2.4594e-06 gnorm: 0.79 [2 days, 17:42:15< 7:23:30] +[titan] 2025-09-10 17:19:16,024 - root - INFO - step: 35960 loss: 2.4266 memory: 122.03GiB(87.57%) tps: 10,176 tflops: 484.96 mfu: 49.04% global_avg_ntp_loss: 0.6241 global_avg_top_loss: 1.8025 +[titan] 2025-09-10 17:19:16,024 - root - INFO - lr: 2.4583e-06 gnorm: 0.55 [2 days, 17:42:47< 7:22:57] +[titan] 2025-09-10 17:19:48,347 - root - INFO - step: 35965 loss: 2.4896 memory: 122.03GiB(87.57%) tps: 10,138 tflops: 483.17 mfu: 48.85% global_avg_ntp_loss: 0.6487 global_avg_top_loss: 1.8409 +[titan] 2025-09-10 17:19:48,347 - root - INFO - lr: 2.4572e-06 gnorm: 0.61 [2 days, 17:43:19< 7:22:24] +[titan] 2025-09-10 17:20:20,552 - root - INFO - step: 35970 loss: 2.5474 memory: 122.03GiB(87.57%) tps: 10,175 tflops: 484.93 mfu: 49.03% global_avg_ntp_loss: 0.6795 global_avg_top_loss: 1.8679 +[titan] 2025-09-10 17:20:20,552 - root - INFO - lr: 2.4560e-06 gnorm: 0.61 [2 days, 17:43:51< 7:21:51] +[titan] 2025-09-10 17:20:52,647 - root - INFO - step: 35975 loss: 2.2821 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.59 mfu: 49.20% global_avg_ntp_loss: 0.5562 global_avg_top_loss: 1.7259 +[titan] 2025-09-10 17:20:52,648 - root - INFO - lr: 2.4549e-06 gnorm: 0.60 [2 days, 17:44:23< 7:21:18] +[titan] 2025-09-10 17:21:24,946 - root - INFO - step: 35980 loss: 2.5666 memory: 122.03GiB(87.57%) tps: 10,145 tflops: 483.52 mfu: 48.89% global_avg_ntp_loss: 0.6894 global_avg_top_loss: 1.8772 +[titan] 2025-09-10 17:21:24,946 - root - INFO - lr: 2.4538e-06 gnorm: 0.58 [2 days, 17:44:56< 7:20:45] +[titan] 2025-09-10 17:21:56,887 - root - INFO - step: 35985 loss: 2.6377 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.93 mfu: 49.44% global_avg_ntp_loss: 0.7301 global_avg_top_loss: 1.9076 +[titan] 2025-09-10 17:21:56,888 - root - INFO - lr: 2.4527e-06 gnorm: 0.59 [2 days, 17:45:28< 7:20:12] +[titan] 2025-09-10 17:22:29,138 - root - INFO - step: 35990 loss: 2.6634 memory: 122.03GiB(87.57%) tps: 10,161 tflops: 484.24 mfu: 48.96% global_avg_ntp_loss: 0.7413 global_avg_top_loss: 1.9222 +[titan] 2025-09-10 17:22:29,138 - root - INFO - lr: 2.4516e-06 gnorm: 0.74 [2 days, 17:46:00< 7:19:39] +[titan] 2025-09-10 17:23:01,521 - root - INFO - step: 35995 loss: 2.5767 memory: 122.03GiB(87.57%) tps: 10,119 tflops: 482.28 mfu: 48.76% global_avg_ntp_loss: 0.6919 global_avg_top_loss: 1.8848 +[titan] 2025-09-10 17:23:01,521 - root - INFO - lr: 2.4505e-06 gnorm: 0.62 [2 days, 17:46:32< 7:19:06] +[titan] 2025-09-10 17:23:27,106 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 17:23:33,501 - root - INFO - step: 36000 loss: 2.5104 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.34 mfu: 49.38% global_avg_ntp_loss: 0.6616 global_avg_top_loss: 1.8487 +[titan] 2025-09-10 17:23:33,501 - root - INFO - lr: 2.4493e-06 gnorm: 0.57 [2 days, 17:47:04< 7:18:33] +[titan] 2025-09-10 17:24:05,606 - root - INFO - step: 36005 loss: 2.4369 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.44 mfu: 49.18% global_avg_ntp_loss: 0.6289 global_avg_top_loss: 1.8080 +[titan] 2025-09-10 17:24:05,606 - root - INFO - lr: 2.4482e-06 gnorm: 0.62 [2 days, 17:47:36< 7:18:00] +[titan] 2025-09-10 17:24:37,830 - root - INFO - step: 36010 loss: 2.6117 memory: 122.03GiB(87.57%) tps: 10,169 tflops: 484.64 mfu: 49.00% global_avg_ntp_loss: 0.7108 global_avg_top_loss: 1.9009 +[titan] 2025-09-10 17:24:37,830 - root - INFO - lr: 2.4471e-06 gnorm: 0.58 [2 days, 17:48:09< 7:17:27] +[titan] 2025-09-10 17:25:10,050 - root - INFO - step: 36015 loss: 2.5087 memory: 122.03GiB(87.57%) tps: 10,170 tflops: 484.71 mfu: 49.01% global_avg_ntp_loss: 0.6620 global_avg_top_loss: 1.8467 +[titan] 2025-09-10 17:25:10,050 - root - INFO - lr: 2.4460e-06 gnorm: 0.64 [2 days, 17:48:41< 7:16:54] +[titan] 2025-09-10 17:25:42,148 - root - INFO - step: 36020 loss: 2.5915 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.55 mfu: 49.20% global_avg_ntp_loss: 0.7028 global_avg_top_loss: 1.8887 +[titan] 2025-09-10 17:25:42,148 - root - INFO - lr: 2.4449e-06 gnorm: 0.65 [2 days, 17:49:13< 7:16:21] +[titan] 2025-09-10 17:26:14,312 - root - INFO - step: 36025 loss: 2.5001 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.55 mfu: 49.10% global_avg_ntp_loss: 0.6583 global_avg_top_loss: 1.8419 +[titan] 2025-09-10 17:26:14,312 - root - INFO - lr: 2.4438e-06 gnorm: 0.55 [2 days, 17:49:45< 7:15:48] +[titan] 2025-09-10 17:26:46,271 - root - INFO - step: 36030 loss: 2.3661 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.66 mfu: 49.41% global_avg_ntp_loss: 0.5941 global_avg_top_loss: 1.7721 +[titan] 2025-09-10 17:26:46,271 - root - INFO - lr: 2.4427e-06 gnorm: 0.65 [2 days, 17:50:17< 7:15:15] +[titan] 2025-09-10 17:27:18,247 - root - INFO - step: 36035 loss: 2.3813 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.39 mfu: 49.38% global_avg_ntp_loss: 0.5985 global_avg_top_loss: 1.7828 +[titan] 2025-09-10 17:27:18,248 - root - INFO - lr: 2.4416e-06 gnorm: 0.71 [2 days, 17:50:49< 7:14:43] +[titan] 2025-09-10 17:27:50,218 - root - INFO - step: 36040 loss: 2.4700 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.49 mfu: 49.39% global_avg_ntp_loss: 0.6455 global_avg_top_loss: 1.8245 +[titan] 2025-09-10 17:27:50,218 - root - INFO - lr: 2.4405e-06 gnorm: 0.57 [2 days, 17:51:21< 7:14:10] +[titan] 2025-09-10 17:28:22,434 - root - INFO - step: 36045 loss: 2.4606 memory: 122.03GiB(87.57%) tps: 10,171 tflops: 484.76 mfu: 49.02% global_avg_ntp_loss: 0.6416 global_avg_top_loss: 1.8190 +[titan] 2025-09-10 17:28:22,434 - root - INFO - lr: 2.4394e-06 gnorm: 0.56 [2 days, 17:51:53< 7:13:37] +[titan] 2025-09-10 17:28:48,057 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 17:28:54,455 - root - INFO - step: 36050 loss: 2.4830 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.71 mfu: 49.31% global_avg_ntp_loss: 0.6567 global_avg_top_loss: 1.8263 +[titan] 2025-09-10 17:28:54,456 - root - INFO - lr: 2.4383e-06 gnorm: 0.72 [2 days, 17:52:25< 7:13:04] +[titan] 2025-09-10 17:29:26,740 - root - INFO - step: 36055 loss: 2.4021 memory: 122.03GiB(87.57%) tps: 10,150 tflops: 483.73 mfu: 48.91% global_avg_ntp_loss: 0.6087 global_avg_top_loss: 1.7934 +[titan] 2025-09-10 17:29:26,741 - root - INFO - lr: 2.4372e-06 gnorm: 0.61 [2 days, 17:52:57< 7:12:31] +[titan] 2025-09-10 17:29:58,854 - root - INFO - step: 36060 loss: 2.5103 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.32 mfu: 49.17% global_avg_ntp_loss: 0.6617 global_avg_top_loss: 1.8486 +[titan] 2025-09-10 17:29:58,854 - root - INFO - lr: 2.4361e-06 gnorm: 0.58 [2 days, 17:53:30< 7:11:58] +[titan] 2025-09-10 17:30:31,145 - root - INFO - step: 36065 loss: 2.5619 memory: 122.03GiB(87.57%) tps: 10,148 tflops: 483.63 mfu: 48.90% global_avg_ntp_loss: 0.6951 global_avg_top_loss: 1.8668 +[titan] 2025-09-10 17:30:31,145 - root - INFO - lr: 2.4350e-06 gnorm: 0.59 [2 days, 17:54:02< 7:11:25] +[titan] 2025-09-10 17:31:03,326 - root - INFO - step: 36070 loss: 2.5843 memory: 122.03GiB(87.57%) tps: 10,183 tflops: 485.29 mfu: 49.07% global_avg_ntp_loss: 0.6973 global_avg_top_loss: 1.8870 +[titan] 2025-09-10 17:31:03,326 - root - INFO - lr: 2.4339e-06 gnorm: 0.67 [2 days, 17:54:34< 7:10:52] +[titan] 2025-09-10 17:31:35,977 - root - INFO - step: 36075 loss: 2.5390 memory: 122.03GiB(87.57%) tps: 10,036 tflops: 478.30 mfu: 48.36% global_avg_ntp_loss: 0.6739 global_avg_top_loss: 1.8651 +[titan] 2025-09-10 17:31:35,977 - root - INFO - lr: 2.4328e-06 gnorm: 0.60 [2 days, 17:55:07< 7:10:19] +[titan] 2025-09-10 17:32:08,158 - root - INFO - step: 36080 loss: 2.5252 memory: 122.03GiB(87.57%) tps: 10,183 tflops: 485.30 mfu: 49.07% global_avg_ntp_loss: 0.6639 global_avg_top_loss: 1.8613 +[titan] 2025-09-10 17:32:08,158 - root - INFO - lr: 2.4317e-06 gnorm: 0.56 [2 days, 17:55:39< 7:09:46] +[titan] 2025-09-10 17:32:40,178 - root - INFO - step: 36085 loss: 2.5771 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.72 mfu: 49.31% global_avg_ntp_loss: 0.6907 global_avg_top_loss: 1.8864 +[titan] 2025-09-10 17:32:40,178 - root - INFO - lr: 2.4306e-06 gnorm: 0.66 [2 days, 17:56:11< 7:09:13] +[titan] 2025-09-10 17:33:12,176 - root - INFO - step: 36090 loss: 2.7131 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.08 mfu: 49.35% global_avg_ntp_loss: 0.7783 global_avg_top_loss: 1.9348 +[titan] 2025-09-10 17:33:12,176 - root - INFO - lr: 2.4295e-06 gnorm: 0.57 [2 days, 17:56:43< 7:08:40] +[titan] 2025-09-10 17:33:44,309 - root - INFO - step: 36095 loss: 2.5989 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.01 mfu: 49.14% global_avg_ntp_loss: 0.7018 global_avg_top_loss: 1.8972 +[titan] 2025-09-10 17:33:44,309 - root - INFO - lr: 2.4284e-06 gnorm: 0.65 [2 days, 17:57:15< 7:08:07] +[titan] 2025-09-10 17:34:09,830 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 17:34:16,247 - root - INFO - step: 36100 loss: 2.5940 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.98 mfu: 49.44% global_avg_ntp_loss: 0.7000 global_avg_top_loss: 1.8940 +[titan] 2025-09-10 17:34:16,248 - root - INFO - lr: 2.4273e-06 gnorm: 0.72 [2 days, 17:57:47< 7:07:34] +[titan] 2025-09-10 17:34:48,558 - root - INFO - step: 36105 loss: 2.4737 memory: 122.03GiB(87.57%) tps: 10,142 tflops: 483.35 mfu: 48.87% global_avg_ntp_loss: 0.6518 global_avg_top_loss: 1.8219 +[titan] 2025-09-10 17:34:48,558 - root - INFO - lr: 2.4262e-06 gnorm: 0.54 [2 days, 17:58:19< 7:07:01] +[titan] 2025-09-10 17:35:20,708 - root - INFO - step: 36110 loss: 2.6904 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.76 mfu: 49.12% global_avg_ntp_loss: 0.7704 global_avg_top_loss: 1.9200 +[titan] 2025-09-10 17:35:20,708 - root - INFO - lr: 2.4252e-06 gnorm: 0.63 [2 days, 17:58:51< 7:06:28] +[titan] 2025-09-10 17:35:52,929 - root - INFO - step: 36115 loss: 2.3445 memory: 122.03GiB(87.57%) tps: 10,170 tflops: 484.70 mfu: 49.01% global_avg_ntp_loss: 0.5823 global_avg_top_loss: 1.7622 +[titan] 2025-09-10 17:35:52,929 - root - INFO - lr: 2.4241e-06 gnorm: 0.70 [2 days, 17:59:24< 7:05:55] +[titan] 2025-09-10 17:36:25,015 - root - INFO - step: 36120 loss: 2.5193 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.73 mfu: 49.21% global_avg_ntp_loss: 0.6671 global_avg_top_loss: 1.8522 +[titan] 2025-09-10 17:36:25,015 - root - INFO - lr: 2.4230e-06 gnorm: 0.57 [2 days, 17:59:56< 7:05:22] +[titan] 2025-09-10 17:36:57,111 - root - INFO - step: 36125 loss: 2.4011 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.58 mfu: 49.20% global_avg_ntp_loss: 0.6084 global_avg_top_loss: 1.7927 +[titan] 2025-09-10 17:36:57,111 - root - INFO - lr: 2.4219e-06 gnorm: 0.62 [2 days, 18:00:28< 7:04:49] +[titan] 2025-09-10 17:37:29,176 - root - INFO - step: 36130 loss: 2.5091 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.05 mfu: 49.25% global_avg_ntp_loss: 0.6660 global_avg_top_loss: 1.8431 +[titan] 2025-09-10 17:37:29,176 - root - INFO - lr: 2.4208e-06 gnorm: 0.60 [2 days, 18:01:00< 7:04:16] +[titan] 2025-09-10 17:38:01,237 - root - INFO - step: 36135 loss: 2.3309 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.10 mfu: 49.25% global_avg_ntp_loss: 0.5820 global_avg_top_loss: 1.7489 +[titan] 2025-09-10 17:38:01,238 - root - INFO - lr: 2.4198e-06 gnorm: 0.60 [2 days, 18:01:32< 7:03:43] +[titan] 2025-09-10 17:38:33,409 - root - INFO - step: 36140 loss: 2.5928 memory: 122.03GiB(87.57%) tps: 10,185 tflops: 485.43 mfu: 49.08% global_avg_ntp_loss: 0.6981 global_avg_top_loss: 1.8947 +[titan] 2025-09-10 17:38:33,409 - root - INFO - lr: 2.4187e-06 gnorm: 0.60 [2 days, 18:02:04< 7:03:10] +[titan] 2025-09-10 17:39:05,316 - root - INFO - step: 36145 loss: 2.5999 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.46 mfu: 49.49% global_avg_ntp_loss: 0.7055 global_avg_top_loss: 1.8943 +[titan] 2025-09-10 17:39:05,316 - root - INFO - lr: 2.4176e-06 gnorm: 0.62 [2 days, 18:02:36< 7:02:37] +[titan] 2025-09-10 17:39:31,036 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 17:39:37,497 - root - INFO - step: 36150 loss: 2.8209 memory: 122.03GiB(87.57%) tps: 10,183 tflops: 485.30 mfu: 49.07% global_avg_ntp_loss: 0.8337 global_avg_top_loss: 1.9872 +[titan] 2025-09-10 17:39:37,497 - root - INFO - lr: 2.4165e-06 gnorm: 0.69 [2 days, 18:03:08< 7:02:04] +[titan] 2025-09-10 17:40:09,925 - root - INFO - step: 36155 loss: 2.5746 memory: 122.03GiB(87.57%) tps: 10,105 tflops: 481.59 mfu: 48.70% global_avg_ntp_loss: 0.6922 global_avg_top_loss: 1.8824 +[titan] 2025-09-10 17:40:09,925 - root - INFO - lr: 2.4155e-06 gnorm: 0.60 [2 days, 18:03:41< 7:01:31] +[titan] 2025-09-10 17:40:41,937 - root - INFO - step: 36160 loss: 2.4668 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.86 mfu: 49.33% global_avg_ntp_loss: 0.6400 global_avg_top_loss: 1.8268 +[titan] 2025-09-10 17:40:41,937 - root - INFO - lr: 2.4144e-06 gnorm: 0.57 [2 days, 18:04:13< 7:00:58] +[titan] 2025-09-10 17:41:14,018 - root - INFO - step: 36165 loss: 2.4920 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.80 mfu: 49.22% global_avg_ntp_loss: 0.6564 global_avg_top_loss: 1.8356 +[titan] 2025-09-10 17:41:14,018 - root - INFO - lr: 2.4133e-06 gnorm: 0.61 [2 days, 18:04:45< 7:00:25] +[titan] 2025-09-10 17:41:46,122 - root - INFO - step: 36170 loss: 2.6141 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.46 mfu: 49.19% global_avg_ntp_loss: 0.7141 global_avg_top_loss: 1.9001 +[titan] 2025-09-10 17:41:46,122 - root - INFO - lr: 2.4122e-06 gnorm: 0.56 [2 days, 18:05:17< 6:59:52] +[titan] 2025-09-10 17:42:18,260 - root - INFO - step: 36175 loss: 2.5758 memory: 122.03GiB(87.57%) tps: 10,196 tflops: 485.94 mfu: 49.13% global_avg_ntp_loss: 0.6939 global_avg_top_loss: 1.8819 +[titan] 2025-09-10 17:42:18,261 - root - INFO - lr: 2.4112e-06 gnorm: 0.75 [2 days, 18:05:49< 6:59:19] +[titan] 2025-09-10 17:42:50,433 - root - INFO - step: 36180 loss: 2.7009 memory: 122.03GiB(87.57%) tps: 10,185 tflops: 485.41 mfu: 49.08% global_avg_ntp_loss: 0.7706 global_avg_top_loss: 1.9303 +[titan] 2025-09-10 17:42:50,434 - root - INFO - lr: 2.4101e-06 gnorm: 0.73 [2 days, 18:06:21< 6:58:46] +[titan] 2025-09-10 17:43:22,483 - root - INFO - step: 36185 loss: 2.3985 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.29 mfu: 49.27% global_avg_ntp_loss: 0.6122 global_avg_top_loss: 1.7864 +[titan] 2025-09-10 17:43:22,483 - root - INFO - lr: 2.4090e-06 gnorm: 0.57 [2 days, 18:06:53< 6:58:13] +[titan] 2025-09-10 17:43:54,678 - root - INFO - step: 36190 loss: 2.3364 memory: 122.03GiB(87.57%) tps: 10,178 tflops: 485.08 mfu: 49.05% global_avg_ntp_loss: 0.5789 global_avg_top_loss: 1.7575 +[titan] 2025-09-10 17:43:54,678 - root - INFO - lr: 2.4080e-06 gnorm: 0.65 [2 days, 18:07:25< 6:57:40] +[titan] 2025-09-10 17:44:26,883 - root - INFO - step: 36195 loss: 2.3450 memory: 122.03GiB(87.57%) tps: 10,175 tflops: 484.93 mfu: 49.03% global_avg_ntp_loss: 0.5808 global_avg_top_loss: 1.7643 +[titan] 2025-09-10 17:44:26,883 - root - INFO - lr: 2.4069e-06 gnorm: 0.74 [2 days, 18:07:58< 6:57:07] +[titan] 2025-09-10 17:44:52,726 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 17:44:59,135 - root - INFO - step: 36200 loss: 2.4704 memory: 122.03GiB(87.57%) tps: 10,160 tflops: 484.23 mfu: 48.96% global_avg_ntp_loss: 0.6440 global_avg_top_loss: 1.8264 +[titan] 2025-09-10 17:44:59,135 - root - INFO - lr: 2.4059e-06 gnorm: 0.56 [2 days, 18:08:30< 6:56:35] +[titan] 2025-09-10 17:45:31,279 - root - INFO - step: 36205 loss: 2.5251 memory: 122.03GiB(87.57%) tps: 10,194 tflops: 485.85 mfu: 49.13% global_avg_ntp_loss: 0.6689 global_avg_top_loss: 1.8563 +[titan] 2025-09-10 17:45:31,279 - root - INFO - lr: 2.4048e-06 gnorm: 0.62 [2 days, 18:09:02< 6:56:02] +[titan] 2025-09-10 17:46:03,498 - root - INFO - step: 36210 loss: 2.5172 memory: 122.03GiB(87.57%) tps: 10,170 tflops: 484.72 mfu: 49.01% global_avg_ntp_loss: 0.6625 global_avg_top_loss: 1.8547 +[titan] 2025-09-10 17:46:03,498 - root - INFO - lr: 2.4037e-06 gnorm: 0.73 [2 days, 18:09:34< 6:55:29] +[titan] 2025-09-10 17:46:35,834 - root - INFO - step: 36215 loss: 2.4146 memory: 122.03GiB(87.57%) tps: 10,134 tflops: 482.97 mfu: 48.83% global_avg_ntp_loss: 0.6177 global_avg_top_loss: 1.7969 +[titan] 2025-09-10 17:46:35,834 - root - INFO - lr: 2.4027e-06 gnorm: 0.58 [2 days, 18:10:07< 6:54:56] +[titan] 2025-09-10 17:47:08,149 - root - INFO - step: 36220 loss: 2.5749 memory: 122.03GiB(87.57%) tps: 10,140 tflops: 483.28 mfu: 48.87% global_avg_ntp_loss: 0.6889 global_avg_top_loss: 1.8860 +[titan] 2025-09-10 17:47:08,149 - root - INFO - lr: 2.4016e-06 gnorm: 0.58 [2 days, 18:10:39< 6:54:23] +[titan] 2025-09-10 17:47:40,392 - root - INFO - step: 36225 loss: 2.5700 memory: 122.03GiB(87.57%) tps: 10,163 tflops: 484.36 mfu: 48.97% global_avg_ntp_loss: 0.6898 global_avg_top_loss: 1.8802 +[titan] 2025-09-10 17:47:40,392 - root - INFO - lr: 2.4006e-06 gnorm: 0.61 [2 days, 18:11:11< 6:53:50] +[titan] 2025-09-10 17:48:12,427 - root - INFO - step: 36230 loss: 2.6284 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.51 mfu: 49.29% global_avg_ntp_loss: 0.7260 global_avg_top_loss: 1.9024 +[titan] 2025-09-10 17:48:12,427 - root - INFO - lr: 2.3995e-06 gnorm: 0.63 [2 days, 18:11:43< 6:53:17] +[titan] 2025-09-10 17:48:44,392 - root - INFO - step: 36235 loss: 2.5448 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.57 mfu: 49.40% global_avg_ntp_loss: 0.6802 global_avg_top_loss: 1.8645 +[titan] 2025-09-10 17:48:44,392 - root - INFO - lr: 2.3985e-06 gnorm: 0.62 [2 days, 18:12:15< 6:52:44] +[titan] 2025-09-10 17:49:16,597 - root - INFO - step: 36240 loss: 2.4381 memory: 122.03GiB(87.57%) tps: 10,175 tflops: 484.93 mfu: 49.03% global_avg_ntp_loss: 0.6336 global_avg_top_loss: 1.8044 +[titan] 2025-09-10 17:49:16,597 - root - INFO - lr: 2.3974e-06 gnorm: 0.57 [2 days, 18:12:47< 6:52:11] +[titan] 2025-09-10 17:49:48,695 - root - INFO - step: 36245 loss: 2.4225 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.55 mfu: 49.20% global_avg_ntp_loss: 0.6212 global_avg_top_loss: 1.8012 +[titan] 2025-09-10 17:49:48,695 - root - INFO - lr: 2.3964e-06 gnorm: 0.61 [2 days, 18:13:19< 6:51:38] +[titan] 2025-09-10 17:50:14,455 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 17:50:20,821 - root - INFO - step: 36250 loss: 3.0134 memory: 122.03GiB(87.57%) tps: 10,200 tflops: 486.12 mfu: 49.15% global_avg_ntp_loss: 0.9334 global_avg_top_loss: 2.0800 +[titan] 2025-09-10 17:50:20,822 - root - INFO - lr: 2.3953e-06 gnorm: 0.56 [2 days, 18:13:52< 6:51:05] +[titan] 2025-09-10 17:50:52,754 - root - INFO - step: 36255 loss: 2.4769 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.07 mfu: 49.45% global_avg_ntp_loss: 0.6477 global_avg_top_loss: 1.8292 +[titan] 2025-09-10 17:50:52,754 - root - INFO - lr: 2.3943e-06 gnorm: 0.66 [2 days, 18:14:23< 6:50:32] +[titan] 2025-09-10 17:51:24,809 - root - INFO - step: 36260 loss: 2.5628 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.20 mfu: 49.26% global_avg_ntp_loss: 0.6856 global_avg_top_loss: 1.8772 +[titan] 2025-09-10 17:51:24,809 - root - INFO - lr: 2.3932e-06 gnorm: 0.72 [2 days, 18:14:55< 6:49:59] +[titan] 2025-09-10 17:51:57,108 - root - INFO - step: 36265 loss: 2.8870 memory: 122.03GiB(87.57%) tps: 10,145 tflops: 483.51 mfu: 48.89% global_avg_ntp_loss: 0.8844 global_avg_top_loss: 2.0026 +[titan] 2025-09-10 17:51:57,108 - root - INFO - lr: 2.3922e-06 gnorm: 0.54 [2 days, 18:15:28< 6:49:26] +[titan] 2025-09-10 17:52:29,324 - root - INFO - step: 36270 loss: 2.4042 memory: 122.03GiB(87.57%) tps: 10,172 tflops: 484.77 mfu: 49.02% global_avg_ntp_loss: 0.6107 global_avg_top_loss: 1.7935 +[titan] 2025-09-10 17:52:29,324 - root - INFO - lr: 2.3912e-06 gnorm: 0.67 [2 days, 18:16:00< 6:48:53] +[titan] 2025-09-10 17:53:01,605 - root - INFO - step: 36275 loss: 2.3775 memory: 122.03GiB(87.57%) tps: 10,151 tflops: 483.78 mfu: 48.92% global_avg_ntp_loss: 0.5951 global_avg_top_loss: 1.7824 +[titan] 2025-09-10 17:53:01,606 - root - INFO - lr: 2.3901e-06 gnorm: 0.74 [2 days, 18:16:32< 6:48:20] +[titan] 2025-09-10 17:53:34,029 - root - INFO - step: 36280 loss: 2.4479 memory: 122.03GiB(87.57%) tps: 10,107 tflops: 481.67 mfu: 48.70% global_avg_ntp_loss: 0.6329 global_avg_top_loss: 1.8150 +[titan] 2025-09-10 17:53:34,029 - root - INFO - lr: 2.3891e-06 gnorm: 0.56 [2 days, 18:17:05< 6:47:47] +[titan] 2025-09-10 17:54:06,281 - root - INFO - step: 36285 loss: 2.4242 memory: 122.03GiB(87.57%) tps: 10,160 tflops: 484.22 mfu: 48.96% global_avg_ntp_loss: 0.6228 global_avg_top_loss: 1.8014 +[titan] 2025-09-10 17:54:06,281 - root - INFO - lr: 2.3880e-06 gnorm: 0.59 [2 days, 18:17:37< 6:47:14] +[titan] 2025-09-10 17:54:38,482 - root - INFO - step: 36290 loss: 2.5323 memory: 122.03GiB(87.57%) tps: 10,176 tflops: 484.98 mfu: 49.04% global_avg_ntp_loss: 0.6737 global_avg_top_loss: 1.8586 +[titan] 2025-09-10 17:54:38,483 - root - INFO - lr: 2.3870e-06 gnorm: 0.63 [2 days, 18:18:09< 6:46:41] +[titan] 2025-09-10 17:55:10,585 - root - INFO - step: 36295 loss: 2.4233 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.47 mfu: 49.19% global_avg_ntp_loss: 0.6250 global_avg_top_loss: 1.7983 +[titan] 2025-09-10 17:55:10,586 - root - INFO - lr: 2.3860e-06 gnorm: 0.59 [2 days, 18:18:41< 6:46:08] +[titan] 2025-09-10 17:55:36,155 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 17:55:42,847 - root - INFO - step: 36300 loss: 2.6409 memory: 122.03GiB(87.57%) tps: 10,157 tflops: 484.08 mfu: 48.95% global_avg_ntp_loss: 0.7278 global_avg_top_loss: 1.9131 +[titan] 2025-09-10 17:55:42,847 - root - INFO - lr: 2.3849e-06 gnorm: 0.58 [2 days, 18:19:14< 6:45:35] +[titan] 2025-09-10 17:56:15,057 - root - INFO - step: 36305 loss: 2.5445 memory: 122.03GiB(87.57%) tps: 10,173 tflops: 484.86 mfu: 49.02% global_avg_ntp_loss: 0.6788 global_avg_top_loss: 1.8657 +[titan] 2025-09-10 17:56:15,057 - root - INFO - lr: 2.3839e-06 gnorm: 0.63 [2 days, 18:19:46< 6:45:02] +[titan] 2025-09-10 17:56:46,954 - root - INFO - step: 36310 loss: 3.0984 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.61 mfu: 49.51% global_avg_ntp_loss: 0.9928 global_avg_top_loss: 2.1056 +[titan] 2025-09-10 17:56:46,955 - root - INFO - lr: 2.3829e-06 gnorm: 0.59 [2 days, 18:20:18< 6:44:29] +[titan] 2025-09-10 17:57:19,291 - root - INFO - step: 36315 loss: 2.6493 memory: 122.03GiB(87.57%) tps: 10,134 tflops: 482.97 mfu: 48.83% global_avg_ntp_loss: 0.7301 global_avg_top_loss: 1.9192 +[titan] 2025-09-10 17:57:19,291 - root - INFO - lr: 2.3818e-06 gnorm: 0.61 [2 days, 18:20:50< 6:43:56] +[titan] 2025-09-10 17:57:51,508 - root - INFO - step: 36320 loss: 2.5625 memory: 122.03GiB(87.57%) tps: 10,171 tflops: 484.75 mfu: 49.01% global_avg_ntp_loss: 0.6838 global_avg_top_loss: 1.8787 +[titan] 2025-09-10 17:57:51,508 - root - INFO - lr: 2.3808e-06 gnorm: 0.57 [2 days, 18:21:22< 6:43:23] +[titan] 2025-09-10 17:58:23,674 - root - INFO - step: 36325 loss: 2.4713 memory: 122.03GiB(87.57%) tps: 10,187 tflops: 485.52 mfu: 49.09% global_avg_ntp_loss: 0.6442 global_avg_top_loss: 1.8271 +[titan] 2025-09-10 17:58:23,674 - root - INFO - lr: 2.3798e-06 gnorm: 0.57 [2 days, 18:21:54< 6:42:51] +[titan] 2025-09-10 17:58:56,164 - root - INFO - step: 36330 loss: 2.5338 memory: 122.03GiB(87.57%) tps: 10,086 tflops: 480.67 mfu: 48.60% global_avg_ntp_loss: 0.6715 global_avg_top_loss: 1.8623 +[titan] 2025-09-10 17:58:56,165 - root - INFO - lr: 2.3788e-06 gnorm: 0.56 [2 days, 18:22:27< 6:42:18] +[titan] 2025-09-10 17:59:28,311 - root - INFO - step: 36335 loss: 2.4411 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.81 mfu: 49.12% global_avg_ntp_loss: 0.6321 global_avg_top_loss: 1.8090 +[titan] 2025-09-10 17:59:28,311 - root - INFO - lr: 2.3777e-06 gnorm: 0.64 [2 days, 18:22:59< 6:41:45] +[titan] 2025-09-10 18:00:00,700 - root - INFO - step: 36340 loss: 2.5943 memory: 122.03GiB(87.57%) tps: 10,117 tflops: 482.17 mfu: 48.75% global_avg_ntp_loss: 0.7051 global_avg_top_loss: 1.8892 +[titan] 2025-09-10 18:00:00,700 - root - INFO - lr: 2.3767e-06 gnorm: 0.73 [2 days, 18:23:31< 6:41:12] +[titan] 2025-09-10 18:00:32,789 - root - INFO - step: 36345 loss: 2.8657 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.68 mfu: 49.21% global_avg_ntp_loss: 0.8785 global_avg_top_loss: 1.9872 +[titan] 2025-09-10 18:00:32,790 - root - INFO - lr: 2.3757e-06 gnorm: 0.56 [2 days, 18:24:03< 6:40:39] +[titan] 2025-09-10 18:00:58,729 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 18:01:05,096 - root - INFO - step: 36350 loss: 2.3818 memory: 122.03GiB(87.57%) tps: 10,143 tflops: 483.40 mfu: 48.88% global_avg_ntp_loss: 0.6042 global_avg_top_loss: 1.7776 +[titan] 2025-09-10 18:01:05,097 - root - INFO - lr: 2.3747e-06 gnorm: 0.62 [2 days, 18:24:36< 6:40:06] +[titan] 2025-09-10 18:01:18,334 - root - INFO - Dumping profiler traces at step 36352 +[titan] 2025-09-10 18:01:18,405 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-10 18:01:37,626 - root - INFO - step: 36355 loss: 2.3734 memory: 122.03GiB(87.57%) tps: 10,073 tflops: 480.09 mfu: 48.54% global_avg_ntp_loss: 0.5904 global_avg_top_loss: 1.7831 +[titan] 2025-09-10 18:01:37,627 - root - INFO - lr: 2.3737e-06 gnorm: 0.80 [2 days, 18:25:08< 6:39:33] +[titan] 2025-09-10 18:02:09,774 - root - INFO - step: 36360 loss: 2.4939 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.79 mfu: 49.12% global_avg_ntp_loss: 0.6569 global_avg_top_loss: 1.8370 +[titan] 2025-09-10 18:02:09,775 - root - INFO - lr: 2.3726e-06 gnorm: 0.56 [2 days, 18:25:40< 6:39:00] +[titan] 2025-09-10 18:02:41,805 - root - INFO - step: 36365 loss: 2.4027 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.58 mfu: 49.30% global_avg_ntp_loss: 0.6101 global_avg_top_loss: 1.7926 +[titan] 2025-09-10 18:02:41,805 - root - INFO - lr: 2.3716e-06 gnorm: 0.61 [2 days, 18:26:12< 6:38:27] +[titan] 2025-09-10 18:03:14,026 - root - INFO - step: 36370 loss: 2.5456 memory: 122.03GiB(87.57%) tps: 10,170 tflops: 484.68 mfu: 49.01% global_avg_ntp_loss: 0.6762 global_avg_top_loss: 1.8694 +[titan] 2025-09-10 18:03:14,027 - root - INFO - lr: 2.3706e-06 gnorm: 0.62 [2 days, 18:26:45< 6:37:54] +[titan] 2025-09-10 18:03:46,166 - root - INFO - step: 36375 loss: 2.4280 memory: 122.03GiB(87.57%) tps: 10,196 tflops: 485.92 mfu: 49.13% global_avg_ntp_loss: 0.6286 global_avg_top_loss: 1.7994 +[titan] 2025-09-10 18:03:46,166 - root - INFO - lr: 2.3696e-06 gnorm: 0.57 [2 days, 18:27:17< 6:37:21] +[titan] 2025-09-10 18:04:18,327 - root - INFO - step: 36380 loss: 2.6132 memory: 122.03GiB(87.57%) tps: 10,189 tflops: 485.60 mfu: 49.10% global_avg_ntp_loss: 0.7060 global_avg_top_loss: 1.9072 +[titan] 2025-09-10 18:04:18,327 - root - INFO - lr: 2.3686e-06 gnorm: 0.61 [2 days, 18:27:49< 6:36:48] +[titan] 2025-09-10 18:04:50,563 - root - INFO - step: 36385 loss: 2.5585 memory: 122.03GiB(87.57%) tps: 10,165 tflops: 484.46 mfu: 48.99% global_avg_ntp_loss: 0.6886 global_avg_top_loss: 1.8699 +[titan] 2025-09-10 18:04:50,563 - root - INFO - lr: 2.3676e-06 gnorm: 0.78 [2 days, 18:28:21< 6:36:15] +[titan] 2025-09-10 18:05:22,536 - root - INFO - step: 36390 loss: 3.1171 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.45 mfu: 49.39% global_avg_ntp_loss: 0.9928 global_avg_top_loss: 2.1243 +[titan] 2025-09-10 18:05:22,536 - root - INFO - lr: 2.3666e-06 gnorm: 0.65 [2 days, 18:28:53< 6:35:42] +[titan] 2025-09-10 18:05:54,782 - root - INFO - step: 36395 loss: 2.5079 memory: 122.03GiB(87.57%) tps: 10,162 tflops: 484.31 mfu: 48.97% global_avg_ntp_loss: 0.6630 global_avg_top_loss: 1.8449 +[titan] 2025-09-10 18:05:54,783 - root - INFO - lr: 2.3656e-06 gnorm: 0.60 [2 days, 18:29:25< 6:35:09] +[titan] 2025-09-10 18:06:20,491 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 18:06:26,887 - root - INFO - step: 36400 loss: 2.5026 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.45 mfu: 49.19% global_avg_ntp_loss: 0.6592 global_avg_top_loss: 1.8434 +[titan] 2025-09-10 18:06:26,887 - root - INFO - lr: 2.3645e-06 gnorm: 0.57 [2 days, 18:29:58< 6:34:36] +[titan] 2025-09-10 18:06:58,976 - root - INFO - step: 36405 loss: 2.5193 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.68 mfu: 49.21% global_avg_ntp_loss: 0.6668 global_avg_top_loss: 1.8524 +[titan] 2025-09-10 18:06:58,977 - root - INFO - lr: 2.3635e-06 gnorm: 0.68 [2 days, 18:30:30< 6:34:03] +[titan] 2025-09-10 18:07:31,070 - root - INFO - step: 36410 loss: 2.5915 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.61 mfu: 49.20% global_avg_ntp_loss: 0.6988 global_avg_top_loss: 1.8927 +[titan] 2025-09-10 18:07:31,071 - root - INFO - lr: 2.3625e-06 gnorm: 0.59 [2 days, 18:31:02< 6:33:30] +[titan] 2025-09-10 18:08:03,071 - root - INFO - step: 36415 loss: 2.5202 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.03 mfu: 49.35% global_avg_ntp_loss: 0.6650 global_avg_top_loss: 1.8552 +[titan] 2025-09-10 18:08:03,071 - root - INFO - lr: 2.3615e-06 gnorm: 0.70 [2 days, 18:31:34< 6:32:57] +[titan] 2025-09-10 18:08:35,306 - root - INFO - step: 36420 loss: 2.5800 memory: 122.03GiB(87.57%) tps: 10,166 tflops: 484.48 mfu: 48.99% global_avg_ntp_loss: 0.6948 global_avg_top_loss: 1.8853 +[titan] 2025-09-10 18:08:35,306 - root - INFO - lr: 2.3605e-06 gnorm: 0.70 [2 days, 18:32:06< 6:32:24] +[titan] 2025-09-10 18:09:07,523 - root - INFO - step: 36425 loss: 2.8834 memory: 122.03GiB(87.57%) tps: 10,171 tflops: 484.75 mfu: 49.01% global_avg_ntp_loss: 0.8852 global_avg_top_loss: 1.9982 +[titan] 2025-09-10 18:09:07,523 - root - INFO - lr: 2.3595e-06 gnorm: 0.56 [2 days, 18:32:38< 6:31:51] +[titan] 2025-09-10 18:09:39,629 - root - INFO - step: 36430 loss: 2.3640 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.42 mfu: 49.18% global_avg_ntp_loss: 0.5949 global_avg_top_loss: 1.7691 +[titan] 2025-09-10 18:09:39,630 - root - INFO - lr: 2.3585e-06 gnorm: 0.63 [2 days, 18:33:10< 6:31:18] +[titan] 2025-09-10 18:10:11,793 - root - INFO - step: 36435 loss: 2.3752 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.55 mfu: 49.09% global_avg_ntp_loss: 0.5966 global_avg_top_loss: 1.7787 +[titan] 2025-09-10 18:10:11,794 - root - INFO - lr: 2.3575e-06 gnorm: 0.76 [2 days, 18:33:42< 6:30:46] +[titan] 2025-09-10 18:10:44,138 - root - INFO - step: 36440 loss: 2.9353 memory: 122.03GiB(87.57%) tps: 10,131 tflops: 482.84 mfu: 48.82% global_avg_ntp_loss: 0.9084 global_avg_top_loss: 2.0269 +[titan] 2025-09-10 18:10:44,138 - root - INFO - lr: 2.3565e-06 gnorm: 0.54 [2 days, 18:34:15< 6:30:13] +[titan] 2025-09-10 18:11:16,432 - root - INFO - step: 36445 loss: 2.4336 memory: 122.03GiB(87.57%) tps: 10,147 tflops: 483.59 mfu: 48.90% global_avg_ntp_loss: 0.6265 global_avg_top_loss: 1.8071 +[titan] 2025-09-10 18:11:16,432 - root - INFO - lr: 2.3555e-06 gnorm: 0.58 [2 days, 18:34:47< 6:29:40] +[titan] 2025-09-10 18:11:42,293 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 18:11:48,763 - root - INFO - step: 36450 loss: 2.5524 memory: 122.03GiB(87.57%) tps: 10,135 tflops: 483.04 mfu: 48.84% global_avg_ntp_loss: 0.6824 global_avg_top_loss: 1.8700 +[titan] 2025-09-10 18:11:48,763 - root - INFO - lr: 2.3546e-06 gnorm: 0.63 [2 days, 18:35:19< 6:29:07] +[titan] 2025-09-10 18:12:20,821 - root - INFO - step: 36455 loss: 2.9039 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.15 mfu: 49.26% global_avg_ntp_loss: 0.8910 global_avg_top_loss: 2.0128 +[titan] 2025-09-10 18:12:20,822 - root - INFO - lr: 2.3536e-06 gnorm: 0.61 [2 days, 18:35:51< 6:28:34] +[titan] 2025-09-10 18:12:52,866 - root - INFO - step: 36460 loss: 2.5076 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.36 mfu: 49.28% global_avg_ntp_loss: 0.6612 global_avg_top_loss: 1.8464 +[titan] 2025-09-10 18:12:52,866 - root - INFO - lr: 2.3526e-06 gnorm: 0.63 [2 days, 18:36:24< 6:28:01] +[titan] 2025-09-10 18:13:24,908 - root - INFO - step: 36465 loss: 2.5597 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.40 mfu: 49.28% global_avg_ntp_loss: 0.6857 global_avg_top_loss: 1.8740 +[titan] 2025-09-10 18:13:24,908 - root - INFO - lr: 2.3516e-06 gnorm: 0.60 [2 days, 18:36:56< 6:27:28] +[titan] 2025-09-10 18:13:57,014 - root - INFO - step: 36470 loss: 3.1757 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.43 mfu: 49.18% global_avg_ntp_loss: 1.0218 global_avg_top_loss: 2.1539 +[titan] 2025-09-10 18:13:57,014 - root - INFO - lr: 2.3506e-06 gnorm: 0.59 [2 days, 18:37:28< 6:26:55] +[titan] 2025-09-10 18:14:28,901 - root - INFO - step: 36475 loss: 2.5523 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.77 mfu: 49.52% global_avg_ntp_loss: 0.6758 global_avg_top_loss: 1.8764 +[titan] 2025-09-10 18:14:28,901 - root - INFO - lr: 2.3496e-06 gnorm: 0.64 [2 days, 18:38:00< 6:26:22] +[titan] 2025-09-10 18:15:01,124 - root - INFO - step: 36480 loss: 2.4937 memory: 122.03GiB(87.57%) tps: 10,169 tflops: 484.65 mfu: 49.00% global_avg_ntp_loss: 0.6519 global_avg_top_loss: 1.8418 +[titan] 2025-09-10 18:15:01,124 - root - INFO - lr: 2.3486e-06 gnorm: 0.59 [2 days, 18:38:32< 6:25:49] +[titan] 2025-09-10 18:15:33,490 - root - INFO - step: 36485 loss: 2.4535 memory: 122.03GiB(87.57%) tps: 10,124 tflops: 482.52 mfu: 48.79% global_avg_ntp_loss: 0.6351 global_avg_top_loss: 1.8184 +[titan] 2025-09-10 18:15:33,490 - root - INFO - lr: 2.3476e-06 gnorm: 0.63 [2 days, 18:39:04< 6:25:16] +[titan] 2025-09-10 18:16:05,455 - root - INFO - step: 36490 loss: 2.8161 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.57 mfu: 49.40% global_avg_ntp_loss: 0.8281 global_avg_top_loss: 1.9880 +[titan] 2025-09-10 18:16:05,455 - root - INFO - lr: 2.3467e-06 gnorm: 0.57 [2 days, 18:39:36< 6:24:43] +[titan] 2025-09-10 18:16:37,387 - root - INFO - step: 36495 loss: 2.4849 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.08 mfu: 49.45% global_avg_ntp_loss: 0.6506 global_avg_top_loss: 1.8342 +[titan] 2025-09-10 18:16:37,387 - root - INFO - lr: 2.3457e-06 gnorm: 0.71 [2 days, 18:40:08< 6:24:10] +[titan] 2025-09-10 18:17:03,013 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 18:17:09,437 - root - INFO - step: 36500 loss: 2.5384 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.28 mfu: 49.27% global_avg_ntp_loss: 0.6785 global_avg_top_loss: 1.8599 +[titan] 2025-09-10 18:17:09,437 - root - INFO - lr: 2.3447e-06 gnorm: 0.70 [2 days, 18:40:40< 6:23:37] +[titan] 2025-09-10 18:17:41,608 - root - INFO - step: 36505 loss: 2.8314 memory: 122.03GiB(87.57%) tps: 10,186 tflops: 485.45 mfu: 49.08% global_avg_ntp_loss: 0.8618 global_avg_top_loss: 1.9696 +[titan] 2025-09-10 18:17:41,608 - root - INFO - lr: 2.3437e-06 gnorm: 0.56 [2 days, 18:41:12< 6:23:04] +[titan] 2025-09-10 18:18:13,798 - root - INFO - step: 36510 loss: 2.4664 memory: 122.03GiB(87.57%) tps: 10,180 tflops: 485.16 mfu: 49.06% global_avg_ntp_loss: 0.6406 global_avg_top_loss: 1.8257 +[titan] 2025-09-10 18:18:13,799 - root - INFO - lr: 2.3427e-06 gnorm: 0.66 [2 days, 18:41:44< 6:22:31] +[titan] 2025-09-10 18:18:46,083 - root - INFO - step: 36515 loss: 2.3623 memory: 122.03GiB(87.57%) tps: 10,150 tflops: 483.73 mfu: 48.91% global_avg_ntp_loss: 0.5874 global_avg_top_loss: 1.7749 +[titan] 2025-09-10 18:18:46,083 - root - INFO - lr: 2.3418e-06 gnorm: 0.88 [2 days, 18:42:17< 6:21:58] +[titan] 2025-09-10 18:19:18,220 - root - INFO - step: 36520 loss: 2.4605 memory: 122.03GiB(87.57%) tps: 10,197 tflops: 485.96 mfu: 49.14% global_avg_ntp_loss: 0.6396 global_avg_top_loss: 1.8210 +[titan] 2025-09-10 18:19:18,220 - root - INFO - lr: 2.3408e-06 gnorm: 0.56 [2 days, 18:42:49< 6:21:25] +[titan] 2025-09-10 18:19:50,239 - root - INFO - step: 36525 loss: 2.4401 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.74 mfu: 49.32% global_avg_ntp_loss: 0.6269 global_avg_top_loss: 1.8132 +[titan] 2025-09-10 18:19:50,239 - root - INFO - lr: 2.3398e-06 gnorm: 0.59 [2 days, 18:43:21< 6:20:52] +[titan] 2025-09-10 18:20:22,300 - root - INFO - step: 36530 loss: 2.3647 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.11 mfu: 49.25% global_avg_ntp_loss: 0.6001 global_avg_top_loss: 1.7646 +[titan] 2025-09-10 18:20:22,300 - root - INFO - lr: 2.3389e-06 gnorm: 0.58 [2 days, 18:43:53< 6:20:19] +[titan] 2025-09-10 18:20:54,412 - root - INFO - step: 36535 loss: 2.8384 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.33 mfu: 49.17% global_avg_ntp_loss: 0.8647 global_avg_top_loss: 1.9737 +[titan] 2025-09-10 18:20:54,412 - root - INFO - lr: 2.3379e-06 gnorm: 0.59 [2 days, 18:44:25< 6:19:46] +[titan] 2025-09-10 18:21:26,491 - root - INFO - step: 36540 loss: 2.4336 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.83 mfu: 49.22% global_avg_ntp_loss: 0.6271 global_avg_top_loss: 1.8065 +[titan] 2025-09-10 18:21:26,492 - root - INFO - lr: 2.3369e-06 gnorm: 0.57 [2 days, 18:44:57< 6:19:13] +[titan] 2025-09-10 18:21:58,511 - root - INFO - step: 36545 loss: 2.6625 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.74 mfu: 49.32% global_avg_ntp_loss: 0.7294 global_avg_top_loss: 1.9330 +[titan] 2025-09-10 18:21:58,511 - root - INFO - lr: 2.3360e-06 gnorm: 0.63 [2 days, 18:45:29< 6:18:41] +[titan] 2025-09-10 18:22:24,054 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 18:22:30,544 - root - INFO - step: 36550 loss: 2.5456 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.54 mfu: 49.30% global_avg_ntp_loss: 0.6799 global_avg_top_loss: 1.8657 +[titan] 2025-09-10 18:22:30,544 - root - INFO - lr: 2.3350e-06 gnorm: 0.70 [2 days, 18:46:01< 6:18:08] +[titan] 2025-09-10 18:23:02,977 - root - INFO - step: 36555 loss: 2.5058 memory: 122.03GiB(87.57%) tps: 10,103 tflops: 481.52 mfu: 48.69% global_avg_ntp_loss: 0.6619 global_avg_top_loss: 1.8438 +[titan] 2025-09-10 18:23:02,977 - root - INFO - lr: 2.3340e-06 gnorm: 0.59 [2 days, 18:46:34< 6:17:35] +[titan] 2025-09-10 18:23:34,826 - root - INFO - step: 36560 loss: 2.5376 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.35 mfu: 49.58% global_avg_ntp_loss: 0.6627 global_avg_top_loss: 1.8749 +[titan] 2025-09-10 18:23:34,826 - root - INFO - lr: 2.3331e-06 gnorm: 0.62 [2 days, 18:47:05< 6:17:02] +[titan] 2025-09-10 18:24:06,910 - root - INFO - step: 36565 loss: 2.5003 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.75 mfu: 49.22% global_avg_ntp_loss: 0.6587 global_avg_top_loss: 1.8416 +[titan] 2025-09-10 18:24:06,911 - root - INFO - lr: 2.3321e-06 gnorm: 0.57 [2 days, 18:47:38< 6:16:29] +[titan] 2025-09-10 18:24:39,093 - root - INFO - step: 36570 loss: 2.5556 memory: 122.03GiB(87.57%) tps: 10,182 tflops: 485.26 mfu: 49.07% global_avg_ntp_loss: 0.6853 global_avg_top_loss: 1.8703 +[titan] 2025-09-10 18:24:39,094 - root - INFO - lr: 2.3311e-06 gnorm: 0.57 [2 days, 18:48:10< 6:15:56] +[titan] 2025-09-10 18:25:11,095 - root - INFO - step: 36575 loss: 2.5625 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.01 mfu: 49.34% global_avg_ntp_loss: 0.6862 global_avg_top_loss: 1.8763 +[titan] 2025-09-10 18:25:11,095 - root - INFO - lr: 2.3302e-06 gnorm: 0.76 [2 days, 18:48:42< 6:15:23] +[titan] 2025-09-10 18:25:43,009 - root - INFO - step: 36580 loss: 2.5381 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.35 mfu: 49.48% global_avg_ntp_loss: 0.6796 global_avg_top_loss: 1.8585 +[titan] 2025-09-10 18:25:43,010 - root - INFO - lr: 2.3292e-06 gnorm: 0.73 [2 days, 18:49:14< 6:14:50] +[titan] 2025-09-10 18:26:14,965 - root - INFO - step: 36585 loss: 2.3685 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.72 mfu: 49.42% global_avg_ntp_loss: 0.5984 global_avg_top_loss: 1.7701 +[titan] 2025-09-10 18:26:14,965 - root - INFO - lr: 2.3283e-06 gnorm: 0.56 [2 days, 18:49:46< 6:14:17] +[titan] 2025-09-10 18:26:47,193 - root - INFO - step: 36590 loss: 2.3930 memory: 122.03GiB(87.57%) tps: 10,168 tflops: 484.58 mfu: 49.00% global_avg_ntp_loss: 0.6041 global_avg_top_loss: 1.7890 +[titan] 2025-09-10 18:26:47,193 - root - INFO - lr: 2.3273e-06 gnorm: 0.63 [2 days, 18:50:18< 6:13:44] +[titan] 2025-09-10 18:27:19,345 - root - INFO - step: 36595 loss: 2.3857 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.73 mfu: 49.11% global_avg_ntp_loss: 0.5971 global_avg_top_loss: 1.7886 +[titan] 2025-09-10 18:27:19,345 - root - INFO - lr: 2.3264e-06 gnorm: 0.80 [2 days, 18:50:50< 6:13:11] +[titan] 2025-09-10 18:27:45,081 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 18:27:51,459 - root - INFO - step: 36600 loss: 2.5231 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.30 mfu: 49.17% global_avg_ntp_loss: 0.6665 global_avg_top_loss: 1.8566 +[titan] 2025-09-10 18:27:51,460 - root - INFO - lr: 2.3254e-06 gnorm: 0.59 [2 days, 18:51:22< 6:12:38] +[titan] 2025-09-10 18:28:23,695 - root - INFO - step: 36605 loss: 2.5606 memory: 122.03GiB(87.57%) tps: 10,165 tflops: 484.48 mfu: 48.99% global_avg_ntp_loss: 0.6859 global_avg_top_loss: 1.8747 +[titan] 2025-09-10 18:28:23,695 - root - INFO - lr: 2.3245e-06 gnorm: 0.62 [2 days, 18:51:54< 6:12:05] +[titan] 2025-09-10 18:28:55,820 - root - INFO - step: 36610 loss: 2.4608 memory: 122.03GiB(87.57%) tps: 10,200 tflops: 486.13 mfu: 49.15% global_avg_ntp_loss: 0.6389 global_avg_top_loss: 1.8220 +[titan] 2025-09-10 18:28:55,820 - root - INFO - lr: 2.3235e-06 gnorm: 0.61 [2 days, 18:52:26< 6:11:32] +[titan] 2025-09-10 18:29:27,852 - root - INFO - step: 36615 loss: 2.8607 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.55 mfu: 49.30% global_avg_ntp_loss: 0.8683 global_avg_top_loss: 1.9924 +[titan] 2025-09-10 18:29:27,852 - root - INFO - lr: 2.3226e-06 gnorm: 0.61 [2 days, 18:52:58< 6:10:59] +[titan] 2025-09-10 18:29:59,898 - root - INFO - step: 36620 loss: 2.5007 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.34 mfu: 49.28% global_avg_ntp_loss: 0.6588 global_avg_top_loss: 1.8419 +[titan] 2025-09-10 18:29:59,898 - root - INFO - lr: 2.3216e-06 gnorm: 0.61 [2 days, 18:53:31< 6:10:26] +[titan] 2025-09-10 18:30:31,868 - root - INFO - step: 36625 loss: 2.5752 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.49 mfu: 49.39% global_avg_ntp_loss: 0.6962 global_avg_top_loss: 1.8789 +[titan] 2025-09-10 18:30:31,868 - root - INFO - lr: 2.3207e-06 gnorm: 0.64 [2 days, 18:54:03< 6:09:53] +[titan] 2025-09-10 18:31:03,831 - root - INFO - step: 36630 loss: 2.6438 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.61 mfu: 49.40% global_avg_ntp_loss: 0.7273 global_avg_top_loss: 1.9164 +[titan] 2025-09-10 18:31:03,831 - root - INFO - lr: 2.3197e-06 gnorm: 0.70 [2 days, 18:54:34< 6:09:20] +[titan] 2025-09-10 18:31:35,887 - root - INFO - step: 36635 loss: 2.5736 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.18 mfu: 49.26% global_avg_ntp_loss: 0.6926 global_avg_top_loss: 1.8810 +[titan] 2025-09-10 18:31:35,887 - root - INFO - lr: 2.3188e-06 gnorm: 0.63 [2 days, 18:55:07< 6:08:47] +[titan] 2025-09-10 18:32:07,851 - root - INFO - step: 36640 loss: 2.4046 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.58 mfu: 49.40% global_avg_ntp_loss: 0.6134 global_avg_top_loss: 1.7912 +[titan] 2025-09-10 18:32:07,852 - root - INFO - lr: 2.3178e-06 gnorm: 0.58 [2 days, 18:55:38< 6:08:14] +[titan] 2025-09-10 18:32:40,058 - root - INFO - step: 36645 loss: 2.6101 memory: 122.03GiB(87.57%) tps: 10,174 tflops: 484.91 mfu: 49.03% global_avg_ntp_loss: 0.7060 global_avg_top_loss: 1.9041 +[titan] 2025-09-10 18:32:40,058 - root - INFO - lr: 2.3169e-06 gnorm: 0.64 [2 days, 18:56:11< 6:07:41] +[titan] 2025-09-10 18:33:05,782 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 18:33:12,266 - root - INFO - step: 36650 loss: 2.6266 memory: 122.03GiB(87.57%) tps: 10,174 tflops: 484.89 mfu: 49.03% global_avg_ntp_loss: 0.7176 global_avg_top_loss: 1.9089 +[titan] 2025-09-10 18:33:12,266 - root - INFO - lr: 2.3160e-06 gnorm: 0.56 [2 days, 18:56:43< 6:07:08] +[titan] 2025-09-10 18:33:44,549 - root - INFO - step: 36655 loss: 2.5451 memory: 122.03GiB(87.57%) tps: 10,150 tflops: 483.75 mfu: 48.91% global_avg_ntp_loss: 0.6849 global_avg_top_loss: 1.8602 +[titan] 2025-09-10 18:33:44,550 - root - INFO - lr: 2.3150e-06 gnorm: 0.71 [2 days, 18:57:15< 6:06:36] +[titan] 2025-09-10 18:34:16,707 - root - INFO - step: 36660 loss: 2.5146 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.64 mfu: 49.10% global_avg_ntp_loss: 0.6652 global_avg_top_loss: 1.8495 +[titan] 2025-09-10 18:34:16,708 - root - INFO - lr: 2.3141e-06 gnorm: 0.74 [2 days, 18:57:47< 6:06:03] +[titan] 2025-09-10 18:34:48,859 - root - INFO - step: 36665 loss: 2.4314 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.73 mfu: 49.11% global_avg_ntp_loss: 0.6291 global_avg_top_loss: 1.8022 +[titan] 2025-09-10 18:34:48,860 - root - INFO - lr: 2.3132e-06 gnorm: 0.57 [2 days, 18:58:19< 6:05:30] +[titan] 2025-09-10 18:35:20,789 - root - INFO - step: 36670 loss: 2.4287 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.12 mfu: 49.46% global_avg_ntp_loss: 0.6210 global_avg_top_loss: 1.8077 +[titan] 2025-09-10 18:35:20,789 - root - INFO - lr: 2.3122e-06 gnorm: 0.72 [2 days, 18:58:51< 6:04:57] +[titan] 2025-09-10 18:35:52,800 - root - INFO - step: 36675 loss: 2.3177 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.86 mfu: 49.33% global_avg_ntp_loss: 0.5713 global_avg_top_loss: 1.7464 +[titan] 2025-09-10 18:35:52,800 - root - INFO - lr: 2.3113e-06 gnorm: 0.76 [2 days, 18:59:23< 6:04:24] +[titan] 2025-09-10 18:36:24,996 - root - INFO - step: 36680 loss: 2.4871 memory: 122.03GiB(87.57%) tps: 10,178 tflops: 485.07 mfu: 49.05% global_avg_ntp_loss: 0.6473 global_avg_top_loss: 1.8398 +[titan] 2025-09-10 18:36:24,996 - root - INFO - lr: 2.3104e-06 gnorm: 0.60 [2 days, 18:59:56< 6:03:51] +[titan] 2025-09-10 18:36:57,070 - root - INFO - step: 36685 loss: 2.6310 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.91 mfu: 49.23% global_avg_ntp_loss: 0.7395 global_avg_top_loss: 1.8914 +[titan] 2025-09-10 18:36:57,070 - root - INFO - lr: 2.3094e-06 gnorm: 0.58 [2 days, 19:00:28< 6:03:18] +[titan] 2025-09-10 18:37:29,234 - root - INFO - step: 36690 loss: 2.4610 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.55 mfu: 49.10% global_avg_ntp_loss: 0.6385 global_avg_top_loss: 1.8225 +[titan] 2025-09-10 18:37:29,234 - root - INFO - lr: 2.3085e-06 gnorm: 0.69 [2 days, 19:01:00< 6:02:45] +[titan] 2025-09-10 18:38:01,473 - root - INFO - step: 36695 loss: 2.3284 memory: 122.03GiB(87.57%) tps: 10,164 tflops: 484.42 mfu: 48.98% global_avg_ntp_loss: 0.5806 global_avg_top_loss: 1.7478 +[titan] 2025-09-10 18:38:01,473 - root - INFO - lr: 2.3076e-06 gnorm: 0.57 [2 days, 19:01:32< 6:02:12] +[titan] 2025-09-10 18:38:27,165 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 18:38:33,648 - root - INFO - step: 36700 loss: 2.5286 memory: 122.03GiB(87.57%) tps: 10,184 tflops: 485.38 mfu: 49.08% global_avg_ntp_loss: 0.6695 global_avg_top_loss: 1.8591 +[titan] 2025-09-10 18:38:33,648 - root - INFO - lr: 2.3067e-06 gnorm: 0.58 [2 days, 19:02:04< 6:01:39] +[titan] 2025-09-10 18:39:05,568 - root - INFO - step: 36705 loss: 2.6464 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.26 mfu: 49.47% global_avg_ntp_loss: 0.7310 global_avg_top_loss: 1.9154 +[titan] 2025-09-10 18:39:05,568 - root - INFO - lr: 2.3057e-06 gnorm: 0.58 [2 days, 19:02:36< 6:01:06] +[titan] 2025-09-10 18:39:37,548 - root - INFO - step: 36710 loss: 2.9733 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.34 mfu: 49.38% global_avg_ntp_loss: 0.9252 global_avg_top_loss: 2.0481 +[titan] 2025-09-10 18:39:37,549 - root - INFO - lr: 2.3048e-06 gnorm: 0.73 [2 days, 19:03:08< 6:00:33] +[titan] 2025-09-10 18:40:09,959 - root - INFO - step: 36715 loss: 2.4791 memory: 122.03GiB(87.57%) tps: 10,110 tflops: 481.85 mfu: 48.72% global_avg_ntp_loss: 0.6472 global_avg_top_loss: 1.8319 +[titan] 2025-09-10 18:40:09,960 - root - INFO - lr: 2.3039e-06 gnorm: 0.57 [2 days, 19:03:41< 6:00:00] +[titan] 2025-09-10 18:40:42,217 - root - INFO - step: 36720 loss: 2.4772 memory: 122.03GiB(87.57%) tps: 10,158 tflops: 484.14 mfu: 48.95% global_avg_ntp_loss: 0.6439 global_avg_top_loss: 1.8333 +[titan] 2025-09-10 18:40:42,217 - root - INFO - lr: 2.3030e-06 gnorm: 0.58 [2 days, 19:04:13< 5:59:27] +[titan] 2025-09-10 18:41:14,104 - root - INFO - step: 36725 loss: 2.4320 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.76 mfu: 49.52% global_avg_ntp_loss: 0.6228 global_avg_top_loss: 1.8093 +[titan] 2025-09-10 18:41:14,105 - root - INFO - lr: 2.3020e-06 gnorm: 0.72 [2 days, 19:04:45< 5:58:54] +[titan] 2025-09-10 18:41:46,548 - root - INFO - step: 36730 loss: 2.5514 memory: 122.03GiB(87.57%) tps: 10,100 tflops: 481.37 mfu: 48.67% global_avg_ntp_loss: 0.6838 global_avg_top_loss: 1.8676 +[titan] 2025-09-10 18:41:46,548 - root - INFO - lr: 2.3011e-06 gnorm: 0.58 [2 days, 19:05:17< 5:58:21] +[titan] 2025-09-10 18:42:18,675 - root - INFO - step: 36735 loss: 2.5152 memory: 122.03GiB(87.57%) tps: 10,200 tflops: 486.10 mfu: 49.15% global_avg_ntp_loss: 0.6662 global_avg_top_loss: 1.8490 +[titan] 2025-09-10 18:42:18,675 - root - INFO - lr: 2.3002e-06 gnorm: 0.68 [2 days, 19:05:49< 5:57:48] +[titan] 2025-09-10 18:42:50,629 - root - INFO - step: 36740 loss: 2.9724 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.75 mfu: 49.42% global_avg_ntp_loss: 0.9210 global_avg_top_loss: 2.0513 +[titan] 2025-09-10 18:42:50,629 - root - INFO - lr: 2.2993e-06 gnorm: 0.83 [2 days, 19:06:21< 5:57:15] +[titan] 2025-09-10 18:43:22,660 - root - INFO - step: 36745 loss: 2.4421 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.57 mfu: 49.30% global_avg_ntp_loss: 0.6314 global_avg_top_loss: 1.8107 +[titan] 2025-09-10 18:43:22,660 - root - INFO - lr: 2.2984e-06 gnorm: 0.59 [2 days, 19:06:53< 5:56:42] +[titan] 2025-09-10 18:43:48,480 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 18:43:54,904 - root - INFO - step: 36750 loss: 2.3837 memory: 122.03GiB(87.57%) tps: 10,163 tflops: 484.34 mfu: 48.97% global_avg_ntp_loss: 0.6018 global_avg_top_loss: 1.7819 +[titan] 2025-09-10 18:43:54,904 - root - INFO - lr: 2.2975e-06 gnorm: 0.67 [2 days, 19:07:26< 5:56:10] +[titan] 2025-09-10 18:44:26,928 - root - INFO - step: 36755 loss: 2.2666 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.67 mfu: 49.31% global_avg_ntp_loss: 0.5523 global_avg_top_loss: 1.7143 +[titan] 2025-09-10 18:44:26,928 - root - INFO - lr: 2.2966e-06 gnorm: 0.77 [2 days, 19:07:58< 5:55:37] +[titan] 2025-09-10 18:44:59,071 - root - INFO - step: 36760 loss: 2.5283 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.88 mfu: 49.13% global_avg_ntp_loss: 0.6675 global_avg_top_loss: 1.8608 +[titan] 2025-09-10 18:44:59,071 - root - INFO - lr: 2.2957e-06 gnorm: 0.56 [2 days, 19:08:30< 5:55:04] +[titan] 2025-09-10 18:45:31,256 - root - INFO - step: 36765 loss: 2.3671 memory: 122.03GiB(87.57%) tps: 10,181 tflops: 485.23 mfu: 49.06% global_avg_ntp_loss: 0.6048 global_avg_top_loss: 1.7624 +[titan] 2025-09-10 18:45:31,256 - root - INFO - lr: 2.2948e-06 gnorm: 0.60 [2 days, 19:09:02< 5:54:31] +[titan] 2025-09-10 18:46:03,284 - root - INFO - step: 36770 loss: 2.5077 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.61 mfu: 49.30% global_avg_ntp_loss: 0.6600 global_avg_top_loss: 1.8477 +[titan] 2025-09-10 18:46:03,284 - root - INFO - lr: 2.2939e-06 gnorm: 0.60 [2 days, 19:09:34< 5:53:58] +[titan] 2025-09-10 18:46:35,261 - root - INFO - step: 36775 loss: 2.6830 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.38 mfu: 49.38% global_avg_ntp_loss: 0.7673 global_avg_top_loss: 1.9157 +[titan] 2025-09-10 18:46:35,262 - root - INFO - lr: 2.2929e-06 gnorm: 0.57 [2 days, 19:10:06< 5:53:25] +[titan] 2025-09-10 18:47:07,370 - root - INFO - step: 36780 loss: 2.5546 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.39 mfu: 49.18% global_avg_ntp_loss: 0.6814 global_avg_top_loss: 1.8732 +[titan] 2025-09-10 18:47:07,370 - root - INFO - lr: 2.2920e-06 gnorm: 0.62 [2 days, 19:10:38< 5:52:52] +[titan] 2025-09-10 18:47:39,400 - root - INFO - step: 36785 loss: 2.5046 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.58 mfu: 49.30% global_avg_ntp_loss: 0.6613 global_avg_top_loss: 1.8433 +[titan] 2025-09-10 18:47:39,400 - root - INFO - lr: 2.2911e-06 gnorm: 0.58 [2 days, 19:11:10< 5:52:19] +[titan] 2025-09-10 18:48:11,559 - root - INFO - step: 36790 loss: 2.5854 memory: 122.03GiB(87.57%) tps: 10,189 tflops: 485.62 mfu: 49.10% global_avg_ntp_loss: 0.6987 global_avg_top_loss: 1.8867 +[titan] 2025-09-10 18:48:11,560 - root - INFO - lr: 2.2902e-06 gnorm: 0.65 [2 days, 19:11:42< 5:51:46] +[titan] 2025-09-10 18:48:43,575 - root - INFO - step: 36795 loss: 2.4711 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.80 mfu: 49.32% global_avg_ntp_loss: 0.6448 global_avg_top_loss: 1.8264 +[titan] 2025-09-10 18:48:43,575 - root - INFO - lr: 2.2893e-06 gnorm: 0.66 [2 days, 19:12:14< 5:51:13] +[titan] 2025-09-10 18:49:09,108 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 18:49:15,712 - root - INFO - step: 36800 loss: 2.4147 memory: 122.03GiB(87.57%) tps: 10,197 tflops: 485.96 mfu: 49.14% global_avg_ntp_loss: 0.6171 global_avg_top_loss: 1.7976 +[titan] 2025-09-10 18:49:15,712 - root - INFO - lr: 2.2884e-06 gnorm: 0.57 [2 days, 19:12:46< 5:50:40] +[titan] 2025-09-10 18:49:47,731 - root - INFO - step: 36805 loss: 2.4787 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.75 mfu: 49.32% global_avg_ntp_loss: 0.6490 global_avg_top_loss: 1.8296 +[titan] 2025-09-10 18:49:47,731 - root - INFO - lr: 2.2876e-06 gnorm: 0.69 [2 days, 19:13:18< 5:50:07] +[titan] 2025-09-10 18:50:19,828 - root - INFO - step: 36810 loss: 2.5841 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.56 mfu: 49.20% global_avg_ntp_loss: 0.7028 global_avg_top_loss: 1.8813 +[titan] 2025-09-10 18:50:19,828 - root - INFO - lr: 2.2867e-06 gnorm: 0.59 [2 days, 19:13:50< 5:49:34] +[titan] 2025-09-10 18:50:52,042 - root - INFO - step: 36815 loss: 2.5854 memory: 122.03GiB(87.57%) tps: 10,172 tflops: 484.79 mfu: 49.02% global_avg_ntp_loss: 0.6969 global_avg_top_loss: 1.8885 +[titan] 2025-09-10 18:50:52,043 - root - INFO - lr: 2.2858e-06 gnorm: 0.73 [2 days, 19:14:23< 5:49:01] +[titan] 2025-09-10 18:51:24,486 - root - INFO - step: 36820 loss: 2.5419 memory: 122.03GiB(87.57%) tps: 10,100 tflops: 481.37 mfu: 48.67% global_avg_ntp_loss: 0.6763 global_avg_top_loss: 1.8656 +[titan] 2025-09-10 18:51:24,486 - root - INFO - lr: 2.2849e-06 gnorm: 0.78 [2 days, 19:14:55< 5:48:28] +[titan] 2025-09-10 18:51:56,453 - root - INFO - step: 36825 loss: 2.4398 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.54 mfu: 49.40% global_avg_ntp_loss: 0.6346 global_avg_top_loss: 1.8052 +[titan] 2025-09-10 18:51:56,453 - root - INFO - lr: 2.2840e-06 gnorm: 0.57 [2 days, 19:15:27< 5:47:55] +[titan] 2025-09-10 18:52:28,651 - root - INFO - step: 36830 loss: 2.4117 memory: 122.03GiB(87.57%) tps: 10,177 tflops: 485.04 mfu: 49.04% global_avg_ntp_loss: 0.6146 global_avg_top_loss: 1.7971 +[titan] 2025-09-10 18:52:28,651 - root - INFO - lr: 2.2831e-06 gnorm: 0.65 [2 days, 19:15:59< 5:47:22] +[titan] 2025-09-10 18:53:00,576 - root - INFO - step: 36835 loss: 2.2908 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.18 mfu: 49.46% global_avg_ntp_loss: 0.5549 global_avg_top_loss: 1.7359 +[titan] 2025-09-10 18:53:00,577 - root - INFO - lr: 2.2822e-06 gnorm: 0.88 [2 days, 19:16:31< 5:46:50] +[titan] 2025-09-10 18:53:32,731 - root - INFO - step: 36840 loss: 2.4213 memory: 122.03GiB(87.57%) tps: 10,191 tflops: 485.69 mfu: 49.11% global_avg_ntp_loss: 0.6203 global_avg_top_loss: 1.8010 +[titan] 2025-09-10 18:53:32,732 - root - INFO - lr: 2.2813e-06 gnorm: 0.59 [2 days, 19:17:03< 5:46:17] +[titan] 2025-09-10 18:54:04,978 - root - INFO - step: 36845 loss: 2.3533 memory: 122.03GiB(87.57%) tps: 10,162 tflops: 484.31 mfu: 48.97% global_avg_ntp_loss: 0.5905 global_avg_top_loss: 1.7627 +[titan] 2025-09-10 18:54:04,978 - root - INFO - lr: 2.2804e-06 gnorm: 0.62 [2 days, 19:17:36< 5:45:44] +[titan] 2025-09-10 18:54:30,652 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 18:54:37,037 - root - INFO - step: 36850 loss: 2.4913 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.14 mfu: 49.26% global_avg_ntp_loss: 0.6547 global_avg_top_loss: 1.8366 +[titan] 2025-09-10 18:54:37,037 - root - INFO - lr: 2.2796e-06 gnorm: 0.64 [2 days, 19:18:08< 5:45:11] +[titan] 2025-09-10 18:55:09,081 - root - INFO - step: 36855 loss: 2.4223 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.37 mfu: 49.28% global_avg_ntp_loss: 0.6187 global_avg_top_loss: 1.8035 +[titan] 2025-09-10 18:55:09,081 - root - INFO - lr: 2.2787e-06 gnorm: 0.61 [2 days, 19:18:40< 5:44:38] +[titan] 2025-09-10 18:55:41,184 - root - INFO - step: 36860 loss: 2.5397 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.47 mfu: 49.19% global_avg_ntp_loss: 0.6752 global_avg_top_loss: 1.8644 +[titan] 2025-09-10 18:55:41,184 - root - INFO - lr: 2.2778e-06 gnorm: 0.62 [2 days, 19:19:12< 5:44:05] +[titan] 2025-09-10 18:56:07,308 - root - INFO - Dumping profiler traces at step 36864 +[titan] 2025-09-10 18:56:07,383 - root - INFO - Finished dumping profiler traces in 0.08 seconds +[titan] 2025-09-10 18:56:13,841 - root - INFO - step: 36865 loss: 2.6246 memory: 122.03GiB(87.57%) tps: 10,034 tflops: 478.22 mfu: 48.35% global_avg_ntp_loss: 0.7148 global_avg_top_loss: 1.9098 +[titan] 2025-09-10 18:56:13,841 - root - INFO - lr: 2.2769e-06 gnorm: 0.64 [2 days, 19:19:44< 5:43:32] +[titan] 2025-09-10 18:56:46,163 - root - INFO - step: 36870 loss: 2.6480 memory: 122.03GiB(87.57%) tps: 10,138 tflops: 483.17 mfu: 48.85% global_avg_ntp_loss: 0.7281 global_avg_top_loss: 1.9200 +[titan] 2025-09-10 18:56:46,164 - root - INFO - lr: 2.2760e-06 gnorm: 0.70 [2 days, 19:20:17< 5:42:59] +[titan] 2025-09-10 18:57:18,104 - root - INFO - step: 36875 loss: 2.5042 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.95 mfu: 49.44% global_avg_ntp_loss: 0.6577 global_avg_top_loss: 1.8465 +[titan] 2025-09-10 18:57:18,104 - root - INFO - lr: 2.2752e-06 gnorm: 0.62 [2 days, 19:20:49< 5:42:26] +[titan] 2025-09-10 18:57:50,216 - root - INFO - step: 36880 loss: 2.6099 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.34 mfu: 49.17% global_avg_ntp_loss: 0.7160 global_avg_top_loss: 1.8939 +[titan] 2025-09-10 18:57:50,216 - root - INFO - lr: 2.2743e-06 gnorm: 0.60 [2 days, 19:21:21< 5:41:53] +[titan] 2025-09-10 18:58:22,464 - root - INFO - step: 36885 loss: 2.4856 memory: 122.03GiB(87.57%) tps: 10,161 tflops: 484.28 mfu: 48.97% global_avg_ntp_loss: 0.6512 global_avg_top_loss: 1.8344 +[titan] 2025-09-10 18:58:22,464 - root - INFO - lr: 2.2734e-06 gnorm: 0.63 [2 days, 19:21:53< 5:41:20] +[titan] 2025-09-10 18:58:54,551 - root - INFO - step: 36890 loss: 2.7820 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.71 mfu: 49.21% global_avg_ntp_loss: 0.8073 global_avg_top_loss: 1.9747 +[titan] 2025-09-10 18:58:54,551 - root - INFO - lr: 2.2725e-06 gnorm: 0.56 [2 days, 19:22:25< 5:40:47] +[titan] 2025-09-10 18:59:26,716 - root - INFO - step: 36895 loss: 2.5118 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.54 mfu: 49.09% global_avg_ntp_loss: 0.6659 global_avg_top_loss: 1.8459 +[titan] 2025-09-10 18:59:26,716 - root - INFO - lr: 2.2717e-06 gnorm: 0.67 [2 days, 19:22:57< 5:40:14] +[titan] 2025-09-10 18:59:52,317 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 18:59:58,857 - root - INFO - step: 36900 loss: 2.4829 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.89 mfu: 49.13% global_avg_ntp_loss: 0.6571 global_avg_top_loss: 1.8258 +[titan] 2025-09-10 18:59:58,858 - root - INFO - lr: 2.2708e-06 gnorm: 0.75 [2 days, 19:23:29< 5:39:41] +[titan] 2025-09-10 19:00:30,918 - root - INFO - step: 36905 loss: 2.4111 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.12 mfu: 49.25% global_avg_ntp_loss: 0.6148 global_avg_top_loss: 1.7963 +[titan] 2025-09-10 19:00:30,918 - root - INFO - lr: 2.2699e-06 gnorm: 0.57 [2 days, 19:24:02< 5:39:08] +[titan] 2025-09-10 19:01:03,044 - root - INFO - step: 36910 loss: 2.3852 memory: 122.03GiB(87.57%) tps: 10,200 tflops: 486.12 mfu: 49.15% global_avg_ntp_loss: 0.6039 global_avg_top_loss: 1.7813 +[titan] 2025-09-10 19:01:03,044 - root - INFO - lr: 2.2691e-06 gnorm: 0.71 [2 days, 19:24:34< 5:38:35] +[titan] 2025-09-10 19:01:35,224 - root - INFO - step: 36915 loss: 2.3091 memory: 122.03GiB(87.57%) tps: 10,183 tflops: 485.31 mfu: 49.07% global_avg_ntp_loss: 0.5638 global_avg_top_loss: 1.7453 +[titan] 2025-09-10 19:01:35,224 - root - INFO - lr: 2.2682e-06 gnorm: 0.78 [2 days, 19:25:06< 5:38:03] +[titan] 2025-09-10 19:02:07,229 - root - INFO - step: 36920 loss: 2.4963 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.96 mfu: 49.34% global_avg_ntp_loss: 0.6535 global_avg_top_loss: 1.8428 +[titan] 2025-09-10 19:02:07,229 - root - INFO - lr: 2.2673e-06 gnorm: 0.58 [2 days, 19:25:38< 5:37:30] +[titan] 2025-09-10 19:02:39,448 - root - INFO - step: 36925 loss: 2.8161 memory: 122.03GiB(87.57%) tps: 10,171 tflops: 484.72 mfu: 49.01% global_avg_ntp_loss: 0.8416 global_avg_top_loss: 1.9745 +[titan] 2025-09-10 19:02:39,448 - root - INFO - lr: 2.2665e-06 gnorm: 0.63 [2 days, 19:26:10< 5:36:57] +[titan] 2025-09-10 19:03:11,515 - root - INFO - step: 36930 loss: 2.5064 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.01 mfu: 49.24% global_avg_ntp_loss: 0.6582 global_avg_top_loss: 1.8482 +[titan] 2025-09-10 19:03:11,515 - root - INFO - lr: 2.2656e-06 gnorm: 0.61 [2 days, 19:26:42< 5:36:24] +[titan] 2025-09-10 19:03:43,678 - root - INFO - step: 36935 loss: 2.4026 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.57 mfu: 49.10% global_avg_ntp_loss: 0.6125 global_avg_top_loss: 1.7901 +[titan] 2025-09-10 19:03:43,678 - root - INFO - lr: 2.2647e-06 gnorm: 0.62 [2 days, 19:27:14< 5:35:51] +[titan] 2025-09-10 19:04:15,920 - root - INFO - step: 36940 loss: 2.4797 memory: 122.03GiB(87.57%) tps: 10,163 tflops: 484.37 mfu: 48.98% global_avg_ntp_loss: 0.6472 global_avg_top_loss: 1.8325 +[titan] 2025-09-10 19:04:15,920 - root - INFO - lr: 2.2639e-06 gnorm: 0.60 [2 days, 19:27:47< 5:35:18] +[titan] 2025-09-10 19:04:48,042 - root - INFO - step: 36945 loss: 2.6491 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.19 mfu: 49.16% global_avg_ntp_loss: 0.7295 global_avg_top_loss: 1.9196 +[titan] 2025-09-10 19:04:48,042 - root - INFO - lr: 2.2630e-06 gnorm: 0.66 [2 days, 19:28:19< 5:34:45] +[titan] 2025-09-10 19:05:13,655 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 19:05:20,112 - root - INFO - step: 36950 loss: 2.6526 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.96 mfu: 49.24% global_avg_ntp_loss: 0.7287 global_avg_top_loss: 1.9239 +[titan] 2025-09-10 19:05:20,113 - root - INFO - lr: 2.2622e-06 gnorm: 0.70 [2 days, 19:28:51< 5:34:12] +[titan] 2025-09-10 19:05:52,059 - root - INFO - step: 36955 loss: 2.4818 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.86 mfu: 49.43% global_avg_ntp_loss: 0.6492 global_avg_top_loss: 1.8326 +[titan] 2025-09-10 19:05:52,059 - root - INFO - lr: 2.2613e-06 gnorm: 0.59 [2 days, 19:29:23< 5:33:39] +[titan] 2025-09-10 19:06:24,213 - root - INFO - step: 36960 loss: 2.4561 memory: 122.03GiB(87.57%) tps: 10,191 tflops: 485.70 mfu: 49.11% global_avg_ntp_loss: 0.6326 global_avg_top_loss: 1.8234 +[titan] 2025-09-10 19:06:24,213 - root - INFO - lr: 2.2605e-06 gnorm: 0.69 [2 days, 19:29:55< 5:33:06] +[titan] 2025-09-10 19:06:56,262 - root - INFO - step: 36965 loss: 2.5354 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.28 mfu: 49.27% global_avg_ntp_loss: 0.6835 global_avg_top_loss: 1.8519 +[titan] 2025-09-10 19:06:56,263 - root - INFO - lr: 2.2596e-06 gnorm: 0.61 [2 days, 19:30:27< 5:32:33] +[titan] 2025-09-10 19:07:28,414 - root - INFO - step: 36970 loss: 2.6428 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.74 mfu: 49.11% global_avg_ntp_loss: 0.7268 global_avg_top_loss: 1.9160 +[titan] 2025-09-10 19:07:28,414 - root - INFO - lr: 2.2588e-06 gnorm: 0.59 [2 days, 19:30:59< 5:32:00] +[titan] 2025-09-10 19:08:00,509 - root - INFO - step: 36975 loss: 2.6267 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.59 mfu: 49.20% global_avg_ntp_loss: 0.7170 global_avg_top_loss: 1.9096 +[titan] 2025-09-10 19:08:00,509 - root - INFO - lr: 2.2579e-06 gnorm: 0.74 [2 days, 19:31:31< 5:31:27] +[titan] 2025-09-10 19:08:32,513 - root - INFO - step: 36980 loss: 2.5454 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.98 mfu: 49.34% global_avg_ntp_loss: 0.6862 global_avg_top_loss: 1.8592 +[titan] 2025-09-10 19:08:32,513 - root - INFO - lr: 2.2571e-06 gnorm: 0.69 [2 days, 19:32:03< 5:30:54] +[titan] 2025-09-10 19:09:04,522 - root - INFO - step: 36985 loss: 2.3213 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.90 mfu: 49.33% global_avg_ntp_loss: 0.5771 global_avg_top_loss: 1.7442 +[titan] 2025-09-10 19:09:04,523 - root - INFO - lr: 2.2562e-06 gnorm: 0.56 [2 days, 19:32:35< 5:30:21] +[titan] 2025-09-10 19:09:36,729 - root - INFO - step: 36990 loss: 2.3764 memory: 122.03GiB(87.57%) tps: 10,174 tflops: 484.90 mfu: 49.03% global_avg_ntp_loss: 0.5955 global_avg_top_loss: 1.7809 +[titan] 2025-09-10 19:09:36,729 - root - INFO - lr: 2.2554e-06 gnorm: 0.68 [2 days, 19:33:07< 5:29:49] +[titan] 2025-09-10 19:10:08,734 - root - INFO - step: 36995 loss: 2.3201 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.96 mfu: 49.34% global_avg_ntp_loss: 0.5694 global_avg_top_loss: 1.7508 +[titan] 2025-09-10 19:10:08,734 - root - INFO - lr: 2.2545e-06 gnorm: 0.75 [2 days, 19:33:39< 5:29:16] +[titan] 2025-09-10 19:10:34,383 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 19:10:40,815 - root - INFO - step: 37000 loss: 2.4252 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.81 mfu: 49.22% global_avg_ntp_loss: 0.6207 global_avg_top_loss: 1.8045 +[titan] 2025-09-10 19:10:40,815 - root - INFO - lr: 2.2537e-06 gnorm: 0.58 [2 days, 19:34:11< 5:28:43] +[titan] 2025-09-10 19:11:13,204 - root - INFO - step: 37005 loss: 2.4205 memory: 122.03GiB(87.57%) tps: 10,117 tflops: 482.17 mfu: 48.75% global_avg_ntp_loss: 0.6170 global_avg_top_loss: 1.8035 +[titan] 2025-09-10 19:11:13,204 - root - INFO - lr: 2.2528e-06 gnorm: 0.60 [2 days, 19:34:44< 5:28:10] +[titan] 2025-09-10 19:11:45,340 - root - INFO - step: 37010 loss: 2.4561 memory: 122.03GiB(87.57%) tps: 10,197 tflops: 485.98 mfu: 49.14% global_avg_ntp_loss: 0.6395 global_avg_top_loss: 1.8165 +[titan] 2025-09-10 19:11:45,340 - root - INFO - lr: 2.2520e-06 gnorm: 0.60 [2 days, 19:35:16< 5:27:37] +[titan] 2025-09-10 19:12:17,494 - root - INFO - step: 37015 loss: 2.3534 memory: 122.03GiB(87.57%) tps: 10,191 tflops: 485.70 mfu: 49.11% global_avg_ntp_loss: 0.5904 global_avg_top_loss: 1.7630 +[titan] 2025-09-10 19:12:17,494 - root - INFO - lr: 2.2512e-06 gnorm: 0.58 [2 days, 19:35:48< 5:27:04] +[titan] 2025-09-10 19:12:49,902 - root - INFO - step: 37020 loss: 2.5383 memory: 122.03GiB(87.57%) tps: 10,111 tflops: 481.89 mfu: 48.73% global_avg_ntp_loss: 0.6725 global_avg_top_loss: 1.8658 +[titan] 2025-09-10 19:12:49,902 - root - INFO - lr: 2.2503e-06 gnorm: 0.60 [2 days, 19:36:21< 5:26:31] +[titan] 2025-09-10 19:13:22,023 - root - INFO - step: 37025 loss: 2.5990 memory: 122.03GiB(87.57%) tps: 10,202 tflops: 486.20 mfu: 49.16% global_avg_ntp_loss: 0.7072 global_avg_top_loss: 1.8917 +[titan] 2025-09-10 19:13:22,023 - root - INFO - lr: 2.2495e-06 gnorm: 0.61 [2 days, 19:36:53< 5:25:58] +[titan] 2025-09-10 19:13:54,252 - root - INFO - step: 37030 loss: 2.5670 memory: 122.03GiB(87.57%) tps: 10,167 tflops: 484.57 mfu: 49.00% global_avg_ntp_loss: 0.6910 global_avg_top_loss: 1.8760 +[titan] 2025-09-10 19:13:54,252 - root - INFO - lr: 2.2487e-06 gnorm: 0.69 [2 days, 19:37:25< 5:25:25] +[titan] 2025-09-10 19:14:26,528 - root - INFO - step: 37035 loss: 2.4955 memory: 122.03GiB(87.57%) tps: 10,152 tflops: 483.86 mfu: 48.92% global_avg_ntp_loss: 0.6514 global_avg_top_loss: 1.8440 +[titan] 2025-09-10 19:14:26,529 - root - INFO - lr: 2.2478e-06 gnorm: 0.65 [2 days, 19:37:57< 5:24:52] +[titan] 2025-09-10 19:14:58,489 - root - INFO - step: 37040 loss: 2.5111 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.64 mfu: 49.41% global_avg_ntp_loss: 0.6573 global_avg_top_loss: 1.8538 +[titan] 2025-09-10 19:14:58,489 - root - INFO - lr: 2.2470e-06 gnorm: 0.64 [2 days, 19:38:29< 5:24:19] +[titan] 2025-09-10 19:15:30,557 - root - INFO - step: 37045 loss: 2.5538 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.01 mfu: 49.24% global_avg_ntp_loss: 0.6768 global_avg_top_loss: 1.8770 +[titan] 2025-09-10 19:15:30,557 - root - INFO - lr: 2.2462e-06 gnorm: 0.72 [2 days, 19:39:01< 5:23:46] +[titan] 2025-09-10 19:15:56,233 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 19:16:02,769 - root - INFO - step: 37050 loss: 2.5232 memory: 122.03GiB(87.57%) tps: 10,173 tflops: 484.81 mfu: 49.02% global_avg_ntp_loss: 0.6688 global_avg_top_loss: 1.8543 +[titan] 2025-09-10 19:16:02,770 - root - INFO - lr: 2.2453e-06 gnorm: 0.58 [2 days, 19:39:33< 5:23:13] +[titan] 2025-09-10 19:16:34,969 - root - INFO - step: 37055 loss: 2.5388 memory: 122.03GiB(87.57%) tps: 10,177 tflops: 485.01 mfu: 49.04% global_avg_ntp_loss: 0.6748 global_avg_top_loss: 1.8640 +[titan] 2025-09-10 19:16:34,969 - root - INFO - lr: 2.2445e-06 gnorm: 0.66 [2 days, 19:40:06< 5:22:40] +[titan] 2025-09-10 19:17:07,133 - root - INFO - step: 37060 loss: 2.5828 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.55 mfu: 49.10% global_avg_ntp_loss: 0.7006 global_avg_top_loss: 1.8822 +[titan] 2025-09-10 19:17:07,133 - root - INFO - lr: 2.2437e-06 gnorm: 0.72 [2 days, 19:40:38< 5:22:08] +[titan] 2025-09-10 19:17:39,270 - root - INFO - step: 37065 loss: 2.3165 memory: 122.03GiB(87.57%) tps: 10,196 tflops: 485.96 mfu: 49.14% global_avg_ntp_loss: 0.5733 global_avg_top_loss: 1.7432 +[titan] 2025-09-10 19:17:39,270 - root - INFO - lr: 2.2429e-06 gnorm: 0.54 [2 days, 19:41:10< 5:21:35] +[titan] 2025-09-10 19:18:11,537 - root - INFO - step: 37070 loss: 2.4166 memory: 122.03GiB(87.57%) tps: 10,155 tflops: 484.00 mfu: 48.94% global_avg_ntp_loss: 0.6133 global_avg_top_loss: 1.8034 +[titan] 2025-09-10 19:18:11,537 - root - INFO - lr: 2.2420e-06 gnorm: 0.71 [2 days, 19:41:42< 5:21:02] +[titan] 2025-09-10 19:18:43,573 - root - INFO - step: 37075 loss: 2.3048 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.48 mfu: 49.29% global_avg_ntp_loss: 0.5619 global_avg_top_loss: 1.7429 +[titan] 2025-09-10 19:18:43,574 - root - INFO - lr: 2.2412e-06 gnorm: 0.74 [2 days, 19:42:14< 5:20:29] +[titan] 2025-09-10 19:19:15,553 - root - INFO - step: 37080 loss: 2.8875 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.35 mfu: 49.38% global_avg_ntp_loss: 0.8840 global_avg_top_loss: 2.0035 +[titan] 2025-09-10 19:19:15,553 - root - INFO - lr: 2.2404e-06 gnorm: 0.58 [2 days, 19:42:46< 5:19:56] +[titan] 2025-09-10 19:19:47,928 - root - INFO - step: 37085 loss: 2.4873 memory: 122.03GiB(87.57%) tps: 10,121 tflops: 482.38 mfu: 48.77% global_avg_ntp_loss: 0.6432 global_avg_top_loss: 1.8441 +[titan] 2025-09-10 19:19:47,928 - root - INFO - lr: 2.2396e-06 gnorm: 0.62 [2 days, 19:43:19< 5:19:23] +[titan] 2025-09-10 19:20:20,078 - root - INFO - step: 37090 loss: 2.4442 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.76 mfu: 49.12% global_avg_ntp_loss: 0.6313 global_avg_top_loss: 1.8130 +[titan] 2025-09-10 19:20:20,078 - root - INFO - lr: 2.2388e-06 gnorm: 0.60 [2 days, 19:43:51< 5:18:50] +[titan] 2025-09-10 19:20:52,516 - root - INFO - step: 37095 loss: 2.8535 memory: 122.03GiB(87.57%) tps: 10,102 tflops: 481.45 mfu: 48.68% global_avg_ntp_loss: 0.8673 global_avg_top_loss: 1.9861 +[titan] 2025-09-10 19:20:52,516 - root - INFO - lr: 2.2379e-06 gnorm: 0.63 [2 days, 19:44:23< 5:18:17] +[titan] 2025-09-10 19:21:18,197 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 19:21:24,586 - root - INFO - step: 37100 loss: 2.5436 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.97 mfu: 49.24% global_avg_ntp_loss: 0.6734 global_avg_top_loss: 1.8702 +[titan] 2025-09-10 19:21:24,586 - root - INFO - lr: 2.2371e-06 gnorm: 0.65 [2 days, 19:44:55< 5:17:44] +[titan] 2025-09-10 19:21:56,795 - root - INFO - step: 37105 loss: 2.4813 memory: 122.03GiB(87.57%) tps: 10,174 tflops: 484.88 mfu: 49.03% global_avg_ntp_loss: 0.6512 global_avg_top_loss: 1.8300 +[titan] 2025-09-10 19:21:56,795 - root - INFO - lr: 2.2363e-06 gnorm: 0.60 [2 days, 19:45:27< 5:17:11] +[titan] 2025-09-10 19:22:29,070 - root - INFO - step: 37110 loss: 2.6574 memory: 122.03GiB(87.57%) tps: 10,153 tflops: 483.87 mfu: 48.92% global_avg_ntp_loss: 0.7315 global_avg_top_loss: 1.9259 +[titan] 2025-09-10 19:22:29,071 - root - INFO - lr: 2.2355e-06 gnorm: 0.69 [2 days, 19:46:00< 5:16:38] +[titan] 2025-09-10 19:23:01,269 - root - INFO - step: 37115 loss: 2.4426 memory: 122.03GiB(87.57%) tps: 10,177 tflops: 485.02 mfu: 49.04% global_avg_ntp_loss: 0.6281 global_avg_top_loss: 1.8146 +[titan] 2025-09-10 19:23:01,270 - root - INFO - lr: 2.2347e-06 gnorm: 0.67 [2 days, 19:46:32< 5:16:05] +[titan] 2025-09-10 19:23:33,530 - root - INFO - step: 37120 loss: 2.5508 memory: 122.03GiB(87.57%) tps: 10,157 tflops: 484.10 mfu: 48.95% global_avg_ntp_loss: 0.6789 global_avg_top_loss: 1.8719 +[titan] 2025-09-10 19:23:33,530 - root - INFO - lr: 2.2339e-06 gnorm: 0.63 [2 days, 19:47:04< 5:15:32] +[titan] 2025-09-10 19:24:05,710 - root - INFO - step: 37125 loss: 2.4215 memory: 122.03GiB(87.57%) tps: 10,183 tflops: 485.31 mfu: 49.07% global_avg_ntp_loss: 0.6174 global_avg_top_loss: 1.8041 +[titan] 2025-09-10 19:24:05,710 - root - INFO - lr: 2.2331e-06 gnorm: 0.66 [2 days, 19:47:36< 5:15:00] +[titan] 2025-09-10 19:24:37,708 - root - INFO - step: 37130 loss: 2.5843 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.06 mfu: 49.35% global_avg_ntp_loss: 0.6961 global_avg_top_loss: 1.8881 +[titan] 2025-09-10 19:24:37,709 - root - INFO - lr: 2.2323e-06 gnorm: 0.61 [2 days, 19:48:08< 5:14:27] +[titan] 2025-09-10 19:25:09,660 - root - INFO - step: 37135 loss: 2.5568 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.77 mfu: 49.42% global_avg_ntp_loss: 0.6829 global_avg_top_loss: 1.8738 +[titan] 2025-09-10 19:25:09,661 - root - INFO - lr: 2.2315e-06 gnorm: 0.64 [2 days, 19:48:40< 5:13:54] +[titan] 2025-09-10 19:25:41,614 - root - INFO - step: 37140 loss: 2.4903 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.75 mfu: 49.42% global_avg_ntp_loss: 0.6536 global_avg_top_loss: 1.8367 +[titan] 2025-09-10 19:25:41,614 - root - INFO - lr: 2.2307e-06 gnorm: 0.73 [2 days, 19:49:12< 5:13:21] +[titan] 2025-09-10 19:26:13,829 - root - INFO - step: 37145 loss: 2.3683 memory: 122.03GiB(87.57%) tps: 10,172 tflops: 484.77 mfu: 49.02% global_avg_ntp_loss: 0.5952 global_avg_top_loss: 1.7730 +[titan] 2025-09-10 19:26:13,830 - root - INFO - lr: 2.2299e-06 gnorm: 0.59 [2 days, 19:49:44< 5:12:48] +[titan] 2025-09-10 19:26:39,443 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 19:26:45,874 - root - INFO - step: 37150 loss: 2.3904 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.36 mfu: 49.28% global_avg_ntp_loss: 0.6028 global_avg_top_loss: 1.7876 +[titan] 2025-09-10 19:26:45,874 - root - INFO - lr: 2.2291e-06 gnorm: 0.72 [2 days, 19:50:16< 5:12:15] +[titan] 2025-09-10 19:27:17,993 - root - INFO - step: 37155 loss: 2.3340 memory: 122.03GiB(87.57%) tps: 10,202 tflops: 486.23 mfu: 49.16% global_avg_ntp_loss: 0.5771 global_avg_top_loss: 1.7570 +[titan] 2025-09-10 19:27:17,994 - root - INFO - lr: 2.2283e-06 gnorm: 0.91 [2 days, 19:50:49< 5:11:42] +[titan] 2025-09-10 19:27:50,097 - root - INFO - step: 37160 loss: 2.2777 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.47 mfu: 49.19% global_avg_ntp_loss: 0.5597 global_avg_top_loss: 1.7180 +[titan] 2025-09-10 19:27:50,097 - root - INFO - lr: 2.2275e-06 gnorm: 0.55 [2 days, 19:51:21< 5:11:09] +[titan] 2025-09-10 19:28:22,258 - root - INFO - step: 37165 loss: 2.6007 memory: 122.03GiB(87.57%) tps: 10,189 tflops: 485.59 mfu: 49.10% global_avg_ntp_loss: 0.7219 global_avg_top_loss: 1.8788 +[titan] 2025-09-10 19:28:22,258 - root - INFO - lr: 2.2267e-06 gnorm: 0.60 [2 days, 19:51:53< 5:10:36] +[titan] 2025-09-10 19:28:54,435 - root - INFO - step: 37170 loss: 2.4918 memory: 122.03GiB(87.57%) tps: 10,184 tflops: 485.36 mfu: 49.08% global_avg_ntp_loss: 0.6550 global_avg_top_loss: 1.8368 +[titan] 2025-09-10 19:28:54,435 - root - INFO - lr: 2.2259e-06 gnorm: 0.69 [2 days, 19:52:25< 5:10:03] +[titan] 2025-09-10 19:29:26,541 - root - INFO - step: 37175 loss: 2.8816 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.42 mfu: 49.18% global_avg_ntp_loss: 0.8808 global_avg_top_loss: 2.0008 +[titan] 2025-09-10 19:29:26,541 - root - INFO - lr: 2.2251e-06 gnorm: 0.66 [2 days, 19:52:57< 5:09:30] +[titan] 2025-09-10 19:29:58,567 - root - INFO - step: 37180 loss: 2.4901 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.65 mfu: 49.31% global_avg_ntp_loss: 0.6519 global_avg_top_loss: 1.8382 +[titan] 2025-09-10 19:29:58,567 - root - INFO - lr: 2.2243e-06 gnorm: 0.60 [2 days, 19:53:29< 5:08:57] +[titan] 2025-09-10 19:30:30,541 - root - INFO - step: 37185 loss: 2.5910 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.44 mfu: 49.39% global_avg_ntp_loss: 0.7009 global_avg_top_loss: 1.8901 +[titan] 2025-09-10 19:30:30,541 - root - INFO - lr: 2.2235e-06 gnorm: 0.63 [2 days, 19:54:01< 5:08:24] +[titan] 2025-09-10 19:31:02,688 - root - INFO - step: 37190 loss: 2.9052 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.80 mfu: 49.12% global_avg_ntp_loss: 0.8744 global_avg_top_loss: 2.0308 +[titan] 2025-09-10 19:31:02,689 - root - INFO - lr: 2.2227e-06 gnorm: 0.70 [2 days, 19:54:33< 5:07:51] +[titan] 2025-09-10 19:31:34,772 - root - INFO - step: 37195 loss: 2.4900 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.76 mfu: 49.22% global_avg_ntp_loss: 0.6532 global_avg_top_loss: 1.8368 +[titan] 2025-09-10 19:31:34,772 - root - INFO - lr: 2.2219e-06 gnorm: 1.07 [2 days, 19:55:05< 5:07:19] +[titan] 2025-09-10 19:32:00,710 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 19:32:07,063 - root - INFO - step: 37200 loss: 2.4763 memory: 122.03GiB(87.57%) tps: 10,148 tflops: 483.64 mfu: 48.90% global_avg_ntp_loss: 0.6473 global_avg_top_loss: 1.8291 +[titan] 2025-09-10 19:32:07,063 - root - INFO - lr: 2.2211e-06 gnorm: 0.58 [2 days, 19:55:38< 5:06:46] +[titan] 2025-09-10 19:32:39,200 - root - INFO - step: 37205 loss: 2.4736 memory: 122.03GiB(87.57%) tps: 10,196 tflops: 485.95 mfu: 49.14% global_avg_ntp_loss: 0.6444 global_avg_top_loss: 1.8291 +[titan] 2025-09-10 19:32:39,200 - root - INFO - lr: 2.2203e-06 gnorm: 0.60 [2 days, 19:56:10< 5:06:13] +[titan] 2025-09-10 19:33:11,372 - root - INFO - step: 37210 loss: 2.5382 memory: 122.03GiB(87.57%) tps: 10,185 tflops: 485.43 mfu: 49.08% global_avg_ntp_loss: 0.6762 global_avg_top_loss: 1.8620 +[titan] 2025-09-10 19:33:11,372 - root - INFO - lr: 2.2196e-06 gnorm: 0.57 [2 days, 19:56:42< 5:05:40] +[titan] 2025-09-10 19:33:43,597 - root - INFO - step: 37215 loss: 2.5483 memory: 122.03GiB(87.57%) tps: 10,169 tflops: 484.64 mfu: 49.00% global_avg_ntp_loss: 0.6908 global_avg_top_loss: 1.8575 +[titan] 2025-09-10 19:33:43,597 - root - INFO - lr: 2.2188e-06 gnorm: 0.69 [2 days, 19:57:14< 5:05:07] +[titan] 2025-09-10 19:34:15,808 - root - INFO - step: 37220 loss: 2.4966 memory: 122.03GiB(87.57%) tps: 10,173 tflops: 484.84 mfu: 49.02% global_avg_ntp_loss: 0.6582 global_avg_top_loss: 1.8384 +[titan] 2025-09-10 19:34:15,808 - root - INFO - lr: 2.2180e-06 gnorm: 0.71 [2 days, 19:57:46< 5:04:34] +[titan] 2025-09-10 19:34:47,941 - root - INFO - step: 37225 loss: 2.3660 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.01 mfu: 49.14% global_avg_ntp_loss: 0.6037 global_avg_top_loss: 1.7623 +[titan] 2025-09-10 19:34:47,941 - root - INFO - lr: 2.2172e-06 gnorm: 0.57 [2 days, 19:58:19< 5:04:01] +[titan] 2025-09-10 19:35:19,932 - root - INFO - step: 37230 loss: 2.2785 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.18 mfu: 49.36% global_avg_ntp_loss: 0.5523 global_avg_top_loss: 1.7261 +[titan] 2025-09-10 19:35:19,932 - root - INFO - lr: 2.2164e-06 gnorm: 0.67 [2 days, 19:58:51< 5:03:28] +[titan] 2025-09-10 19:35:52,049 - root - INFO - step: 37235 loss: 2.3499 memory: 122.03GiB(87.57%) tps: 10,203 tflops: 486.25 mfu: 49.17% global_avg_ntp_loss: 0.5836 global_avg_top_loss: 1.7663 +[titan] 2025-09-10 19:35:52,050 - root - INFO - lr: 2.2156e-06 gnorm: 0.75 [2 days, 19:59:23< 5:02:55] +[titan] 2025-09-10 19:36:24,262 - root - INFO - step: 37240 loss: 2.5123 memory: 122.03GiB(87.57%) tps: 10,173 tflops: 484.82 mfu: 49.02% global_avg_ntp_loss: 0.6649 global_avg_top_loss: 1.8475 +[titan] 2025-09-10 19:36:24,262 - root - INFO - lr: 2.2149e-06 gnorm: 0.58 [2 days, 19:59:55< 5:02:22] +[titan] 2025-09-10 19:36:56,447 - root - INFO - step: 37245 loss: 2.3305 memory: 122.03GiB(87.57%) tps: 10,181 tflops: 485.23 mfu: 49.06% global_avg_ntp_loss: 0.5763 global_avg_top_loss: 1.7541 +[titan] 2025-09-10 19:36:56,447 - root - INFO - lr: 2.2141e-06 gnorm: 0.62 [2 days, 20:00:27< 5:01:49] +[titan] 2025-09-10 19:37:22,160 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 19:37:28,512 - root - INFO - step: 37250 loss: 2.5024 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.06 mfu: 49.25% global_avg_ntp_loss: 0.6603 global_avg_top_loss: 1.8421 +[titan] 2025-09-10 19:37:28,512 - root - INFO - lr: 2.2133e-06 gnorm: 0.62 [2 days, 20:00:59< 5:01:16] +[titan] 2025-09-10 19:38:00,749 - root - INFO - step: 37255 loss: 2.9148 memory: 122.03GiB(87.57%) tps: 10,165 tflops: 484.45 mfu: 48.98% global_avg_ntp_loss: 0.8918 global_avg_top_loss: 2.0230 +[titan] 2025-09-10 19:38:00,749 - root - INFO - lr: 2.2126e-06 gnorm: 0.63 [2 days, 20:01:31< 5:00:43] +[titan] 2025-09-10 19:38:32,987 - root - INFO - step: 37260 loss: 2.7273 memory: 122.03GiB(87.57%) tps: 10,164 tflops: 484.42 mfu: 48.98% global_avg_ntp_loss: 0.7848 global_avg_top_loss: 1.9424 +[titan] 2025-09-10 19:38:32,988 - root - INFO - lr: 2.2118e-06 gnorm: 0.60 [2 days, 20:02:04< 5:00:11] +[titan] 2025-09-10 19:39:05,052 - root - INFO - step: 37265 loss: 2.5643 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.06 mfu: 49.25% global_avg_ntp_loss: 0.6888 global_avg_top_loss: 1.8755 +[titan] 2025-09-10 19:39:05,052 - root - INFO - lr: 2.2110e-06 gnorm: 0.62 [2 days, 20:02:36< 4:59:38] +[titan] 2025-09-10 19:39:37,051 - root - INFO - step: 37270 loss: 2.7608 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.05 mfu: 49.35% global_avg_ntp_loss: 0.7961 global_avg_top_loss: 1.9647 +[titan] 2025-09-10 19:39:37,051 - root - INFO - lr: 2.2102e-06 gnorm: 0.67 [2 days, 20:03:08< 4:59:05] +[titan] 2025-09-10 19:40:09,285 - root - INFO - step: 37275 loss: 2.5118 memory: 122.03GiB(87.57%) tps: 10,166 tflops: 484.50 mfu: 48.99% global_avg_ntp_loss: 0.6616 global_avg_top_loss: 1.8501 +[titan] 2025-09-10 19:40:09,285 - root - INFO - lr: 2.2095e-06 gnorm: 0.66 [2 days, 20:03:40< 4:58:32] +[titan] 2025-09-10 19:40:41,462 - root - INFO - step: 37280 loss: 2.4844 memory: 122.03GiB(87.57%) tps: 10,184 tflops: 485.35 mfu: 49.07% global_avg_ntp_loss: 0.6508 global_avg_top_loss: 1.8336 +[titan] 2025-09-10 19:40:41,463 - root - INFO - lr: 2.2087e-06 gnorm: 0.58 [2 days, 20:04:12< 4:57:59] +[titan] 2025-09-10 19:41:13,433 - root - INFO - step: 37285 loss: 2.5240 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.49 mfu: 49.39% global_avg_ntp_loss: 0.6692 global_avg_top_loss: 1.8548 +[titan] 2025-09-10 19:41:13,433 - root - INFO - lr: 2.2080e-06 gnorm: 0.66 [2 days, 20:04:44< 4:57:26] +[titan] 2025-09-10 19:41:45,540 - root - INFO - step: 37290 loss: 2.4783 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.41 mfu: 49.18% global_avg_ntp_loss: 0.6513 global_avg_top_loss: 1.8269 +[titan] 2025-09-10 19:41:45,540 - root - INFO - lr: 2.2072e-06 gnorm: 0.57 [2 days, 20:05:16< 4:56:53] +[titan] 2025-09-10 19:42:17,583 - root - INFO - step: 37295 loss: 2.4637 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.39 mfu: 49.28% global_avg_ntp_loss: 0.6416 global_avg_top_loss: 1.8222 +[titan] 2025-09-10 19:42:17,583 - root - INFO - lr: 2.2064e-06 gnorm: 0.64 [2 days, 20:05:48< 4:56:20] +[titan] 2025-09-10 19:42:43,292 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 19:42:49,690 - root - INFO - step: 37300 loss: 2.5051 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.40 mfu: 49.18% global_avg_ntp_loss: 0.6632 global_avg_top_loss: 1.8419 +[titan] 2025-09-10 19:42:49,691 - root - INFO - lr: 2.2057e-06 gnorm: 0.76 [2 days, 20:06:20< 4:55:47] +[titan] 2025-09-10 19:43:22,001 - root - INFO - step: 37305 loss: 2.3414 memory: 122.03GiB(87.57%) tps: 10,142 tflops: 483.35 mfu: 48.87% global_avg_ntp_loss: 0.5901 global_avg_top_loss: 1.7514 +[titan] 2025-09-10 19:43:22,001 - root - INFO - lr: 2.2049e-06 gnorm: 0.59 [2 days, 20:06:53< 4:55:14] +[titan] 2025-09-10 19:43:54,308 - root - INFO - step: 37310 loss: 2.4070 memory: 122.03GiB(87.57%) tps: 10,143 tflops: 483.40 mfu: 48.88% global_avg_ntp_loss: 0.6113 global_avg_top_loss: 1.7956 +[titan] 2025-09-10 19:43:54,308 - root - INFO - lr: 2.2042e-06 gnorm: 0.66 [2 days, 20:07:25< 4:54:41] +[titan] 2025-09-10 19:44:26,453 - root - INFO - step: 37315 loss: 2.3650 memory: 122.03GiB(87.57%) tps: 10,194 tflops: 485.83 mfu: 49.12% global_avg_ntp_loss: 0.5884 global_avg_top_loss: 1.7766 +[titan] 2025-09-10 19:44:26,454 - root - INFO - lr: 2.2034e-06 gnorm: 0.85 [2 days, 20:07:57< 4:54:08] +[titan] 2025-09-10 19:44:58,665 - root - INFO - step: 37320 loss: 2.3939 memory: 122.03GiB(87.57%) tps: 10,173 tflops: 484.83 mfu: 49.02% global_avg_ntp_loss: 0.6030 global_avg_top_loss: 1.7909 +[titan] 2025-09-10 19:44:58,666 - root - INFO - lr: 2.2026e-06 gnorm: 0.59 [2 days, 20:08:29< 4:53:36] +[titan] 2025-09-10 19:45:30,749 - root - INFO - step: 37325 loss: 2.4942 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.77 mfu: 49.22% global_avg_ntp_loss: 0.6499 global_avg_top_loss: 1.8442 +[titan] 2025-09-10 19:45:30,749 - root - INFO - lr: 2.2019e-06 gnorm: 0.64 [2 days, 20:09:01< 4:53:03] +[titan] 2025-09-10 19:46:02,872 - root - INFO - step: 37330 loss: 2.5111 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.17 mfu: 49.16% global_avg_ntp_loss: 0.6610 global_avg_top_loss: 1.8501 +[titan] 2025-09-10 19:46:02,872 - root - INFO - lr: 2.2011e-06 gnorm: 0.67 [2 days, 20:09:33< 4:52:30] +[titan] 2025-09-10 19:46:35,151 - root - INFO - step: 37335 loss: 2.8635 memory: 122.03GiB(87.57%) tps: 10,151 tflops: 483.81 mfu: 48.92% global_avg_ntp_loss: 0.8713 global_avg_top_loss: 1.9922 +[titan] 2025-09-10 19:46:35,152 - root - INFO - lr: 2.2004e-06 gnorm: 0.71 [2 days, 20:10:06< 4:51:57] +[titan] 2025-09-10 19:47:07,183 - root - INFO - step: 37340 loss: 2.5555 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.55 mfu: 49.30% global_avg_ntp_loss: 0.6821 global_avg_top_loss: 1.8734 +[titan] 2025-09-10 19:47:07,184 - root - INFO - lr: 2.1996e-06 gnorm: 0.61 [2 days, 20:10:38< 4:51:24] +[titan] 2025-09-10 19:47:39,413 - root - INFO - step: 37345 loss: 2.5525 memory: 122.03GiB(87.57%) tps: 10,167 tflops: 484.56 mfu: 48.99% global_avg_ntp_loss: 0.6866 global_avg_top_loss: 1.8658 +[titan] 2025-09-10 19:47:39,414 - root - INFO - lr: 2.1989e-06 gnorm: 0.61 [2 days, 20:11:10< 4:50:51] +[titan] 2025-09-10 19:48:05,074 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 19:48:11,554 - root - INFO - step: 37350 loss: 2.5293 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.90 mfu: 49.13% global_avg_ntp_loss: 0.6784 global_avg_top_loss: 1.8509 +[titan] 2025-09-10 19:48:11,555 - root - INFO - lr: 2.1981e-06 gnorm: 0.71 [2 days, 20:11:42< 4:50:18] +[titan] 2025-09-10 19:48:43,573 - root - INFO - step: 37355 loss: 2.4693 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.75 mfu: 49.32% global_avg_ntp_loss: 0.6417 global_avg_top_loss: 1.8276 +[titan] 2025-09-10 19:48:43,574 - root - INFO - lr: 2.1974e-06 gnorm: 0.64 [2 days, 20:12:14< 4:49:45] +[titan] 2025-09-10 19:49:15,656 - root - INFO - step: 37360 loss: 2.4732 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.78 mfu: 49.22% global_avg_ntp_loss: 0.6411 global_avg_top_loss: 1.8320 +[titan] 2025-09-10 19:49:15,656 - root - INFO - lr: 2.1967e-06 gnorm: 0.63 [2 days, 20:12:46< 4:49:12] +[titan] 2025-09-10 19:49:47,832 - root - INFO - step: 37365 loss: 2.5321 memory: 122.03GiB(87.57%) tps: 10,184 tflops: 485.37 mfu: 49.08% global_avg_ntp_loss: 0.6707 global_avg_top_loss: 1.8613 +[titan] 2025-09-10 19:49:47,832 - root - INFO - lr: 2.1959e-06 gnorm: 0.73 [2 days, 20:13:18< 4:48:39] +[titan] 2025-09-10 19:50:19,917 - root - INFO - step: 37370 loss: 2.5552 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.74 mfu: 49.21% global_avg_ntp_loss: 0.6838 global_avg_top_loss: 1.8714 +[titan] 2025-09-10 19:50:19,918 - root - INFO - lr: 2.1952e-06 gnorm: 0.60 [2 days, 20:13:50< 4:48:06] +[titan] 2025-09-10 19:50:51,991 - root - INFO - step: 37375 loss: 2.4350 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.92 mfu: 49.23% global_avg_ntp_loss: 0.6297 global_avg_top_loss: 1.8054 +[titan] 2025-09-10 19:50:51,991 - root - INFO - lr: 2.1944e-06 gnorm: 0.70 [2 days, 20:14:23< 4:47:33] +[titan] 2025-09-10 19:50:58,622 - root - INFO - Dumping profiler traces at step 37376 +[titan] 2025-09-10 19:50:58,678 - root - INFO - Finished dumping profiler traces in 0.06 seconds +[titan] 2025-09-10 19:51:24,221 - root - INFO - step: 37380 loss: 2.5429 memory: 122.03GiB(87.57%) tps: 10,167 tflops: 484.56 mfu: 48.99% global_avg_ntp_loss: 0.6801 global_avg_top_loss: 1.8629 +[titan] 2025-09-10 19:51:24,221 - root - INFO - lr: 2.1937e-06 gnorm: 0.72 [2 days, 20:14:55< 4:47:01] +[titan] 2025-09-10 19:51:56,396 - root - INFO - step: 37385 loss: 2.3901 memory: 122.03GiB(87.57%) tps: 10,184 tflops: 485.38 mfu: 49.08% global_avg_ntp_loss: 0.6034 global_avg_top_loss: 1.7868 +[titan] 2025-09-10 19:51:56,396 - root - INFO - lr: 2.1930e-06 gnorm: 0.57 [2 days, 20:15:27< 4:46:28] +[titan] 2025-09-10 19:52:28,377 - root - INFO - step: 37390 loss: 2.3837 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.32 mfu: 49.38% global_avg_ntp_loss: 0.6014 global_avg_top_loss: 1.7822 +[titan] 2025-09-10 19:52:28,377 - root - INFO - lr: 2.1922e-06 gnorm: 0.66 [2 days, 20:15:59< 4:45:55] +[titan] 2025-09-10 19:53:00,352 - root - INFO - step: 37395 loss: 2.3728 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.42 mfu: 49.38% global_avg_ntp_loss: 0.5941 global_avg_top_loss: 1.7786 +[titan] 2025-09-10 19:53:00,353 - root - INFO - lr: 2.1915e-06 gnorm: 0.89 [2 days, 20:16:31< 4:45:22] +[titan] 2025-09-10 19:53:25,877 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 19:53:32,408 - root - INFO - step: 37400 loss: 2.5206 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.18 mfu: 49.26% global_avg_ntp_loss: 0.6691 global_avg_top_loss: 1.8516 +[titan] 2025-09-10 19:53:32,409 - root - INFO - lr: 2.1908e-06 gnorm: 0.58 [2 days, 20:17:03< 4:44:49] +[titan] 2025-09-10 19:54:04,577 - root - INFO - step: 37405 loss: 2.3526 memory: 122.03GiB(87.57%) tps: 10,186 tflops: 485.48 mfu: 49.09% global_avg_ntp_loss: 0.5861 global_avg_top_loss: 1.7665 +[titan] 2025-09-10 19:54:04,577 - root - INFO - lr: 2.1900e-06 gnorm: 0.61 [2 days, 20:17:35< 4:44:16] +[titan] 2025-09-10 19:54:36,695 - root - INFO - step: 37410 loss: 2.4234 memory: 122.03GiB(87.57%) tps: 10,202 tflops: 486.24 mfu: 49.16% global_avg_ntp_loss: 0.6173 global_avg_top_loss: 1.8061 +[titan] 2025-09-10 19:54:36,696 - root - INFO - lr: 2.1893e-06 gnorm: 0.67 [2 days, 20:18:07< 4:43:43] +[titan] 2025-09-10 19:55:09,103 - root - INFO - step: 37415 loss: 2.9246 memory: 122.03GiB(87.57%) tps: 10,111 tflops: 481.89 mfu: 48.73% global_avg_ntp_loss: 0.8992 global_avg_top_loss: 2.0255 +[titan] 2025-09-10 19:55:09,104 - root - INFO - lr: 2.1886e-06 gnorm: 0.65 [2 days, 20:18:40< 4:43:10] +[titan] 2025-09-10 19:55:41,473 - root - INFO - step: 37420 loss: 2.5340 memory: 122.03GiB(87.57%) tps: 10,123 tflops: 482.47 mfu: 48.78% global_avg_ntp_loss: 0.6735 global_avg_top_loss: 1.8605 +[titan] 2025-09-10 19:55:41,473 - root - INFO - lr: 2.1879e-06 gnorm: 0.63 [2 days, 20:19:12< 4:42:37] +[titan] 2025-09-10 19:56:13,783 - root - INFO - step: 37425 loss: 2.6458 memory: 122.03GiB(87.57%) tps: 10,142 tflops: 483.36 mfu: 48.87% global_avg_ntp_loss: 0.7257 global_avg_top_loss: 1.9201 +[titan] 2025-09-10 19:56:13,783 - root - INFO - lr: 2.1871e-06 gnorm: 0.69 [2 days, 20:19:44< 4:42:04] +[titan] 2025-09-10 19:56:45,846 - root - INFO - step: 37430 loss: 2.5837 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.08 mfu: 49.25% global_avg_ntp_loss: 0.6992 global_avg_top_loss: 1.8845 +[titan] 2025-09-10 19:56:45,846 - root - INFO - lr: 2.1864e-06 gnorm: 0.64 [2 days, 20:20:16< 4:41:31] +[titan] 2025-09-10 19:57:18,070 - root - INFO - step: 37435 loss: 2.4946 memory: 122.03GiB(87.57%) tps: 10,169 tflops: 484.64 mfu: 49.00% global_avg_ntp_loss: 0.6543 global_avg_top_loss: 1.8403 +[titan] 2025-09-10 19:57:18,070 - root - INFO - lr: 2.1857e-06 gnorm: 0.63 [2 days, 20:20:49< 4:40:58] +[titan] 2025-09-10 19:57:50,401 - root - INFO - step: 37440 loss: 2.3865 memory: 122.03GiB(87.57%) tps: 10,135 tflops: 483.03 mfu: 48.84% global_avg_ntp_loss: 0.6037 global_avg_top_loss: 1.7828 +[titan] 2025-09-10 19:57:50,401 - root - INFO - lr: 2.1850e-06 gnorm: 0.59 [2 days, 20:21:21< 4:40:26] +[titan] 2025-09-10 19:58:22,472 - root - INFO - step: 37445 loss: 2.4842 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.96 mfu: 49.24% global_avg_ntp_loss: 0.6468 global_avg_top_loss: 1.8374 +[titan] 2025-09-10 19:58:22,472 - root - INFO - lr: 2.1842e-06 gnorm: 0.70 [2 days, 20:21:53< 4:39:53] +[titan] 2025-09-10 19:58:48,140 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 19:58:54,650 - root - INFO - step: 37450 loss: 2.5407 memory: 122.03GiB(87.57%) tps: 10,183 tflops: 485.33 mfu: 49.07% global_avg_ntp_loss: 0.6775 global_avg_top_loss: 1.8632 +[titan] 2025-09-10 19:58:54,650 - root - INFO - lr: 2.1835e-06 gnorm: 0.59 [2 days, 20:22:25< 4:39:20] +[titan] 2025-09-10 19:59:26,790 - root - INFO - step: 37455 loss: 2.5301 memory: 122.03GiB(87.57%) tps: 10,196 tflops: 485.92 mfu: 49.13% global_avg_ntp_loss: 0.6712 global_avg_top_loss: 1.8589 +[titan] 2025-09-10 19:59:26,790 - root - INFO - lr: 2.1828e-06 gnorm: 0.68 [2 days, 20:22:57< 4:38:47] +[titan] 2025-09-10 19:59:58,836 - root - INFO - step: 37460 loss: 2.5726 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.33 mfu: 49.27% global_avg_ntp_loss: 0.6914 global_avg_top_loss: 1.8812 +[titan] 2025-09-10 19:59:58,837 - root - INFO - lr: 2.1821e-06 gnorm: 0.77 [2 days, 20:23:29< 4:38:14] +[titan] 2025-09-10 20:00:31,039 - root - INFO - step: 37465 loss: 2.3591 memory: 122.03GiB(87.57%) tps: 10,176 tflops: 484.97 mfu: 49.04% global_avg_ntp_loss: 0.5918 global_avg_top_loss: 1.7673 +[titan] 2025-09-10 20:00:31,039 - root - INFO - lr: 2.1814e-06 gnorm: 0.56 [2 days, 20:24:02< 4:37:41] +[titan] 2025-09-10 20:01:03,138 - root - INFO - step: 37470 loss: 2.3133 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.53 mfu: 49.19% global_avg_ntp_loss: 0.5680 global_avg_top_loss: 1.7454 +[titan] 2025-09-10 20:01:03,138 - root - INFO - lr: 2.1807e-06 gnorm: 0.64 [2 days, 20:24:34< 4:37:08] +[titan] 2025-09-10 20:01:35,226 - root - INFO - step: 37475 loss: 2.3022 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.70 mfu: 49.21% global_avg_ntp_loss: 0.5580 global_avg_top_loss: 1.7442 +[titan] 2025-09-10 20:01:35,226 - root - INFO - lr: 2.1800e-06 gnorm: 0.76 [2 days, 20:25:06< 4:36:35] +[titan] 2025-09-10 20:02:07,246 - root - INFO - step: 37480 loss: 2.4289 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.72 mfu: 49.31% global_avg_ntp_loss: 0.6257 global_avg_top_loss: 1.8032 +[titan] 2025-09-10 20:02:07,247 - root - INFO - lr: 2.1792e-06 gnorm: 0.58 [2 days, 20:25:38< 4:36:02] +[titan] 2025-09-10 20:02:39,349 - root - INFO - step: 37485 loss: 2.4171 memory: 122.03GiB(87.57%) tps: 10,208 tflops: 486.48 mfu: 49.19% global_avg_ntp_loss: 0.6136 global_avg_top_loss: 1.8035 +[titan] 2025-09-10 20:02:39,349 - root - INFO - lr: 2.1785e-06 gnorm: 0.63 [2 days, 20:26:10< 4:35:29] +[titan] 2025-09-10 20:03:11,455 - root - INFO - step: 37490 loss: 2.5265 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.41 mfu: 49.18% global_avg_ntp_loss: 0.6674 global_avg_top_loss: 1.8591 +[titan] 2025-09-10 20:03:11,456 - root - INFO - lr: 2.1778e-06 gnorm: 0.67 [2 days, 20:26:42< 4:34:56] +[titan] 2025-09-10 20:03:43,568 - root - INFO - step: 37495 loss: 2.4428 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.32 mfu: 49.17% global_avg_ntp_loss: 0.6298 global_avg_top_loss: 1.8130 +[titan] 2025-09-10 20:03:43,569 - root - INFO - lr: 2.1771e-06 gnorm: 0.62 [2 days, 20:27:14< 4:34:24] +[titan] 2025-09-10 20:04:09,185 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 20:04:15,750 - root - INFO - step: 37500 loss: 2.5633 memory: 122.03GiB(87.57%) tps: 10,182 tflops: 485.28 mfu: 49.07% global_avg_ntp_loss: 0.6856 global_avg_top_loss: 1.8777 +[titan] 2025-09-10 20:04:15,750 - root - INFO - lr: 2.1764e-06 gnorm: 0.64 [2 days, 20:27:46< 4:33:51] +[titan] 2025-09-10 20:04:47,816 - root - INFO - step: 37505 loss: 2.6352 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.04 mfu: 49.25% global_avg_ntp_loss: 0.7234 global_avg_top_loss: 1.9118 +[titan] 2025-09-10 20:04:47,816 - root - INFO - lr: 2.1757e-06 gnorm: 0.65 [2 days, 20:28:18< 4:33:18] +[titan] 2025-09-10 20:05:20,004 - root - INFO - step: 37510 loss: 2.5820 memory: 122.03GiB(87.57%) tps: 10,180 tflops: 485.18 mfu: 49.06% global_avg_ntp_loss: 0.7020 global_avg_top_loss: 1.8800 +[titan] 2025-09-10 20:05:20,005 - root - INFO - lr: 2.1750e-06 gnorm: 0.71 [2 days, 20:28:51< 4:32:45] +[titan] 2025-09-10 20:05:52,084 - root - INFO - step: 37515 loss: 2.4661 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.83 mfu: 49.22% global_avg_ntp_loss: 0.6423 global_avg_top_loss: 1.8238 +[titan] 2025-09-10 20:05:52,084 - root - INFO - lr: 2.1743e-06 gnorm: 0.63 [2 days, 20:29:23< 4:32:12] +[titan] 2025-09-10 20:06:24,104 - root - INFO - step: 37520 loss: 2.4727 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.73 mfu: 49.32% global_avg_ntp_loss: 0.6464 global_avg_top_loss: 1.8263 +[titan] 2025-09-10 20:06:24,104 - root - INFO - lr: 2.1736e-06 gnorm: 0.59 [2 days, 20:29:55< 4:31:39] +[titan] 2025-09-10 20:06:56,267 - root - INFO - step: 37525 loss: 2.5053 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.56 mfu: 49.10% global_avg_ntp_loss: 0.6568 global_avg_top_loss: 1.8485 +[titan] 2025-09-10 20:06:56,267 - root - INFO - lr: 2.1729e-06 gnorm: 0.70 [2 days, 20:30:27< 4:31:06] +[titan] 2025-09-10 20:07:28,472 - root - INFO - step: 37530 loss: 2.5772 memory: 122.03GiB(87.57%) tps: 10,175 tflops: 484.94 mfu: 49.03% global_avg_ntp_loss: 0.6963 global_avg_top_loss: 1.8809 +[titan] 2025-09-10 20:07:28,472 - root - INFO - lr: 2.1722e-06 gnorm: 0.59 [2 days, 20:30:59< 4:30:33] +[titan] 2025-09-10 20:08:00,869 - root - INFO - step: 37535 loss: 2.4986 memory: 122.03GiB(87.57%) tps: 10,115 tflops: 482.05 mfu: 48.74% global_avg_ntp_loss: 0.6567 global_avg_top_loss: 1.8419 +[titan] 2025-09-10 20:08:00,870 - root - INFO - lr: 2.1715e-06 gnorm: 0.71 [2 days, 20:31:31< 4:30:00] +[titan] 2025-09-10 20:08:32,952 - root - INFO - step: 37540 loss: 2.5346 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.77 mfu: 49.22% global_avg_ntp_loss: 0.6740 global_avg_top_loss: 1.8606 +[titan] 2025-09-10 20:08:32,953 - root - INFO - lr: 2.1708e-06 gnorm: 0.74 [2 days, 20:32:03< 4:29:27] +[titan] 2025-09-10 20:09:05,315 - root - INFO - step: 37545 loss: 2.3214 memory: 122.03GiB(87.57%) tps: 10,126 tflops: 482.58 mfu: 48.79% global_avg_ntp_loss: 0.5768 global_avg_top_loss: 1.7446 +[titan] 2025-09-10 20:09:05,315 - root - INFO - lr: 2.1701e-06 gnorm: 0.56 [2 days, 20:32:36< 4:28:54] +[titan] 2025-09-10 20:09:30,871 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 20:09:37,237 - root - INFO - step: 37550 loss: 2.3710 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.23 mfu: 49.47% global_avg_ntp_loss: 0.5939 global_avg_top_loss: 1.7771 +[titan] 2025-09-10 20:09:37,237 - root - INFO - lr: 2.1695e-06 gnorm: 0.67 [2 days, 20:33:08< 4:28:22] +[titan] 2025-09-10 20:10:09,451 - root - INFO - step: 37555 loss: 2.3685 memory: 122.03GiB(87.57%) tps: 10,172 tflops: 484.80 mfu: 49.02% global_avg_ntp_loss: 0.5931 global_avg_top_loss: 1.7754 +[titan] 2025-09-10 20:10:09,451 - root - INFO - lr: 2.1688e-06 gnorm: 0.77 [2 days, 20:33:40< 4:27:49] +[titan] 2025-09-10 20:10:41,566 - root - INFO - step: 37560 loss: 2.4486 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.29 mfu: 49.17% global_avg_ntp_loss: 0.6422 global_avg_top_loss: 1.8065 +[titan] 2025-09-10 20:10:41,566 - root - INFO - lr: 2.1681e-06 gnorm: 0.59 [2 days, 20:34:12< 4:27:16] +[titan] 2025-09-10 20:11:13,688 - root - INFO - step: 37565 loss: 2.4014 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.17 mfu: 49.16% global_avg_ntp_loss: 0.6096 global_avg_top_loss: 1.7918 +[titan] 2025-09-10 20:11:13,688 - root - INFO - lr: 2.1674e-06 gnorm: 0.63 [2 days, 20:34:44< 4:26:43] +[titan] 2025-09-10 20:11:45,718 - root - INFO - step: 37570 loss: 2.4556 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.59 mfu: 49.30% global_avg_ntp_loss: 0.6375 global_avg_top_loss: 1.8181 +[titan] 2025-09-10 20:11:45,718 - root - INFO - lr: 2.1667e-06 gnorm: 0.62 [2 days, 20:35:16< 4:26:10] +[titan] 2025-09-10 20:12:17,854 - root - INFO - step: 37575 loss: 2.3525 memory: 122.03GiB(87.57%) tps: 10,197 tflops: 485.97 mfu: 49.14% global_avg_ntp_loss: 0.5895 global_avg_top_loss: 1.7630 +[titan] 2025-09-10 20:12:17,854 - root - INFO - lr: 2.1660e-06 gnorm: 0.66 [2 days, 20:35:48< 4:25:37] +[titan] 2025-09-10 20:12:49,903 - root - INFO - step: 37580 loss: 2.5102 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.30 mfu: 49.27% global_avg_ntp_loss: 0.6608 global_avg_top_loss: 1.8494 +[titan] 2025-09-10 20:12:49,903 - root - INFO - lr: 2.1653e-06 gnorm: 0.65 [2 days, 20:36:20< 4:25:04] +[titan] 2025-09-10 20:13:22,124 - root - INFO - step: 37585 loss: 2.6003 memory: 122.03GiB(87.57%) tps: 10,170 tflops: 484.69 mfu: 49.01% global_avg_ntp_loss: 0.7040 global_avg_top_loss: 1.8963 +[titan] 2025-09-10 20:13:22,124 - root - INFO - lr: 2.1647e-06 gnorm: 0.65 [2 days, 20:36:53< 4:24:31] +[titan] 2025-09-10 20:13:54,165 - root - INFO - step: 37590 loss: 2.5207 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.41 mfu: 49.28% global_avg_ntp_loss: 0.6755 global_avg_top_loss: 1.8452 +[titan] 2025-09-10 20:13:54,165 - root - INFO - lr: 2.1640e-06 gnorm: 0.66 [2 days, 20:37:25< 4:23:58] +[titan] 2025-09-10 20:14:26,270 - root - INFO - step: 37595 loss: 2.5328 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.44 mfu: 49.18% global_avg_ntp_loss: 0.6722 global_avg_top_loss: 1.8606 +[titan] 2025-09-10 20:14:26,270 - root - INFO - lr: 2.1633e-06 gnorm: 0.64 [2 days, 20:37:57< 4:23:25] +[titan] 2025-09-10 20:14:51,937 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 20:14:58,296 - root - INFO - step: 37600 loss: 2.4865 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.65 mfu: 49.31% global_avg_ntp_loss: 0.6480 global_avg_top_loss: 1.8385 +[titan] 2025-09-10 20:14:58,296 - root - INFO - lr: 2.1626e-06 gnorm: 0.59 [2 days, 20:38:29< 4:22:52] +[titan] 2025-09-10 20:15:30,495 - root - INFO - step: 37605 loss: 2.4462 memory: 122.03GiB(87.57%) tps: 10,177 tflops: 485.02 mfu: 49.04% global_avg_ntp_loss: 0.6330 global_avg_top_loss: 1.8132 +[titan] 2025-09-10 20:15:30,495 - root - INFO - lr: 2.1620e-06 gnorm: 0.71 [2 days, 20:39:01< 4:22:20] +[titan] 2025-09-10 20:16:02,590 - root - INFO - step: 37610 loss: 2.6372 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.60 mfu: 49.20% global_avg_ntp_loss: 0.7203 global_avg_top_loss: 1.9169 +[titan] 2025-09-10 20:16:02,590 - root - INFO - lr: 2.1613e-06 gnorm: 0.62 [2 days, 20:39:33< 4:21:47] +[titan] 2025-09-10 20:16:34,791 - root - INFO - step: 37615 loss: 2.5438 memory: 122.03GiB(87.57%) tps: 10,176 tflops: 484.99 mfu: 49.04% global_avg_ntp_loss: 0.6770 global_avg_top_loss: 1.8668 +[titan] 2025-09-10 20:16:34,791 - root - INFO - lr: 2.1606e-06 gnorm: 0.73 [2 days, 20:40:05< 4:21:14] +[titan] 2025-09-10 20:17:06,873 - root - INFO - step: 37620 loss: 2.6021 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.79 mfu: 49.22% global_avg_ntp_loss: 0.7035 global_avg_top_loss: 1.8986 +[titan] 2025-09-10 20:17:06,873 - root - INFO - lr: 2.1599e-06 gnorm: 0.82 [2 days, 20:40:37< 4:20:41] +[titan] 2025-09-10 20:17:38,857 - root - INFO - step: 37625 loss: 2.3048 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.28 mfu: 49.37% global_avg_ntp_loss: 0.5754 global_avg_top_loss: 1.7294 +[titan] 2025-09-10 20:17:38,857 - root - INFO - lr: 2.1593e-06 gnorm: 0.57 [2 days, 20:41:09< 4:20:08] +[titan] 2025-09-10 20:18:11,026 - root - INFO - step: 37630 loss: 2.3402 memory: 122.03GiB(87.57%) tps: 10,186 tflops: 485.47 mfu: 49.09% global_avg_ntp_loss: 0.5800 global_avg_top_loss: 1.7602 +[titan] 2025-09-10 20:18:11,026 - root - INFO - lr: 2.1586e-06 gnorm: 0.70 [2 days, 20:41:42< 4:19:35] +[titan] 2025-09-10 20:18:43,123 - root - INFO - step: 37635 loss: 2.3052 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.55 mfu: 49.20% global_avg_ntp_loss: 0.5590 global_avg_top_loss: 1.7463 +[titan] 2025-09-10 20:18:43,124 - root - INFO - lr: 2.1579e-06 gnorm: 0.74 [2 days, 20:42:14< 4:19:02] +[titan] 2025-09-10 20:19:15,207 - root - INFO - step: 37640 loss: 2.3864 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.77 mfu: 49.22% global_avg_ntp_loss: 0.6024 global_avg_top_loss: 1.7840 +[titan] 2025-09-10 20:19:15,207 - root - INFO - lr: 2.1573e-06 gnorm: 0.61 [2 days, 20:42:46< 4:18:29] +[titan] 2025-09-10 20:19:47,444 - root - INFO - step: 37645 loss: 2.3908 memory: 122.03GiB(87.57%) tps: 10,165 tflops: 484.44 mfu: 48.98% global_avg_ntp_loss: 0.6059 global_avg_top_loss: 1.7848 +[titan] 2025-09-10 20:19:47,445 - root - INFO - lr: 2.1566e-06 gnorm: 0.61 [2 days, 20:43:18< 4:17:56] +[titan] 2025-09-10 20:20:13,055 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 20:20:19,491 - root - INFO - step: 37650 loss: 2.4464 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.33 mfu: 49.27% global_avg_ntp_loss: 0.6363 global_avg_top_loss: 1.8101 +[titan] 2025-09-10 20:20:19,491 - root - INFO - lr: 2.1559e-06 gnorm: 0.65 [2 days, 20:43:50< 4:17:23] +[titan] 2025-09-10 20:20:51,564 - root - INFO - step: 37655 loss: 2.4136 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.92 mfu: 49.23% global_avg_ntp_loss: 0.6156 global_avg_top_loss: 1.7981 +[titan] 2025-09-10 20:20:51,565 - root - INFO - lr: 2.1553e-06 gnorm: 0.63 [2 days, 20:44:22< 4:16:50] +[titan] 2025-09-10 20:21:23,573 - root - INFO - step: 37660 loss: 2.4910 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.90 mfu: 49.33% global_avg_ntp_loss: 0.6525 global_avg_top_loss: 1.8384 +[titan] 2025-09-10 20:21:23,574 - root - INFO - lr: 2.1546e-06 gnorm: 0.61 [2 days, 20:44:54< 4:16:18] +[titan] 2025-09-10 20:21:55,578 - root - INFO - step: 37665 loss: 2.5458 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.96 mfu: 49.34% global_avg_ntp_loss: 0.6823 global_avg_top_loss: 1.8634 +[titan] 2025-09-10 20:21:55,579 - root - INFO - lr: 2.1540e-06 gnorm: 0.70 [2 days, 20:45:26< 4:15:45] +[titan] 2025-09-10 20:22:27,543 - root - INFO - step: 37670 loss: 2.6375 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.58 mfu: 49.40% global_avg_ntp_loss: 0.7243 global_avg_top_loss: 1.9132 +[titan] 2025-09-10 20:22:27,543 - root - INFO - lr: 2.1533e-06 gnorm: 0.97 [2 days, 20:45:58< 4:15:12] +[titan] 2025-09-10 20:22:59,813 - root - INFO - step: 37675 loss: 2.4679 memory: 122.03GiB(87.57%) tps: 10,155 tflops: 483.96 mfu: 48.93% global_avg_ntp_loss: 0.6386 global_avg_top_loss: 1.8293 +[titan] 2025-09-10 20:22:59,813 - root - INFO - lr: 2.1527e-06 gnorm: 0.67 [2 days, 20:46:30< 4:14:39] +[titan] 2025-09-10 20:23:31,956 - root - INFO - step: 37680 loss: 2.4225 memory: 122.03GiB(87.57%) tps: 10,194 tflops: 485.85 mfu: 49.13% global_avg_ntp_loss: 0.6195 global_avg_top_loss: 1.8030 +[titan] 2025-09-10 20:23:31,957 - root - INFO - lr: 2.1520e-06 gnorm: 0.63 [2 days, 20:47:02< 4:14:06] +[titan] 2025-09-10 20:24:03,957 - root - INFO - step: 37685 loss: 2.4596 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.03 mfu: 49.35% global_avg_ntp_loss: 0.6358 global_avg_top_loss: 1.8238 +[titan] 2025-09-10 20:24:03,957 - root - INFO - lr: 2.1513e-06 gnorm: 0.75 [2 days, 20:47:34< 4:13:33] +[titan] 2025-09-10 20:24:36,031 - root - INFO - step: 37690 loss: 2.5943 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.90 mfu: 49.23% global_avg_ntp_loss: 0.7020 global_avg_top_loss: 1.8923 +[titan] 2025-09-10 20:24:36,032 - root - INFO - lr: 2.1507e-06 gnorm: 0.59 [2 days, 20:48:07< 4:13:00] +[titan] 2025-09-10 20:25:08,154 - root - INFO - step: 37695 loss: 2.4815 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.18 mfu: 49.16% global_avg_ntp_loss: 0.6485 global_avg_top_loss: 1.8330 +[titan] 2025-09-10 20:25:08,154 - root - INFO - lr: 2.1500e-06 gnorm: 0.75 [2 days, 20:48:39< 4:12:27] +[titan] 2025-09-10 20:25:33,849 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 20:25:40,362 - root - INFO - step: 37700 loss: 2.5745 memory: 122.03GiB(87.57%) tps: 10,174 tflops: 484.88 mfu: 49.03% global_avg_ntp_loss: 0.6920 global_avg_top_loss: 1.8824 +[titan] 2025-09-10 20:25:40,362 - root - INFO - lr: 2.1494e-06 gnorm: 0.75 [2 days, 20:49:11< 4:11:54] +[titan] 2025-09-10 20:26:12,411 - root - INFO - step: 37705 loss: 2.3561 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.29 mfu: 49.27% global_avg_ntp_loss: 0.5931 global_avg_top_loss: 1.7631 +[titan] 2025-09-10 20:26:12,411 - root - INFO - lr: 2.1488e-06 gnorm: 0.57 [2 days, 20:49:43< 4:11:21] +[titan] 2025-09-10 20:26:44,454 - root - INFO - step: 37710 loss: 2.4569 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.38 mfu: 49.28% global_avg_ntp_loss: 0.6292 global_avg_top_loss: 1.8277 +[titan] 2025-09-10 20:26:44,455 - root - INFO - lr: 2.1481e-06 gnorm: 0.64 [2 days, 20:50:15< 4:10:48] +[titan] 2025-09-10 20:27:16,687 - root - INFO - step: 37715 loss: 2.2849 memory: 122.03GiB(87.57%) tps: 10,166 tflops: 484.52 mfu: 48.99% global_avg_ntp_loss: 0.5533 global_avg_top_loss: 1.7316 +[titan] 2025-09-10 20:27:16,687 - root - INFO - lr: 2.1475e-06 gnorm: 0.77 [2 days, 20:50:47< 4:10:16] +[titan] 2025-09-10 20:27:48,938 - root - INFO - step: 37720 loss: 2.8788 memory: 122.03GiB(87.57%) tps: 10,160 tflops: 484.24 mfu: 48.96% global_avg_ntp_loss: 0.8821 global_avg_top_loss: 1.9968 +[titan] 2025-09-10 20:27:48,938 - root - INFO - lr: 2.1468e-06 gnorm: 0.57 [2 days, 20:51:19< 4:09:43] +[titan] 2025-09-10 20:28:20,835 - root - INFO - step: 37725 loss: 2.3623 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.61 mfu: 49.51% global_avg_ntp_loss: 0.5912 global_avg_top_loss: 1.7711 +[titan] 2025-09-10 20:28:20,835 - root - INFO - lr: 2.1462e-06 gnorm: 0.63 [2 days, 20:51:51< 4:09:10] +[titan] 2025-09-10 20:28:52,730 - root - INFO - step: 37730 loss: 2.5277 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.65 mfu: 49.51% global_avg_ntp_loss: 0.6658 global_avg_top_loss: 1.8619 +[titan] 2025-09-10 20:28:52,730 - root - INFO - lr: 2.1455e-06 gnorm: 0.66 [2 days, 20:52:23< 4:08:37] +[titan] 2025-09-10 20:29:24,776 - root - INFO - step: 37735 loss: 2.3537 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.34 mfu: 49.28% global_avg_ntp_loss: 0.5912 global_avg_top_loss: 1.7625 +[titan] 2025-09-10 20:29:24,776 - root - INFO - lr: 2.1449e-06 gnorm: 0.62 [2 days, 20:52:55< 4:08:04] +[titan] 2025-09-10 20:29:57,015 - root - INFO - step: 37740 loss: 2.5527 memory: 122.03GiB(87.57%) tps: 10,164 tflops: 484.41 mfu: 48.98% global_avg_ntp_loss: 0.6943 global_avg_top_loss: 1.8584 +[titan] 2025-09-10 20:29:57,015 - root - INFO - lr: 2.1443e-06 gnorm: 0.61 [2 days, 20:53:28< 4:07:31] +[titan] 2025-09-10 20:30:29,173 - root - INFO - step: 37745 loss: 2.4849 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.64 mfu: 49.10% global_avg_ntp_loss: 0.6521 global_avg_top_loss: 1.8328 +[titan] 2025-09-10 20:30:29,173 - root - INFO - lr: 2.1436e-06 gnorm: 0.65 [2 days, 20:54:00< 4:06:58] +[titan] 2025-09-10 20:30:54,960 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 20:31:01,390 - root - INFO - step: 37750 loss: 3.0359 memory: 122.03GiB(87.57%) tps: 10,171 tflops: 484.75 mfu: 49.01% global_avg_ntp_loss: 0.9517 global_avg_top_loss: 2.0842 +[titan] 2025-09-10 20:31:01,390 - root - INFO - lr: 2.1430e-06 gnorm: 0.75 [2 days, 20:54:32< 4:06:25] +[titan] 2025-09-10 20:31:33,404 - root - INFO - step: 37755 loss: 2.5395 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.82 mfu: 49.33% global_avg_ntp_loss: 0.6726 global_avg_top_loss: 1.8669 +[titan] 2025-09-10 20:31:33,404 - root - INFO - lr: 2.1424e-06 gnorm: 0.64 [2 days, 20:55:04< 4:05:52] +[titan] 2025-09-10 20:32:05,686 - root - INFO - step: 37760 loss: 2.6151 memory: 122.03GiB(87.57%) tps: 10,151 tflops: 483.77 mfu: 48.92% global_avg_ntp_loss: 0.7118 global_avg_top_loss: 1.9033 +[titan] 2025-09-10 20:32:05,686 - root - INFO - lr: 2.1417e-06 gnorm: 0.63 [2 days, 20:55:36< 4:05:19] +[titan] 2025-09-10 20:32:37,927 - root - INFO - step: 37765 loss: 2.5436 memory: 122.03GiB(87.57%) tps: 10,164 tflops: 484.39 mfu: 48.98% global_avg_ntp_loss: 0.6728 global_avg_top_loss: 1.8708 +[titan] 2025-09-10 20:32:37,927 - root - INFO - lr: 2.1411e-06 gnorm: 0.67 [2 days, 20:56:08< 4:04:47] +[titan] 2025-09-10 20:33:10,065 - root - INFO - step: 37770 loss: 3.0774 memory: 122.03GiB(87.57%) tps: 10,196 tflops: 485.94 mfu: 49.13% global_avg_ntp_loss: 0.9820 global_avg_top_loss: 2.0954 +[titan] 2025-09-10 20:33:10,066 - root - INFO - lr: 2.1405e-06 gnorm: 0.57 [2 days, 20:56:41< 4:04:14] +[titan] 2025-09-10 20:33:42,218 - root - INFO - step: 37775 loss: 2.4685 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.72 mfu: 49.11% global_avg_ntp_loss: 0.6434 global_avg_top_loss: 1.8252 +[titan] 2025-09-10 20:33:42,218 - root - INFO - lr: 2.1398e-06 gnorm: 0.71 [2 days, 20:57:13< 4:03:41] +[titan] 2025-09-10 20:34:14,216 - root - INFO - step: 37780 loss: 2.5617 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.07 mfu: 49.35% global_avg_ntp_loss: 0.6884 global_avg_top_loss: 1.8733 +[titan] 2025-09-10 20:34:14,216 - root - INFO - lr: 2.1392e-06 gnorm: 0.74 [2 days, 20:57:45< 4:03:08] +[titan] 2025-09-10 20:34:46,241 - root - INFO - step: 37785 loss: 2.3907 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.66 mfu: 49.31% global_avg_ntp_loss: 0.6076 global_avg_top_loss: 1.7831 +[titan] 2025-09-10 20:34:46,241 - root - INFO - lr: 2.1386e-06 gnorm: 0.58 [2 days, 20:58:17< 4:02:35] +[titan] 2025-09-10 20:35:18,445 - root - INFO - step: 37790 loss: 2.3268 memory: 122.03GiB(87.57%) tps: 10,175 tflops: 484.94 mfu: 49.03% global_avg_ntp_loss: 0.5758 global_avg_top_loss: 1.7510 +[titan] 2025-09-10 20:35:18,446 - root - INFO - lr: 2.1380e-06 gnorm: 0.74 [2 days, 20:58:49< 4:02:02] +[titan] 2025-09-10 20:35:50,609 - root - INFO - step: 37795 loss: 2.2894 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.55 mfu: 49.09% global_avg_ntp_loss: 0.5550 global_avg_top_loss: 1.7344 +[titan] 2025-09-10 20:35:50,610 - root - INFO - lr: 2.1373e-06 gnorm: 0.78 [2 days, 20:59:21< 4:01:29] +[titan] 2025-09-10 20:36:16,321 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 20:36:22,809 - root - INFO - step: 37800 loss: 2.4934 memory: 122.03GiB(87.57%) tps: 10,177 tflops: 485.02 mfu: 49.04% global_avg_ntp_loss: 0.6531 global_avg_top_loss: 1.8404 +[titan] 2025-09-10 20:36:22,809 - root - INFO - lr: 2.1367e-06 gnorm: 0.61 [2 days, 20:59:53< 4:00:56] +[titan] 2025-09-10 20:36:54,892 - root - INFO - step: 37805 loss: 2.3705 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.78 mfu: 49.22% global_avg_ntp_loss: 0.5969 global_avg_top_loss: 1.7735 +[titan] 2025-09-10 20:36:54,892 - root - INFO - lr: 2.1361e-06 gnorm: 0.67 [2 days, 21:00:25< 4:00:23] +[titan] 2025-09-10 20:37:26,908 - root - INFO - step: 37810 loss: 2.4935 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.78 mfu: 49.32% global_avg_ntp_loss: 0.6497 global_avg_top_loss: 1.8437 +[titan] 2025-09-10 20:37:26,909 - root - INFO - lr: 2.1355e-06 gnorm: 0.65 [2 days, 21:00:57< 3:59:50] +[titan] 2025-09-10 20:37:59,021 - root - INFO - step: 37815 loss: 2.4521 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.32 mfu: 49.17% global_avg_ntp_loss: 0.6314 global_avg_top_loss: 1.8208 +[titan] 2025-09-10 20:37:59,022 - root - INFO - lr: 2.1349e-06 gnorm: 0.64 [2 days, 21:01:30< 3:59:18] +[titan] 2025-09-10 20:38:31,072 - root - INFO - step: 37820 loss: 2.5244 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.27 mfu: 49.27% global_avg_ntp_loss: 0.6655 global_avg_top_loss: 1.8589 +[titan] 2025-09-10 20:38:31,072 - root - INFO - lr: 2.1343e-06 gnorm: 0.68 [2 days, 21:02:02< 3:58:45] +[titan] 2025-09-10 20:39:03,130 - root - INFO - step: 37825 loss: 2.5944 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.16 mfu: 49.26% global_avg_ntp_loss: 0.6994 global_avg_top_loss: 1.8950 +[titan] 2025-09-10 20:39:03,130 - root - INFO - lr: 2.1336e-06 gnorm: 0.65 [2 days, 21:02:34< 3:58:12] +[titan] 2025-09-10 20:39:35,222 - root - INFO - step: 37830 loss: 2.5924 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.64 mfu: 49.21% global_avg_ntp_loss: 0.7094 global_avg_top_loss: 1.8830 +[titan] 2025-09-10 20:39:35,222 - root - INFO - lr: 2.1330e-06 gnorm: 0.68 [2 days, 21:03:06< 3:57:39] +[titan] 2025-09-10 20:40:07,391 - root - INFO - step: 37835 loss: 2.5696 memory: 122.03GiB(87.57%) tps: 10,186 tflops: 485.47 mfu: 49.09% global_avg_ntp_loss: 0.6911 global_avg_top_loss: 1.8785 +[titan] 2025-09-10 20:40:07,391 - root - INFO - lr: 2.1324e-06 gnorm: 0.70 [2 days, 21:03:38< 3:57:06] +[titan] 2025-09-10 20:40:39,487 - root - INFO - step: 37840 loss: 2.4590 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.58 mfu: 49.20% global_avg_ntp_loss: 0.6397 global_avg_top_loss: 1.8193 +[titan] 2025-09-10 20:40:39,487 - root - INFO - lr: 2.1318e-06 gnorm: 0.61 [2 days, 21:04:10< 3:56:33] +[titan] 2025-09-10 20:41:11,542 - root - INFO - step: 37845 loss: 2.4678 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.20 mfu: 49.26% global_avg_ntp_loss: 0.6416 global_avg_top_loss: 1.8262 +[titan] 2025-09-10 20:41:11,542 - root - INFO - lr: 2.1312e-06 gnorm: 0.70 [2 days, 21:04:42< 3:56:00] +[titan] 2025-09-10 20:41:37,340 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 20:41:43,766 - root - INFO - step: 37850 loss: 2.5156 memory: 122.03GiB(87.57%) tps: 10,169 tflops: 484.64 mfu: 49.00% global_avg_ntp_loss: 0.6623 global_avg_top_loss: 1.8533 +[titan] 2025-09-10 20:41:43,767 - root - INFO - lr: 2.1306e-06 gnorm: 0.60 [2 days, 21:05:14< 3:55:27] +[titan] 2025-09-10 20:42:15,772 - root - INFO - step: 37855 loss: 2.5097 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.96 mfu: 49.34% global_avg_ntp_loss: 0.6639 global_avg_top_loss: 1.8458 +[titan] 2025-09-10 20:42:15,772 - root - INFO - lr: 2.1300e-06 gnorm: 0.69 [2 days, 21:05:46< 3:54:54] +[titan] 2025-09-10 20:42:47,875 - root - INFO - step: 37860 loss: 2.5487 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.47 mfu: 49.19% global_avg_ntp_loss: 0.6814 global_avg_top_loss: 1.8673 +[titan] 2025-09-10 20:42:47,875 - root - INFO - lr: 2.1294e-06 gnorm: 0.81 [2 days, 21:06:18< 3:54:21] +[titan] 2025-09-10 20:43:19,917 - root - INFO - step: 37865 loss: 2.3682 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.40 mfu: 49.28% global_avg_ntp_loss: 0.5948 global_avg_top_loss: 1.7734 +[titan] 2025-09-10 20:43:19,917 - root - INFO - lr: 2.1288e-06 gnorm: 0.58 [2 days, 21:06:50< 3:53:49] +[titan] 2025-09-10 20:43:51,964 - root - INFO - step: 37870 loss: 2.3396 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.33 mfu: 49.27% global_avg_ntp_loss: 0.5807 global_avg_top_loss: 1.7589 +[titan] 2025-09-10 20:43:51,964 - root - INFO - lr: 2.1282e-06 gnorm: 0.71 [2 days, 21:07:22< 3:53:16] +[titan] 2025-09-10 20:44:24,006 - root - INFO - step: 37875 loss: 2.3438 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.39 mfu: 49.28% global_avg_ntp_loss: 0.5797 global_avg_top_loss: 1.7641 +[titan] 2025-09-10 20:44:24,006 - root - INFO - lr: 2.1276e-06 gnorm: 0.93 [2 days, 21:07:55< 3:52:43] +[titan] 2025-09-10 20:44:56,074 - root - INFO - step: 37880 loss: 2.3728 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.01 mfu: 49.24% global_avg_ntp_loss: 0.5957 global_avg_top_loss: 1.7772 +[titan] 2025-09-10 20:44:56,074 - root - INFO - lr: 2.1270e-06 gnorm: 0.62 [2 days, 21:08:27< 3:52:10] +[titan] 2025-09-10 20:45:27,954 - root - INFO - step: 37885 loss: 2.4314 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.88 mfu: 49.53% global_avg_ntp_loss: 0.6254 global_avg_top_loss: 1.8060 +[titan] 2025-09-10 20:45:27,954 - root - INFO - lr: 2.1264e-06 gnorm: 0.64 [2 days, 21:08:58< 3:51:37] +[titan] 2025-09-10 20:45:47,470 - root - INFO - Dumping profiler traces at step 37888 +[titan] 2025-09-10 20:45:47,543 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-10 20:46:00,398 - root - INFO - step: 37890 loss: 2.5158 memory: 122.03GiB(87.57%) tps: 10,100 tflops: 481.36 mfu: 48.67% global_avg_ntp_loss: 0.6645 global_avg_top_loss: 1.8513 +[titan] 2025-09-10 20:46:00,398 - root - INFO - lr: 2.1258e-06 gnorm: 0.69 [2 days, 21:09:31< 3:51:04] +[titan] 2025-09-10 20:46:32,323 - root - INFO - step: 37895 loss: 2.3548 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.18 mfu: 49.46% global_avg_ntp_loss: 0.5883 global_avg_top_loss: 1.7665 +[titan] 2025-09-10 20:46:32,323 - root - INFO - lr: 2.1252e-06 gnorm: 0.63 [2 days, 21:10:03< 3:50:31] +[titan] 2025-09-10 20:46:57,843 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 20:47:04,273 - root - INFO - step: 37900 loss: 2.6504 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.80 mfu: 49.42% global_avg_ntp_loss: 0.7233 global_avg_top_loss: 1.9271 +[titan] 2025-09-10 20:47:04,273 - root - INFO - lr: 2.1246e-06 gnorm: 0.66 [2 days, 21:10:35< 3:49:58] +[titan] 2025-09-10 20:47:36,358 - root - INFO - step: 37905 loss: 2.5982 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.74 mfu: 49.22% global_avg_ntp_loss: 0.7036 global_avg_top_loss: 1.8946 +[titan] 2025-09-10 20:47:36,358 - root - INFO - lr: 2.1240e-06 gnorm: 0.64 [2 days, 21:11:07< 3:49:25] +[titan] 2025-09-10 20:48:08,233 - root - INFO - step: 37910 loss: 2.9830 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.95 mfu: 49.54% global_avg_ntp_loss: 0.9279 global_avg_top_loss: 2.0550 +[titan] 2025-09-10 20:48:08,233 - root - INFO - lr: 2.1234e-06 gnorm: 0.65 [2 days, 21:11:39< 3:48:52] +[titan] 2025-09-10 20:48:40,143 - root - INFO - step: 37915 loss: 2.4864 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.41 mfu: 49.49% global_avg_ntp_loss: 0.6484 global_avg_top_loss: 1.8379 +[titan] 2025-09-10 20:48:40,143 - root - INFO - lr: 2.1228e-06 gnorm: 0.66 [2 days, 21:12:11< 3:48:20] +[titan] 2025-09-10 20:49:12,157 - root - INFO - step: 37920 loss: 2.5383 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.82 mfu: 49.32% global_avg_ntp_loss: 0.6741 global_avg_top_loss: 1.8642 +[titan] 2025-09-10 20:49:12,158 - root - INFO - lr: 2.1222e-06 gnorm: 0.61 [2 days, 21:12:43< 3:47:47] +[titan] 2025-09-10 20:49:44,158 - root - INFO - step: 37925 loss: 2.5294 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.03 mfu: 49.35% global_avg_ntp_loss: 0.6705 global_avg_top_loss: 1.8589 +[titan] 2025-09-10 20:49:44,158 - root - INFO - lr: 2.1217e-06 gnorm: 0.72 [2 days, 21:13:15< 3:47:14] +[titan] 2025-09-10 20:50:16,006 - root - INFO - step: 37930 loss: 2.5282 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.36 mfu: 49.58% global_avg_ntp_loss: 0.6723 global_avg_top_loss: 1.8560 +[titan] 2025-09-10 20:50:16,006 - root - INFO - lr: 2.1211e-06 gnorm: 0.58 [2 days, 21:13:47< 3:46:41] +[titan] 2025-09-10 20:50:48,109 - root - INFO - step: 37935 loss: 2.5366 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.47 mfu: 49.19% global_avg_ntp_loss: 0.6769 global_avg_top_loss: 1.8597 +[titan] 2025-09-10 20:50:48,109 - root - INFO - lr: 2.1205e-06 gnorm: 0.80 [2 days, 21:14:19< 3:46:08] +[titan] 2025-09-10 20:51:20,162 - root - INFO - step: 37940 loss: 2.5370 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.22 mfu: 49.26% global_avg_ntp_loss: 0.6764 global_avg_top_loss: 1.8606 +[titan] 2025-09-10 20:51:20,163 - root - INFO - lr: 2.1199e-06 gnorm: 0.72 [2 days, 21:14:51< 3:45:35] +[titan] 2025-09-10 20:51:52,153 - root - INFO - step: 37945 loss: 2.4236 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.18 mfu: 49.36% global_avg_ntp_loss: 0.6198 global_avg_top_loss: 1.8038 +[titan] 2025-09-10 20:51:52,153 - root - INFO - lr: 2.1193e-06 gnorm: 0.59 [2 days, 21:15:23< 3:45:02] +[titan] 2025-09-10 20:52:17,699 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 20:52:24,035 - root - INFO - step: 37950 loss: 2.2624 memory: 122.03GiB(87.57%) tps: 10,278 tflops: 489.85 mfu: 49.53% global_avg_ntp_loss: 0.5424 global_avg_top_loss: 1.7200 +[titan] 2025-09-10 20:52:24,035 - root - INFO - lr: 2.1188e-06 gnorm: 0.67 [2 days, 21:15:55< 3:44:29] +[titan] 2025-09-10 20:52:56,105 - root - INFO - step: 37955 loss: 2.3707 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.97 mfu: 49.24% global_avg_ntp_loss: 0.5940 global_avg_top_loss: 1.7767 +[titan] 2025-09-10 20:52:56,106 - root - INFO - lr: 2.1182e-06 gnorm: 0.76 [2 days, 21:16:27< 3:43:56] +[titan] 2025-09-10 20:53:28,225 - root - INFO - step: 37960 loss: 2.5264 memory: 122.03GiB(87.57%) tps: 10,202 tflops: 486.21 mfu: 49.16% global_avg_ntp_loss: 0.6686 global_avg_top_loss: 1.8577 +[titan] 2025-09-10 20:53:28,226 - root - INFO - lr: 2.1176e-06 gnorm: 0.59 [2 days, 21:16:59< 3:43:23] +[titan] 2025-09-10 20:54:00,274 - root - INFO - step: 37965 loss: 2.4272 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.30 mfu: 49.27% global_avg_ntp_loss: 0.6236 global_avg_top_loss: 1.8036 +[titan] 2025-09-10 20:54:00,274 - root - INFO - lr: 2.1170e-06 gnorm: 0.65 [2 days, 21:17:31< 3:42:51] +[titan] 2025-09-10 20:54:32,244 - root - INFO - step: 37970 loss: 2.5158 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.49 mfu: 49.39% global_avg_ntp_loss: 0.6619 global_avg_top_loss: 1.8539 +[titan] 2025-09-10 20:54:32,244 - root - INFO - lr: 2.1165e-06 gnorm: 0.65 [2 days, 21:18:03< 3:42:18] +[titan] 2025-09-10 20:55:04,314 - root - INFO - step: 37975 loss: 2.3332 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.98 mfu: 49.24% global_avg_ntp_loss: 0.5807 global_avg_top_loss: 1.7525 +[titan] 2025-09-10 20:55:04,314 - root - INFO - lr: 2.1159e-06 gnorm: 0.62 [2 days, 21:18:35< 3:41:45] +[titan] 2025-09-10 20:55:36,485 - root - INFO - step: 37980 loss: 2.5431 memory: 122.03GiB(87.57%) tps: 10,186 tflops: 485.44 mfu: 49.08% global_avg_ntp_loss: 0.6780 global_avg_top_loss: 1.8651 +[titan] 2025-09-10 20:55:36,485 - root - INFO - lr: 2.1153e-06 gnorm: 0.73 [2 days, 21:19:07< 3:41:12] +[titan] 2025-09-10 20:56:08,575 - root - INFO - step: 37985 loss: 2.5639 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.67 mfu: 49.21% global_avg_ntp_loss: 0.6897 global_avg_top_loss: 1.8743 +[titan] 2025-09-10 20:56:08,575 - root - INFO - lr: 2.1147e-06 gnorm: 0.63 [2 days, 21:19:39< 3:40:39] +[titan] 2025-09-10 20:56:40,574 - root - INFO - step: 37990 loss: 3.1801 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.04 mfu: 49.35% global_avg_ntp_loss: 1.0295 global_avg_top_loss: 2.1506 +[titan] 2025-09-10 20:56:40,575 - root - INFO - lr: 2.1142e-06 gnorm: 1.03 [2 days, 21:20:11< 3:40:06] +[titan] 2025-09-10 20:57:12,584 - root - INFO - step: 37995 loss: 2.4968 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.89 mfu: 49.33% global_avg_ntp_loss: 0.6512 global_avg_top_loss: 1.8457 +[titan] 2025-09-10 20:57:12,584 - root - INFO - lr: 2.1136e-06 gnorm: 0.65 [2 days, 21:20:43< 3:39:33] +[titan] 2025-09-10 20:57:38,085 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 20:57:44,637 - root - INFO - step: 38000 loss: 2.4605 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.23 mfu: 49.27% global_avg_ntp_loss: 0.6385 global_avg_top_loss: 1.8220 +[titan] 2025-09-10 20:57:44,637 - root - INFO - lr: 2.1130e-06 gnorm: 0.64 [2 days, 21:21:15< 3:39:00] +[titan] 2025-09-10 20:58:16,565 - root - INFO - step: 38005 loss: 2.4689 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.14 mfu: 49.46% global_avg_ntp_loss: 0.6422 global_avg_top_loss: 1.8267 +[titan] 2025-09-10 20:58:16,565 - root - INFO - lr: 2.1125e-06 gnorm: 0.71 [2 days, 21:21:47< 3:38:27] +[titan] 2025-09-10 20:58:48,626 - root - INFO - step: 38010 loss: 2.5064 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.10 mfu: 49.25% global_avg_ntp_loss: 0.6594 global_avg_top_loss: 1.8471 +[titan] 2025-09-10 20:58:48,626 - root - INFO - lr: 2.1119e-06 gnorm: 0.60 [2 days, 21:22:19< 3:37:55] +[titan] 2025-09-10 20:59:20,593 - root - INFO - step: 38015 loss: 2.5758 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.55 mfu: 49.40% global_avg_ntp_loss: 0.6891 global_avg_top_loss: 1.8866 +[titan] 2025-09-10 20:59:20,593 - root - INFO - lr: 2.1114e-06 gnorm: 0.76 [2 days, 21:22:51< 3:37:22] +[titan] 2025-09-10 20:59:52,666 - root - INFO - step: 38020 loss: 2.4924 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.93 mfu: 49.23% global_avg_ntp_loss: 0.6565 global_avg_top_loss: 1.8359 +[titan] 2025-09-10 20:59:52,666 - root - INFO - lr: 2.1108e-06 gnorm: 0.73 [2 days, 21:23:23< 3:36:49] +[titan] 2025-09-10 21:00:24,579 - root - INFO - step: 38025 loss: 2.3355 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.35 mfu: 49.48% global_avg_ntp_loss: 0.5862 global_avg_top_loss: 1.7493 +[titan] 2025-09-10 21:00:24,580 - root - INFO - lr: 2.1102e-06 gnorm: 0.60 [2 days, 21:23:55< 3:36:16] +[titan] 2025-09-10 21:00:56,742 - root - INFO - step: 38030 loss: 2.3052 memory: 122.03GiB(87.57%) tps: 10,189 tflops: 485.58 mfu: 49.10% global_avg_ntp_loss: 0.5628 global_avg_top_loss: 1.7424 +[titan] 2025-09-10 21:00:56,742 - root - INFO - lr: 2.1097e-06 gnorm: 0.71 [2 days, 21:24:27< 3:35:43] +[titan] 2025-09-10 21:01:28,756 - root - INFO - step: 38035 loss: 2.3457 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.82 mfu: 49.32% global_avg_ntp_loss: 0.5808 global_avg_top_loss: 1.7649 +[titan] 2025-09-10 21:01:28,756 - root - INFO - lr: 2.1091e-06 gnorm: 0.83 [2 days, 21:24:59< 3:35:10] +[titan] 2025-09-10 21:02:00,693 - root - INFO - step: 38040 loss: 2.4666 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 489.01 mfu: 49.44% global_avg_ntp_loss: 0.6373 global_avg_top_loss: 1.8293 +[titan] 2025-09-10 21:02:00,693 - root - INFO - lr: 2.1086e-06 gnorm: 0.60 [2 days, 21:25:31< 3:34:37] +[titan] 2025-09-10 21:02:32,698 - root - INFO - step: 38045 loss: 2.3205 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.95 mfu: 49.34% global_avg_ntp_loss: 0.5706 global_avg_top_loss: 1.7499 +[titan] 2025-09-10 21:02:32,699 - root - INFO - lr: 2.1080e-06 gnorm: 0.64 [2 days, 21:26:03< 3:34:04] +[titan] 2025-09-10 21:02:58,253 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 21:03:04,677 - root - INFO - step: 38050 loss: 2.3037 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.37 mfu: 49.38% global_avg_ntp_loss: 0.5682 global_avg_top_loss: 1.7355 +[titan] 2025-09-10 21:03:04,677 - root - INFO - lr: 2.1075e-06 gnorm: 0.64 [2 days, 21:26:35< 3:33:31] +[titan] 2025-09-10 21:03:36,733 - root - INFO - step: 38055 loss: 2.4225 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.18 mfu: 49.26% global_avg_ntp_loss: 0.6208 global_avg_top_loss: 1.8017 +[titan] 2025-09-10 21:03:36,734 - root - INFO - lr: 2.1069e-06 gnorm: 0.64 [2 days, 21:27:07< 3:32:58] +[titan] 2025-09-10 21:04:08,813 - root - INFO - step: 38060 loss: 2.5239 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.82 mfu: 49.22% global_avg_ntp_loss: 0.6678 global_avg_top_loss: 1.8560 +[titan] 2025-09-10 21:04:08,814 - root - INFO - lr: 2.1064e-06 gnorm: 0.62 [2 days, 21:27:39< 3:32:26] +[titan] 2025-09-10 21:04:40,842 - root - INFO - step: 38065 loss: 2.5346 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.61 mfu: 49.30% global_avg_ntp_loss: 0.6757 global_avg_top_loss: 1.8589 +[titan] 2025-09-10 21:04:40,842 - root - INFO - lr: 2.1058e-06 gnorm: 0.66 [2 days, 21:28:11< 3:31:53] +[titan] 2025-09-10 21:05:12,922 - root - INFO - step: 38070 loss: 3.0577 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.82 mfu: 49.22% global_avg_ntp_loss: 0.9667 global_avg_top_loss: 2.0910 +[titan] 2025-09-10 21:05:12,922 - root - INFO - lr: 2.1053e-06 gnorm: 0.75 [2 days, 21:28:43< 3:31:20] +[titan] 2025-09-10 21:05:45,072 - root - INFO - step: 38075 loss: 2.4785 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.76 mfu: 49.12% global_avg_ntp_loss: 0.6463 global_avg_top_loss: 1.8322 +[titan] 2025-09-10 21:05:45,072 - root - INFO - lr: 2.1047e-06 gnorm: 0.66 [2 days, 21:29:16< 3:30:47] +[titan] 2025-09-10 21:06:17,025 - root - INFO - step: 38080 loss: 2.4247 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.76 mfu: 49.42% global_avg_ntp_loss: 0.6247 global_avg_top_loss: 1.8000 +[titan] 2025-09-10 21:06:17,025 - root - INFO - lr: 2.1042e-06 gnorm: 0.65 [2 days, 21:29:48< 3:30:14] +[titan] 2025-09-10 21:06:49,057 - root - INFO - step: 38085 loss: 2.3956 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.55 mfu: 49.30% global_avg_ntp_loss: 0.6077 global_avg_top_loss: 1.7879 +[titan] 2025-09-10 21:06:49,057 - root - INFO - lr: 2.1037e-06 gnorm: 0.75 [2 days, 21:30:20< 3:29:41] +[titan] 2025-09-10 21:07:21,086 - root - INFO - step: 38090 loss: 2.5596 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.60 mfu: 49.30% global_avg_ntp_loss: 0.6855 global_avg_top_loss: 1.8741 +[titan] 2025-09-10 21:07:21,086 - root - INFO - lr: 2.1031e-06 gnorm: 0.63 [2 days, 21:30:52< 3:29:08] +[titan] 2025-09-10 21:07:53,110 - root - INFO - step: 38095 loss: 2.5274 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.66 mfu: 49.31% global_avg_ntp_loss: 0.6702 global_avg_top_loss: 1.8571 +[titan] 2025-09-10 21:07:53,111 - root - INFO - lr: 2.1026e-06 gnorm: 0.76 [2 days, 21:31:24< 3:28:35] +[titan] 2025-09-10 21:08:18,825 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 21:08:25,161 - root - INFO - step: 38100 loss: 2.5385 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.26 mfu: 49.27% global_avg_ntp_loss: 0.6768 global_avg_top_loss: 1.8617 +[titan] 2025-09-10 21:08:25,162 - root - INFO - lr: 2.1020e-06 gnorm: 0.80 [2 days, 21:31:56< 3:28:02] +[titan] 2025-09-10 21:08:57,201 - root - INFO - step: 38105 loss: 2.3765 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.44 mfu: 49.29% global_avg_ntp_loss: 0.6028 global_avg_top_loss: 1.7737 +[titan] 2025-09-10 21:08:57,201 - root - INFO - lr: 2.1015e-06 gnorm: 0.57 [2 days, 21:32:28< 3:27:30] +[titan] 2025-09-10 21:09:29,047 - root - INFO - step: 38110 loss: 2.3325 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.39 mfu: 49.58% global_avg_ntp_loss: 0.5790 global_avg_top_loss: 1.7535 +[titan] 2025-09-10 21:09:29,048 - root - INFO - lr: 2.1010e-06 gnorm: 0.74 [2 days, 21:33:00< 3:26:57] +[titan] 2025-09-10 21:10:01,420 - root - INFO - step: 38115 loss: 2.2043 memory: 122.03GiB(87.57%) tps: 10,122 tflops: 482.42 mfu: 48.78% global_avg_ntp_loss: 0.5154 global_avg_top_loss: 1.6889 +[titan] 2025-09-10 21:10:01,420 - root - INFO - lr: 2.1004e-06 gnorm: 0.73 [2 days, 21:33:32< 3:26:24] +[titan] 2025-09-10 21:10:33,402 - root - INFO - step: 38120 loss: 2.4528 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.31 mfu: 49.37% global_avg_ntp_loss: 0.6309 global_avg_top_loss: 1.8219 +[titan] 2025-09-10 21:10:33,403 - root - INFO - lr: 2.0999e-06 gnorm: 0.69 [2 days, 21:34:04< 3:25:51] +[titan] 2025-09-10 21:11:05,356 - root - INFO - step: 38125 loss: 2.3763 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.75 mfu: 49.42% global_avg_ntp_loss: 0.5958 global_avg_top_loss: 1.7805 +[titan] 2025-09-10 21:11:05,356 - root - INFO - lr: 2.0994e-06 gnorm: 0.69 [2 days, 21:34:36< 3:25:18] +[titan] 2025-09-10 21:11:37,287 - root - INFO - step: 38130 loss: 2.3063 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.08 mfu: 49.45% global_avg_ntp_loss: 0.5690 global_avg_top_loss: 1.7373 +[titan] 2025-09-10 21:11:37,288 - root - INFO - lr: 2.0989e-06 gnorm: 0.59 [2 days, 21:35:08< 3:24:45] +[titan] 2025-09-10 21:12:09,155 - root - INFO - step: 38135 loss: 2.4380 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.06 mfu: 49.55% global_avg_ntp_loss: 0.6305 global_avg_top_loss: 1.8075 +[titan] 2025-09-10 21:12:09,156 - root - INFO - lr: 2.0983e-06 gnorm: 0.61 [2 days, 21:35:40< 3:24:12] +[titan] 2025-09-10 21:12:41,184 - root - INFO - step: 38140 loss: 2.4642 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.60 mfu: 49.30% global_avg_ntp_loss: 0.6406 global_avg_top_loss: 1.8236 +[titan] 2025-09-10 21:12:41,184 - root - INFO - lr: 2.0978e-06 gnorm: 0.64 [2 days, 21:36:12< 3:23:39] +[titan] 2025-09-10 21:13:13,040 - root - INFO - step: 38145 loss: 2.5467 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.25 mfu: 49.57% global_avg_ntp_loss: 0.6805 global_avg_top_loss: 1.8662 +[titan] 2025-09-10 21:13:13,040 - root - INFO - lr: 2.0973e-06 gnorm: 0.64 [2 days, 21:36:44< 3:23:06] +[titan] 2025-09-10 21:13:38,815 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 21:13:45,304 - root - INFO - step: 38150 loss: 2.5944 memory: 122.03GiB(87.57%) tps: 10,156 tflops: 484.05 mfu: 48.94% global_avg_ntp_loss: 0.6975 global_avg_top_loss: 1.8969 +[titan] 2025-09-10 21:13:45,304 - root - INFO - lr: 2.0968e-06 gnorm: 0.72 [2 days, 21:37:16< 3:22:34] +[titan] 2025-09-10 21:14:17,188 - root - INFO - step: 38155 loss: 2.4192 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.80 mfu: 49.53% global_avg_ntp_loss: 0.6173 global_avg_top_loss: 1.8019 +[titan] 2025-09-10 21:14:17,188 - root - INFO - lr: 2.0962e-06 gnorm: 0.63 [2 days, 21:37:48< 3:22:01] +[titan] 2025-09-10 21:14:49,073 - root - INFO - step: 38160 loss: 2.4354 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.80 mfu: 49.52% global_avg_ntp_loss: 0.6239 global_avg_top_loss: 1.8115 +[titan] 2025-09-10 21:14:49,073 - root - INFO - lr: 2.0957e-06 gnorm: 0.62 [2 days, 21:38:20< 3:21:28] +[titan] 2025-09-10 21:15:21,430 - root - INFO - step: 38165 loss: 2.4345 memory: 122.03GiB(87.57%) tps: 10,127 tflops: 482.66 mfu: 48.80% global_avg_ntp_loss: 0.6230 global_avg_top_loss: 1.8115 +[titan] 2025-09-10 21:15:21,430 - root - INFO - lr: 2.0952e-06 gnorm: 0.75 [2 days, 21:38:52< 3:20:55] +[titan] 2025-09-10 21:15:53,399 - root - INFO - step: 38170 loss: 2.5437 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.51 mfu: 49.39% global_avg_ntp_loss: 0.6779 global_avg_top_loss: 1.8657 +[titan] 2025-09-10 21:15:53,399 - root - INFO - lr: 2.0947e-06 gnorm: 0.63 [2 days, 21:39:24< 3:20:22] +[titan] 2025-09-10 21:16:25,339 - root - INFO - step: 38175 loss: 2.5746 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.95 mfu: 49.44% global_avg_ntp_loss: 0.6940 global_avg_top_loss: 1.8806 +[titan] 2025-09-10 21:16:25,339 - root - INFO - lr: 2.0942e-06 gnorm: 0.84 [2 days, 21:39:56< 3:19:49] +[titan] 2025-09-10 21:16:57,493 - root - INFO - step: 38180 loss: 2.4837 memory: 122.03GiB(87.57%) tps: 10,191 tflops: 485.69 mfu: 49.11% global_avg_ntp_loss: 0.6551 global_avg_top_loss: 1.8286 +[titan] 2025-09-10 21:16:57,494 - root - INFO - lr: 2.0936e-06 gnorm: 0.77 [2 days, 21:40:28< 3:19:16] +[titan] 2025-09-10 21:17:29,452 - root - INFO - step: 38185 loss: 2.3595 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.66 mfu: 49.41% global_avg_ntp_loss: 0.5924 global_avg_top_loss: 1.7671 +[titan] 2025-09-10 21:17:29,453 - root - INFO - lr: 2.0931e-06 gnorm: 0.62 [2 days, 21:41:00< 3:18:43] +[titan] 2025-09-10 21:18:01,356 - root - INFO - step: 38190 loss: 2.3010 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.51 mfu: 49.50% global_avg_ntp_loss: 0.5670 global_avg_top_loss: 1.7340 +[titan] 2025-09-10 21:18:01,356 - root - INFO - lr: 2.0926e-06 gnorm: 0.67 [2 days, 21:41:32< 3:18:10] +[titan] 2025-09-10 21:18:33,458 - root - INFO - step: 38195 loss: 2.3216 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.48 mfu: 49.19% global_avg_ntp_loss: 0.5705 global_avg_top_loss: 1.7511 +[titan] 2025-09-10 21:18:33,459 - root - INFO - lr: 2.0921e-06 gnorm: 0.88 [2 days, 21:42:04< 3:17:38] +[titan] 2025-09-10 21:18:59,069 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 21:19:05,431 - root - INFO - step: 38200 loss: 2.5379 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.46 mfu: 49.39% global_avg_ntp_loss: 0.6669 global_avg_top_loss: 1.8710 +[titan] 2025-09-10 21:19:05,431 - root - INFO - lr: 2.0916e-06 gnorm: 0.64 [2 days, 21:42:36< 3:17:05] +[titan] 2025-09-10 21:19:37,532 - root - INFO - step: 38205 loss: 2.3543 memory: 122.03GiB(87.57%) tps: 10,208 tflops: 486.50 mfu: 49.19% global_avg_ntp_loss: 0.5864 global_avg_top_loss: 1.7680 +[titan] 2025-09-10 21:19:37,532 - root - INFO - lr: 2.0911e-06 gnorm: 0.63 [2 days, 21:43:08< 3:16:32] +[titan] 2025-09-10 21:20:09,480 - root - INFO - step: 38210 loss: 2.4121 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.83 mfu: 49.43% global_avg_ntp_loss: 0.6177 global_avg_top_loss: 1.7944 +[titan] 2025-09-10 21:20:09,480 - root - INFO - lr: 2.0906e-06 gnorm: 0.63 [2 days, 21:43:40< 3:15:59] +[titan] 2025-09-10 21:20:41,360 - root - INFO - step: 38215 loss: 2.4025 memory: 122.03GiB(87.57%) tps: 10,278 tflops: 489.87 mfu: 49.53% global_avg_ntp_loss: 0.6105 global_avg_top_loss: 1.7920 +[titan] 2025-09-10 21:20:41,361 - root - INFO - lr: 2.0901e-06 gnorm: 0.65 [2 days, 21:44:12< 3:15:26] +[titan] 2025-09-10 21:21:13,365 - root - INFO - step: 38220 loss: 2.5149 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.96 mfu: 49.34% global_avg_ntp_loss: 0.6632 global_avg_top_loss: 1.8518 +[titan] 2025-09-10 21:21:13,366 - root - INFO - lr: 2.0896e-06 gnorm: 0.64 [2 days, 21:44:44< 3:14:53] +[titan] 2025-09-10 21:21:45,399 - root - INFO - step: 38225 loss: 2.6152 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.53 mfu: 49.30% global_avg_ntp_loss: 0.7108 global_avg_top_loss: 1.9044 +[titan] 2025-09-10 21:21:45,399 - root - INFO - lr: 2.0891e-06 gnorm: 0.67 [2 days, 21:45:16< 3:14:20] +[titan] 2025-09-10 21:22:17,455 - root - INFO - step: 38230 loss: 2.6202 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.19 mfu: 49.26% global_avg_ntp_loss: 0.7248 global_avg_top_loss: 1.8954 +[titan] 2025-09-10 21:22:17,455 - root - INFO - lr: 2.0886e-06 gnorm: 0.73 [2 days, 21:45:48< 3:13:47] +[titan] 2025-09-10 21:22:49,599 - root - INFO - step: 38235 loss: 2.4404 memory: 122.03GiB(87.57%) tps: 10,194 tflops: 485.84 mfu: 49.12% global_avg_ntp_loss: 0.6242 global_avg_top_loss: 1.8162 +[titan] 2025-09-10 21:22:49,600 - root - INFO - lr: 2.0881e-06 gnorm: 0.71 [2 days, 21:46:20< 3:13:14] +[titan] 2025-09-10 21:23:21,744 - root - INFO - step: 38240 loss: 2.4922 memory: 122.03GiB(87.57%) tps: 10,194 tflops: 485.84 mfu: 49.12% global_avg_ntp_loss: 0.6486 global_avg_top_loss: 1.8436 +[titan] 2025-09-10 21:23:21,744 - root - INFO - lr: 2.0876e-06 gnorm: 0.65 [2 days, 21:46:52< 3:12:42] +[titan] 2025-09-10 21:23:53,680 - root - INFO - step: 38245 loss: 2.5161 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 489.01 mfu: 49.44% global_avg_ntp_loss: 0.6704 global_avg_top_loss: 1.8457 +[titan] 2025-09-10 21:23:53,681 - root - INFO - lr: 2.0871e-06 gnorm: 0.69 [2 days, 21:47:24< 3:12:09] +[titan] 2025-09-10 21:24:19,364 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 21:24:25,828 - root - INFO - step: 38250 loss: 2.5414 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.79 mfu: 49.12% global_avg_ntp_loss: 0.6768 global_avg_top_loss: 1.8646 +[titan] 2025-09-10 21:24:25,829 - root - INFO - lr: 2.0866e-06 gnorm: 0.62 [2 days, 21:47:56< 3:11:36] +[titan] 2025-09-10 21:24:57,946 - root - INFO - step: 38255 loss: 2.5108 memory: 122.03GiB(87.57%) tps: 10,203 tflops: 486.25 mfu: 49.17% global_avg_ntp_loss: 0.6605 global_avg_top_loss: 1.8503 +[titan] 2025-09-10 21:24:57,947 - root - INFO - lr: 2.0861e-06 gnorm: 0.84 [2 days, 21:48:28< 3:11:03] +[titan] 2025-09-10 21:25:29,956 - root - INFO - step: 38260 loss: 2.5093 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.88 mfu: 49.33% global_avg_ntp_loss: 0.6623 global_avg_top_loss: 1.8470 +[titan] 2025-09-10 21:25:29,957 - root - INFO - lr: 2.0856e-06 gnorm: 0.80 [2 days, 21:49:00< 3:10:30] +[titan] 2025-09-10 21:26:02,022 - root - INFO - step: 38265 loss: 2.3550 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.04 mfu: 49.25% global_avg_ntp_loss: 0.5890 global_avg_top_loss: 1.7660 +[titan] 2025-09-10 21:26:02,023 - root - INFO - lr: 2.0851e-06 gnorm: 0.60 [2 days, 21:49:32< 3:09:57] +[titan] 2025-09-10 21:26:34,047 - root - INFO - step: 38270 loss: 2.3259 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.66 mfu: 49.31% global_avg_ntp_loss: 0.5739 global_avg_top_loss: 1.7520 +[titan] 2025-09-10 21:26:34,047 - root - INFO - lr: 2.0846e-06 gnorm: 0.71 [2 days, 21:50:05< 3:09:24] +[titan] 2025-09-10 21:27:06,106 - root - INFO - step: 38275 loss: 2.2983 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.14 mfu: 49.26% global_avg_ntp_loss: 0.5595 global_avg_top_loss: 1.7388 +[titan] 2025-09-10 21:27:06,106 - root - INFO - lr: 2.0841e-06 gnorm: 0.85 [2 days, 21:50:37< 3:08:51] +[titan] 2025-09-10 21:27:38,114 - root - INFO - step: 38280 loss: 2.4626 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.92 mfu: 49.33% global_avg_ntp_loss: 0.6400 global_avg_top_loss: 1.8226 +[titan] 2025-09-10 21:27:38,114 - root - INFO - lr: 2.0837e-06 gnorm: 0.64 [2 days, 21:51:09< 3:08:19] +[titan] 2025-09-10 21:28:10,273 - root - INFO - step: 38285 loss: 2.3397 memory: 122.03GiB(87.57%) tps: 10,189 tflops: 485.62 mfu: 49.10% global_avg_ntp_loss: 0.5816 global_avg_top_loss: 1.7582 +[titan] 2025-09-10 21:28:10,273 - root - INFO - lr: 2.0832e-06 gnorm: 0.66 [2 days, 21:51:41< 3:07:46] +[titan] 2025-09-10 21:28:42,293 - root - INFO - step: 38290 loss: 2.3839 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.73 mfu: 49.32% global_avg_ntp_loss: 0.6034 global_avg_top_loss: 1.7805 +[titan] 2025-09-10 21:28:42,293 - root - INFO - lr: 2.0827e-06 gnorm: 0.63 [2 days, 21:52:13< 3:07:13] +[titan] 2025-09-10 21:29:14,260 - root - INFO - step: 38295 loss: 2.4004 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.55 mfu: 49.40% global_avg_ntp_loss: 0.6094 global_avg_top_loss: 1.7910 +[titan] 2025-09-10 21:29:14,260 - root - INFO - lr: 2.0822e-06 gnorm: 0.67 [2 days, 21:52:45< 3:06:40] +[titan] 2025-09-10 21:29:39,851 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 21:29:46,243 - root - INFO - step: 38300 loss: 2.4885 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.29 mfu: 49.37% global_avg_ntp_loss: 0.6517 global_avg_top_loss: 1.8368 +[titan] 2025-09-10 21:29:46,244 - root - INFO - lr: 2.0817e-06 gnorm: 0.67 [2 days, 21:53:17< 3:06:07] +[titan] 2025-09-10 21:30:18,366 - root - INFO - step: 38305 loss: 2.5829 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.17 mfu: 49.16% global_avg_ntp_loss: 0.6956 global_avg_top_loss: 1.8873 +[titan] 2025-09-10 21:30:18,366 - root - INFO - lr: 2.0812e-06 gnorm: 0.68 [2 days, 21:53:49< 3:05:34] +[titan] 2025-09-10 21:30:50,271 - root - INFO - step: 38310 loss: 3.5687 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.49 mfu: 49.49% global_avg_ntp_loss: 1.2494 global_avg_top_loss: 2.3193 +[titan] 2025-09-10 21:30:50,272 - root - INFO - lr: 2.0808e-06 gnorm: 0.73 [2 days, 21:54:21< 3:05:01] +[titan] 2025-09-10 21:31:22,208 - root - INFO - step: 38315 loss: 2.4777 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 489.01 mfu: 49.44% global_avg_ntp_loss: 0.6445 global_avg_top_loss: 1.8332 +[titan] 2025-09-10 21:31:22,208 - root - INFO - lr: 2.0803e-06 gnorm: 0.68 [2 days, 21:54:53< 3:04:28] +[titan] 2025-09-10 21:31:54,268 - root - INFO - step: 38320 loss: 2.3941 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.13 mfu: 49.25% global_avg_ntp_loss: 0.6064 global_avg_top_loss: 1.7877 +[titan] 2025-09-10 21:31:54,268 - root - INFO - lr: 2.0798e-06 gnorm: 0.61 [2 days, 21:55:25< 3:03:55] +[titan] 2025-09-10 21:32:26,272 - root - INFO - step: 38325 loss: 2.4379 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.98 mfu: 49.34% global_avg_ntp_loss: 0.6258 global_avg_top_loss: 1.8121 +[titan] 2025-09-10 21:32:26,272 - root - INFO - lr: 2.0793e-06 gnorm: 0.63 [2 days, 21:55:57< 3:03:23] +[titan] 2025-09-10 21:32:58,342 - root - INFO - step: 38330 loss: 2.5009 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.97 mfu: 49.24% global_avg_ntp_loss: 0.6580 global_avg_top_loss: 1.8430 +[titan] 2025-09-10 21:32:58,342 - root - INFO - lr: 2.0789e-06 gnorm: 0.58 [2 days, 21:56:29< 3:02:50] +[titan] 2025-09-10 21:33:30,383 - root - INFO - step: 38335 loss: 2.4786 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.41 mfu: 49.28% global_avg_ntp_loss: 0.6499 global_avg_top_loss: 1.8287 +[titan] 2025-09-10 21:33:30,383 - root - INFO - lr: 2.0784e-06 gnorm: 0.85 [2 days, 21:57:01< 3:02:17] +[titan] 2025-09-10 21:34:02,393 - root - INFO - step: 38340 loss: 2.5325 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.89 mfu: 49.33% global_avg_ntp_loss: 0.6730 global_avg_top_loss: 1.8595 +[titan] 2025-09-10 21:34:02,393 - root - INFO - lr: 2.0779e-06 gnorm: 0.82 [2 days, 21:57:33< 3:01:44] +[titan] 2025-09-10 21:34:34,431 - root - INFO - step: 38345 loss: 2.3684 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.46 mfu: 49.29% global_avg_ntp_loss: 0.5947 global_avg_top_loss: 1.7737 +[titan] 2025-09-10 21:34:34,431 - root - INFO - lr: 2.0775e-06 gnorm: 0.63 [2 days, 21:58:05< 3:01:11] +[titan] 2025-09-10 21:35:00,232 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 21:35:06,598 - root - INFO - step: 38350 loss: 2.3397 memory: 122.03GiB(87.57%) tps: 10,187 tflops: 485.50 mfu: 49.09% global_avg_ntp_loss: 0.5795 global_avg_top_loss: 1.7602 +[titan] 2025-09-10 21:35:06,599 - root - INFO - lr: 2.0770e-06 gnorm: 0.71 [2 days, 21:58:37< 3:00:38] +[titan] 2025-09-10 21:35:38,714 - root - INFO - step: 38355 loss: 2.3226 memory: 122.03GiB(87.57%) tps: 10,203 tflops: 486.28 mfu: 49.17% global_avg_ntp_loss: 0.5692 global_avg_top_loss: 1.7534 +[titan] 2025-09-10 21:35:38,714 - root - INFO - lr: 2.0765e-06 gnorm: 0.84 [2 days, 21:59:09< 3:00:05] +[titan] 2025-09-10 21:36:10,535 - root - INFO - step: 38360 loss: 2.9329 memory: 122.03GiB(87.57%) tps: 10,298 tflops: 490.78 mfu: 49.62% global_avg_ntp_loss: 0.9058 global_avg_top_loss: 2.0271 +[titan] 2025-09-10 21:36:10,536 - root - INFO - lr: 2.0761e-06 gnorm: 0.59 [2 days, 21:59:41< 2:59:32] +[titan] 2025-09-10 21:36:42,799 - root - INFO - step: 38365 loss: 2.4157 memory: 122.03GiB(87.57%) tps: 10,156 tflops: 484.05 mfu: 48.94% global_avg_ntp_loss: 0.6160 global_avg_top_loss: 1.7996 +[titan] 2025-09-10 21:36:42,799 - root - INFO - lr: 2.0756e-06 gnorm: 0.70 [2 days, 22:00:13< 2:59:00] +[titan] 2025-09-10 21:37:14,672 - root - INFO - step: 38370 loss: 2.3474 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.99 mfu: 49.54% global_avg_ntp_loss: 0.5882 global_avg_top_loss: 1.7592 +[titan] 2025-09-10 21:37:14,672 - root - INFO - lr: 2.0751e-06 gnorm: 0.61 [2 days, 22:00:45< 2:58:27] +[titan] 2025-09-10 21:37:46,485 - root - INFO - step: 38375 loss: 2.4015 memory: 122.03GiB(87.57%) tps: 10,300 tflops: 490.90 mfu: 49.64% global_avg_ntp_loss: 0.6113 global_avg_top_loss: 1.7902 +[titan] 2025-09-10 21:37:46,485 - root - INFO - lr: 2.0747e-06 gnorm: 0.67 [2 days, 22:01:17< 2:57:54] +[titan] 2025-09-10 21:38:18,561 - root - INFO - step: 38380 loss: 2.5629 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.87 mfu: 49.23% global_avg_ntp_loss: 0.6863 global_avg_top_loss: 1.8766 +[titan] 2025-09-10 21:38:18,562 - root - INFO - lr: 2.0742e-06 gnorm: 0.64 [2 days, 22:01:49< 2:57:21] +[titan] 2025-09-10 21:38:50,689 - root - INFO - step: 38385 loss: 2.6099 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.10 mfu: 49.15% global_avg_ntp_loss: 0.7084 global_avg_top_loss: 1.9015 +[titan] 2025-09-10 21:38:50,689 - root - INFO - lr: 2.0738e-06 gnorm: 0.70 [2 days, 22:02:21< 2:56:48] +[titan] 2025-09-10 21:39:22,774 - root - INFO - step: 38390 loss: 3.1237 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.75 mfu: 49.22% global_avg_ntp_loss: 0.9950 global_avg_top_loss: 2.1287 +[titan] 2025-09-10 21:39:22,774 - root - INFO - lr: 2.0733e-06 gnorm: 0.66 [2 days, 22:02:53< 2:56:15] +[titan] 2025-09-10 21:39:54,668 - root - INFO - step: 38395 loss: 2.5167 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.65 mfu: 49.51% global_avg_ntp_loss: 0.6594 global_avg_top_loss: 1.8573 +[titan] 2025-09-10 21:39:54,669 - root - INFO - lr: 2.0729e-06 gnorm: 0.68 [2 days, 22:03:25< 2:55:42] +[titan] 2025-09-10 21:40:20,392 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 21:40:26,822 - root - INFO - step: 38400 loss: 2.4433 memory: 122.03GiB(87.57%) tps: 10,191 tflops: 485.71 mfu: 49.11% global_avg_ntp_loss: 0.6281 global_avg_top_loss: 1.8152 +[titan] 2025-09-10 21:40:26,822 - root - INFO - lr: 2.0724e-06 gnorm: 0.63 [2 days, 22:03:57< 2:55:09] +[titan] 2025-09-10 21:40:27,119 - root - INFO - Dumping profiler traces at step 38400 +[titan] 2025-09-10 21:40:27,194 - root - INFO - Finished dumping profiler traces in 0.08 seconds +[titan] 2025-09-10 21:40:59,070 - root - INFO - step: 38405 loss: 2.5647 memory: 122.03GiB(87.57%) tps: 10,161 tflops: 484.28 mfu: 48.97% global_avg_ntp_loss: 0.6812 global_avg_top_loss: 1.8835 +[titan] 2025-09-10 21:40:59,070 - root - INFO - lr: 2.0720e-06 gnorm: 0.73 [2 days, 22:04:30< 2:54:37] +[titan] 2025-09-10 21:41:31,229 - root - INFO - step: 38410 loss: 2.5903 memory: 122.03GiB(87.57%) tps: 10,189 tflops: 485.62 mfu: 49.10% global_avg_ntp_loss: 0.6944 global_avg_top_loss: 1.8959 +[titan] 2025-09-10 21:41:31,230 - root - INFO - lr: 2.0715e-06 gnorm: 0.64 [2 days, 22:05:02< 2:54:04] +[titan] 2025-09-10 21:42:03,316 - root - INFO - step: 38415 loss: 2.5185 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.72 mfu: 49.21% global_avg_ntp_loss: 0.6693 global_avg_top_loss: 1.8493 +[titan] 2025-09-10 21:42:03,316 - root - INFO - lr: 2.0711e-06 gnorm: 0.77 [2 days, 22:05:34< 2:53:31] +[titan] 2025-09-10 21:42:35,219 - root - INFO - step: 38420 loss: 2.5643 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.52 mfu: 49.50% global_avg_ntp_loss: 0.6896 global_avg_top_loss: 1.8747 +[titan] 2025-09-10 21:42:35,219 - root - INFO - lr: 2.0706e-06 gnorm: 0.82 [2 days, 22:06:06< 2:52:58] +[titan] 2025-09-10 21:43:07,270 - root - INFO - step: 38425 loss: 2.3886 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.27 mfu: 49.27% global_avg_ntp_loss: 0.6064 global_avg_top_loss: 1.7822 +[titan] 2025-09-10 21:43:07,270 - root - INFO - lr: 2.0702e-06 gnorm: 0.63 [2 days, 22:06:38< 2:52:25] +[titan] 2025-09-10 21:43:39,184 - root - INFO - step: 38430 loss: 2.4002 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.35 mfu: 49.48% global_avg_ntp_loss: 0.6137 global_avg_top_loss: 1.7865 +[titan] 2025-09-10 21:43:39,184 - root - INFO - lr: 2.0697e-06 gnorm: 0.65 [2 days, 22:07:10< 2:51:52] +[titan] 2025-09-10 21:44:11,254 - root - INFO - step: 38435 loss: 2.3826 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.97 mfu: 49.24% global_avg_ntp_loss: 0.5999 global_avg_top_loss: 1.7828 +[titan] 2025-09-10 21:44:11,254 - root - INFO - lr: 2.0693e-06 gnorm: 0.99 [2 days, 22:07:42< 2:51:19] +[titan] 2025-09-10 21:44:43,292 - root - INFO - step: 38440 loss: 2.3654 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.45 mfu: 49.29% global_avg_ntp_loss: 0.5993 global_avg_top_loss: 1.7662 +[titan] 2025-09-10 21:44:43,293 - root - INFO - lr: 2.0688e-06 gnorm: 0.59 [2 days, 22:08:14< 2:50:46] +[titan] 2025-09-10 21:45:15,304 - root - INFO - step: 38445 loss: 2.4309 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.86 mfu: 49.33% global_avg_ntp_loss: 0.6195 global_avg_top_loss: 1.8114 +[titan] 2025-09-10 21:45:15,305 - root - INFO - lr: 2.0684e-06 gnorm: 0.69 [2 days, 22:08:46< 2:50:14] +[titan] 2025-09-10 21:45:40,935 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 21:45:47,300 - root - INFO - step: 38450 loss: 2.4733 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.10 mfu: 49.35% global_avg_ntp_loss: 0.6438 global_avg_top_loss: 1.8295 +[titan] 2025-09-10 21:45:47,301 - root - INFO - lr: 2.0680e-06 gnorm: 0.67 [2 days, 22:09:18< 2:49:41] +[titan] 2025-09-10 21:46:19,311 - root - INFO - step: 38455 loss: 2.3433 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.88 mfu: 49.33% global_avg_ntp_loss: 0.5923 global_avg_top_loss: 1.7510 +[titan] 2025-09-10 21:46:19,311 - root - INFO - lr: 2.0675e-06 gnorm: 0.63 [2 days, 22:09:50< 2:49:08] +[titan] 2025-09-10 21:46:51,341 - root - INFO - step: 38460 loss: 2.5030 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.57 mfu: 49.30% global_avg_ntp_loss: 0.6564 global_avg_top_loss: 1.8466 +[titan] 2025-09-10 21:46:51,341 - root - INFO - lr: 2.0671e-06 gnorm: 0.68 [2 days, 22:10:22< 2:48:35] +[titan] 2025-09-10 21:47:23,671 - root - INFO - step: 38465 loss: 2.5533 memory: 122.03GiB(87.57%) tps: 10,136 tflops: 483.05 mfu: 48.84% global_avg_ntp_loss: 0.6793 global_avg_top_loss: 1.8740 +[titan] 2025-09-10 21:47:23,672 - root - INFO - lr: 2.0666e-06 gnorm: 0.70 [2 days, 22:10:54< 2:48:02] +[titan] 2025-09-10 21:47:55,627 - root - INFO - step: 38470 loss: 3.0974 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.73 mfu: 49.42% global_avg_ntp_loss: 0.9852 global_avg_top_loss: 2.1123 +[titan] 2025-09-10 21:47:55,627 - root - INFO - lr: 2.0662e-06 gnorm: 0.71 [2 days, 22:11:26< 2:47:29] +[titan] 2025-09-10 21:48:27,506 - root - INFO - step: 38475 loss: 2.4808 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.88 mfu: 49.53% global_avg_ntp_loss: 0.6465 global_avg_top_loss: 1.8344 +[titan] 2025-09-10 21:48:27,507 - root - INFO - lr: 2.0658e-06 gnorm: 0.66 [2 days, 22:11:58< 2:46:56] +[titan] 2025-09-10 21:48:59,518 - root - INFO - step: 38480 loss: 2.3676 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.86 mfu: 49.33% global_avg_ntp_loss: 0.5932 global_avg_top_loss: 1.7743 +[titan] 2025-09-10 21:48:59,518 - root - INFO - lr: 2.0654e-06 gnorm: 0.60 [2 days, 22:12:30< 2:46:23] +[titan] 2025-09-10 21:49:31,508 - root - INFO - step: 38485 loss: 2.6177 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.20 mfu: 49.36% global_avg_ntp_loss: 0.7243 global_avg_top_loss: 1.8934 +[titan] 2025-09-10 21:49:31,508 - root - INFO - lr: 2.0649e-06 gnorm: 0.82 [2 days, 22:13:02< 2:45:51] +[titan] 2025-09-10 21:50:03,670 - root - INFO - step: 38490 loss: 2.5068 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.57 mfu: 49.10% global_avg_ntp_loss: 0.6580 global_avg_top_loss: 1.8488 +[titan] 2025-09-10 21:50:03,670 - root - INFO - lr: 2.0645e-06 gnorm: 0.63 [2 days, 22:13:34< 2:45:18] +[titan] 2025-09-10 21:50:35,659 - root - INFO - step: 38495 loss: 2.5210 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.21 mfu: 49.36% global_avg_ntp_loss: 0.6635 global_avg_top_loss: 1.8575 +[titan] 2025-09-10 21:50:35,659 - root - INFO - lr: 2.0641e-06 gnorm: 0.83 [2 days, 22:14:06< 2:44:45] +[titan] 2025-09-10 21:51:01,305 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 21:51:07,723 - root - INFO - step: 38500 loss: 2.5638 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.06 mfu: 49.25% global_avg_ntp_loss: 0.6894 global_avg_top_loss: 1.8745 +[titan] 2025-09-10 21:51:07,724 - root - INFO - lr: 2.0636e-06 gnorm: 0.75 [2 days, 22:14:38< 2:44:12] +[titan] 2025-09-10 21:51:39,718 - root - INFO - step: 38505 loss: 2.3290 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.12 mfu: 49.35% global_avg_ntp_loss: 0.5796 global_avg_top_loss: 1.7494 +[titan] 2025-09-10 21:51:39,718 - root - INFO - lr: 2.0632e-06 gnorm: 0.62 [2 days, 22:15:10< 2:43:39] +[titan] 2025-09-10 21:52:11,698 - root - INFO - step: 38510 loss: 2.3743 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.34 mfu: 49.38% global_avg_ntp_loss: 0.5943 global_avg_top_loss: 1.7800 +[titan] 2025-09-10 21:52:11,699 - root - INFO - lr: 2.0628e-06 gnorm: 0.70 [2 days, 22:15:42< 2:43:06] +[titan] 2025-09-10 21:52:43,793 - root - INFO - step: 38515 loss: 2.3395 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.60 mfu: 49.20% global_avg_ntp_loss: 0.5773 global_avg_top_loss: 1.7622 +[titan] 2025-09-10 21:52:43,793 - root - INFO - lr: 2.0624e-06 gnorm: 0.89 [2 days, 22:16:14< 2:42:33] +[titan] 2025-09-10 21:53:15,728 - root - INFO - step: 38520 loss: 2.5236 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.04 mfu: 49.45% global_avg_ntp_loss: 0.6688 global_avg_top_loss: 1.8548 +[titan] 2025-09-10 21:53:15,728 - root - INFO - lr: 2.0620e-06 gnorm: 0.63 [2 days, 22:16:46< 2:42:00] +[titan] 2025-09-10 21:53:47,775 - root - INFO - step: 38525 loss: 2.3767 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.31 mfu: 49.27% global_avg_ntp_loss: 0.5985 global_avg_top_loss: 1.7782 +[titan] 2025-09-10 21:53:47,776 - root - INFO - lr: 2.0615e-06 gnorm: 0.74 [2 days, 22:17:18< 2:41:28] +[titan] 2025-09-10 21:54:19,756 - root - INFO - step: 38530 loss: 2.4806 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.34 mfu: 49.38% global_avg_ntp_loss: 0.6459 global_avg_top_loss: 1.8347 +[titan] 2025-09-10 21:54:19,756 - root - INFO - lr: 2.0611e-06 gnorm: 0.63 [2 days, 22:17:50< 2:40:55] +[titan] 2025-09-10 21:54:51,993 - root - INFO - step: 38535 loss: 2.4157 memory: 122.03GiB(87.57%) tps: 10,165 tflops: 484.45 mfu: 48.98% global_avg_ntp_loss: 0.6119 global_avg_top_loss: 1.8037 +[titan] 2025-09-10 21:54:51,993 - root - INFO - lr: 2.0607e-06 gnorm: 0.67 [2 days, 22:18:22< 2:40:22] +[titan] 2025-09-10 21:55:24,141 - root - INFO - step: 38540 loss: 2.5067 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.79 mfu: 49.12% global_avg_ntp_loss: 0.6602 global_avg_top_loss: 1.8465 +[titan] 2025-09-10 21:55:24,141 - root - INFO - lr: 2.0603e-06 gnorm: 0.67 [2 days, 22:18:55< 2:39:49] +[titan] 2025-09-10 21:55:56,197 - root - INFO - step: 38545 loss: 2.5435 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.18 mfu: 49.26% global_avg_ntp_loss: 0.6784 global_avg_top_loss: 1.8651 +[titan] 2025-09-10 21:55:56,197 - root - INFO - lr: 2.0599e-06 gnorm: 0.65 [2 days, 22:19:27< 2:39:16] +[titan] 2025-09-10 21:56:21,814 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 21:56:28,137 - root - INFO - step: 38550 loss: 2.6236 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.96 mfu: 49.44% global_avg_ntp_loss: 0.7186 global_avg_top_loss: 1.9050 +[titan] 2025-09-10 21:56:28,137 - root - INFO - lr: 2.0595e-06 gnorm: 0.72 [2 days, 22:19:59< 2:38:43] +[titan] 2025-09-10 21:57:00,265 - root - INFO - step: 38555 loss: 2.4820 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.09 mfu: 49.15% global_avg_ntp_loss: 0.6444 global_avg_top_loss: 1.8376 +[titan] 2025-09-10 21:57:00,265 - root - INFO - lr: 2.0591e-06 gnorm: 0.67 [2 days, 22:20:31< 2:38:10] +[titan] 2025-09-10 21:57:32,126 - root - INFO - step: 38560 loss: 2.4759 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.17 mfu: 49.56% global_avg_ntp_loss: 0.6386 global_avg_top_loss: 1.8373 +[titan] 2025-09-10 21:57:32,126 - root - INFO - lr: 2.0587e-06 gnorm: 0.65 [2 days, 22:21:03< 2:37:37] +[titan] 2025-09-10 21:58:03,990 - root - INFO - step: 38565 loss: 2.5183 memory: 122.03GiB(87.57%) tps: 10,284 tflops: 490.11 mfu: 49.56% global_avg_ntp_loss: 0.6661 global_avg_top_loss: 1.8523 +[titan] 2025-09-10 21:58:03,990 - root - INFO - lr: 2.0583e-06 gnorm: 0.80 [2 days, 22:21:34< 2:37:05] +[titan] 2025-09-10 21:58:35,893 - root - INFO - step: 38570 loss: 2.5561 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.52 mfu: 49.50% global_avg_ntp_loss: 0.6824 global_avg_top_loss: 1.8737 +[titan] 2025-09-10 21:58:35,894 - root - INFO - lr: 2.0579e-06 gnorm: 0.64 [2 days, 22:22:06< 2:36:32] +[titan] 2025-09-10 21:59:07,916 - root - INFO - step: 38575 loss: 2.5466 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.70 mfu: 49.31% global_avg_ntp_loss: 0.6799 global_avg_top_loss: 1.8667 +[titan] 2025-09-10 21:59:07,916 - root - INFO - lr: 2.0574e-06 gnorm: 0.75 [2 days, 22:22:38< 2:35:59] +[titan] 2025-09-10 21:59:40,057 - root - INFO - step: 38580 loss: 2.5611 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.90 mfu: 49.13% global_avg_ntp_loss: 0.6888 global_avg_top_loss: 1.8724 +[titan] 2025-09-10 21:59:40,057 - root - INFO - lr: 2.0570e-06 gnorm: 0.78 [2 days, 22:23:10< 2:35:26] +[titan] 2025-09-10 22:00:12,136 - root - INFO - step: 38585 loss: 2.2767 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.83 mfu: 49.22% global_avg_ntp_loss: 0.5574 global_avg_top_loss: 1.7193 +[titan] 2025-09-10 22:00:12,136 - root - INFO - lr: 2.0566e-06 gnorm: 0.60 [2 days, 22:23:43< 2:34:53] +[titan] 2025-09-10 22:00:44,021 - root - INFO - step: 38590 loss: 2.2913 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.81 mfu: 49.53% global_avg_ntp_loss: 0.5579 global_avg_top_loss: 1.7334 +[titan] 2025-09-10 22:00:44,021 - root - INFO - lr: 2.0562e-06 gnorm: 0.67 [2 days, 22:24:14< 2:34:20] +[titan] 2025-09-10 22:01:16,057 - root - INFO - step: 38595 loss: 2.3054 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.49 mfu: 49.29% global_avg_ntp_loss: 0.5612 global_avg_top_loss: 1.7441 +[titan] 2025-09-10 22:01:16,057 - root - INFO - lr: 2.0558e-06 gnorm: 0.78 [2 days, 22:24:46< 2:33:47] +[titan] 2025-09-10 22:01:41,565 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 22:01:48,036 - root - INFO - step: 38600 loss: 2.5205 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.35 mfu: 49.38% global_avg_ntp_loss: 0.6611 global_avg_top_loss: 1.8594 +[titan] 2025-09-10 22:01:48,036 - root - INFO - lr: 2.0555e-06 gnorm: 0.63 [2 days, 22:25:18< 2:33:14] +[titan] 2025-09-10 22:02:20,113 - root - INFO - step: 38605 loss: 2.4162 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.87 mfu: 49.23% global_avg_ntp_loss: 0.6164 global_avg_top_loss: 1.7998 +[titan] 2025-09-10 22:02:20,113 - root - INFO - lr: 2.0551e-06 gnorm: 0.74 [2 days, 22:25:51< 2:32:42] +[titan] 2025-09-10 22:02:52,032 - root - INFO - step: 38610 loss: 2.4538 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.28 mfu: 49.47% global_avg_ntp_loss: 0.6366 global_avg_top_loss: 1.8172 +[titan] 2025-09-10 22:02:52,032 - root - INFO - lr: 2.0547e-06 gnorm: 0.66 [2 days, 22:26:22< 2:32:09] +[titan] 2025-09-10 22:03:23,938 - root - INFO - step: 38615 loss: 2.3603 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.47 mfu: 49.49% global_avg_ntp_loss: 0.5894 global_avg_top_loss: 1.7709 +[titan] 2025-09-10 22:03:23,938 - root - INFO - lr: 2.0543e-06 gnorm: 0.64 [2 days, 22:26:54< 2:31:36] +[titan] 2025-09-10 22:03:55,865 - root - INFO - step: 38620 loss: 2.5662 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.16 mfu: 49.46% global_avg_ntp_loss: 0.6883 global_avg_top_loss: 1.8779 +[titan] 2025-09-10 22:03:55,865 - root - INFO - lr: 2.0539e-06 gnorm: 0.62 [2 days, 22:27:26< 2:31:03] +[titan] 2025-09-10 22:04:27,856 - root - INFO - step: 38625 loss: 2.6355 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.16 mfu: 49.36% global_avg_ntp_loss: 0.7217 global_avg_top_loss: 1.9138 +[titan] 2025-09-10 22:04:27,857 - root - INFO - lr: 2.0535e-06 gnorm: 0.68 [2 days, 22:27:58< 2:30:30] +[titan] 2025-09-10 22:04:59,986 - root - INFO - step: 38630 loss: 2.6392 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.07 mfu: 49.15% global_avg_ntp_loss: 0.7246 global_avg_top_loss: 1.9146 +[titan] 2025-09-10 22:04:59,986 - root - INFO - lr: 2.0531e-06 gnorm: 0.71 [2 days, 22:28:30< 2:29:57] +[titan] 2025-09-10 22:05:31,981 - root - INFO - step: 38635 loss: 2.4988 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.12 mfu: 49.35% global_avg_ntp_loss: 0.6539 global_avg_top_loss: 1.8449 +[titan] 2025-09-10 22:05:31,981 - root - INFO - lr: 2.0527e-06 gnorm: 0.67 [2 days, 22:29:02< 2:29:24] +[titan] 2025-09-10 22:06:03,913 - root - INFO - step: 38640 loss: 2.4628 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.06 mfu: 49.45% global_avg_ntp_loss: 0.6352 global_avg_top_loss: 1.8276 +[titan] 2025-09-10 22:06:03,914 - root - INFO - lr: 2.0523e-06 gnorm: 0.65 [2 days, 22:29:34< 2:28:52] +[titan] 2025-09-10 22:06:35,943 - root - INFO - step: 38645 loss: 2.4661 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.59 mfu: 49.30% global_avg_ntp_loss: 0.6380 global_avg_top_loss: 1.8281 +[titan] 2025-09-10 22:06:35,943 - root - INFO - lr: 2.0519e-06 gnorm: 0.66 [2 days, 22:30:06< 2:28:19] +[titan] 2025-09-10 22:07:01,518 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 22:07:07,981 - root - INFO - step: 38650 loss: 2.5383 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.47 mfu: 49.29% global_avg_ntp_loss: 0.6743 global_avg_top_loss: 1.8640 +[titan] 2025-09-10 22:07:07,981 - root - INFO - lr: 2.0516e-06 gnorm: 0.66 [2 days, 22:30:38< 2:27:46] +[titan] 2025-09-10 22:07:39,953 - root - INFO - step: 38655 loss: 2.3687 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.46 mfu: 49.39% global_avg_ntp_loss: 0.5975 global_avg_top_loss: 1.7713 +[titan] 2025-09-10 22:07:39,953 - root - INFO - lr: 2.0512e-06 gnorm: 0.75 [2 days, 22:31:10< 2:27:13] +[titan] 2025-09-10 22:08:11,985 - root - INFO - step: 38660 loss: 2.4659 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.55 mfu: 49.30% global_avg_ntp_loss: 0.6461 global_avg_top_loss: 1.8197 +[titan] 2025-09-10 22:08:11,985 - root - INFO - lr: 2.0508e-06 gnorm: 0.82 [2 days, 22:31:42< 2:26:40] +[titan] 2025-09-10 22:08:44,122 - root - INFO - step: 38665 loss: 2.3246 memory: 122.03GiB(87.57%) tps: 10,197 tflops: 485.96 mfu: 49.14% global_avg_ntp_loss: 0.5769 global_avg_top_loss: 1.7477 +[titan] 2025-09-10 22:08:44,122 - root - INFO - lr: 2.0504e-06 gnorm: 0.59 [2 days, 22:32:15< 2:26:07] +[titan] 2025-09-10 22:09:16,152 - root - INFO - step: 38670 loss: 2.2387 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.57 mfu: 49.30% global_avg_ntp_loss: 0.5318 global_avg_top_loss: 1.7069 +[titan] 2025-09-10 22:09:16,153 - root - INFO - lr: 2.0500e-06 gnorm: 0.68 [2 days, 22:32:47< 2:25:34] +[titan] 2025-09-10 22:09:48,462 - root - INFO - step: 38675 loss: 2.3041 memory: 122.03GiB(87.57%) tps: 10,142 tflops: 483.37 mfu: 48.87% global_avg_ntp_loss: 0.5611 global_avg_top_loss: 1.7431 +[titan] 2025-09-10 22:09:48,462 - root - INFO - lr: 2.0497e-06 gnorm: 0.79 [2 days, 22:33:19< 2:25:01] +[titan] 2025-09-10 22:10:20,410 - root - INFO - step: 38680 loss: 2.4759 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.82 mfu: 49.43% global_avg_ntp_loss: 0.6439 global_avg_top_loss: 1.8320 +[titan] 2025-09-10 22:10:20,411 - root - INFO - lr: 2.0493e-06 gnorm: 0.62 [2 days, 22:33:51< 2:24:29] +[titan] 2025-09-10 22:10:52,322 - root - INFO - step: 38685 loss: 2.3984 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.38 mfu: 49.48% global_avg_ntp_loss: 0.6048 global_avg_top_loss: 1.7936 +[titan] 2025-09-10 22:10:52,323 - root - INFO - lr: 2.0489e-06 gnorm: 0.67 [2 days, 22:34:23< 2:23:56] +[titan] 2025-09-10 22:11:24,184 - root - INFO - step: 38690 loss: 2.5174 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.16 mfu: 49.56% global_avg_ntp_loss: 0.6685 global_avg_top_loss: 1.8489 +[titan] 2025-09-10 22:11:24,184 - root - INFO - lr: 2.0486e-06 gnorm: 0.63 [2 days, 22:34:55< 2:23:23] +[titan] 2025-09-10 22:11:56,107 - root - INFO - step: 38695 loss: 2.4432 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.21 mfu: 49.46% global_avg_ntp_loss: 0.6233 global_avg_top_loss: 1.8199 +[titan] 2025-09-10 22:11:56,107 - root - INFO - lr: 2.0482e-06 gnorm: 0.68 [2 days, 22:35:27< 2:22:50] +[titan] 2025-09-10 22:12:21,808 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 22:12:28,232 - root - INFO - step: 38700 loss: 2.5562 memory: 122.03GiB(87.57%) tps: 10,200 tflops: 486.14 mfu: 49.15% global_avg_ntp_loss: 0.6771 global_avg_top_loss: 1.8791 +[titan] 2025-09-10 22:12:28,232 - root - INFO - lr: 2.0478e-06 gnorm: 0.69 [2 days, 22:35:59< 2:22:17] +[titan] 2025-09-10 22:13:00,210 - root - INFO - step: 38705 loss: 2.5816 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.37 mfu: 49.38% global_avg_ntp_loss: 0.6945 global_avg_top_loss: 1.8871 +[titan] 2025-09-10 22:13:00,210 - root - INFO - lr: 2.0475e-06 gnorm: 0.65 [2 days, 22:36:31< 2:21:44] +[titan] 2025-09-10 22:13:32,103 - root - INFO - step: 38710 loss: 2.5536 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.68 mfu: 49.51% global_avg_ntp_loss: 0.6915 global_avg_top_loss: 1.8621 +[titan] 2025-09-10 22:13:32,103 - root - INFO - lr: 2.0471e-06 gnorm: 0.73 [2 days, 22:37:03< 2:21:11] +[titan] 2025-09-10 22:14:04,101 - root - INFO - step: 38715 loss: 2.3988 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.06 mfu: 49.35% global_avg_ntp_loss: 0.6067 global_avg_top_loss: 1.7921 +[titan] 2025-09-10 22:14:04,101 - root - INFO - lr: 2.0467e-06 gnorm: 0.70 [2 days, 22:37:35< 2:20:39] +[titan] 2025-09-10 22:14:36,119 - root - INFO - step: 38720 loss: 2.4884 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.77 mfu: 49.32% global_avg_ntp_loss: 0.6481 global_avg_top_loss: 1.8403 +[titan] 2025-09-10 22:14:36,119 - root - INFO - lr: 2.0464e-06 gnorm: 0.70 [2 days, 22:38:07< 2:20:06] +[titan] 2025-09-10 22:15:08,137 - root - INFO - step: 38725 loss: 2.4233 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.76 mfu: 49.32% global_avg_ntp_loss: 0.6181 global_avg_top_loss: 1.8053 +[titan] 2025-09-10 22:15:08,137 - root - INFO - lr: 2.0460e-06 gnorm: 0.72 [2 days, 22:38:39< 2:19:33] +[titan] 2025-09-10 22:15:40,184 - root - INFO - step: 38730 loss: 2.6218 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.33 mfu: 49.27% global_avg_ntp_loss: 0.7093 global_avg_top_loss: 1.9124 +[titan] 2025-09-10 22:15:40,184 - root - INFO - lr: 2.0456e-06 gnorm: 0.64 [2 days, 22:39:11< 2:19:00] +[titan] 2025-09-10 22:16:12,294 - root - INFO - step: 38735 loss: 2.4628 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.37 mfu: 49.18% global_avg_ntp_loss: 0.6389 global_avg_top_loss: 1.8239 +[titan] 2025-09-10 22:16:12,294 - root - INFO - lr: 2.0453e-06 gnorm: 0.68 [2 days, 22:39:43< 2:18:27] +[titan] 2025-09-10 22:16:44,418 - root - INFO - step: 38740 loss: 2.5336 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.15 mfu: 49.16% global_avg_ntp_loss: 0.6718 global_avg_top_loss: 1.8618 +[titan] 2025-09-10 22:16:44,418 - root - INFO - lr: 2.0449e-06 gnorm: 0.85 [2 days, 22:40:15< 2:17:54] +[titan] 2025-09-10 22:17:16,358 - root - INFO - step: 38745 loss: 2.2518 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.94 mfu: 49.44% global_avg_ntp_loss: 0.5459 global_avg_top_loss: 1.7059 +[titan] 2025-09-10 22:17:16,359 - root - INFO - lr: 2.0446e-06 gnorm: 0.60 [2 days, 22:40:47< 2:17:21] +[titan] 2025-09-10 22:17:41,993 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 22:17:48,427 - root - INFO - step: 38750 loss: 2.3749 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 487.00 mfu: 49.24% global_avg_ntp_loss: 0.5950 global_avg_top_loss: 1.7798 +[titan] 2025-09-10 22:17:48,427 - root - INFO - lr: 2.0442e-06 gnorm: 0.69 [2 days, 22:41:19< 2:16:49] +[titan] 2025-09-10 22:18:20,413 - root - INFO - step: 38755 loss: 2.3276 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.25 mfu: 49.37% global_avg_ntp_loss: 0.5719 global_avg_top_loss: 1.7557 +[titan] 2025-09-10 22:18:20,413 - root - INFO - lr: 2.0439e-06 gnorm: 0.77 [2 days, 22:41:51< 2:16:16] +[titan] 2025-09-10 22:18:52,404 - root - INFO - step: 38760 loss: 2.4025 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.17 mfu: 49.36% global_avg_ntp_loss: 0.6136 global_avg_top_loss: 1.7889 +[titan] 2025-09-10 22:18:52,404 - root - INFO - lr: 2.0435e-06 gnorm: 0.63 [2 days, 22:42:23< 2:15:43] +[titan] 2025-09-10 22:19:24,328 - root - INFO - step: 38765 loss: 2.3653 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.21 mfu: 49.46% global_avg_ntp_loss: 0.5940 global_avg_top_loss: 1.7713 +[titan] 2025-09-10 22:19:24,328 - root - INFO - lr: 2.0432e-06 gnorm: 0.70 [2 days, 22:42:55< 2:15:10] +[titan] 2025-09-10 22:19:56,544 - root - INFO - step: 38770 loss: 2.4705 memory: 122.03GiB(87.57%) tps: 10,171 tflops: 484.76 mfu: 49.01% global_avg_ntp_loss: 0.6429 global_avg_top_loss: 1.8275 +[titan] 2025-09-10 22:19:56,545 - root - INFO - lr: 2.0428e-06 gnorm: 0.66 [2 days, 22:43:27< 2:14:37] +[titan] 2025-09-10 22:20:28,675 - root - INFO - step: 38775 loss: 2.8720 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.05 mfu: 49.15% global_avg_ntp_loss: 0.8758 global_avg_top_loss: 1.9962 +[titan] 2025-09-10 22:20:28,676 - root - INFO - lr: 2.0425e-06 gnorm: 0.66 [2 days, 22:43:59< 2:14:04] +[titan] 2025-09-10 22:21:00,562 - root - INFO - step: 38780 loss: 2.4974 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.77 mfu: 49.52% global_avg_ntp_loss: 0.6686 global_avg_top_loss: 1.8288 +[titan] 2025-09-10 22:21:00,562 - root - INFO - lr: 2.0421e-06 gnorm: 0.68 [2 days, 22:44:31< 2:13:31] +[titan] 2025-09-10 22:21:32,681 - root - INFO - step: 38785 loss: 2.5450 memory: 122.03GiB(87.57%) tps: 10,202 tflops: 486.22 mfu: 49.16% global_avg_ntp_loss: 0.6794 global_avg_top_loss: 1.8656 +[titan] 2025-09-10 22:21:32,682 - root - INFO - lr: 2.0418e-06 gnorm: 0.64 [2 days, 22:45:03< 2:12:58] +[titan] 2025-09-10 22:22:04,853 - root - INFO - step: 38790 loss: 2.7775 memory: 122.03GiB(87.57%) tps: 10,186 tflops: 485.44 mfu: 49.08% global_avg_ntp_loss: 0.8099 global_avg_top_loss: 1.9676 +[titan] 2025-09-10 22:22:04,853 - root - INFO - lr: 2.0414e-06 gnorm: 0.73 [2 days, 22:45:35< 2:12:26] +[titan] 2025-09-10 22:22:37,035 - root - INFO - step: 38795 loss: 2.4216 memory: 122.03GiB(87.57%) tps: 10,182 tflops: 485.27 mfu: 49.07% global_avg_ntp_loss: 0.6184 global_avg_top_loss: 1.8032 +[titan] 2025-09-10 22:22:37,035 - root - INFO - lr: 2.0411e-06 gnorm: 0.66 [2 days, 22:46:07< 2:11:53] +[titan] 2025-09-10 22:23:02,721 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 22:23:09,202 - root - INFO - step: 38800 loss: 2.7016 memory: 122.03GiB(87.57%) tps: 10,187 tflops: 485.50 mfu: 49.09% global_avg_ntp_loss: 0.7696 global_avg_top_loss: 1.9320 +[titan] 2025-09-10 22:23:09,202 - root - INFO - lr: 2.0408e-06 gnorm: 0.65 [2 days, 22:46:40< 2:11:20] +[titan] 2025-09-10 22:23:41,389 - root - INFO - step: 38805 loss: 2.4321 memory: 122.03GiB(87.57%) tps: 10,181 tflops: 485.20 mfu: 49.06% global_avg_ntp_loss: 0.6210 global_avg_top_loss: 1.8111 +[titan] 2025-09-10 22:23:41,389 - root - INFO - lr: 2.0404e-06 gnorm: 0.75 [2 days, 22:47:12< 2:10:47] +[titan] 2025-09-10 22:24:13,514 - root - INFO - step: 38810 loss: 2.5465 memory: 122.03GiB(87.57%) tps: 10,200 tflops: 486.15 mfu: 49.16% global_avg_ntp_loss: 0.6772 global_avg_top_loss: 1.8692 +[titan] 2025-09-10 22:24:13,514 - root - INFO - lr: 2.0401e-06 gnorm: 0.68 [2 days, 22:47:44< 2:10:14] +[titan] 2025-09-10 22:24:45,667 - root - INFO - step: 38815 loss: 2.5835 memory: 122.03GiB(87.57%) tps: 10,191 tflops: 485.71 mfu: 49.11% global_avg_ntp_loss: 0.6938 global_avg_top_loss: 1.8897 +[titan] 2025-09-10 22:24:45,667 - root - INFO - lr: 2.0397e-06 gnorm: 0.74 [2 days, 22:48:16< 2:09:41] +[titan] 2025-09-10 22:25:17,692 - root - INFO - step: 38820 loss: 2.5530 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.66 mfu: 49.31% global_avg_ntp_loss: 0.6844 global_avg_top_loss: 1.8686 +[titan] 2025-09-10 22:25:17,693 - root - INFO - lr: 2.0394e-06 gnorm: 0.84 [2 days, 22:48:48< 2:09:08] +[titan] 2025-09-10 22:25:49,709 - root - INFO - step: 38825 loss: 2.3498 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.78 mfu: 49.32% global_avg_ntp_loss: 0.5904 global_avg_top_loss: 1.7594 +[titan] 2025-09-10 22:25:49,709 - root - INFO - lr: 2.0391e-06 gnorm: 0.64 [2 days, 22:49:20< 2:08:36] +[titan] 2025-09-10 22:26:21,905 - root - INFO - step: 38830 loss: 2.7680 memory: 122.03GiB(87.57%) tps: 10,178 tflops: 485.07 mfu: 49.05% global_avg_ntp_loss: 0.8064 global_avg_top_loss: 1.9616 +[titan] 2025-09-10 22:26:21,905 - root - INFO - lr: 2.0387e-06 gnorm: 0.72 [2 days, 22:49:52< 2:08:03] +[titan] 2025-09-10 22:26:53,998 - root - INFO - step: 38835 loss: 2.3059 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.62 mfu: 49.20% global_avg_ntp_loss: 0.5619 global_avg_top_loss: 1.7441 +[titan] 2025-09-10 22:26:53,998 - root - INFO - lr: 2.0384e-06 gnorm: 0.95 [2 days, 22:50:24< 2:07:30] +[titan] 2025-09-10 22:27:26,177 - root - INFO - step: 38840 loss: 2.3389 memory: 122.03GiB(87.57%) tps: 10,183 tflops: 485.33 mfu: 49.07% global_avg_ntp_loss: 0.5831 global_avg_top_loss: 1.7558 +[titan] 2025-09-10 22:27:26,177 - root - INFO - lr: 2.0381e-06 gnorm: 0.62 [2 days, 22:50:57< 2:06:57] +[titan] 2025-09-10 22:27:58,166 - root - INFO - step: 38845 loss: 2.3612 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.20 mfu: 49.36% global_avg_ntp_loss: 0.5868 global_avg_top_loss: 1.7744 +[titan] 2025-09-10 22:27:58,166 - root - INFO - lr: 2.0378e-06 gnorm: 0.69 [2 days, 22:51:29< 2:06:24] +[titan] 2025-09-10 22:28:23,887 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 22:28:30,421 - root - INFO - step: 38850 loss: 2.4289 memory: 122.03GiB(87.57%) tps: 10,159 tflops: 484.18 mfu: 48.96% global_avg_ntp_loss: 0.6236 global_avg_top_loss: 1.8053 +[titan] 2025-09-10 22:28:30,421 - root - INFO - lr: 2.0374e-06 gnorm: 0.67 [2 days, 22:52:01< 2:05:51] +[titan] 2025-09-10 22:29:02,384 - root - INFO - step: 38855 loss: 2.9997 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.59 mfu: 49.40% global_avg_ntp_loss: 0.9416 global_avg_top_loss: 2.0581 +[titan] 2025-09-10 22:29:02,384 - root - INFO - lr: 2.0371e-06 gnorm: 0.70 [2 days, 22:52:33< 2:05:18] +[titan] 2025-09-10 22:29:34,511 - root - INFO - step: 38860 loss: 2.5349 memory: 122.03GiB(87.57%) tps: 10,200 tflops: 486.10 mfu: 49.15% global_avg_ntp_loss: 0.6688 global_avg_top_loss: 1.8662 +[titan] 2025-09-10 22:29:34,512 - root - INFO - lr: 2.0368e-06 gnorm: 0.67 [2 days, 22:53:05< 2:04:46] +[titan] 2025-09-10 22:30:06,581 - root - INFO - step: 38865 loss: 2.6113 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.98 mfu: 49.24% global_avg_ntp_loss: 0.7084 global_avg_top_loss: 1.9028 +[titan] 2025-09-10 22:30:06,581 - root - INFO - lr: 2.0365e-06 gnorm: 0.69 [2 days, 22:53:37< 2:04:13] +[titan] 2025-09-10 22:30:38,680 - root - INFO - step: 38870 loss: 2.5606 memory: 122.03GiB(87.57%) tps: 10,208 tflops: 486.53 mfu: 49.19% global_avg_ntp_loss: 0.6900 global_avg_top_loss: 1.8705 +[titan] 2025-09-10 22:30:38,680 - root - INFO - lr: 2.0361e-06 gnorm: 0.73 [2 days, 22:54:09< 2:03:40] +[titan] 2025-09-10 22:31:10,647 - root - INFO - step: 38875 loss: 2.4190 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.54 mfu: 49.40% global_avg_ntp_loss: 0.6138 global_avg_top_loss: 1.8052 +[titan] 2025-09-10 22:31:10,647 - root - INFO - lr: 2.0358e-06 gnorm: 0.70 [2 days, 22:54:41< 2:03:07] +[titan] 2025-09-10 22:31:42,640 - root - INFO - step: 38880 loss: 2.3625 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.14 mfu: 49.36% global_avg_ntp_loss: 0.5929 global_avg_top_loss: 1.7696 +[titan] 2025-09-10 22:31:42,641 - root - INFO - lr: 2.0355e-06 gnorm: 0.67 [2 days, 22:55:13< 2:02:34] +[titan] 2025-09-10 22:32:14,677 - root - INFO - step: 38885 loss: 2.4331 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.47 mfu: 49.29% global_avg_ntp_loss: 0.6224 global_avg_top_loss: 1.8107 +[titan] 2025-09-10 22:32:14,678 - root - INFO - lr: 2.0352e-06 gnorm: 0.86 [2 days, 22:55:45< 2:02:01] +[titan] 2025-09-10 22:32:46,725 - root - INFO - step: 38890 loss: 2.5673 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.31 mfu: 49.27% global_avg_ntp_loss: 0.6917 global_avg_top_loss: 1.8756 +[titan] 2025-09-10 22:32:46,726 - root - INFO - lr: 2.0349e-06 gnorm: 0.61 [2 days, 22:56:17< 2:01:28] +[titan] 2025-09-10 22:33:18,574 - root - INFO - step: 38895 loss: 2.4829 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.35 mfu: 49.58% global_avg_ntp_loss: 0.6486 global_avg_top_loss: 1.8343 +[titan] 2025-09-10 22:33:18,574 - root - INFO - lr: 2.0346e-06 gnorm: 0.87 [2 days, 22:56:49< 2:00:56] +[titan] 2025-09-10 22:33:44,289 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 22:33:50,873 - root - INFO - step: 38900 loss: 2.5309 memory: 122.03GiB(87.57%) tps: 10,145 tflops: 483.52 mfu: 48.89% global_avg_ntp_loss: 0.6761 global_avg_top_loss: 1.8548 +[titan] 2025-09-10 22:33:50,873 - root - INFO - lr: 2.0342e-06 gnorm: 0.77 [2 days, 22:57:21< 2:00:23] +[titan] 2025-09-10 22:34:23,193 - root - INFO - step: 38905 loss: 2.3409 memory: 122.03GiB(87.57%) tps: 10,139 tflops: 483.20 mfu: 48.86% global_avg_ntp_loss: 0.5808 global_avg_top_loss: 1.7601 +[titan] 2025-09-10 22:34:23,194 - root - INFO - lr: 2.0339e-06 gnorm: 0.64 [2 days, 22:57:54< 1:59:50] +[titan] 2025-09-10 22:34:55,477 - root - INFO - step: 38910 loss: 2.2678 memory: 122.03GiB(87.57%) tps: 10,150 tflops: 483.74 mfu: 48.91% global_avg_ntp_loss: 0.5485 global_avg_top_loss: 1.7193 +[titan] 2025-09-10 22:34:55,478 - root - INFO - lr: 2.0336e-06 gnorm: 0.73 [2 days, 22:58:26< 1:59:17] +[titan] 2025-09-10 22:35:08,477 - root - INFO - Dumping profiler traces at step 38912 +[titan] 2025-09-10 22:35:08,531 - root - INFO - Finished dumping profiler traces in 0.05 seconds +[titan] 2025-09-10 22:35:27,660 - root - INFO - step: 38915 loss: 2.2953 memory: 122.03GiB(87.57%) tps: 10,182 tflops: 485.27 mfu: 49.07% global_avg_ntp_loss: 0.5608 global_avg_top_loss: 1.7345 +[titan] 2025-09-10 22:35:27,660 - root - INFO - lr: 2.0333e-06 gnorm: 0.83 [2 days, 22:58:58< 1:58:44] +[titan] 2025-09-10 22:35:59,845 - root - INFO - step: 38920 loss: 2.4197 memory: 122.03GiB(87.57%) tps: 10,181 tflops: 485.23 mfu: 49.06% global_avg_ntp_loss: 0.6180 global_avg_top_loss: 1.8017 +[titan] 2025-09-10 22:35:59,845 - root - INFO - lr: 2.0330e-06 gnorm: 0.62 [2 days, 22:59:30< 1:58:11] +[titan] 2025-09-10 22:36:31,819 - root - INFO - step: 38925 loss: 2.4418 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.43 mfu: 49.39% global_avg_ntp_loss: 0.6308 global_avg_top_loss: 1.8110 +[titan] 2025-09-10 22:36:31,819 - root - INFO - lr: 2.0327e-06 gnorm: 0.72 [2 days, 23:00:02< 1:57:39] +[titan] 2025-09-10 22:37:03,888 - root - INFO - step: 38930 loss: 2.5169 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.98 mfu: 49.24% global_avg_ntp_loss: 0.6620 global_avg_top_loss: 1.8549 +[titan] 2025-09-10 22:37:03,889 - root - INFO - lr: 2.0324e-06 gnorm: 0.72 [2 days, 23:00:34< 1:57:06] +[titan] 2025-09-10 22:37:35,883 - root - INFO - step: 38935 loss: 2.8410 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.12 mfu: 49.36% global_avg_ntp_loss: 0.8609 global_avg_top_loss: 1.9800 +[titan] 2025-09-10 22:37:35,883 - root - INFO - lr: 2.0321e-06 gnorm: 0.68 [2 days, 23:01:06< 1:56:33] +[titan] 2025-09-10 22:38:07,937 - root - INFO - step: 38940 loss: 2.5118 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.22 mfu: 49.26% global_avg_ntp_loss: 0.6593 global_avg_top_loss: 1.8525 +[titan] 2025-09-10 22:38:07,937 - root - INFO - lr: 2.0318e-06 gnorm: 0.70 [2 days, 23:01:38< 1:56:00] +[titan] 2025-09-10 22:38:39,958 - root - INFO - step: 38945 loss: 2.5851 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.72 mfu: 49.31% global_avg_ntp_loss: 0.6955 global_avg_top_loss: 1.8896 +[titan] 2025-09-10 22:38:39,958 - root - INFO - lr: 2.0315e-06 gnorm: 0.74 [2 days, 23:02:10< 1:55:27] +[titan] 2025-09-10 22:39:05,489 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 22:39:11,931 - root - INFO - step: 38950 loss: 2.6418 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.45 mfu: 49.39% global_avg_ntp_loss: 0.7258 global_avg_top_loss: 1.9160 +[titan] 2025-09-10 22:39:11,931 - root - INFO - lr: 2.0312e-06 gnorm: 0.73 [2 days, 23:02:42< 1:54:54] +[titan] 2025-09-10 22:39:43,939 - root - INFO - step: 38955 loss: 2.5466 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.92 mfu: 49.33% global_avg_ntp_loss: 0.6744 global_avg_top_loss: 1.8721 +[titan] 2025-09-10 22:39:43,939 - root - INFO - lr: 2.0309e-06 gnorm: 0.69 [2 days, 23:03:14< 1:54:21] +[titan] 2025-09-10 22:40:15,985 - root - INFO - step: 38960 loss: 2.4165 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.34 mfu: 49.28% global_avg_ntp_loss: 0.6181 global_avg_top_loss: 1.7984 +[titan] 2025-09-10 22:40:15,985 - root - INFO - lr: 2.0306e-06 gnorm: 0.66 [2 days, 23:03:46< 1:53:49] +[titan] 2025-09-10 22:40:48,069 - root - INFO - step: 38965 loss: 2.4333 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.76 mfu: 49.22% global_avg_ntp_loss: 0.6249 global_avg_top_loss: 1.8083 +[titan] 2025-09-10 22:40:48,069 - root - INFO - lr: 2.0303e-06 gnorm: 0.75 [2 days, 23:04:18< 1:53:16] +[titan] 2025-09-10 22:41:20,131 - root - INFO - step: 38970 loss: 2.5077 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.10 mfu: 49.25% global_avg_ntp_loss: 0.6603 global_avg_top_loss: 1.8474 +[titan] 2025-09-10 22:41:20,131 - root - INFO - lr: 2.0300e-06 gnorm: 0.66 [2 days, 23:04:51< 1:52:43] +[titan] 2025-09-10 22:41:52,095 - root - INFO - step: 38975 loss: 2.5333 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.59 mfu: 49.40% global_avg_ntp_loss: 0.6738 global_avg_top_loss: 1.8595 +[titan] 2025-09-10 22:41:52,095 - root - INFO - lr: 2.0297e-06 gnorm: 0.79 [2 days, 23:05:22< 1:52:10] +[titan] 2025-09-10 22:42:23,971 - root - INFO - step: 38980 loss: 2.4990 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.94 mfu: 49.54% global_avg_ntp_loss: 0.6612 global_avg_top_loss: 1.8379 +[titan] 2025-09-10 22:42:23,971 - root - INFO - lr: 2.0294e-06 gnorm: 0.77 [2 days, 23:05:54< 1:51:37] +[titan] 2025-09-10 22:42:56,155 - root - INFO - step: 38985 loss: 2.3474 memory: 122.03GiB(87.57%) tps: 10,181 tflops: 485.24 mfu: 49.06% global_avg_ntp_loss: 0.5842 global_avg_top_loss: 1.7632 +[titan] 2025-09-10 22:42:56,156 - root - INFO - lr: 2.0292e-06 gnorm: 0.61 [2 days, 23:06:27< 1:51:04] +[titan] 2025-09-10 22:43:28,243 - root - INFO - step: 38990 loss: 2.3246 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.71 mfu: 49.21% global_avg_ntp_loss: 0.5727 global_avg_top_loss: 1.7519 +[titan] 2025-09-10 22:43:28,243 - root - INFO - lr: 2.0289e-06 gnorm: 0.72 [2 days, 23:06:59< 1:50:31] +[titan] 2025-09-10 22:44:00,178 - root - INFO - step: 38995 loss: 2.2641 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.04 mfu: 49.45% global_avg_ntp_loss: 0.5419 global_avg_top_loss: 1.7222 +[titan] 2025-09-10 22:44:00,178 - root - INFO - lr: 2.0286e-06 gnorm: 0.78 [2 days, 23:07:31< 1:49:59] +[titan] 2025-09-10 22:44:26,032 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 22:44:32,474 - root - INFO - step: 39000 loss: 2.9582 memory: 122.03GiB(87.57%) tps: 10,146 tflops: 483.56 mfu: 48.89% global_avg_ntp_loss: 0.9186 global_avg_top_loss: 2.0396 +[titan] 2025-09-10 22:44:32,474 - root - INFO - lr: 2.0283e-06 gnorm: 0.61 [2 days, 23:08:03< 1:49:26] +[titan] 2025-09-10 22:45:04,470 - root - INFO - step: 39005 loss: 2.3613 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.09 mfu: 49.35% global_avg_ntp_loss: 0.5928 global_avg_top_loss: 1.7685 +[titan] 2025-09-10 22:45:04,471 - root - INFO - lr: 2.0280e-06 gnorm: 0.67 [2 days, 23:08:35< 1:48:53] +[titan] 2025-09-10 22:45:36,583 - root - INFO - step: 39010 loss: 2.4381 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.32 mfu: 49.17% global_avg_ntp_loss: 0.6296 global_avg_top_loss: 1.8084 +[titan] 2025-09-10 22:45:36,584 - root - INFO - lr: 2.0277e-06 gnorm: 0.67 [2 days, 23:09:07< 1:48:20] +[titan] 2025-09-10 22:46:08,927 - root - INFO - step: 39015 loss: 2.7870 memory: 122.03GiB(87.57%) tps: 10,131 tflops: 482.85 mfu: 48.82% global_avg_ntp_loss: 0.8336 global_avg_top_loss: 1.9534 +[titan] 2025-09-10 22:46:08,927 - root - INFO - lr: 2.0275e-06 gnorm: 0.63 [2 days, 23:09:39< 1:47:47] +[titan] 2025-09-10 22:46:41,109 - root - INFO - step: 39020 loss: 2.5496 memory: 122.03GiB(87.57%) tps: 10,182 tflops: 485.28 mfu: 49.07% global_avg_ntp_loss: 0.6809 global_avg_top_loss: 1.8686 +[titan] 2025-09-10 22:46:41,109 - root - INFO - lr: 2.0272e-06 gnorm: 0.68 [2 days, 23:10:11< 1:47:14] +[titan] 2025-09-10 22:47:13,327 - root - INFO - step: 39025 loss: 2.5332 memory: 122.03GiB(87.57%) tps: 10,171 tflops: 484.74 mfu: 49.01% global_avg_ntp_loss: 0.6768 global_avg_top_loss: 1.8564 +[titan] 2025-09-10 22:47:13,327 - root - INFO - lr: 2.0269e-06 gnorm: 0.68 [2 days, 23:10:44< 1:46:42] +[titan] 2025-09-10 22:47:45,459 - root - INFO - step: 39030 loss: 2.6017 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.04 mfu: 49.14% global_avg_ntp_loss: 0.7053 global_avg_top_loss: 1.8964 +[titan] 2025-09-10 22:47:45,459 - root - INFO - lr: 2.0266e-06 gnorm: 0.71 [2 days, 23:11:16< 1:46:09] +[titan] 2025-09-10 22:48:17,603 - root - INFO - step: 39035 loss: 2.4397 memory: 122.03GiB(87.57%) tps: 10,194 tflops: 485.84 mfu: 49.12% global_avg_ntp_loss: 0.6264 global_avg_top_loss: 1.8133 +[titan] 2025-09-10 22:48:17,604 - root - INFO - lr: 2.0264e-06 gnorm: 0.65 [2 days, 23:11:48< 1:45:36] +[titan] 2025-09-10 22:48:49,612 - root - INFO - step: 39040 loss: 2.4428 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.91 mfu: 49.33% global_avg_ntp_loss: 0.6290 global_avg_top_loss: 1.8137 +[titan] 2025-09-10 22:48:49,612 - root - INFO - lr: 2.0261e-06 gnorm: 0.63 [2 days, 23:12:20< 1:45:03] +[titan] 2025-09-10 22:49:21,667 - root - INFO - step: 39045 loss: 2.4635 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.20 mfu: 49.26% global_avg_ntp_loss: 0.6357 global_avg_top_loss: 1.8278 +[titan] 2025-09-10 22:49:21,667 - root - INFO - lr: 2.0258e-06 gnorm: 0.83 [2 days, 23:12:52< 1:44:30] +[titan] 2025-09-10 22:49:47,171 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 22:49:53,599 - root - INFO - step: 39050 loss: 2.9578 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.07 mfu: 49.45% global_avg_ntp_loss: 0.9147 global_avg_top_loss: 2.0432 +[titan] 2025-09-10 22:49:53,600 - root - INFO - lr: 2.0255e-06 gnorm: 0.62 [2 days, 23:13:24< 1:43:57] +[titan] 2025-09-10 22:50:25,590 - root - INFO - step: 39055 loss: 2.5187 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.18 mfu: 49.36% global_avg_ntp_loss: 0.6662 global_avg_top_loss: 1.8524 +[titan] 2025-09-10 22:50:25,590 - root - INFO - lr: 2.0253e-06 gnorm: 0.92 [2 days, 23:13:56< 1:43:24] +[titan] 2025-09-10 22:50:57,613 - root - INFO - step: 39060 loss: 2.5828 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.68 mfu: 49.31% global_avg_ntp_loss: 0.6942 global_avg_top_loss: 1.8886 +[titan] 2025-09-10 22:50:57,614 - root - INFO - lr: 2.0250e-06 gnorm: 0.76 [2 days, 23:14:28< 1:42:52] +[titan] 2025-09-10 22:51:29,532 - root - INFO - step: 39065 loss: 2.8399 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.28 mfu: 49.47% global_avg_ntp_loss: 0.8637 global_avg_top_loss: 1.9762 +[titan] 2025-09-10 22:51:29,533 - root - INFO - lr: 2.0247e-06 gnorm: 0.63 [2 days, 23:15:00< 1:42:19] +[titan] 2025-09-10 22:52:01,537 - root - INFO - step: 39070 loss: 2.3375 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.96 mfu: 49.34% global_avg_ntp_loss: 0.5816 global_avg_top_loss: 1.7559 +[titan] 2025-09-10 22:52:01,538 - root - INFO - lr: 2.0245e-06 gnorm: 0.71 [2 days, 23:15:32< 1:41:46] +[titan] 2025-09-10 22:52:33,566 - root - INFO - step: 39075 loss: 2.3561 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.60 mfu: 49.30% global_avg_ntp_loss: 0.5836 global_avg_top_loss: 1.7724 +[titan] 2025-09-10 22:52:33,566 - root - INFO - lr: 2.0242e-06 gnorm: 0.92 [2 days, 23:16:04< 1:41:13] +[titan] 2025-09-10 22:53:05,568 - root - INFO - step: 39080 loss: 2.4890 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.01 mfu: 49.34% global_avg_ntp_loss: 0.6491 global_avg_top_loss: 1.8399 +[titan] 2025-09-10 22:53:05,568 - root - INFO - lr: 2.0240e-06 gnorm: 0.65 [2 days, 23:16:36< 1:40:40] +[titan] 2025-09-10 22:53:37,529 - root - INFO - step: 39085 loss: 2.8619 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.64 mfu: 49.41% global_avg_ntp_loss: 0.8769 global_avg_top_loss: 1.9850 +[titan] 2025-09-10 22:53:37,529 - root - INFO - lr: 2.0237e-06 gnorm: 0.71 [2 days, 23:17:08< 1:40:07] +[titan] 2025-09-10 22:54:09,571 - root - INFO - step: 39090 loss: 2.4790 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.39 mfu: 49.28% global_avg_ntp_loss: 0.6515 global_avg_top_loss: 1.8275 +[titan] 2025-09-10 22:54:09,572 - root - INFO - lr: 2.0234e-06 gnorm: 0.68 [2 days, 23:17:40< 1:39:34] +[titan] 2025-09-10 22:54:41,722 - root - INFO - step: 39095 loss: 2.6395 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.75 mfu: 49.12% global_avg_ntp_loss: 0.7354 global_avg_top_loss: 1.9041 +[titan] 2025-09-10 22:54:41,722 - root - INFO - lr: 2.0232e-06 gnorm: 0.71 [2 days, 23:18:12< 1:39:02] +[titan] 2025-09-10 22:55:07,255 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 22:55:13,681 - root - INFO - step: 39100 loss: 2.3840 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.67 mfu: 49.41% global_avg_ntp_loss: 0.5997 global_avg_top_loss: 1.7843 +[titan] 2025-09-10 22:55:13,681 - root - INFO - lr: 2.0229e-06 gnorm: 0.66 [2 days, 23:18:44< 1:38:29] +[titan] 2025-09-10 22:55:45,746 - root - INFO - step: 39105 loss: 2.5590 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.04 mfu: 49.25% global_avg_ntp_loss: 0.6852 global_avg_top_loss: 1.8738 +[titan] 2025-09-10 22:55:45,746 - root - INFO - lr: 2.0227e-06 gnorm: 0.67 [2 days, 23:19:16< 1:37:56] +[titan] 2025-09-10 22:56:17,668 - root - INFO - step: 39110 loss: 2.6628 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.23 mfu: 49.47% global_avg_ntp_loss: 0.7409 global_avg_top_loss: 1.9219 +[titan] 2025-09-10 22:56:17,668 - root - INFO - lr: 2.0224e-06 gnorm: 0.70 [2 days, 23:19:48< 1:37:23] +[titan] 2025-09-10 22:56:49,713 - root - INFO - step: 39115 loss: 2.4336 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.35 mfu: 49.28% global_avg_ntp_loss: 0.6215 global_avg_top_loss: 1.8121 +[titan] 2025-09-10 22:56:49,713 - root - INFO - lr: 2.0222e-06 gnorm: 0.70 [2 days, 23:20:20< 1:36:50] +[titan] 2025-09-10 22:57:21,482 - root - INFO - step: 39120 loss: 2.4705 memory: 122.03GiB(87.57%) tps: 10,315 tflops: 491.59 mfu: 49.71% global_avg_ntp_loss: 0.6379 global_avg_top_loss: 1.8326 +[titan] 2025-09-10 22:57:21,482 - root - INFO - lr: 2.0219e-06 gnorm: 0.64 [2 days, 23:20:52< 1:36:17] +[titan] 2025-09-10 22:57:53,581 - root - INFO - step: 39125 loss: 2.5256 memory: 122.03GiB(87.57%) tps: 10,208 tflops: 486.53 mfu: 49.19% global_avg_ntp_loss: 0.6625 global_avg_top_loss: 1.8630 +[titan] 2025-09-10 22:57:53,581 - root - INFO - lr: 2.0217e-06 gnorm: 0.77 [2 days, 23:21:24< 1:35:45] +[titan] 2025-09-10 22:58:25,438 - root - INFO - step: 39130 loss: 2.5531 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.23 mfu: 49.57% global_avg_ntp_loss: 0.6783 global_avg_top_loss: 1.8748 +[titan] 2025-09-10 22:58:25,439 - root - INFO - lr: 2.0214e-06 gnorm: 0.62 [2 days, 23:21:56< 1:35:12] +[titan] 2025-09-10 22:58:57,603 - root - INFO - step: 39135 loss: 2.4889 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.54 mfu: 49.09% global_avg_ntp_loss: 0.6521 global_avg_top_loss: 1.8368 +[titan] 2025-09-10 22:58:57,603 - root - INFO - lr: 2.0212e-06 gnorm: 0.83 [2 days, 23:22:28< 1:34:39] +[titan] 2025-09-10 22:59:29,554 - root - INFO - step: 39140 loss: 2.5210 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.78 mfu: 49.42% global_avg_ntp_loss: 0.6666 global_avg_top_loss: 1.8543 +[titan] 2025-09-10 22:59:29,555 - root - INFO - lr: 2.0209e-06 gnorm: 0.73 [2 days, 23:23:00< 1:34:06] +[titan] 2025-09-10 23:00:01,544 - root - INFO - step: 39145 loss: 2.8463 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.19 mfu: 49.36% global_avg_ntp_loss: 0.8632 global_avg_top_loss: 1.9831 +[titan] 2025-09-10 23:00:01,545 - root - INFO - lr: 2.0207e-06 gnorm: 0.63 [2 days, 23:23:32< 1:33:33] +[titan] 2025-09-10 23:00:26,969 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 23:00:33,455 - root - INFO - step: 39150 loss: 2.3188 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.41 mfu: 49.48% global_avg_ntp_loss: 0.5713 global_avg_top_loss: 1.7475 +[titan] 2025-09-10 23:00:33,455 - root - INFO - lr: 2.0205e-06 gnorm: 0.77 [2 days, 23:24:04< 1:33:00] +[titan] 2025-09-10 23:01:05,352 - root - INFO - step: 39155 loss: 2.3221 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.61 mfu: 49.51% global_avg_ntp_loss: 0.5693 global_avg_top_loss: 1.7528 +[titan] 2025-09-10 23:01:05,353 - root - INFO - lr: 2.0202e-06 gnorm: 0.85 [2 days, 23:24:36< 1:32:27] +[titan] 2025-09-10 23:01:37,293 - root - INFO - step: 39160 loss: 2.4549 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.95 mfu: 49.44% global_avg_ntp_loss: 0.6336 global_avg_top_loss: 1.8213 +[titan] 2025-09-10 23:01:37,293 - root - INFO - lr: 2.0200e-06 gnorm: 0.63 [2 days, 23:25:08< 1:31:55] +[titan] 2025-09-10 23:02:09,300 - root - INFO - step: 39165 loss: 2.3422 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.93 mfu: 49.34% global_avg_ntp_loss: 0.5873 global_avg_top_loss: 1.7549 +[titan] 2025-09-10 23:02:09,300 - root - INFO - lr: 2.0197e-06 gnorm: 0.68 [2 days, 23:25:40< 1:31:22] +[titan] 2025-09-10 23:02:41,370 - root - INFO - step: 39170 loss: 2.4839 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.97 mfu: 49.24% global_avg_ntp_loss: 0.6507 global_avg_top_loss: 1.8332 +[titan] 2025-09-10 23:02:41,370 - root - INFO - lr: 2.0195e-06 gnorm: 0.72 [2 days, 23:26:12< 1:30:49] +[titan] 2025-09-10 23:03:13,346 - root - INFO - step: 39175 loss: 2.3225 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.39 mfu: 49.38% global_avg_ntp_loss: 0.5743 global_avg_top_loss: 1.7481 +[titan] 2025-09-10 23:03:13,347 - root - INFO - lr: 2.0193e-06 gnorm: 0.64 [2 days, 23:26:44< 1:30:16] +[titan] 2025-09-10 23:03:45,194 - root - INFO - step: 39180 loss: 2.5432 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.37 mfu: 49.58% global_avg_ntp_loss: 0.6743 global_avg_top_loss: 1.8688 +[titan] 2025-09-10 23:03:45,195 - root - INFO - lr: 2.0190e-06 gnorm: 0.68 [2 days, 23:27:16< 1:29:43] +[titan] 2025-09-10 23:04:17,237 - root - INFO - step: 39185 loss: 2.7813 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.39 mfu: 49.28% global_avg_ntp_loss: 0.8171 global_avg_top_loss: 1.9641 +[titan] 2025-09-10 23:04:17,237 - root - INFO - lr: 2.0188e-06 gnorm: 0.68 [2 days, 23:27:48< 1:29:10] +[titan] 2025-09-10 23:04:49,397 - root - INFO - step: 39190 loss: 2.6038 memory: 122.03GiB(87.57%) tps: 10,189 tflops: 485.61 mfu: 49.10% global_avg_ntp_loss: 0.7081 global_avg_top_loss: 1.8956 +[titan] 2025-09-10 23:04:49,397 - root - INFO - lr: 2.0186e-06 gnorm: 0.74 [2 days, 23:28:20< 1:28:38] +[titan] 2025-09-10 23:05:21,560 - root - INFO - step: 39195 loss: 2.4734 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.56 mfu: 49.10% global_avg_ntp_loss: 0.6389 global_avg_top_loss: 1.8346 +[titan] 2025-09-10 23:05:21,560 - root - INFO - lr: 2.0183e-06 gnorm: 0.72 [2 days, 23:28:52< 1:28:05] +[titan] 2025-09-10 23:05:47,061 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 23:05:53,466 - root - INFO - step: 39200 loss: 2.5382 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.47 mfu: 49.49% global_avg_ntp_loss: 0.6773 global_avg_top_loss: 1.8609 +[titan] 2025-09-10 23:05:53,466 - root - INFO - lr: 2.0181e-06 gnorm: 0.64 [2 days, 23:29:24< 1:27:32] +[titan] 2025-09-10 23:06:25,680 - root - INFO - step: 39205 loss: 2.4249 memory: 122.03GiB(87.57%) tps: 10,172 tflops: 484.80 mfu: 49.02% global_avg_ntp_loss: 0.6160 global_avg_top_loss: 1.8088 +[titan] 2025-09-10 23:06:25,680 - root - INFO - lr: 2.0179e-06 gnorm: 0.81 [2 days, 23:29:56< 1:26:59] +[titan] 2025-09-10 23:06:57,903 - root - INFO - step: 39210 loss: 2.5296 memory: 122.03GiB(87.57%) tps: 10,169 tflops: 484.65 mfu: 49.00% global_avg_ntp_loss: 0.6665 global_avg_top_loss: 1.8630 +[titan] 2025-09-10 23:06:57,904 - root - INFO - lr: 2.0177e-06 gnorm: 0.65 [2 days, 23:30:28< 1:26:26] +[titan] 2025-09-10 23:07:29,868 - root - INFO - step: 39215 loss: 2.6111 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.58 mfu: 49.40% global_avg_ntp_loss: 0.7051 global_avg_top_loss: 1.9060 +[titan] 2025-09-10 23:07:29,868 - root - INFO - lr: 2.0174e-06 gnorm: 0.94 [2 days, 23:31:00< 1:25:53] +[titan] 2025-09-10 23:08:02,022 - root - INFO - step: 39220 loss: 2.5283 memory: 122.03GiB(87.57%) tps: 10,191 tflops: 485.69 mfu: 49.11% global_avg_ntp_loss: 0.6733 global_avg_top_loss: 1.8550 +[titan] 2025-09-10 23:08:02,023 - root - INFO - lr: 2.0172e-06 gnorm: 0.75 [2 days, 23:31:32< 1:25:20] +[titan] 2025-09-10 23:08:33,931 - root - INFO - step: 39225 loss: 2.8114 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.43 mfu: 49.49% global_avg_ntp_loss: 0.8491 global_avg_top_loss: 1.9623 +[titan] 2025-09-10 23:08:33,932 - root - INFO - lr: 2.0170e-06 gnorm: 0.62 [2 days, 23:32:04< 1:24:48] +[titan] 2025-09-10 23:09:06,046 - root - INFO - step: 39230 loss: 2.2924 memory: 122.03GiB(87.57%) tps: 10,203 tflops: 486.29 mfu: 49.17% global_avg_ntp_loss: 0.5557 global_avg_top_loss: 1.7367 +[titan] 2025-09-10 23:09:06,047 - root - INFO - lr: 2.0168e-06 gnorm: 0.73 [2 days, 23:32:36< 1:24:15] +[titan] 2025-09-10 23:09:38,299 - root - INFO - step: 39235 loss: 2.3186 memory: 122.03GiB(87.57%) tps: 10,160 tflops: 484.21 mfu: 48.96% global_avg_ntp_loss: 0.5691 global_avg_top_loss: 1.7495 +[titan] 2025-09-10 23:09:38,299 - root - INFO - lr: 2.0166e-06 gnorm: 1.03 [2 days, 23:33:09< 1:23:42] +[titan] 2025-09-10 23:10:10,369 - root - INFO - step: 39240 loss: 2.4539 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.98 mfu: 49.24% global_avg_ntp_loss: 0.6332 global_avg_top_loss: 1.8207 +[titan] 2025-09-10 23:10:10,369 - root - INFO - lr: 2.0164e-06 gnorm: 0.65 [2 days, 23:33:41< 1:23:09] +[titan] 2025-09-10 23:10:42,488 - root - INFO - step: 39245 loss: 2.5108 memory: 122.03GiB(87.57%) tps: 10,202 tflops: 486.22 mfu: 49.16% global_avg_ntp_loss: 0.6699 global_avg_top_loss: 1.8409 +[titan] 2025-09-10 23:10:42,488 - root - INFO - lr: 2.0161e-06 gnorm: 0.70 [2 days, 23:34:13< 1:22:36] +[titan] 2025-09-10 23:11:08,151 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 23:11:14,590 - root - INFO - step: 39250 loss: 2.4623 memory: 122.03GiB(87.57%) tps: 10,208 tflops: 486.48 mfu: 49.19% global_avg_ntp_loss: 0.6413 global_avg_top_loss: 1.8210 +[titan] 2025-09-10 23:11:14,591 - root - INFO - lr: 2.0159e-06 gnorm: 0.66 [2 days, 23:34:45< 1:22:03] +[titan] 2025-09-10 23:11:46,947 - root - INFO - step: 39255 loss: 2.4431 memory: 122.03GiB(87.57%) tps: 10,127 tflops: 482.65 mfu: 48.80% global_avg_ntp_loss: 0.6277 global_avg_top_loss: 1.8154 +[titan] 2025-09-10 23:11:46,948 - root - INFO - lr: 2.0157e-06 gnorm: 0.66 [2 days, 23:35:17< 1:21:31] +[titan] 2025-09-10 23:12:18,964 - root - INFO - step: 39260 loss: 2.4805 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.79 mfu: 49.32% global_avg_ntp_loss: 0.6477 global_avg_top_loss: 1.8328 +[titan] 2025-09-10 23:12:18,964 - root - INFO - lr: 2.0155e-06 gnorm: 0.69 [2 days, 23:35:49< 1:20:58] +[titan] 2025-09-10 23:12:51,149 - root - INFO - step: 39265 loss: 2.5917 memory: 122.03GiB(87.57%) tps: 10,181 tflops: 485.23 mfu: 49.06% global_avg_ntp_loss: 0.7015 global_avg_top_loss: 1.8902 +[titan] 2025-09-10 23:12:51,149 - root - INFO - lr: 2.0153e-06 gnorm: 0.67 [2 days, 23:36:22< 1:20:25] +[titan] 2025-09-10 23:13:23,061 - root - INFO - step: 39270 loss: 2.5459 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.38 mfu: 49.48% global_avg_ntp_loss: 0.6786 global_avg_top_loss: 1.8673 +[titan] 2025-09-10 23:13:23,061 - root - INFO - lr: 2.0151e-06 gnorm: 0.71 [2 days, 23:36:53< 1:19:52] +[titan] 2025-09-10 23:13:55,495 - root - INFO - step: 39275 loss: 2.3792 memory: 122.03GiB(87.57%) tps: 10,103 tflops: 481.51 mfu: 48.69% global_avg_ntp_loss: 0.5959 global_avg_top_loss: 1.7832 +[titan] 2025-09-10 23:13:55,495 - root - INFO - lr: 2.0149e-06 gnorm: 0.70 [2 days, 23:37:26< 1:19:19] +[titan] 2025-09-10 23:14:27,502 - root - INFO - step: 39280 loss: 2.5124 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.93 mfu: 49.34% global_avg_ntp_loss: 0.6642 global_avg_top_loss: 1.8481 +[titan] 2025-09-10 23:14:27,502 - root - INFO - lr: 2.0147e-06 gnorm: 0.66 [2 days, 23:37:58< 1:18:46] +[titan] 2025-09-10 23:14:59,348 - root - INFO - step: 39285 loss: 2.4684 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.39 mfu: 49.58% global_avg_ntp_loss: 0.6402 global_avg_top_loss: 1.8282 +[titan] 2025-09-10 23:14:59,349 - root - INFO - lr: 2.0145e-06 gnorm: 0.94 [2 days, 23:38:30< 1:18:14] +[titan] 2025-09-10 23:15:31,327 - root - INFO - step: 39290 loss: 2.5233 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.36 mfu: 49.38% global_avg_ntp_loss: 0.6708 global_avg_top_loss: 1.8525 +[titan] 2025-09-10 23:15:31,327 - root - INFO - lr: 2.0143e-06 gnorm: 0.64 [2 days, 23:39:02< 1:17:41] +[titan] 2025-09-10 23:16:03,379 - root - INFO - step: 39295 loss: 2.4105 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.25 mfu: 49.27% global_avg_ntp_loss: 0.6184 global_avg_top_loss: 1.7921 +[titan] 2025-09-10 23:16:03,379 - root - INFO - lr: 2.0141e-06 gnorm: 0.82 [2 days, 23:39:34< 1:17:08] +[titan] 2025-09-10 23:16:28,846 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 23:16:35,283 - root - INFO - step: 39300 loss: 2.5427 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.51 mfu: 49.50% global_avg_ntp_loss: 0.6812 global_avg_top_loss: 1.8615 +[titan] 2025-09-10 23:16:35,283 - root - INFO - lr: 2.0139e-06 gnorm: 0.85 [2 days, 23:40:06< 1:16:35] +[titan] 2025-09-10 23:17:07,231 - root - INFO - step: 39305 loss: 2.6823 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.82 mfu: 49.43% global_avg_ntp_loss: 0.7782 global_avg_top_loss: 1.9041 +[titan] 2025-09-10 23:17:07,232 - root - INFO - lr: 2.0137e-06 gnorm: 0.64 [2 days, 23:40:38< 1:16:02] +[titan] 2025-09-10 23:17:39,291 - root - INFO - step: 39310 loss: 2.3344 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.13 mfu: 49.25% global_avg_ntp_loss: 0.5746 global_avg_top_loss: 1.7598 +[titan] 2025-09-10 23:17:39,292 - root - INFO - lr: 2.0135e-06 gnorm: 0.67 [2 days, 23:41:10< 1:15:29] +[titan] 2025-09-10 23:18:11,513 - root - INFO - step: 39315 loss: 2.3451 memory: 122.03GiB(87.57%) tps: 10,170 tflops: 484.68 mfu: 49.01% global_avg_ntp_loss: 0.5784 global_avg_top_loss: 1.7667 +[titan] 2025-09-10 23:18:11,513 - root - INFO - lr: 2.0133e-06 gnorm: 0.90 [2 days, 23:41:42< 1:14:57] +[titan] 2025-09-10 23:18:43,530 - root - INFO - step: 39320 loss: 2.4589 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.77 mfu: 49.32% global_avg_ntp_loss: 0.6361 global_avg_top_loss: 1.8228 +[titan] 2025-09-10 23:18:43,531 - root - INFO - lr: 2.0131e-06 gnorm: 0.66 [2 days, 23:42:14< 1:14:24] +[titan] 2025-09-10 23:19:15,693 - root - INFO - step: 39325 loss: 2.4063 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.56 mfu: 49.10% global_avg_ntp_loss: 0.6068 global_avg_top_loss: 1.7995 +[titan] 2025-09-10 23:19:15,694 - root - INFO - lr: 2.0129e-06 gnorm: 0.68 [2 days, 23:42:46< 1:13:51] +[titan] 2025-09-10 23:19:47,910 - root - INFO - step: 39330 loss: 2.5627 memory: 122.03GiB(87.57%) tps: 10,171 tflops: 484.76 mfu: 49.01% global_avg_ntp_loss: 0.6922 global_avg_top_loss: 1.8704 +[titan] 2025-09-10 23:19:47,910 - root - INFO - lr: 2.0127e-06 gnorm: 0.68 [2 days, 23:43:18< 1:13:18] +[titan] 2025-09-10 23:20:19,942 - root - INFO - step: 39335 loss: 2.3741 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.55 mfu: 49.30% global_avg_ntp_loss: 0.5989 global_avg_top_loss: 1.7751 +[titan] 2025-09-10 23:20:19,943 - root - INFO - lr: 2.0125e-06 gnorm: 0.66 [2 days, 23:43:50< 1:12:45] +[titan] 2025-09-10 23:20:51,925 - root - INFO - step: 39340 loss: 2.5002 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.30 mfu: 49.37% global_avg_ntp_loss: 0.6566 global_avg_top_loss: 1.8436 +[titan] 2025-09-10 23:20:51,925 - root - INFO - lr: 2.0123e-06 gnorm: 0.72 [2 days, 23:44:22< 1:12:12] +[titan] 2025-09-10 23:21:24,048 - root - INFO - step: 39345 loss: 2.5235 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.17 mfu: 49.16% global_avg_ntp_loss: 0.6694 global_avg_top_loss: 1.8541 +[titan] 2025-09-10 23:21:24,049 - root - INFO - lr: 2.0121e-06 gnorm: 0.65 [2 days, 23:44:54< 1:11:39] +[titan] 2025-09-10 23:21:49,667 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 23:21:56,031 - root - INFO - step: 39350 loss: 3.3527 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.31 mfu: 49.37% global_avg_ntp_loss: 1.1427 global_avg_top_loss: 2.2101 +[titan] 2025-09-10 23:21:56,031 - root - INFO - lr: 2.0120e-06 gnorm: 0.72 [2 days, 23:45:26< 1:11:07] +[titan] 2025-09-10 23:22:28,122 - root - INFO - step: 39355 loss: 2.4856 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.65 mfu: 49.21% global_avg_ntp_loss: 0.6459 global_avg_top_loss: 1.8397 +[titan] 2025-09-10 23:22:28,122 - root - INFO - lr: 2.0118e-06 gnorm: 0.71 [2 days, 23:45:58< 1:10:34] +[titan] 2025-09-10 23:22:59,939 - root - INFO - step: 39360 loss: 2.5232 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.84 mfu: 49.63% global_avg_ntp_loss: 0.6659 global_avg_top_loss: 1.8573 +[titan] 2025-09-10 23:22:59,939 - root - INFO - lr: 2.0116e-06 gnorm: 0.66 [2 days, 23:46:30< 1:10:01] +[titan] 2025-09-10 23:23:31,816 - root - INFO - step: 39365 loss: 2.4562 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.93 mfu: 49.54% global_avg_ntp_loss: 0.6359 global_avg_top_loss: 1.8203 +[titan] 2025-09-10 23:23:31,816 - root - INFO - lr: 2.0114e-06 gnorm: 0.77 [2 days, 23:47:02< 1:09:28] +[titan] 2025-09-10 23:24:03,780 - root - INFO - step: 39370 loss: 2.5513 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.58 mfu: 49.40% global_avg_ntp_loss: 0.6830 global_avg_top_loss: 1.8682 +[titan] 2025-09-10 23:24:03,781 - root - INFO - lr: 2.0112e-06 gnorm: 0.66 [2 days, 23:47:34< 1:08:55] +[titan] 2025-09-10 23:24:35,779 - root - INFO - step: 39375 loss: 2.5167 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.05 mfu: 49.35% global_avg_ntp_loss: 0.6643 global_avg_top_loss: 1.8524 +[titan] 2025-09-10 23:24:35,780 - root - INFO - lr: 2.0111e-06 gnorm: 0.83 [2 days, 23:48:06< 1:08:22] +[titan] 2025-09-10 23:25:07,769 - root - INFO - step: 39380 loss: 2.5705 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.20 mfu: 49.36% global_avg_ntp_loss: 0.6894 global_avg_top_loss: 1.8810 +[titan] 2025-09-10 23:25:07,769 - root - INFO - lr: 2.0109e-06 gnorm: 0.79 [2 days, 23:48:38< 1:07:50] +[titan] 2025-09-10 23:25:39,671 - root - INFO - step: 39385 loss: 2.3179 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.52 mfu: 49.50% global_avg_ntp_loss: 0.5751 global_avg_top_loss: 1.7428 +[titan] 2025-09-10 23:25:39,672 - root - INFO - lr: 2.0107e-06 gnorm: 0.59 [2 days, 23:49:10< 1:07:17] +[titan] 2025-09-10 23:26:11,804 - root - INFO - step: 39390 loss: 2.2836 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.02 mfu: 49.14% global_avg_ntp_loss: 0.5513 global_avg_top_loss: 1.7323 +[titan] 2025-09-10 23:26:11,805 - root - INFO - lr: 2.0105e-06 gnorm: 0.69 [2 days, 23:49:42< 1:06:44] +[titan] 2025-09-10 23:26:44,074 - root - INFO - step: 39395 loss: 2.2288 memory: 122.03GiB(87.57%) tps: 10,155 tflops: 483.96 mfu: 48.93% global_avg_ntp_loss: 0.5247 global_avg_top_loss: 1.7041 +[titan] 2025-09-10 23:26:44,074 - root - INFO - lr: 2.0104e-06 gnorm: 0.82 [2 days, 23:50:14< 1:06:11] +[titan] 2025-09-10 23:27:09,653 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 23:27:16,065 - root - INFO - step: 39400 loss: 2.4572 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.17 mfu: 49.36% global_avg_ntp_loss: 0.6368 global_avg_top_loss: 1.8204 +[titan] 2025-09-10 23:27:16,065 - root - INFO - lr: 2.0102e-06 gnorm: 0.66 [2 days, 23:50:46< 1:05:38] +[titan] 2025-09-10 23:27:47,973 - root - INFO - step: 39405 loss: 2.2995 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.45 mfu: 49.49% global_avg_ntp_loss: 0.5646 global_avg_top_loss: 1.7350 +[titan] 2025-09-10 23:27:47,973 - root - INFO - lr: 2.0100e-06 gnorm: 0.68 [2 days, 23:51:18< 1:05:05] +[titan] 2025-09-10 23:28:20,018 - root - INFO - step: 39410 loss: 2.4730 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.35 mfu: 49.28% global_avg_ntp_loss: 0.6469 global_avg_top_loss: 1.8261 +[titan] 2025-09-10 23:28:20,018 - root - INFO - lr: 2.0099e-06 gnorm: 0.72 [2 days, 23:51:50< 1:04:33] +[titan] 2025-09-10 23:28:51,925 - root - INFO - step: 39415 loss: 2.8001 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.45 mfu: 49.49% global_avg_ntp_loss: 0.8430 global_avg_top_loss: 1.9572 +[titan] 2025-09-10 23:28:51,926 - root - INFO - lr: 2.0097e-06 gnorm: 0.65 [2 days, 23:52:22< 1:04:00] +[titan] 2025-09-10 23:29:24,106 - root - INFO - step: 39420 loss: 2.5129 memory: 122.03GiB(87.57%) tps: 10,183 tflops: 485.29 mfu: 49.07% global_avg_ntp_loss: 0.6578 global_avg_top_loss: 1.8551 +[titan] 2025-09-10 23:29:24,107 - root - INFO - lr: 2.0095e-06 gnorm: 0.71 [2 days, 23:52:54< 1:03:27] +[titan] 2025-09-10 23:29:50,011 - root - INFO - Dumping profiler traces at step 39424 +[titan] 2025-09-10 23:29:50,083 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-10 23:29:56,411 - root - INFO - step: 39425 loss: 2.5912 memory: 122.03GiB(87.57%) tps: 10,143 tflops: 483.43 mfu: 48.88% global_avg_ntp_loss: 0.6961 global_avg_top_loss: 1.8951 +[titan] 2025-09-10 23:29:56,412 - root - INFO - lr: 2.0094e-06 gnorm: 0.70 [2 days, 23:53:27< 1:02:54] +[titan] 2025-09-10 23:30:28,413 - root - INFO - step: 39430 loss: 2.6213 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.01 mfu: 49.34% global_avg_ntp_loss: 0.7149 global_avg_top_loss: 1.9064 +[titan] 2025-09-10 23:30:28,413 - root - INFO - lr: 2.0092e-06 gnorm: 0.73 [2 days, 23:53:59< 1:02:21] +[titan] 2025-09-10 23:31:00,387 - root - INFO - step: 39435 loss: 2.4499 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.44 mfu: 49.39% global_avg_ntp_loss: 0.6303 global_avg_top_loss: 1.8196 +[titan] 2025-09-10 23:31:00,387 - root - INFO - lr: 2.0090e-06 gnorm: 0.71 [2 days, 23:54:31< 1:01:48] +[titan] 2025-09-10 23:31:32,606 - root - INFO - step: 39440 loss: 2.4541 memory: 122.03GiB(87.57%) tps: 10,171 tflops: 484.72 mfu: 49.01% global_avg_ntp_loss: 0.6342 global_avg_top_loss: 1.8199 +[titan] 2025-09-10 23:31:32,606 - root - INFO - lr: 2.0089e-06 gnorm: 0.64 [2 days, 23:55:03< 1:01:16] +[titan] 2025-09-10 23:32:04,559 - root - INFO - step: 39445 loss: 2.4535 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.75 mfu: 49.42% global_avg_ntp_loss: 0.6341 global_avg_top_loss: 1.8194 +[titan] 2025-09-10 23:32:04,559 - root - INFO - lr: 2.0087e-06 gnorm: 0.80 [2 days, 23:55:35< 1:00:43] +[titan] 2025-09-10 23:32:30,182 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 23:32:36,587 - root - INFO - step: 39450 loss: 2.5357 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.62 mfu: 49.30% global_avg_ntp_loss: 0.6696 global_avg_top_loss: 1.8661 +[titan] 2025-09-10 23:32:36,587 - root - INFO - lr: 2.0086e-06 gnorm: 0.66 [2 days, 23:56:07< 1:00:10] +[titan] 2025-09-10 23:33:08,596 - root - INFO - step: 39455 loss: 2.5364 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.90 mfu: 49.33% global_avg_ntp_loss: 0.6766 global_avg_top_loss: 1.8597 +[titan] 2025-09-10 23:33:08,596 - root - INFO - lr: 2.0084e-06 gnorm: 0.87 [2 days, 23:56:39< 0:59:37] +[titan] 2025-09-10 23:33:40,624 - root - INFO - step: 39460 loss: 2.9518 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.61 mfu: 49.30% global_avg_ntp_loss: 0.9097 global_avg_top_loss: 2.0422 +[titan] 2025-09-10 23:33:40,624 - root - INFO - lr: 2.0083e-06 gnorm: 0.75 [2 days, 23:57:11< 0:59:04] +[titan] 2025-09-10 23:34:12,560 - root - INFO - step: 39465 loss: 2.3072 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.02 mfu: 49.45% global_avg_ntp_loss: 0.5664 global_avg_top_loss: 1.7408 +[titan] 2025-09-10 23:34:12,560 - root - INFO - lr: 2.0081e-06 gnorm: 0.64 [2 days, 23:57:43< 0:58:31] +[titan] 2025-09-10 23:34:44,463 - root - INFO - step: 39470 loss: 2.3048 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.52 mfu: 49.50% global_avg_ntp_loss: 0.5636 global_avg_top_loss: 1.7412 +[titan] 2025-09-10 23:34:44,463 - root - INFO - lr: 2.0080e-06 gnorm: 0.73 [2 days, 23:58:15< 0:57:59] +[titan] 2025-09-10 23:35:16,396 - root - INFO - step: 39475 loss: 2.1847 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.07 mfu: 49.45% global_avg_ntp_loss: 0.5078 global_avg_top_loss: 1.6768 +[titan] 2025-09-10 23:35:16,396 - root - INFO - lr: 2.0078e-06 gnorm: 0.86 [2 days, 23:58:47< 0:57:26] +[titan] 2025-09-10 23:35:48,488 - root - INFO - step: 39480 loss: 2.4921 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.64 mfu: 49.21% global_avg_ntp_loss: 0.6513 global_avg_top_loss: 1.8408 +[titan] 2025-09-10 23:35:48,488 - root - INFO - lr: 2.0077e-06 gnorm: 0.65 [2 days, 23:59:19< 0:56:53] +[titan] 2025-09-10 23:36:20,692 - root - INFO - step: 39485 loss: 2.6699 memory: 122.03GiB(87.57%) tps: 10,175 tflops: 484.95 mfu: 49.03% global_avg_ntp_loss: 0.7630 global_avg_top_loss: 1.9068 +[titan] 2025-09-10 23:36:20,692 - root - INFO - lr: 2.0075e-06 gnorm: 0.74 [2 days, 23:59:51< 0:56:20] +[titan] 2025-09-10 23:36:52,888 - root - INFO - step: 39490 loss: 2.4060 memory: 122.03GiB(87.57%) tps: 10,178 tflops: 485.06 mfu: 49.05% global_avg_ntp_loss: 0.6148 global_avg_top_loss: 1.7911 +[titan] 2025-09-10 23:36:52,889 - root - INFO - lr: 2.0074e-06 gnorm: 0.64 [3 days, 0:00:23< 0:55:47] +[titan] 2025-09-10 23:37:24,901 - root - INFO - step: 39495 loss: 2.8665 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.84 mfu: 49.33% global_avg_ntp_loss: 0.8748 global_avg_top_loss: 1.9917 +[titan] 2025-09-10 23:37:24,902 - root - INFO - lr: 2.0072e-06 gnorm: 0.65 [3 days, 0:00:55< 0:55:14] +[titan] 2025-09-10 23:37:50,428 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 23:37:56,801 - root - INFO - step: 39500 loss: 2.5234 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.57 mfu: 49.50% global_avg_ntp_loss: 0.6662 global_avg_top_loss: 1.8572 +[titan] 2025-09-10 23:37:56,802 - root - INFO - lr: 2.0071e-06 gnorm: 0.68 [3 days, 0:01:27< 0:54:42] +[titan] 2025-09-10 23:38:28,875 - root - INFO - step: 39505 loss: 2.6171 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.92 mfu: 49.23% global_avg_ntp_loss: 0.7090 global_avg_top_loss: 1.9081 +[titan] 2025-09-10 23:38:28,875 - root - INFO - lr: 2.0069e-06 gnorm: 0.72 [3 days, 0:01:59< 0:54:09] +[titan] 2025-09-10 23:39:00,812 - root - INFO - step: 39510 loss: 2.5856 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.99 mfu: 49.44% global_avg_ntp_loss: 0.7021 global_avg_top_loss: 1.8835 +[titan] 2025-09-10 23:39:00,813 - root - INFO - lr: 2.0068e-06 gnorm: 0.73 [3 days, 0:02:31< 0:53:36] +[titan] 2025-09-10 23:39:32,764 - root - INFO - step: 39515 loss: 2.4350 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.78 mfu: 49.42% global_avg_ntp_loss: 0.6221 global_avg_top_loss: 1.8130 +[titan] 2025-09-10 23:39:32,764 - root - INFO - lr: 2.0067e-06 gnorm: 0.70 [3 days, 0:03:03< 0:53:03] +[titan] 2025-09-10 23:40:04,807 - root - INFO - step: 39520 loss: 2.4719 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.38 mfu: 49.28% global_avg_ntp_loss: 0.6384 global_avg_top_loss: 1.8335 +[titan] 2025-09-10 23:40:04,808 - root - INFO - lr: 2.0065e-06 gnorm: 0.72 [3 days, 0:03:35< 0:52:30] +[titan] 2025-09-10 23:40:36,878 - root - INFO - step: 39525 loss: 2.4810 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.96 mfu: 49.24% global_avg_ntp_loss: 0.6491 global_avg_top_loss: 1.8319 +[titan] 2025-09-10 23:40:36,878 - root - INFO - lr: 2.0064e-06 gnorm: 0.77 [3 days, 0:04:07< 0:51:57] +[titan] 2025-09-10 23:41:08,844 - root - INFO - step: 39530 loss: 2.5088 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.56 mfu: 49.40% global_avg_ntp_loss: 0.6604 global_avg_top_loss: 1.8484 +[titan] 2025-09-10 23:41:08,844 - root - INFO - lr: 2.0063e-06 gnorm: 0.64 [3 days, 0:04:39< 0:51:25] +[titan] 2025-09-10 23:41:40,973 - root - INFO - step: 39535 loss: 2.5393 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.08 mfu: 49.15% global_avg_ntp_loss: 0.6742 global_avg_top_loss: 1.8651 +[titan] 2025-09-10 23:41:40,973 - root - INFO - lr: 2.0061e-06 gnorm: 0.83 [3 days, 0:05:11< 0:50:52] +[titan] 2025-09-10 23:42:12,980 - root - INFO - step: 39540 loss: 2.5563 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.94 mfu: 49.34% global_avg_ntp_loss: 0.6853 global_avg_top_loss: 1.8710 +[titan] 2025-09-10 23:42:12,980 - root - INFO - lr: 2.0060e-06 gnorm: 0.79 [3 days, 0:05:43< 0:50:19] +[titan] 2025-09-10 23:42:44,940 - root - INFO - step: 39545 loss: 2.3141 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.64 mfu: 49.41% global_avg_ntp_loss: 0.5704 global_avg_top_loss: 1.7437 +[titan] 2025-09-10 23:42:44,941 - root - INFO - lr: 2.0059e-06 gnorm: 0.62 [3 days, 0:06:15< 0:49:46] +[titan] 2025-09-10 23:43:10,619 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 23:43:17,027 - root - INFO - step: 39550 loss: 2.2991 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.72 mfu: 49.21% global_avg_ntp_loss: 0.5583 global_avg_top_loss: 1.7408 +[titan] 2025-09-10 23:43:17,027 - root - INFO - lr: 2.0057e-06 gnorm: 0.70 [3 days, 0:06:47< 0:49:13] +[titan] 2025-09-10 23:43:48,760 - root - INFO - step: 39555 loss: 2.3019 memory: 122.03GiB(87.57%) tps: 10,326 tflops: 492.15 mfu: 49.76% global_avg_ntp_loss: 0.5589 global_avg_top_loss: 1.7429 +[titan] 2025-09-10 23:43:48,760 - root - INFO - lr: 2.0056e-06 gnorm: 0.91 [3 days, 0:07:19< 0:48:40] +[titan] 2025-09-10 23:44:20,820 - root - INFO - step: 39560 loss: 2.4537 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.13 mfu: 49.25% global_avg_ntp_loss: 0.6316 global_avg_top_loss: 1.8221 +[titan] 2025-09-10 23:44:20,820 - root - INFO - lr: 2.0055e-06 gnorm: 0.68 [3 days, 0:07:51< 0:48:08] +[titan] 2025-09-10 23:44:52,833 - root - INFO - step: 39565 loss: 2.3758 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.83 mfu: 49.33% global_avg_ntp_loss: 0.5944 global_avg_top_loss: 1.7814 +[titan] 2025-09-10 23:44:52,833 - root - INFO - lr: 2.0054e-06 gnorm: 0.69 [3 days, 0:08:23< 0:47:35] +[titan] 2025-09-10 23:45:24,773 - root - INFO - step: 39570 loss: 2.4560 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.96 mfu: 49.44% global_avg_ntp_loss: 0.6342 global_avg_top_loss: 1.8218 +[titan] 2025-09-10 23:45:24,773 - root - INFO - lr: 2.0052e-06 gnorm: 0.76 [3 days, 0:08:55< 0:47:02] +[titan] 2025-09-10 23:45:56,744 - root - INFO - step: 39575 loss: 2.8276 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.48 mfu: 49.39% global_avg_ntp_loss: 0.8558 global_avg_top_loss: 1.9718 +[titan] 2025-09-10 23:45:56,744 - root - INFO - lr: 2.0051e-06 gnorm: 0.63 [3 days, 0:09:27< 0:46:29] +[titan] 2025-09-10 23:46:28,471 - root - INFO - step: 39580 loss: 2.4954 memory: 122.03GiB(87.57%) tps: 10,328 tflops: 492.24 mfu: 49.77% global_avg_ntp_loss: 0.6496 global_avg_top_loss: 1.8458 +[titan] 2025-09-10 23:46:28,471 - root - INFO - lr: 2.0050e-06 gnorm: 0.70 [3 days, 0:09:59< 0:45:56] +[titan] 2025-09-10 23:47:00,388 - root - INFO - step: 39585 loss: 2.4959 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.31 mfu: 49.48% global_avg_ntp_loss: 0.6544 global_avg_top_loss: 1.8415 +[titan] 2025-09-10 23:47:00,388 - root - INFO - lr: 2.0049e-06 gnorm: 0.67 [3 days, 0:10:31< 0:45:24] +[titan] 2025-09-10 23:47:32,249 - root - INFO - step: 39590 loss: 2.5624 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.16 mfu: 49.56% global_avg_ntp_loss: 0.6862 global_avg_top_loss: 1.8762 +[titan] 2025-09-10 23:47:32,249 - root - INFO - lr: 2.0048e-06 gnorm: 0.75 [3 days, 0:11:03< 0:44:51] +[titan] 2025-09-10 23:48:04,083 - root - INFO - step: 39595 loss: 2.4783 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.58 mfu: 49.60% global_avg_ntp_loss: 0.6413 global_avg_top_loss: 1.8370 +[titan] 2025-09-10 23:48:04,084 - root - INFO - lr: 2.0046e-06 gnorm: 0.75 [3 days, 0:11:34< 0:44:18] +[titan] 2025-09-10 23:48:29,598 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 23:48:35,970 - root - INFO - step: 39600 loss: 2.3963 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.78 mfu: 49.52% global_avg_ntp_loss: 0.6060 global_avg_top_loss: 1.7904 +[titan] 2025-09-10 23:48:35,970 - root - INFO - lr: 2.0045e-06 gnorm: 0.67 [3 days, 0:12:06< 0:43:45] +[titan] 2025-09-10 23:49:08,103 - root - INFO - step: 39605 loss: 2.4805 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.02 mfu: 49.14% global_avg_ntp_loss: 0.6447 global_avg_top_loss: 1.8358 +[titan] 2025-09-10 23:49:08,103 - root - INFO - lr: 2.0044e-06 gnorm: 0.83 [3 days, 0:12:38< 0:43:12] +[titan] 2025-09-10 23:49:40,306 - root - INFO - step: 39610 loss: 2.6495 memory: 122.03GiB(87.57%) tps: 10,176 tflops: 484.96 mfu: 49.04% global_avg_ntp_loss: 0.7361 global_avg_top_loss: 1.9134 +[titan] 2025-09-10 23:49:40,306 - root - INFO - lr: 2.0043e-06 gnorm: 0.61 [3 days, 0:13:11< 0:42:39] +[titan] 2025-09-10 23:50:12,332 - root - INFO - step: 39615 loss: 2.4280 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.65 mfu: 49.31% global_avg_ntp_loss: 0.6232 global_avg_top_loss: 1.8048 +[titan] 2025-09-10 23:50:12,332 - root - INFO - lr: 2.0042e-06 gnorm: 0.80 [3 days, 0:13:43< 0:42:07] +[titan] 2025-09-10 23:50:44,273 - root - INFO - step: 39620 loss: 2.4863 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.94 mfu: 49.44% global_avg_ntp_loss: 0.6534 global_avg_top_loss: 1.8328 +[titan] 2025-09-10 23:50:44,273 - root - INFO - lr: 2.0041e-06 gnorm: 0.74 [3 days, 0:14:15< 0:41:34] +[titan] 2025-09-10 23:51:16,121 - root - INFO - step: 39625 loss: 2.3562 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.36 mfu: 49.58% global_avg_ntp_loss: 0.5978 global_avg_top_loss: 1.7584 +[titan] 2025-09-10 23:51:16,122 - root - INFO - lr: 2.0040e-06 gnorm: 0.63 [3 days, 0:14:46< 0:41:01] +[titan] 2025-09-10 23:51:48,159 - root - INFO - step: 39630 loss: 2.3111 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.46 mfu: 49.29% global_avg_ntp_loss: 0.5650 global_avg_top_loss: 1.7461 +[titan] 2025-09-10 23:51:48,160 - root - INFO - lr: 2.0039e-06 gnorm: 0.77 [3 days, 0:15:18< 0:40:28] +[titan] 2025-09-10 23:52:20,132 - root - INFO - step: 39635 loss: 2.2557 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.46 mfu: 49.39% global_avg_ntp_loss: 0.5392 global_avg_top_loss: 1.7164 +[titan] 2025-09-10 23:52:20,132 - root - INFO - lr: 2.0038e-06 gnorm: 0.80 [3 days, 0:15:50< 0:39:55] +[titan] 2025-09-10 23:52:52,053 - root - INFO - step: 39640 loss: 2.8924 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.24 mfu: 49.47% global_avg_ntp_loss: 0.8863 global_avg_top_loss: 2.0061 +[titan] 2025-09-10 23:52:52,054 - root - INFO - lr: 2.0037e-06 gnorm: 0.65 [3 days, 0:16:22< 0:39:22] +[titan] 2025-09-10 23:53:23,949 - root - INFO - step: 39645 loss: 2.3108 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.63 mfu: 49.51% global_avg_ntp_loss: 0.5688 global_avg_top_loss: 1.7419 +[titan] 2025-09-10 23:53:23,949 - root - INFO - lr: 2.0036e-06 gnorm: 0.66 [3 days, 0:16:54< 0:38:50] +[titan] 2025-09-10 23:53:49,362 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 23:53:55,693 - root - INFO - step: 39650 loss: 2.5180 memory: 122.03GiB(87.57%) tps: 10,323 tflops: 491.97 mfu: 49.74% global_avg_ntp_loss: 0.6629 global_avg_top_loss: 1.8550 +[titan] 2025-09-10 23:53:55,694 - root - INFO - lr: 2.0035e-06 gnorm: 0.72 [3 days, 0:17:26< 0:38:17] +[titan] 2025-09-10 23:54:27,584 - root - INFO - step: 39655 loss: 2.7779 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.72 mfu: 49.52% global_avg_ntp_loss: 0.8359 global_avg_top_loss: 1.9420 +[titan] 2025-09-10 23:54:27,584 - root - INFO - lr: 2.0034e-06 gnorm: 0.65 [3 days, 0:17:58< 0:37:44] +[titan] 2025-09-10 23:54:59,493 - root - INFO - step: 39660 loss: 2.4884 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.43 mfu: 49.49% global_avg_ntp_loss: 0.6536 global_avg_top_loss: 1.8348 +[titan] 2025-09-10 23:54:59,493 - root - INFO - lr: 2.0033e-06 gnorm: 0.66 [3 days, 0:18:30< 0:37:11] +[titan] 2025-09-10 23:55:31,556 - root - INFO - step: 39665 loss: 2.5556 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.07 mfu: 49.25% global_avg_ntp_loss: 0.6853 global_avg_top_loss: 1.8703 +[titan] 2025-09-10 23:55:31,557 - root - INFO - lr: 2.0032e-06 gnorm: 0.67 [3 days, 0:19:02< 0:36:38] +[titan] 2025-09-10 23:56:03,278 - root - INFO - step: 39670 loss: 2.5772 memory: 122.03GiB(87.57%) tps: 10,330 tflops: 492.32 mfu: 49.78% global_avg_ntp_loss: 0.6955 global_avg_top_loss: 1.8817 +[titan] 2025-09-10 23:56:03,279 - root - INFO - lr: 2.0031e-06 gnorm: 0.75 [3 days, 0:19:34< 0:36:05] +[titan] 2025-09-10 23:56:35,361 - root - INFO - step: 39675 loss: 2.4575 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.78 mfu: 49.22% global_avg_ntp_loss: 0.6364 global_avg_top_loss: 1.8211 +[titan] 2025-09-10 23:56:35,361 - root - INFO - lr: 2.0030e-06 gnorm: 0.78 [3 days, 0:20:06< 0:35:33] +[titan] 2025-09-10 23:57:07,117 - root - INFO - step: 39680 loss: 2.4507 memory: 122.03GiB(87.57%) tps: 10,319 tflops: 491.79 mfu: 49.73% global_avg_ntp_loss: 0.6333 global_avg_top_loss: 1.8175 +[titan] 2025-09-10 23:57:07,117 - root - INFO - lr: 2.0029e-06 gnorm: 0.64 [3 days, 0:20:37< 0:35:00] +[titan] 2025-09-10 23:57:39,041 - root - INFO - step: 39685 loss: 2.4389 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.20 mfu: 49.46% global_avg_ntp_loss: 0.6246 global_avg_top_loss: 1.8143 +[titan] 2025-09-10 23:57:39,041 - root - INFO - lr: 2.0028e-06 gnorm: 0.82 [3 days, 0:21:09< 0:34:27] +[titan] 2025-09-10 23:58:10,931 - root - INFO - step: 39690 loss: 2.5176 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.72 mfu: 49.52% global_avg_ntp_loss: 0.6606 global_avg_top_loss: 1.8570 +[titan] 2025-09-10 23:58:10,931 - root - INFO - lr: 2.0027e-06 gnorm: 0.67 [3 days, 0:21:41< 0:33:54] +[titan] 2025-09-10 23:58:43,098 - root - INFO - step: 39695 loss: 2.4345 memory: 122.03GiB(87.57%) tps: 10,187 tflops: 485.51 mfu: 49.09% global_avg_ntp_loss: 0.6304 global_avg_top_loss: 1.8041 +[titan] 2025-09-10 23:58:43,098 - root - INFO - lr: 2.0026e-06 gnorm: 0.79 [3 days, 0:22:13< 0:33:21] +[titan] 2025-09-10 23:59:08,593 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 23:59:15,079 - root - INFO - step: 39700 loss: 2.4697 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.32 mfu: 49.37% global_avg_ntp_loss: 0.6415 global_avg_top_loss: 1.8282 +[titan] 2025-09-10 23:59:15,079 - root - INFO - lr: 2.0025e-06 gnorm: 0.82 [3 days, 0:22:45< 0:32:49] +[titan] 2025-09-10 23:59:47,055 - root - INFO - step: 39705 loss: 2.3550 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.40 mfu: 49.38% global_avg_ntp_loss: 0.5904 global_avg_top_loss: 1.7646 +[titan] 2025-09-10 23:59:47,056 - root - INFO - lr: 2.0025e-06 gnorm: 0.64 [3 days, 0:23:17< 0:32:16] +[titan] 2025-09-11 00:00:18,917 - root - INFO - step: 39710 loss: 2.3164 memory: 122.03GiB(87.57%) tps: 10,284 tflops: 490.15 mfu: 49.56% global_avg_ntp_loss: 0.5672 global_avg_top_loss: 1.7492 +[titan] 2025-09-11 00:00:18,918 - root - INFO - lr: 2.0024e-06 gnorm: 0.78 [3 days, 0:23:49< 0:31:43] +[titan] 2025-09-11 00:00:50,976 - root - INFO - step: 39715 loss: 2.3203 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.15 mfu: 49.26% global_avg_ntp_loss: 0.5686 global_avg_top_loss: 1.7516 +[titan] 2025-09-11 00:00:50,976 - root - INFO - lr: 2.0023e-06 gnorm: 0.90 [3 days, 0:24:21< 0:31:10] +[titan] 2025-09-11 00:01:22,907 - root - INFO - step: 39720 loss: 2.3862 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.08 mfu: 49.45% global_avg_ntp_loss: 0.6157 global_avg_top_loss: 1.7705 +[titan] 2025-09-11 00:01:22,908 - root - INFO - lr: 2.0022e-06 gnorm: 0.66 [3 days, 0:24:53< 0:30:37] +[titan] 2025-09-11 00:01:54,847 - root - INFO - step: 39725 loss: 2.7239 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.97 mfu: 49.44% global_avg_ntp_loss: 0.7983 global_avg_top_loss: 1.9257 +[titan] 2025-09-11 00:01:54,847 - root - INFO - lr: 2.0021e-06 gnorm: 0.77 [3 days, 0:25:25< 0:30:04] +[titan] 2025-09-11 00:02:26,813 - root - INFO - step: 39730 loss: 2.8870 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.55 mfu: 49.40% global_avg_ntp_loss: 0.8718 global_avg_top_loss: 2.0152 +[titan] 2025-09-11 00:02:26,813 - root - INFO - lr: 2.0021e-06 gnorm: 0.64 [3 days, 0:25:57< 0:29:32] +[titan] 2025-09-11 00:02:59,086 - root - INFO - step: 39735 loss: 2.8376 memory: 122.03GiB(87.57%) tps: 10,154 tflops: 483.91 mfu: 48.93% global_avg_ntp_loss: 0.8595 global_avg_top_loss: 1.9781 +[titan] 2025-09-11 00:02:59,086 - root - INFO - lr: 2.0020e-06 gnorm: 0.65 [3 days, 0:26:29< 0:28:59] +[titan] 2025-09-11 00:03:31,014 - root - INFO - step: 39740 loss: 2.5535 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.14 mfu: 49.46% global_avg_ntp_loss: 0.6798 global_avg_top_loss: 1.8737 +[titan] 2025-09-11 00:03:31,014 - root - INFO - lr: 2.0019e-06 gnorm: 0.75 [3 days, 0:27:01< 0:28:26] +[titan] 2025-09-11 00:04:02,838 - root - INFO - step: 39745 loss: 2.5278 memory: 122.03GiB(87.57%) tps: 10,297 tflops: 490.73 mfu: 49.62% global_avg_ntp_loss: 0.6700 global_avg_top_loss: 1.8578 +[titan] 2025-09-11 00:04:02,838 - root - INFO - lr: 2.0018e-06 gnorm: 0.66 [3 days, 0:27:33< 0:27:53] +[titan] 2025-09-11 00:04:28,529 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-11 00:04:34,960 - root - INFO - step: 39750 loss: 2.5506 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.18 mfu: 49.16% global_avg_ntp_loss: 0.6869 global_avg_top_loss: 1.8637 +[titan] 2025-09-11 00:04:34,961 - root - INFO - lr: 2.0018e-06 gnorm: 0.71 [3 days, 0:28:05< 0:27:20] +[titan] 2025-09-11 00:05:06,805 - root - INFO - step: 39755 loss: 2.4417 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.42 mfu: 49.59% global_avg_ntp_loss: 0.6281 global_avg_top_loss: 1.8136 +[titan] 2025-09-11 00:05:06,805 - root - INFO - lr: 2.0017e-06 gnorm: 0.69 [3 days, 0:28:37< 0:26:47] +[titan] 2025-09-11 00:05:38,700 - root - INFO - step: 39760 loss: 2.4874 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.64 mfu: 49.51% global_avg_ntp_loss: 0.6476 global_avg_top_loss: 1.8398 +[titan] 2025-09-11 00:05:38,700 - root - INFO - lr: 2.0016e-06 gnorm: 0.63 [3 days, 0:29:09< 0:26:15] +[titan] 2025-09-11 00:06:10,617 - root - INFO - step: 39765 loss: 2.4924 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.31 mfu: 49.47% global_avg_ntp_loss: 0.6479 global_avg_top_loss: 1.8445 +[titan] 2025-09-11 00:06:10,617 - root - INFO - lr: 2.0016e-06 gnorm: 0.93 [3 days, 0:29:41< 0:25:42] +[titan] 2025-09-11 00:06:42,293 - root - INFO - step: 39770 loss: 2.5484 memory: 122.03GiB(87.57%) tps: 10,345 tflops: 493.03 mfu: 49.85% global_avg_ntp_loss: 0.6773 global_avg_top_loss: 1.8711 +[titan] 2025-09-11 00:06:42,293 - root - INFO - lr: 2.0015e-06 gnorm: 0.66 [3 days, 0:30:13< 0:25:09] +[titan] 2025-09-11 00:07:14,378 - root - INFO - step: 39775 loss: 2.5066 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.73 mfu: 49.21% global_avg_ntp_loss: 0.6606 global_avg_top_loss: 1.8460 +[titan] 2025-09-11 00:07:14,378 - root - INFO - lr: 2.0014e-06 gnorm: 0.83 [3 days, 0:30:45< 0:24:36] +[titan] 2025-09-11 00:07:46,252 - root - INFO - step: 39780 loss: 2.5222 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.98 mfu: 49.54% global_avg_ntp_loss: 0.6697 global_avg_top_loss: 1.8525 +[titan] 2025-09-11 00:07:46,252 - root - INFO - lr: 2.0014e-06 gnorm: 0.81 [3 days, 0:31:17< 0:24:03] +[titan] 2025-09-11 00:08:18,352 - root - INFO - step: 39785 loss: 2.3427 memory: 122.03GiB(87.57%) tps: 10,208 tflops: 486.50 mfu: 49.19% global_avg_ntp_loss: 0.5911 global_avg_top_loss: 1.7516 +[titan] 2025-09-11 00:08:18,353 - root - INFO - lr: 2.0013e-06 gnorm: 0.61 [3 days, 0:31:49< 0:23:31] +[titan] 2025-09-11 00:08:50,063 - root - INFO - step: 39790 loss: 2.3389 memory: 122.03GiB(87.57%) tps: 10,334 tflops: 492.50 mfu: 49.80% global_avg_ntp_loss: 0.5752 global_avg_top_loss: 1.7637 +[titan] 2025-09-11 00:08:50,063 - root - INFO - lr: 2.0012e-06 gnorm: 0.70 [3 days, 0:32:20< 0:22:58] +[titan] 2025-09-11 00:09:21,864 - root - INFO - step: 39795 loss: 2.3220 memory: 122.03GiB(87.57%) tps: 10,304 tflops: 491.08 mfu: 49.65% global_avg_ntp_loss: 0.5702 global_avg_top_loss: 1.7518 +[titan] 2025-09-11 00:09:21,865 - root - INFO - lr: 2.0012e-06 gnorm: 0.90 [3 days, 0:32:52< 0:22:25] +[titan] 2025-09-11 00:09:47,205 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-11 00:09:53,581 - root - INFO - step: 39800 loss: 2.4905 memory: 122.03GiB(87.57%) tps: 10,332 tflops: 492.40 mfu: 49.79% global_avg_ntp_loss: 0.6500 global_avg_top_loss: 1.8405 +[titan] 2025-09-11 00:09:53,581 - root - INFO - lr: 2.0011e-06 gnorm: 0.65 [3 days, 0:33:24< 0:21:52] +[titan] 2025-09-11 00:10:25,599 - root - INFO - step: 39805 loss: 2.3163 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.76 mfu: 49.32% global_avg_ntp_loss: 0.5654 global_avg_top_loss: 1.7509 +[titan] 2025-09-11 00:10:25,600 - root - INFO - lr: 2.0011e-06 gnorm: 0.66 [3 days, 0:33:56< 0:21:19] +[titan] 2025-09-11 00:10:57,663 - root - INFO - step: 39810 loss: 2.4578 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.08 mfu: 49.25% global_avg_ntp_loss: 0.6287 global_avg_top_loss: 1.8291 +[titan] 2025-09-11 00:10:57,663 - root - INFO - lr: 2.0010e-06 gnorm: 0.75 [3 days, 0:34:28< 0:20:46] +[titan] 2025-09-11 00:11:29,541 - root - INFO - step: 39815 loss: 2.3854 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.90 mfu: 49.53% global_avg_ntp_loss: 0.5962 global_avg_top_loss: 1.7893 +[titan] 2025-09-11 00:11:29,542 - root - INFO - lr: 2.0010e-06 gnorm: 0.68 [3 days, 0:35:00< 0:20:14] +[titan] 2025-09-11 00:12:01,654 - root - INFO - step: 39820 loss: 2.5651 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.32 mfu: 49.17% global_avg_ntp_loss: 0.6863 global_avg_top_loss: 1.8787 +[titan] 2025-09-11 00:12:01,655 - root - INFO - lr: 2.0009e-06 gnorm: 0.76 [3 days, 0:35:32< 0:19:41] +[titan] 2025-09-11 00:12:33,431 - root - INFO - step: 39825 loss: 2.5332 memory: 122.03GiB(87.57%) tps: 10,312 tflops: 491.47 mfu: 49.69% global_avg_ntp_loss: 0.6744 global_avg_top_loss: 1.8588 +[titan] 2025-09-11 00:12:33,431 - root - INFO - lr: 2.0009e-06 gnorm: 0.73 [3 days, 0:36:04< 0:19:08] +[titan] 2025-09-11 00:13:05,555 - root - INFO - step: 39830 loss: 2.6482 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.15 mfu: 49.16% global_avg_ntp_loss: 0.7304 global_avg_top_loss: 1.9178 +[titan] 2025-09-11 00:13:05,556 - root - INFO - lr: 2.0008e-06 gnorm: 0.79 [3 days, 0:36:36< 0:18:35] +[titan] 2025-09-11 00:13:37,491 - root - INFO - step: 39835 loss: 2.5195 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.02 mfu: 49.45% global_avg_ntp_loss: 0.6744 global_avg_top_loss: 1.8451 +[titan] 2025-09-11 00:13:37,491 - root - INFO - lr: 2.0008e-06 gnorm: 0.76 [3 days, 0:37:08< 0:18:02] +[titan] 2025-09-11 00:14:09,480 - root - INFO - step: 39840 loss: 2.5199 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.20 mfu: 49.36% global_avg_ntp_loss: 0.6609 global_avg_top_loss: 1.8590 +[titan] 2025-09-11 00:14:09,481 - root - INFO - lr: 2.0007e-06 gnorm: 0.74 [3 days, 0:37:40< 0:17:30] +[titan] 2025-09-11 00:14:41,685 - root - INFO - step: 39845 loss: 2.4487 memory: 122.03GiB(87.57%) tps: 10,175 tflops: 484.94 mfu: 49.03% global_avg_ntp_loss: 0.6285 global_avg_top_loss: 1.8202 +[titan] 2025-09-11 00:14:41,685 - root - INFO - lr: 2.0007e-06 gnorm: 1.01 [3 days, 0:38:12< 0:16:57] +[titan] 2025-09-11 00:15:07,047 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-11 00:15:13,473 - root - INFO - step: 39850 loss: 2.5863 memory: 122.03GiB(87.57%) tps: 10,308 tflops: 491.28 mfu: 49.67% global_avg_ntp_loss: 0.6932 global_avg_top_loss: 1.8931 +[titan] 2025-09-11 00:15:13,474 - root - INFO - lr: 2.0006e-06 gnorm: 0.72 [3 days, 0:38:44< 0:16:24] +[titan] 2025-09-11 00:15:45,621 - root - INFO - step: 39855 loss: 2.5366 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.80 mfu: 49.12% global_avg_ntp_loss: 0.6708 global_avg_top_loss: 1.8658 +[titan] 2025-09-11 00:15:45,621 - root - INFO - lr: 2.0006e-06 gnorm: 0.78 [3 days, 0:39:16< 0:15:51] +[titan] 2025-09-11 00:16:17,595 - root - INFO - step: 39860 loss: 2.5936 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.43 mfu: 49.39% global_avg_ntp_loss: 0.7026 global_avg_top_loss: 1.8910 +[titan] 2025-09-11 00:16:17,595 - root - INFO - lr: 2.0006e-06 gnorm: 0.83 [3 days, 0:39:48< 0:15:18] +[titan] 2025-09-11 00:16:49,353 - root - INFO - step: 39865 loss: 2.2647 memory: 122.03GiB(87.57%) tps: 10,318 tflops: 491.75 mfu: 49.72% global_avg_ntp_loss: 0.5491 global_avg_top_loss: 1.7156 +[titan] 2025-09-11 00:16:49,353 - root - INFO - lr: 2.0005e-06 gnorm: 0.62 [3 days, 0:40:20< 0:14:45] +[titan] 2025-09-11 00:17:21,220 - root - INFO - step: 39870 loss: 2.3198 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.08 mfu: 49.55% global_avg_ntp_loss: 0.5710 global_avg_top_loss: 1.7488 +[titan] 2025-09-11 00:17:21,220 - root - INFO - lr: 2.0005e-06 gnorm: 0.72 [3 days, 0:40:52< 0:14:13] +[titan] 2025-09-11 00:17:53,447 - root - INFO - step: 39875 loss: 2.3045 memory: 122.03GiB(87.57%) tps: 10,168 tflops: 484.60 mfu: 49.00% global_avg_ntp_loss: 0.5612 global_avg_top_loss: 1.7434 +[titan] 2025-09-11 00:17:53,447 - root - INFO - lr: 2.0004e-06 gnorm: 0.93 [3 days, 0:41:24< 0:13:40] +[titan] 2025-09-11 00:18:25,334 - root - INFO - step: 39880 loss: 2.5450 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.77 mfu: 49.52% global_avg_ntp_loss: 0.6748 global_avg_top_loss: 1.8702 +[titan] 2025-09-11 00:18:25,334 - root - INFO - lr: 2.0004e-06 gnorm: 0.64 [3 days, 0:41:56< 0:13:07] +[titan] 2025-09-11 00:18:57,305 - root - INFO - step: 39885 loss: 2.3818 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.48 mfu: 49.39% global_avg_ntp_loss: 0.6006 global_avg_top_loss: 1.7812 +[titan] 2025-09-11 00:18:57,305 - root - INFO - lr: 2.0004e-06 gnorm: 0.74 [3 days, 0:42:28< 0:12:34] +[titan] 2025-09-11 00:19:29,234 - root - INFO - step: 39890 loss: 2.5189 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.11 mfu: 49.46% global_avg_ntp_loss: 0.6658 global_avg_top_loss: 1.8530 +[titan] 2025-09-11 00:19:29,235 - root - INFO - lr: 2.0003e-06 gnorm: 0.68 [3 days, 0:43:00< 0:12:01] +[titan] 2025-09-11 00:20:01,199 - root - INFO - step: 39895 loss: 2.2865 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.58 mfu: 49.40% global_avg_ntp_loss: 0.5628 global_avg_top_loss: 1.7237 +[titan] 2025-09-11 00:20:01,199 - root - INFO - lr: 2.0003e-06 gnorm: 0.76 [3 days, 0:43:31< 0:11:29] +[titan] 2025-09-11 00:20:26,907 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-11 00:20:33,309 - root - INFO - step: 39900 loss: 2.4456 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.37 mfu: 49.18% global_avg_ntp_loss: 0.6335 global_avg_top_loss: 1.8121 +[titan] 2025-09-11 00:20:33,309 - root - INFO - lr: 2.0003e-06 gnorm: 0.71 [3 days, 0:44:04< 0:10:56] +[titan] 2025-09-11 00:21:05,337 - root - INFO - step: 39905 loss: 2.5072 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.60 mfu: 49.30% global_avg_ntp_loss: 0.6599 global_avg_top_loss: 1.8472 +[titan] 2025-09-11 00:21:05,337 - root - INFO - lr: 2.0003e-06 gnorm: 0.66 [3 days, 0:44:36< 0:10:23] +[titan] 2025-09-11 00:21:37,242 - root - INFO - step: 39910 loss: 3.0107 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.49 mfu: 49.49% global_avg_ntp_loss: 0.9140 global_avg_top_loss: 2.0967 +[titan] 2025-09-11 00:21:37,242 - root - INFO - lr: 2.0002e-06 gnorm: 0.71 [3 days, 0:45:08< 0:09:50] +[titan] 2025-09-11 00:22:09,125 - root - INFO - step: 39915 loss: 2.4231 memory: 122.03GiB(87.57%) tps: 10,278 tflops: 489.82 mfu: 49.53% global_avg_ntp_loss: 0.6154 global_avg_top_loss: 1.8077 +[titan] 2025-09-11 00:22:09,126 - root - INFO - lr: 2.0002e-06 gnorm: 0.90 [3 days, 0:45:39< 0:09:17] +[titan] 2025-09-11 00:22:41,119 - root - INFO - step: 39920 loss: 2.4574 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.13 mfu: 49.36% global_avg_ntp_loss: 0.6355 global_avg_top_loss: 1.8220 +[titan] 2025-09-11 00:22:41,120 - root - INFO - lr: 2.0002e-06 gnorm: 0.69 [3 days, 0:46:11< 0:08:44] +[titan] 2025-09-11 00:23:13,169 - root - INFO - step: 39925 loss: 2.4378 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.27 mfu: 49.27% global_avg_ntp_loss: 0.6233 global_avg_top_loss: 1.8145 +[titan] 2025-09-11 00:23:13,170 - root - INFO - lr: 2.0002e-06 gnorm: 0.96 [3 days, 0:46:43< 0:08:12] +[titan] 2025-09-11 00:23:45,063 - root - INFO - step: 39930 loss: 2.7418 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.67 mfu: 49.51% global_avg_ntp_loss: 0.7919 global_avg_top_loss: 1.9499 +[titan] 2025-09-11 00:23:45,063 - root - INFO - lr: 2.0001e-06 gnorm: 0.61 [3 days, 0:47:15< 0:07:39] +[titan] 2025-09-11 00:24:17,178 - root - INFO - step: 39935 loss: 2.4925 memory: 122.03GiB(87.57%) tps: 10,203 tflops: 486.29 mfu: 49.17% global_avg_ntp_loss: 0.6531 global_avg_top_loss: 1.8394 +[titan] 2025-09-11 00:24:17,178 - root - INFO - lr: 2.0001e-06 gnorm: 0.81 [3 days, 0:47:47< 0:07:06] +[titan] 2025-09-11 00:24:23,880 - root - INFO - Dumping profiler traces at step 39936 +[titan] 2025-09-11 00:24:23,939 - root - INFO - Finished dumping profiler traces in 0.06 seconds +[titan] 2025-09-11 00:24:49,297 - root - INFO - step: 39940 loss: 2.6852 memory: 122.03GiB(87.57%) tps: 10,202 tflops: 486.22 mfu: 49.16% global_avg_ntp_loss: 0.7620 global_avg_top_loss: 1.9232 +[titan] 2025-09-11 00:24:49,298 - root - INFO - lr: 2.0001e-06 gnorm: 0.79 [3 days, 0:48:20< 0:06:33] +[titan] 2025-09-11 00:25:21,152 - root - INFO - step: 39945 loss: 2.3718 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.26 mfu: 49.57% global_avg_ntp_loss: 0.5971 global_avg_top_loss: 1.7747 +[titan] 2025-09-11 00:25:21,153 - root - INFO - lr: 2.0001e-06 gnorm: 0.63 [3 days, 0:48:51< 0:06:00] +[titan] 2025-09-11 00:25:46,982 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-11 00:25:53,332 - root - INFO - step: 39950 loss: 2.3447 memory: 122.03GiB(87.57%) tps: 10,183 tflops: 485.31 mfu: 49.07% global_avg_ntp_loss: 0.5776 global_avg_top_loss: 1.7671 +[titan] 2025-09-11 00:25:53,332 - root - INFO - lr: 2.0001e-06 gnorm: 0.75 [3 days, 0:49:24< 0:05:28] +[titan] 2025-09-11 00:26:25,019 - root - INFO - step: 39955 loss: 2.2982 memory: 122.03GiB(87.57%) tps: 10,341 tflops: 492.86 mfu: 49.83% global_avg_ntp_loss: 0.5567 global_avg_top_loss: 1.7415 +[titan] 2025-09-11 00:26:25,020 - root - INFO - lr: 2.0001e-06 gnorm: 0.91 [3 days, 0:49:55< 0:04:55] +[titan] 2025-09-11 00:26:57,255 - root - INFO - step: 39960 loss: 2.4205 memory: 122.03GiB(87.57%) tps: 10,165 tflops: 484.47 mfu: 48.99% global_avg_ntp_loss: 0.6210 global_avg_top_loss: 1.7995 +[titan] 2025-09-11 00:26:57,255 - root - INFO - lr: 2.0000e-06 gnorm: 0.68 [3 days, 0:50:28< 0:04:22] +[titan] 2025-09-11 00:27:29,409 - root - INFO - step: 39965 loss: 2.5237 memory: 122.03GiB(87.57%) tps: 10,191 tflops: 485.71 mfu: 49.11% global_avg_ntp_loss: 0.6806 global_avg_top_loss: 1.8431 +[titan] 2025-09-11 00:27:29,409 - root - INFO - lr: 2.0000e-06 gnorm: 0.69 [3 days, 0:51:00< 0:03:49] +[titan] 2025-09-11 00:28:01,446 - root - INFO - step: 39970 loss: 2.4801 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.48 mfu: 49.29% global_avg_ntp_loss: 0.6472 global_avg_top_loss: 1.8328 +[titan] 2025-09-11 00:28:01,446 - root - INFO - lr: 2.0000e-06 gnorm: 0.68 [3 days, 0:51:32< 0:03:16] +[titan] 2025-09-11 00:28:33,304 - root - INFO - step: 39975 loss: 2.3536 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.21 mfu: 49.57% global_avg_ntp_loss: 0.5840 global_avg_top_loss: 1.7696 +[titan] 2025-09-11 00:28:33,304 - root - INFO - lr: 2.0000e-06 gnorm: 0.65 [3 days, 0:52:04< 0:02:44] +[titan] 2025-09-11 00:29:05,373 - root - INFO - step: 39980 loss: 2.4695 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.99 mfu: 49.24% global_avg_ntp_loss: 0.6390 global_avg_top_loss: 1.8305 +[titan] 2025-09-11 00:29:05,373 - root - INFO - lr: 2.0000e-06 gnorm: 0.67 [3 days, 0:52:36< 0:02:11] +[titan] 2025-09-11 00:29:37,545 - root - INFO - step: 39985 loss: 2.5171 memory: 122.03GiB(87.57%) tps: 10,185 tflops: 485.42 mfu: 49.08% global_avg_ntp_loss: 0.6638 global_avg_top_loss: 1.8533 +[titan] 2025-09-11 00:29:37,546 - root - INFO - lr: 2.0000e-06 gnorm: 0.69 [3 days, 0:53:08< 0:01:38] +[titan] 2025-09-11 00:30:09,432 - root - INFO - step: 39990 loss: 2.5692 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.77 mfu: 49.52% global_avg_ntp_loss: 0.6914 global_avg_top_loss: 1.8778 +[titan] 2025-09-11 00:30:09,432 - root - INFO - lr: 2.0000e-06 gnorm: 0.70 [3 days, 0:53:40< 0:01:05] +[titan] 2025-09-11 00:30:41,436 - root - INFO - step: 39995 loss: 2.4510 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.98 mfu: 49.34% global_avg_ntp_loss: 0.6295 global_avg_top_loss: 1.8214 +[titan] 2025-09-11 00:30:41,436 - root - INFO - lr: 2.0000e-06 gnorm: 0.68 [3 days, 0:54:12< 0:00:32] +[titan] 2025-09-11 00:31:07,036 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-11 00:31:13,551 - root - INFO - step: 40000 loss: 2.4026 memory: 122.03GiB(87.57%) tps: 10,203 tflops: 486.28 mfu: 49.17% global_avg_ntp_loss: 0.6054 global_avg_top_loss: 1.7972 +[titan] 2025-09-11 00:31:13,551 - root - INFO - lr: 2.0000e-06 gnorm: 0.65 [3 days, 0:54:44< 0:00:00] +[titan] 2025-09-11 00:31:13,551 - root - INFO - Saving the checkpoint (or staging if async is enabled). +[titan] 2025-09-11 00:31:13,552 - root - INFO - Saving a full checkpoint at last step, step 40000. +[titan] 2025-09-11 00:31:46,236 - root - INFO - [GC] GC collection invoked by checkpointer. 0.01 seconds. +[titan] 2025-09-11 00:31:46,236 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 32.68 seconds. +[titan] 2025-09-11 00:31:46,237 - root - INFO - Training completed diff --git a/logs/none_lyv0rec_/attempt_0/0/stdout.log b/logs/none_lyv0rec_/attempt_0/0/stdout.log new file mode 100644 index 0000000000000000000000000000000000000000..f36fee6a6167c64a7307d65e289aa4cf4403e3a5 --- /dev/null +++ b/logs/none_lyv0rec_/attempt_0/0/stdout.log @@ -0,0 +1,33 @@ + 2025-09-10T00:25:50.402942Z  WARN Status Code: 502. Retrying..., request_id: "" + at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220 + + 2025-09-10T00:25:50.448322Z  WARN Status Code: 502. Retrying..., request_id: "" + at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220 + + 2025-09-10T00:26:01.892901Z  WARN Status Code: 504. Retrying..., request_id: "" + at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220 + + 2025-09-10T00:26:01.894451Z  WARN Status Code: 504. Retrying..., request_id: "" + at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220 + + 2025-09-10T00:26:46.358405Z  WARN Status Code: 504. Retrying..., request_id: "" + at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220 + + 2025-09-10T00:26:50.304225Z  WARN Status Code: 502. Retrying..., request_id: "" + at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220 + + 2025-09-10T00:27:00.830860Z  WARN Status Code: 504. Retrying..., request_id: "" + at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220 + + 2025-09-10T00:28:33.662622Z  WARN Status Code: 502. Retrying..., request_id: "" + at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220 + + 2025-09-10T00:37:21.678500Z  WARN Status Code: 502. Retrying..., request_id: "" + at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220 + + 2025-09-10T00:37:33.396089Z  WARN Status Code: 504. Retrying..., request_id: "" + at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220 + + 2025-09-10T00:38:21.672469Z  WARN Status Code: 502. Retrying..., request_id: "" + at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220 + diff --git a/logs/none_lyv0rec_/attempt_0/7/stderr.log b/logs/none_lyv0rec_/attempt_0/7/stderr.log new file mode 100644 index 0000000000000000000000000000000000000000..b3d44111a0f87ccaf6050cb2781ddc44077c5237 --- /dev/null +++ b/logs/none_lyv0rec_/attempt_0/7/stderr.log @@ -0,0 +1,4819 @@ +OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k +OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k +OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k +2025-09-09 06:17:55.068302: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. +2025-09-09 06:17:55.135417: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. +To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX512_FP16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. +2025-09-09 06:17:56.516572: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. +wandb: Appending key for api.wandb.ai to your netrc file: /home/cvm/.netrc +wandb: Currently logged in as: zaydzuhri to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured. +[titan] 2025-09-09 06:17:58,256 - root - INFO - Starting job: default job +[titan] 2025-09-09 06:17:58,256 - root - INFO - { + "activation_checkpoint": { + "mode": "none", + "selective_ac_option": "2" + }, + "activation_offload": { + "mode": "none" + }, + "checkpoint": { + "async_mode": "disabled", + "convert_to_hf_on_save": false, + "create_seed_checkpoint": false, + "enable_checkpoint": true, + "exclude_from_loading": [], + "export_dtype": "float32", + "folder": "checkpoint", + "hf_repo_base_name": "zaydzuhri/top-code-7B-4096-batch8x2-steps40000", + "hf_upload_enabled": true, + "hf_upload_format": "dcp", + "interval": 5000, + "interval_type": "steps", + "keep_latest_k": 0, + "load_step": -1, + "model_weights_only": false + }, + "comm": { + "init_timeout_seconds": 3000, + "trace_buf_size": 20000, + "train_timeout_seconds": 3000 + }, + "experimental": { + "context_parallel_degree": 1, + "context_parallel_rotate_method": "allgather", + "custom_model_path": "", + "enable_async_tensor_parallel": false, + "enable_compiled_autograd": false, + "pipeline_parallel_degree": 1, + "pipeline_parallel_microbatches": null, + "pipeline_parallel_schedule": "1F1B", + "pipeline_parallel_schedule_csv": "", + "pipeline_parallel_split_points": [] + }, + "fault_tolerance": { + "enable": false, + "group_size": 0, + "min_replica_size": 1, + "replica_id": 0 + }, + "float8": { + "enable_fsdp_float8_all_gather": false, + "force_recompute_fp8_weight_in_bwd": false, + "precompute_float8_dynamic_scale_for_fsdp": false, + "recipe_name": null + }, + "job": { + "config_file": "flame/models/fla.toml", + "description": "default job", + "dump_folder": "exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine", + "print_args": true, + "use_for_integration_test": false + }, + "lr_scheduler": { + "decay_ratio": null, + "decay_type": "cosine", + "lr_min": 0.1, + "warmup_steps": 400 + }, + "memory_estimation": { + "disable_fake_mode": false, + "enabled": false + }, + "metrics": { + "disable_color_printing": false, + "enable_tensorboard": false, + "enable_wandb": true, + "log_freq": 5, + "save_for_all_ranks": false, + "save_tb_folder": "tb" + }, + "model": { + "config": "configs/top_transformer_7B.json", + "converters": [], + "name": "fla", + "print_after_conversion": false, + "tokenizer_path": "fla-hub/transformer-1.3B-100B" + }, + "optimizer": { + "early_step_in_backward": false, + "eps": 1e-15, + "implementation": "fused", + "lr": 2e-05, + "name": "AdamW" + }, + "profiling": { + "enable_memory_snapshot": false, + "enable_profiling": true, + "profile_freq": 512, + "save_memory_snapshot_folder": "memory_snapshot", + "save_traces_folder": "profile_trace" + }, + "training": { + "batch_size": 8, + "compile": true, + "context_len": 4096, + "data_dir": null, + "data_files": null, + "data_parallel_replicate_degree": 1, + "data_parallel_shard_degree": -1, + "data_probs": null, + "dataset": "/home/cvm/.cache/zaydzuhri___stack-edu-python/default", + "dataset_name": "default", + "dataset_split": "train", + "deterministic": false, + "disable_loss_parallel": false, + "enable_cpu_offload": false, + "fsdp_reshard_after_forward": "default", + "gc_freq": 50, + "gradient_accumulation_steps": 2, + "max_norm": 1.0, + "mixed_precision_param": "bfloat16", + "mixed_precision_reduce": "float32", + "num_workers": 32, + "persistent_workers": false, + "pin_memory": false, + "prefetch_factor": 2, + "seed": 79, + "seq_len": 4096, + "skip_nan_inf": true, + "steps": 40000, + "streaming": false, + "tensor_parallel_degree": 1, + "varlen": false + } +} +[titan] 2025-09-09 06:17:58,256 - root - INFO - [GC] Initial GC collection. 0.00 seconds. +[titan] 2025-09-09 06:18:24,313 - root - INFO - Target Hugging Face repository for this run: zaydzuhri/top-code-7B-4096-batch8x2-steps40000-20250909-061824 +[titan] 2025-09-09 06:18:24,314 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config +[titan] 2025-09-09 06:18:24,316 - root - INFO - CUDA capacity: NVIDIA H200 with 139.36GiB memory +[titan] 2025-09-09 06:18:24,345 - root - INFO - Peak FLOPS used for computing MFU: 9.890e+14 +[titan] 2025-09-09 06:18:24,345 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8] +[titan] 2025-09-09 06:18:25,840 - root - INFO - Loading tokenizer... +[titan] 2025-09-09 06:18:26,029 - root - INFO - LlamaTokenizerFast(name_or_path='fla-hub/transformer-1.3B-100B', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': ''}, clean_up_tokenization_spaces=False, added_tokens_decoder={ + 0: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 1: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 2: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), +} +) +[titan] 2025-09-09 06:18:26,029 - root - INFO - Loading dataset /home/cvm/.cache/zaydzuhri___stack-edu-python/default:default +[titan] 2025-09-09 06:18:27,922 - root - INFO - Dataset({ + features: ['blob_id', 'language', 'repo_name', 'path', 'src_encoding', 'length_bytes', 'score', 'int_score', 'detected_licenses', 'license_type', 'text', 'download_success'], + num_rows: 25286012 +}) +[titan] 2025-09-09 06:18:27,922 - root - INFO - Shuffling the dataset with seed 79 +[titan] 2025-09-09 06:18:52,245 - root - INFO - Loading model config from configs/top_transformer_7B.json +[titan] 2025-09-09 06:18:52,248 - root - INFO - Building dataloader... +[titan] 2025-09-09 06:18:52,250 - root - INFO - Building model from the config +TOPTransformerConfig { + "attention_bias": false, + "bos_token_id": 1, + "elementwise_affine": true, + "eos_token_id": 2, + "fuse_cross_entropy": true, + "fuse_norm": true, + "fuse_swiglu": true, + "hidden_act": "swish", + "hidden_ratio": 4, + "hidden_size": 4096, + "initializer_range": 0.006, + "intermediate_size": 14336, + "max_position_embeddings": 2048, + "model_type": "top_transformer", + "norm_eps": 1e-06, + "num_heads": 32, + "num_hidden_layers": 30, + "num_kv_heads": 8, + "qk_norm": false, + "qkv_bias": false, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "top_window_size": 4096, + "transformers_version": "4.51.3", + "use_cache": true, + "use_top_loss": true, + "vocab_size": 32000, + "window_size": null +} + +[titan] 2025-09-09 06:18:52,402 - root - INFO -  +TOPTransformerForCausalLM( + (model): TOPTransformerModel( + (embeddings): Embedding(32000, 4096) + (layers): ModuleList( + (0-29): 30 x TOPTransformerBlock( + (attn_norm): RMSNorm(4096, eps=1e-06) + (attn): Attention( + (q_proj): Linear(in_features=4096, out_features=4096, bias=False) + (k_proj): Linear(in_features=4096, out_features=1024, bias=False) + (v_proj): Linear(in_features=4096, out_features=1024, bias=False) + (o_proj): Linear(in_features=4096, out_features=4096, bias=False) + (rotary): RotaryEmbedding(dim=128, base=10000.0, interleaved=False, pos_idx_in_fp32=True) + ) + (mlp_norm): RMSNorm(4096, eps=1e-06) + (mlp): GatedMLP( + (gate_proj): Linear(in_features=4096, out_features=14336, bias=False) + (up_proj): Linear(in_features=4096, out_features=14336, bias=False) + (down_proj): Linear(in_features=14336, out_features=4096, bias=False) + (swiglu_linear): SwiGLULinear() + ) + ) + ) + (norm): RMSNorm(4096, eps=1e-06) + ) + (lm_head): Linear(in_features=4096, out_features=32000, bias=False) + (top_head): Linear(in_features=4096, out_features=32000, bias=False) + (top_criterion): FusedLinearListNetLoss() + (criterion): FusedLinearCrossEntropyLoss() +) + +[titan] 2025-09-09 06:18:52,437 - root - INFO - Compiling each block with torch.compile +[titan] 2025-09-09 06:18:52,437 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile +[titan] 2025-09-09 06:18:52,438 - root - INFO - Compiling the entire model with torch.compile +[titan] 2025-09-09 06:18:52,526 - root - INFO - Applied FSDP to the model +[titan] 2025-09-09 06:18:53,394 - root - INFO - CUDA memory usage for model: 3.24GiB(2.33%) +[titan] 2025-09-09 06:18:53,417 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/checkpoint +[titan] 2025-09-09 06:18:53,418 - root - INFO - Loading the checkpoint at step 20000. +[titan] 2025-09-09 06:19:19,891 - root - INFO - [GC] GC collection for checkpoint loading. 1.14 seconds. +[titan] 2025-09-09 06:19:19,891 - root - INFO - Finished loading the checkpoint in 26.47 seconds. +[titan] 2025-09-09 06:19:20,033 - root - INFO - CUDA capacity: NVIDIA H200 with 139.36GiB memory +[titan] 2025-09-09 06:19:24,921 - root - INFO - ***** Running training ***** +[titan] 2025-09-09 06:19:24,927 - root - INFO -  Training starts at step 20001 +[titan] 2025-09-09 06:19:24,931 - root - INFO -  Number of tokens per sequence = 4,096 +[titan] 2025-09-09 06:19:24,944 - root - INFO -  Gradient Accumulation steps = 2 +[titan] 2025-09-09 06:19:24,944 - root - INFO -  Instantaneous batch size (per device) = 8 +[titan] 2025-09-09 06:19:24,945 - root - INFO -  Global batch size (w. parallel, distributed & accumulation) = 128 (524,288 tokens) +[titan] 2025-09-09 06:19:24,945 - root - INFO -  Total optimization steps = 40,000 (20,971,520,000 tokens) +[titan] 2025-09-09 06:19:24,945 - root - INFO -  Warmup steps = 400 (209,715,200 tokens) +[titan] 2025-09-09 06:19:24,945 - root - INFO -  Number of parameters = 6,936,580,096  +[titan] 2025-09-09 06:19:24,946 - root - INFO - Profiling active. Traces will be saved at exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/profile_trace +[titan] 2025-09-09 06:20:20,234 - root - INFO - step: 20005 loss: 2.7469 memory: 122.03GiB(87.57%) tps: 5,449 tflops: 259.69 mfu: 26.26% global_avg_ntp_loss: 0.7818 global_avg_top_loss: 1.9651 +[titan] 2025-09-09 06:20:20,235 - root - INFO - lr: 1.1139e-05 gnorm: 0.33 [1 day, 12:44:53<1 day, 12:43:47] +[titan] 2025-09-09 06:20:50,049 - root - INFO - step: 20010 loss: 2.9600 memory: 122.03GiB(87.57%) tps: 10,991 tflops: 523.81 mfu: 52.96% global_avg_ntp_loss: 0.8794 global_avg_top_loss: 2.0806 +[titan] 2025-09-09 06:20:50,050 - root - INFO - lr: 1.1135e-05 gnorm: 0.44 [1 day, 12:45:23<1 day, 12:43:11] +[titan] 2025-09-09 06:21:19,936 - root - INFO - step: 20015 loss: 2.7626 memory: 122.03GiB(87.57%) tps: 10,964 tflops: 522.55 mfu: 52.84% global_avg_ntp_loss: 0.7871 global_avg_top_loss: 1.9755 +[titan] 2025-09-09 06:21:19,936 - root - INFO - lr: 1.1132e-05 gnorm: 0.34 [1 day, 12:45:53<1 day, 12:42:35] +[titan] 2025-09-09 06:21:49,995 - root - INFO - step: 20020 loss: 2.7556 memory: 122.03GiB(87.57%) tps: 10,901 tflops: 519.55 mfu: 52.53% global_avg_ntp_loss: 0.7861 global_avg_top_loss: 1.9694 +[titan] 2025-09-09 06:21:49,995 - root - INFO - lr: 1.1128e-05 gnorm: 0.34 [1 day, 12:46:23<1 day, 12:41:59] +[titan] 2025-09-09 06:22:20,199 - root - INFO - step: 20025 loss: 2.8442 memory: 122.03GiB(87.57%) tps: 10,849 tflops: 517.06 mfu: 52.28% global_avg_ntp_loss: 0.8268 global_avg_top_loss: 2.0174 +[titan] 2025-09-09 06:22:20,199 - root - INFO - lr: 1.1125e-05 gnorm: 0.34 [1 day, 12:46:53<1 day, 12:41:23] +[titan] 2025-09-09 06:22:50,510 - root - INFO - step: 20030 loss: 2.7813 memory: 122.03GiB(87.57%) tps: 10,811 tflops: 515.22 mfu: 52.10% global_avg_ntp_loss: 0.7966 global_avg_top_loss: 1.9848 +[titan] 2025-09-09 06:22:50,511 - root - INFO - lr: 1.1121e-05 gnorm: 0.34 [1 day, 12:47:24<1 day, 12:40:47] +[titan] 2025-09-09 06:23:21,005 - root - INFO - step: 20035 loss: 3.3298 memory: 122.03GiB(87.57%) tps: 10,746 tflops: 512.13 mfu: 51.78% global_avg_ntp_loss: 1.1042 global_avg_top_loss: 2.2255 +[titan] 2025-09-09 06:23:21,005 - root - INFO - lr: 1.1117e-05 gnorm: 0.34 [1 day, 12:47:54<1 day, 12:40:11] +[titan] 2025-09-09 06:23:51,586 - root - INFO - step: 20040 loss: 2.8476 memory: 122.03GiB(87.57%) tps: 10,715 tflops: 510.68 mfu: 51.64% global_avg_ntp_loss: 0.8270 global_avg_top_loss: 2.0206 +[titan] 2025-09-09 06:23:51,586 - root - INFO - lr: 1.1114e-05 gnorm: 0.32 [1 day, 12:48:25<1 day, 12:39:36] +[titan] 2025-09-09 06:24:22,337 - root - INFO - step: 20045 loss: 2.7900 memory: 122.03GiB(87.57%) tps: 10,656 tflops: 507.86 mfu: 51.35% global_avg_ntp_loss: 0.8005 global_avg_top_loss: 1.9896 +[titan] 2025-09-09 06:24:22,337 - root - INFO - lr: 1.1110e-05 gnorm: 0.34 [1 day, 12:48:55<1 day, 12:39:00] +[titan] 2025-09-09 06:24:47,250 - root - INFO - [GC] Peforming periodical GC collection. 0.12 seconds. +[titan] 2025-09-09 06:24:53,455 - root - INFO - step: 20050 loss: 2.6039 memory: 122.03GiB(87.57%) tps: 10,531 tflops: 501.88 mfu: 50.75% global_avg_ntp_loss: 0.7205 global_avg_top_loss: 1.8833 +[titan] 2025-09-09 06:24:53,455 - root - INFO - lr: 1.1107e-05 gnorm: 0.33 [1 day, 12:49:27<1 day, 12:38:25] +[titan] 2025-09-09 06:25:24,418 - root - INFO - step: 20055 loss: 2.7943 memory: 122.03GiB(87.57%) tps: 10,583 tflops: 504.37 mfu: 51.00% global_avg_ntp_loss: 0.8078 global_avg_top_loss: 1.9865 +[titan] 2025-09-09 06:25:24,419 - root - INFO - lr: 1.1103e-05 gnorm: 0.33 [1 day, 12:49:58<1 day, 12:37:50] +[titan] 2025-09-09 06:25:55,658 - root - INFO - step: 20060 loss: 2.8892 memory: 122.03GiB(87.57%) tps: 10,489 tflops: 499.91 mfu: 50.55% global_avg_ntp_loss: 0.8490 global_avg_top_loss: 2.0401 +[titan] 2025-09-09 06:25:55,659 - root - INFO - lr: 1.1100e-05 gnorm: 0.33 [1 day, 12:50:29<1 day, 12:37:15] +[titan] 2025-09-09 06:26:27,033 - root - INFO - step: 20065 loss: 2.8216 memory: 122.03GiB(87.57%) tps: 10,444 tflops: 497.77 mfu: 50.33% global_avg_ntp_loss: 0.8154 global_avg_top_loss: 2.0062 +[titan] 2025-09-09 06:26:27,033 - root - INFO - lr: 1.1096e-05 gnorm: 0.35 [1 day, 12:51:00<1 day, 12:36:41] +[titan] 2025-09-09 06:26:58,429 - root - INFO - step: 20070 loss: 2.7793 memory: 122.03GiB(87.57%) tps: 10,437 tflops: 497.43 mfu: 50.30% global_avg_ntp_loss: 0.7949 global_avg_top_loss: 1.9844 +[titan] 2025-09-09 06:26:58,429 - root - INFO - lr: 1.1092e-05 gnorm: 0.32 [1 day, 12:51:32<1 day, 12:36:06] +[titan] 2025-09-09 06:27:29,890 - root - INFO - step: 20075 loss: 2.7812 memory: 122.03GiB(87.57%) tps: 10,415 tflops: 496.39 mfu: 50.19% global_avg_ntp_loss: 0.7926 global_avg_top_loss: 1.9885 +[titan] 2025-09-09 06:27:29,891 - root - INFO - lr: 1.1089e-05 gnorm: 0.36 [1 day, 12:52:03<1 day, 12:35:31] +[titan] 2025-09-09 06:28:01,443 - root - INFO - step: 20080 loss: 2.7163 memory: 122.03GiB(87.57%) tps: 10,385 tflops: 494.95 mfu: 50.05% global_avg_ntp_loss: 0.7680 global_avg_top_loss: 1.9483 +[titan] 2025-09-09 06:28:01,444 - root - INFO - lr: 1.1085e-05 gnorm: 0.35 [1 day, 12:52:35<1 day, 12:34:57] +[titan] 2025-09-09 06:28:32,958 - root - INFO - step: 20085 loss: 2.8179 memory: 122.03GiB(87.57%) tps: 10,398 tflops: 495.56 mfu: 50.11% global_avg_ntp_loss: 0.8203 global_avg_top_loss: 1.9976 +[titan] 2025-09-09 06:28:32,958 - root - INFO - lr: 1.1082e-05 gnorm: 0.34 [1 day, 12:53:06<1 day, 12:34:22] +[titan] 2025-09-09 06:29:04,816 - root - INFO - step: 20090 loss: 2.7528 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.21 mfu: 49.57% global_avg_ntp_loss: 0.7862 global_avg_top_loss: 1.9666 +[titan] 2025-09-09 06:29:04,816 - root - INFO - lr: 1.1078e-05 gnorm: 0.33 [1 day, 12:53:38<1 day, 12:33:48] +[titan] 2025-09-09 06:29:36,424 - root - INFO - step: 20095 loss: 2.9881 memory: 122.03GiB(87.57%) tps: 10,367 tflops: 494.09 mfu: 49.96% global_avg_ntp_loss: 0.9035 global_avg_top_loss: 2.0846 +[titan] 2025-09-09 06:29:36,424 - root - INFO - lr: 1.1075e-05 gnorm: 0.36 [1 day, 12:54:10<1 day, 12:33:13] +[titan] 2025-09-09 06:30:01,842 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 06:30:08,210 - root - INFO - step: 20100 loss: 2.6944 memory: 122.03GiB(87.57%) tps: 10,309 tflops: 491.32 mfu: 49.68% global_avg_ntp_loss: 0.7563 global_avg_top_loss: 1.9381 +[titan] 2025-09-09 06:30:08,210 - root - INFO - lr: 1.1071e-05 gnorm: 0.35 [1 day, 12:54:41<1 day, 12:32:39] +[titan] 2025-09-09 06:30:39,837 - root - INFO - step: 20105 loss: 2.8289 memory: 122.03GiB(87.57%) tps: 10,361 tflops: 493.80 mfu: 49.93% global_avg_ntp_loss: 0.8182 global_avg_top_loss: 2.0108 +[titan] 2025-09-09 06:30:39,837 - root - INFO - lr: 1.1067e-05 gnorm: 0.33 [1 day, 12:55:13<1 day, 12:32:05] +[titan] 2025-09-09 06:31:11,550 - root - INFO - step: 20110 loss: 2.7379 memory: 122.03GiB(87.57%) tps: 10,333 tflops: 492.44 mfu: 49.79% global_avg_ntp_loss: 0.7768 global_avg_top_loss: 1.9611 +[titan] 2025-09-09 06:31:11,551 - root - INFO - lr: 1.1064e-05 gnorm: 0.34 [1 day, 12:55:45<1 day, 12:31:30] +[titan] 2025-09-09 06:31:43,421 - root - INFO - step: 20115 loss: 3.2744 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.02 mfu: 49.55% global_avg_ntp_loss: 1.0718 global_avg_top_loss: 2.2025 +[titan] 2025-09-09 06:31:43,421 - root - INFO - lr: 1.1060e-05 gnorm: 0.34 [1 day, 12:56:17<1 day, 12:30:56] +[titan] 2025-09-09 06:32:15,207 - root - INFO - step: 20120 loss: 2.8005 memory: 122.03GiB(87.57%) tps: 10,309 tflops: 491.33 mfu: 49.68% global_avg_ntp_loss: 0.8055 global_avg_top_loss: 1.9950 +[titan] 2025-09-09 06:32:15,207 - root - INFO - lr: 1.1057e-05 gnorm: 0.32 [1 day, 12:56:48<1 day, 12:30:22] +[titan] 2025-09-09 06:32:47,104 - root - INFO - step: 20125 loss: 2.7452 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.60 mfu: 49.51% global_avg_ntp_loss: 0.7798 global_avg_top_loss: 1.9654 +[titan] 2025-09-09 06:32:47,105 - root - INFO - lr: 1.1053e-05 gnorm: 0.33 [1 day, 12:57:20<1 day, 12:29:48] +[titan] 2025-09-09 06:33:18,815 - root - INFO - step: 20130 loss: 2.6240 memory: 122.03GiB(87.57%) tps: 10,333 tflops: 492.49 mfu: 49.80% global_avg_ntp_loss: 0.7311 global_avg_top_loss: 1.8929 +[titan] 2025-09-09 06:33:18,815 - root - INFO - lr: 1.1050e-05 gnorm: 0.33 [1 day, 12:57:52<1 day, 12:29:13] +[titan] 2025-09-09 06:33:50,801 - root - INFO - step: 20135 loss: 2.8719 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.25 mfu: 49.37% global_avg_ntp_loss: 0.8376 global_avg_top_loss: 2.0343 +[titan] 2025-09-09 06:33:50,801 - root - INFO - lr: 1.1046e-05 gnorm: 0.37 [1 day, 12:58:24<1 day, 12:28:39] +[titan] 2025-09-09 06:34:22,521 - root - INFO - step: 20140 loss: 2.7833 memory: 122.03GiB(87.57%) tps: 10,330 tflops: 492.34 mfu: 49.78% global_avg_ntp_loss: 0.7952 global_avg_top_loss: 1.9881 +[titan] 2025-09-09 06:34:22,521 - root - INFO - lr: 1.1042e-05 gnorm: 0.35 [1 day, 12:58:56<1 day, 12:28:05] +[titan] 2025-09-09 06:34:54,407 - root - INFO - step: 20145 loss: 2.8903 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.79 mfu: 49.52% global_avg_ntp_loss: 0.8482 global_avg_top_loss: 2.0421 +[titan] 2025-09-09 06:34:54,407 - root - INFO - lr: 1.1039e-05 gnorm: 0.37 [1 day, 12:59:28<1 day, 12:27:30] +[titan] 2025-09-09 06:35:19,985 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 06:35:26,372 - root - INFO - step: 20150 loss: 2.9203 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.57 mfu: 49.40% global_avg_ntp_loss: 0.8794 global_avg_top_loss: 2.0409 +[titan] 2025-09-09 06:35:26,372 - root - INFO - lr: 1.1035e-05 gnorm: 0.37 [1 day, 12:59:59<1 day, 12:26:56] +[titan] 2025-09-09 06:35:58,179 - root - INFO - step: 20155 loss: 2.8514 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 491.01 mfu: 49.65% global_avg_ntp_loss: 0.8292 global_avg_top_loss: 2.0222 +[titan] 2025-09-09 06:35:58,179 - root - INFO - lr: 1.1032e-05 gnorm: 0.33 [1 day, 13:00:31<1 day, 12:26:22] +[titan] 2025-09-09 06:36:30,066 - root - INFO - step: 20160 loss: 2.5929 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.76 mfu: 49.52% global_avg_ntp_loss: 0.7032 global_avg_top_loss: 1.8897 +[titan] 2025-09-09 06:36:30,067 - root - INFO - lr: 1.1028e-05 gnorm: 0.76 [1 day, 13:01:03<1 day, 12:25:48] +[titan] 2025-09-09 06:37:01,877 - root - INFO - step: 20165 loss: 2.7879 memory: 122.03GiB(87.57%) tps: 10,301 tflops: 490.95 mfu: 49.64% global_avg_ntp_loss: 0.8091 global_avg_top_loss: 1.9788 +[titan] 2025-09-09 06:37:01,877 - root - INFO - lr: 1.1025e-05 gnorm: 0.41 [1 day, 13:01:35<1 day, 12:25:14] +[titan] 2025-09-09 06:37:33,810 - root - INFO - step: 20170 loss: 2.7937 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.05 mfu: 49.45% global_avg_ntp_loss: 0.8013 global_avg_top_loss: 1.9924 +[titan] 2025-09-09 06:37:33,811 - root - INFO - lr: 1.1021e-05 gnorm: 0.52 [1 day, 13:02:07<1 day, 12:24:39] +[titan] 2025-09-09 06:38:05,708 - root - INFO - step: 20175 loss: 2.8139 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.61 mfu: 49.51% global_avg_ntp_loss: 0.8099 global_avg_top_loss: 2.0040 +[titan] 2025-09-09 06:38:05,708 - root - INFO - lr: 1.1017e-05 gnorm: 0.34 [1 day, 13:02:39<1 day, 12:24:05] +[titan] 2025-09-09 06:38:37,468 - root - INFO - step: 20180 loss: 2.8027 memory: 122.03GiB(87.57%) tps: 10,318 tflops: 491.73 mfu: 49.72% global_avg_ntp_loss: 0.8096 global_avg_top_loss: 1.9930 +[titan] 2025-09-09 06:38:37,468 - root - INFO - lr: 1.1014e-05 gnorm: 0.37 [1 day, 13:03:11<1 day, 12:23:31] +[titan] 2025-09-09 06:39:09,190 - root - INFO - step: 20185 loss: 2.7779 memory: 122.03GiB(87.57%) tps: 10,330 tflops: 492.31 mfu: 49.78% global_avg_ntp_loss: 0.7954 global_avg_top_loss: 1.9826 +[titan] 2025-09-09 06:39:09,190 - root - INFO - lr: 1.1010e-05 gnorm: 0.36 [1 day, 13:03:42<1 day, 12:22:57] +[titan] 2025-09-09 06:39:41,086 - root - INFO - step: 20190 loss: 2.8220 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.62 mfu: 49.51% global_avg_ntp_loss: 0.8130 global_avg_top_loss: 2.0090 +[titan] 2025-09-09 06:39:41,087 - root - INFO - lr: 1.1007e-05 gnorm: 1.26 [1 day, 13:04:14<1 day, 12:22:22] +[titan] 2025-09-09 06:40:13,085 - root - INFO - step: 20195 loss: 2.7428 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.06 mfu: 49.35% global_avg_ntp_loss: 0.7792 global_avg_top_loss: 1.9636 +[titan] 2025-09-09 06:40:13,085 - root - INFO - lr: 1.1003e-05 gnorm: 0.37 [1 day, 13:04:46<1 day, 12:21:48] +[titan] 2025-09-09 06:40:38,715 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 06:40:45,093 - root - INFO - step: 20200 loss: 2.7984 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.92 mfu: 49.33% global_avg_ntp_loss: 0.8073 global_avg_top_loss: 1.9910 +[titan] 2025-09-09 06:40:45,093 - root - INFO - lr: 1.1000e-05 gnorm: 0.37 [1 day, 13:05:18<1 day, 12:21:14] +[titan] 2025-09-09 06:41:16,990 - root - INFO - step: 20205 loss: 2.8747 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.62 mfu: 49.51% global_avg_ntp_loss: 0.8392 global_avg_top_loss: 2.0355 +[titan] 2025-09-09 06:41:16,990 - root - INFO - lr: 1.0996e-05 gnorm: 0.35 [1 day, 13:05:50<1 day, 12:20:40] +[titan] 2025-09-09 06:41:48,710 - root - INFO - step: 20210 loss: 2.8652 memory: 122.03GiB(87.57%) tps: 10,330 tflops: 492.34 mfu: 49.78% global_avg_ntp_loss: 0.8291 global_avg_top_loss: 2.0361 +[titan] 2025-09-09 06:41:48,710 - root - INFO - lr: 1.0993e-05 gnorm: 0.36 [1 day, 13:06:22<1 day, 12:20:06] +[titan] 2025-09-09 06:42:20,571 - root - INFO - step: 20215 loss: 2.9655 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.16 mfu: 49.56% global_avg_ntp_loss: 0.9175 global_avg_top_loss: 2.0480 +[titan] 2025-09-09 06:42:20,571 - root - INFO - lr: 1.0989e-05 gnorm: 0.35 [1 day, 13:06:54<1 day, 12:19:32] +[titan] 2025-09-09 06:42:52,276 - root - INFO - step: 20220 loss: 2.7629 memory: 122.03GiB(87.57%) tps: 10,335 tflops: 492.58 mfu: 49.81% global_avg_ntp_loss: 0.7864 global_avg_top_loss: 1.9764 +[titan] 2025-09-09 06:42:52,277 - root - INFO - lr: 1.0985e-05 gnorm: 0.34 [1 day, 13:07:25<1 day, 12:18:57] +[titan] 2025-09-09 06:43:24,072 - root - INFO - step: 20225 loss: 2.7892 memory: 122.03GiB(87.57%) tps: 10,306 tflops: 491.17 mfu: 49.66% global_avg_ntp_loss: 0.8035 global_avg_top_loss: 1.9857 +[titan] 2025-09-09 06:43:24,073 - root - INFO - lr: 1.0982e-05 gnorm: 0.36 [1 day, 13:07:57<1 day, 12:18:23] +[titan] 2025-09-09 06:43:56,140 - root - INFO - step: 20230 loss: 2.8211 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.02 mfu: 49.24% global_avg_ntp_loss: 0.8123 global_avg_top_loss: 2.0087 +[titan] 2025-09-09 06:43:56,140 - root - INFO - lr: 1.0978e-05 gnorm: 0.34 [1 day, 13:08:29<1 day, 12:17:49] +[titan] 2025-09-09 06:44:27,808 - root - INFO - step: 20235 loss: 2.8625 memory: 122.03GiB(87.57%) tps: 10,347 tflops: 493.14 mfu: 49.86% global_avg_ntp_loss: 0.8326 global_avg_top_loss: 2.0299 +[titan] 2025-09-09 06:44:27,809 - root - INFO - lr: 1.0975e-05 gnorm: 0.33 [1 day, 13:09:01<1 day, 12:17:14] +[titan] 2025-09-09 06:44:59,720 - root - INFO - step: 20240 loss: 2.7057 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.39 mfu: 49.48% global_avg_ntp_loss: 0.7563 global_avg_top_loss: 1.9494 +[titan] 2025-09-09 06:44:59,720 - root - INFO - lr: 1.0971e-05 gnorm: 0.49 [1 day, 13:09:33<1 day, 12:16:40] +[titan] 2025-09-09 06:45:31,632 - root - INFO - step: 20245 loss: 2.6970 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.38 mfu: 49.48% global_avg_ntp_loss: 0.7653 global_avg_top_loss: 1.9316 +[titan] 2025-09-09 06:45:31,632 - root - INFO - lr: 1.0968e-05 gnorm: 0.37 [1 day, 13:10:05<1 day, 12:16:06] +[titan] 2025-09-09 06:45:57,235 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 06:46:03,715 - root - INFO - step: 20250 loss: 2.7878 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.78 mfu: 49.22% global_avg_ntp_loss: 0.8011 global_avg_top_loss: 1.9867 +[titan] 2025-09-09 06:46:03,715 - root - INFO - lr: 1.0964e-05 gnorm: 0.36 [1 day, 13:10:37<1 day, 12:15:32] +[titan] 2025-09-09 06:46:35,719 - root - INFO - step: 20255 loss: 2.8541 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.97 mfu: 49.34% global_avg_ntp_loss: 0.8296 global_avg_top_loss: 2.0246 +[titan] 2025-09-09 06:46:35,719 - root - INFO - lr: 1.0960e-05 gnorm: 0.37 [1 day, 13:11:09<1 day, 12:14:58] +[titan] 2025-09-09 06:47:07,629 - root - INFO - step: 20260 loss: 2.7089 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.41 mfu: 49.49% global_avg_ntp_loss: 0.7656 global_avg_top_loss: 1.9432 +[titan] 2025-09-09 06:47:07,630 - root - INFO - lr: 1.0957e-05 gnorm: 0.35 [1 day, 13:11:41<1 day, 12:14:24] +[titan] 2025-09-09 06:47:39,813 - root - INFO - step: 20265 loss: 2.6907 memory: 122.03GiB(87.57%) tps: 10,182 tflops: 485.26 mfu: 49.07% global_avg_ntp_loss: 0.7592 global_avg_top_loss: 1.9315 +[titan] 2025-09-09 06:47:39,813 - root - INFO - lr: 1.0953e-05 gnorm: 0.34 [1 day, 13:12:13<1 day, 12:13:50] +[titan] 2025-09-09 06:48:11,816 - root - INFO - step: 20270 loss: 2.8668 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.99 mfu: 49.34% global_avg_ntp_loss: 0.8402 global_avg_top_loss: 2.0267 +[titan] 2025-09-09 06:48:11,816 - root - INFO - lr: 1.0950e-05 gnorm: 0.33 [1 day, 13:12:45<1 day, 12:13:16] +[titan] 2025-09-09 06:48:44,043 - root - INFO - step: 20275 loss: 2.7622 memory: 122.03GiB(87.57%) tps: 10,168 tflops: 484.59 mfu: 49.00% global_avg_ntp_loss: 0.8053 global_avg_top_loss: 1.9569 +[titan] 2025-09-09 06:48:44,044 - root - INFO - lr: 1.0946e-05 gnorm: 0.41 [1 day, 13:13:17<1 day, 12:12:42] +[titan] 2025-09-09 06:49:16,113 - root - INFO - step: 20280 loss: 2.8485 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.97 mfu: 49.24% global_avg_ntp_loss: 0.8292 global_avg_top_loss: 2.0193 +[titan] 2025-09-09 06:49:16,113 - root - INFO - lr: 1.0943e-05 gnorm: 0.39 [1 day, 13:13:49<1 day, 12:12:08] +[titan] 2025-09-09 06:49:48,067 - root - INFO - step: 20285 loss: 2.8332 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.74 mfu: 49.42% global_avg_ntp_loss: 0.8222 global_avg_top_loss: 2.0110 +[titan] 2025-09-09 06:49:48,067 - root - INFO - lr: 1.0939e-05 gnorm: 0.40 [1 day, 13:14:21<1 day, 12:11:34] +[titan] 2025-09-09 06:50:19,957 - root - INFO - step: 20290 loss: 2.6742 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.73 mfu: 49.52% global_avg_ntp_loss: 0.7495 global_avg_top_loss: 1.9247 +[titan] 2025-09-09 06:50:19,957 - root - INFO - lr: 1.0935e-05 gnorm: 0.37 [1 day, 13:14:53<1 day, 12:11:00] +[titan] 2025-09-09 06:50:51,969 - root - INFO - step: 20295 loss: 2.6844 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.85 mfu: 49.33% global_avg_ntp_loss: 0.7538 global_avg_top_loss: 1.9306 +[titan] 2025-09-09 06:50:51,969 - root - INFO - lr: 1.0932e-05 gnorm: 0.37 [1 day, 13:15:25<1 day, 12:10:26] +[titan] 2025-09-09 06:51:17,413 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 06:51:23,904 - root - INFO - step: 20300 loss: 2.7561 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.03 mfu: 49.45% global_avg_ntp_loss: 0.7872 global_avg_top_loss: 1.9689 +[titan] 2025-09-09 06:51:23,905 - root - INFO - lr: 1.0928e-05 gnorm: 0.37 [1 day, 13:15:57<1 day, 12:09:52] +[titan] 2025-09-09 06:51:55,716 - root - INFO - step: 20305 loss: 2.7211 memory: 122.03GiB(87.57%) tps: 10,301 tflops: 490.93 mfu: 49.64% global_avg_ntp_loss: 0.7694 global_avg_top_loss: 1.9517 +[titan] 2025-09-09 06:51:55,716 - root - INFO - lr: 1.0925e-05 gnorm: 0.35 [1 day, 13:16:29<1 day, 12:09:18] +[titan] 2025-09-09 06:52:27,726 - root - INFO - step: 20310 loss: 2.7744 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.88 mfu: 49.33% global_avg_ntp_loss: 0.7937 global_avg_top_loss: 1.9807 +[titan] 2025-09-09 06:52:27,727 - root - INFO - lr: 1.0921e-05 gnorm: 0.34 [1 day, 13:17:01<1 day, 12:08:43] +[titan] 2025-09-09 06:52:59,560 - root - INFO - step: 20315 loss: 3.2540 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.59 mfu: 49.60% global_avg_ntp_loss: 1.0623 global_avg_top_loss: 2.1917 +[titan] 2025-09-09 06:52:59,560 - root - INFO - lr: 1.0918e-05 gnorm: 0.38 [1 day, 13:17:33<1 day, 12:08:09] +[titan] 2025-09-09 06:53:31,550 - root - INFO - step: 20320 loss: 2.7771 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.19 mfu: 49.36% global_avg_ntp_loss: 0.7907 global_avg_top_loss: 1.9865 +[titan] 2025-09-09 06:53:31,550 - root - INFO - lr: 1.0914e-05 gnorm: 0.42 [1 day, 13:18:05<1 day, 12:07:35] +[titan] 2025-09-09 06:54:03,315 - root - INFO - step: 20325 loss: 2.8222 memory: 122.03GiB(87.57%) tps: 10,316 tflops: 491.65 mfu: 49.71% global_avg_ntp_loss: 0.8125 global_avg_top_loss: 2.0098 +[titan] 2025-09-09 06:54:03,315 - root - INFO - lr: 1.0910e-05 gnorm: 0.44 [1 day, 13:18:36<1 day, 12:07:01] +[titan] 2025-09-09 06:54:35,253 - root - INFO - step: 20330 loss: 3.1607 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.98 mfu: 49.44% global_avg_ntp_loss: 1.0078 global_avg_top_loss: 2.1529 +[titan] 2025-09-09 06:54:35,253 - root - INFO - lr: 1.0907e-05 gnorm: 0.37 [1 day, 13:19:08<1 day, 12:06:27] +[titan] 2025-09-09 06:55:07,023 - root - INFO - step: 20335 loss: 2.7996 memory: 122.03GiB(87.57%) tps: 10,314 tflops: 491.57 mfu: 49.70% global_avg_ntp_loss: 0.8026 global_avg_top_loss: 1.9970 +[titan] 2025-09-09 06:55:07,024 - root - INFO - lr: 1.0903e-05 gnorm: 0.34 [1 day, 13:19:40<1 day, 12:05:53] +[titan] 2025-09-09 06:55:38,864 - root - INFO - step: 20340 loss: 2.7147 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.48 mfu: 49.59% global_avg_ntp_loss: 0.7647 global_avg_top_loss: 1.9500 +[titan] 2025-09-09 06:55:38,864 - root - INFO - lr: 1.0900e-05 gnorm: 0.34 [1 day, 13:20:12<1 day, 12:05:18] +[titan] 2025-09-09 06:56:10,802 - root - INFO - step: 20345 loss: 2.8070 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.99 mfu: 49.44% global_avg_ntp_loss: 0.8076 global_avg_top_loss: 1.9994 +[titan] 2025-09-09 06:56:10,802 - root - INFO - lr: 1.0896e-05 gnorm: 0.33 [1 day, 13:20:44<1 day, 12:04:44] +[titan] 2025-09-09 06:56:36,146 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 06:56:42,481 - root - INFO - step: 20350 loss: 2.8160 memory: 122.03GiB(87.57%) tps: 10,344 tflops: 492.97 mfu: 49.85% global_avg_ntp_loss: 0.8141 global_avg_top_loss: 2.0020 +[titan] 2025-09-09 06:56:42,482 - root - INFO - lr: 1.0893e-05 gnorm: 0.34 [1 day, 13:21:16<1 day, 12:04:10] +[titan] 2025-09-09 06:57:14,495 - root - INFO - step: 20355 loss: 2.7763 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.82 mfu: 49.33% global_avg_ntp_loss: 0.7943 global_avg_top_loss: 1.9820 +[titan] 2025-09-09 06:57:14,496 - root - INFO - lr: 1.0889e-05 gnorm: 0.33 [1 day, 13:21:48<1 day, 12:03:36] +[titan] 2025-09-09 06:57:46,528 - root - INFO - step: 20360 loss: 2.7609 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.54 mfu: 49.30% global_avg_ntp_loss: 0.7851 global_avg_top_loss: 1.9758 +[titan] 2025-09-09 06:57:46,528 - root - INFO - lr: 1.0885e-05 gnorm: 0.34 [1 day, 13:22:20<1 day, 12:03:02] +[titan] 2025-09-09 06:58:18,434 - root - INFO - step: 20365 loss: 2.8526 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.48 mfu: 49.49% global_avg_ntp_loss: 0.8304 global_avg_top_loss: 2.0222 +[titan] 2025-09-09 06:58:18,434 - root - INFO - lr: 1.0882e-05 gnorm: 0.35 [1 day, 13:22:52<1 day, 12:02:28] +[titan] 2025-09-09 06:58:50,498 - root - INFO - step: 20370 loss: 2.7777 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.07 mfu: 49.25% global_avg_ntp_loss: 0.8031 global_avg_top_loss: 1.9746 +[titan] 2025-09-09 06:58:50,498 - root - INFO - lr: 1.0878e-05 gnorm: 0.33 [1 day, 13:23:24<1 day, 12:01:54] +[titan] 2025-09-09 06:59:22,426 - root - INFO - step: 20375 loss: 2.6894 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.13 mfu: 49.46% global_avg_ntp_loss: 0.7562 global_avg_top_loss: 1.9333 +[titan] 2025-09-09 06:59:22,426 - root - INFO - lr: 1.0875e-05 gnorm: 0.35 [1 day, 13:23:56<1 day, 12:01:20] +[titan] 2025-09-09 06:59:54,616 - root - INFO - step: 20380 loss: 2.7796 memory: 122.03GiB(87.57%) tps: 10,180 tflops: 485.16 mfu: 49.06% global_avg_ntp_loss: 0.7999 global_avg_top_loss: 1.9797 +[titan] 2025-09-09 06:59:54,616 - root - INFO - lr: 1.0871e-05 gnorm: 0.33 [1 day, 13:24:28<1 day, 12:00:46] +[titan] 2025-09-09 07:00:26,522 - root - INFO - step: 20385 loss: 2.7060 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.46 mfu: 49.49% global_avg_ntp_loss: 0.7625 global_avg_top_loss: 1.9435 +[titan] 2025-09-09 07:00:26,523 - root - INFO - lr: 1.0868e-05 gnorm: 0.34 [1 day, 13:25:00<1 day, 12:00:12] +[titan] 2025-09-09 07:00:58,530 - root - INFO - step: 20390 loss: 3.0371 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.92 mfu: 49.33% global_avg_ntp_loss: 0.9383 global_avg_top_loss: 2.0988 +[titan] 2025-09-09 07:00:58,531 - root - INFO - lr: 1.0864e-05 gnorm: 0.35 [1 day, 13:25:32<1 day, 11:59:38] +[titan] 2025-09-09 07:01:30,383 - root - INFO - step: 20395 loss: 3.2952 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.29 mfu: 49.57% global_avg_ntp_loss: 1.0824 global_avg_top_loss: 2.2127 +[titan] 2025-09-09 07:01:30,383 - root - INFO - lr: 1.0860e-05 gnorm: 0.39 [1 day, 13:26:03<1 day, 11:59:03] +[titan] 2025-09-09 07:01:55,824 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 07:02:02,212 - root - INFO - step: 20400 loss: 2.7289 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.67 mfu: 49.61% global_avg_ntp_loss: 0.7767 global_avg_top_loss: 1.9522 +[titan] 2025-09-09 07:02:02,212 - root - INFO - lr: 1.0857e-05 gnorm: 0.36 [1 day, 13:26:35<1 day, 11:58:29] +[titan] 2025-09-09 07:02:34,203 - root - INFO - step: 20405 loss: 2.8002 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.17 mfu: 49.36% global_avg_ntp_loss: 0.8053 global_avg_top_loss: 1.9949 +[titan] 2025-09-09 07:02:34,203 - root - INFO - lr: 1.0853e-05 gnorm: 0.34 [1 day, 13:27:07<1 day, 11:57:55] +[titan] 2025-09-09 07:03:06,198 - root - INFO - step: 20410 loss: 2.8048 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.11 mfu: 49.35% global_avg_ntp_loss: 0.8092 global_avg_top_loss: 1.9956 +[titan] 2025-09-09 07:03:06,198 - root - INFO - lr: 1.0850e-05 gnorm: 0.36 [1 day, 13:27:39<1 day, 11:57:21] +[titan] 2025-09-09 07:03:38,243 - root - INFO - step: 20415 loss: 2.7784 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.35 mfu: 49.28% global_avg_ntp_loss: 0.7953 global_avg_top_loss: 1.9830 +[titan] 2025-09-09 07:03:38,243 - root - INFO - lr: 1.0846e-05 gnorm: 0.42 [1 day, 13:28:11<1 day, 11:56:47] +[titan] 2025-09-09 07:04:10,129 - root - INFO - step: 20420 loss: 2.7288 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.78 mfu: 49.52% global_avg_ntp_loss: 0.7734 global_avg_top_loss: 1.9554 +[titan] 2025-09-09 07:04:10,129 - root - INFO - lr: 1.0843e-05 gnorm: 0.34 [1 day, 13:28:43<1 day, 11:56:13] +[titan] 2025-09-09 07:04:42,052 - root - INFO - step: 20425 loss: 2.8111 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.22 mfu: 49.47% global_avg_ntp_loss: 0.8155 global_avg_top_loss: 1.9956 +[titan] 2025-09-09 07:04:42,052 - root - INFO - lr: 1.0839e-05 gnorm: 0.34 [1 day, 13:29:15<1 day, 11:55:39] +[titan] 2025-09-09 07:05:13,849 - root - INFO - step: 20430 loss: 2.7851 memory: 122.03GiB(87.57%) tps: 10,306 tflops: 491.15 mfu: 49.66% global_avg_ntp_loss: 0.7994 global_avg_top_loss: 1.9857 +[titan] 2025-09-09 07:05:13,849 - root - INFO - lr: 1.0835e-05 gnorm: 0.33 [1 day, 13:29:47<1 day, 11:55:05] +[titan] 2025-09-09 07:05:46,093 - root - INFO - step: 20435 loss: 3.1016 memory: 122.03GiB(87.57%) tps: 10,162 tflops: 484.34 mfu: 48.97% global_avg_ntp_loss: 0.9714 global_avg_top_loss: 2.1302 +[titan] 2025-09-09 07:05:46,094 - root - INFO - lr: 1.0832e-05 gnorm: 0.35 [1 day, 13:30:19<1 day, 11:54:31] +[titan] 2025-09-09 07:06:17,966 - root - INFO - step: 20440 loss: 2.8745 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.99 mfu: 49.54% global_avg_ntp_loss: 0.8395 global_avg_top_loss: 2.0351 +[titan] 2025-09-09 07:06:17,966 - root - INFO - lr: 1.0828e-05 gnorm: 0.34 [1 day, 13:30:51<1 day, 11:53:57] +[titan] 2025-09-09 07:06:50,157 - root - INFO - step: 20445 loss: 2.7692 memory: 122.03GiB(87.57%) tps: 10,179 tflops: 485.14 mfu: 49.05% global_avg_ntp_loss: 0.7921 global_avg_top_loss: 1.9771 +[titan] 2025-09-09 07:06:50,158 - root - INFO - lr: 1.0825e-05 gnorm: 0.34 [1 day, 13:31:23<1 day, 11:53:23] +[titan] 2025-09-09 07:07:15,784 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 07:07:22,090 - root - INFO - step: 20450 loss: 2.7641 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.07 mfu: 49.45% global_avg_ntp_loss: 0.7916 global_avg_top_loss: 1.9725 +[titan] 2025-09-09 07:07:22,090 - root - INFO - lr: 1.0821e-05 gnorm: 0.33 [1 day, 13:31:55<1 day, 11:52:49] +[titan] 2025-09-09 07:07:54,405 - root - INFO - step: 20455 loss: 2.6634 memory: 122.03GiB(87.57%) tps: 10,140 tflops: 483.29 mfu: 48.87% global_avg_ntp_loss: 0.7446 global_avg_top_loss: 1.9188 +[titan] 2025-09-09 07:07:54,405 - root - INFO - lr: 1.0818e-05 gnorm: 0.35 [1 day, 13:32:27<1 day, 11:52:15] +[titan] 2025-09-09 07:08:26,512 - root - INFO - step: 20460 loss: 2.8403 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.40 mfu: 49.18% global_avg_ntp_loss: 0.8255 global_avg_top_loss: 2.0148 +[titan] 2025-09-09 07:08:26,513 - root - INFO - lr: 1.0814e-05 gnorm: 0.34 [1 day, 13:33:00<1 day, 11:51:41] +[titan] 2025-09-09 07:08:58,482 - root - INFO - step: 20465 loss: 2.7725 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.51 mfu: 49.39% global_avg_ntp_loss: 0.7930 global_avg_top_loss: 1.9796 +[titan] 2025-09-09 07:08:58,482 - root - INFO - lr: 1.0810e-05 gnorm: 0.33 [1 day, 13:33:32<1 day, 11:51:07] +[titan] 2025-09-09 07:09:30,461 - root - INFO - step: 20470 loss: 2.7779 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.35 mfu: 49.38% global_avg_ntp_loss: 0.7936 global_avg_top_loss: 1.9842 +[titan] 2025-09-09 07:09:30,461 - root - INFO - lr: 1.0807e-05 gnorm: 0.34 [1 day, 13:34:04<1 day, 11:50:33] +[titan] 2025-09-09 07:10:02,598 - root - INFO - step: 20475 loss: 3.2187 memory: 122.03GiB(87.57%) tps: 10,196 tflops: 485.95 mfu: 49.14% global_avg_ntp_loss: 1.0490 global_avg_top_loss: 2.1697 +[titan] 2025-09-09 07:10:02,599 - root - INFO - lr: 1.0803e-05 gnorm: 0.36 [1 day, 13:34:36<1 day, 11:49:59] +[titan] 2025-09-09 07:10:35,023 - root - INFO - step: 20480 loss: 2.7769 memory: 122.03GiB(87.57%) tps: 10,106 tflops: 481.64 mfu: 48.70% global_avg_ntp_loss: 0.7952 global_avg_top_loss: 1.9817 +[titan] 2025-09-09 07:10:35,023 - root - INFO - lr: 1.0800e-05 gnorm: 0.33 [1 day, 13:35:08<1 day, 11:49:26] +[titan] 2025-09-09 07:10:35,370 - root - INFO - Dumping profiler traces at step 20480 +[titan] 2025-09-09 07:10:35,427 - root - INFO - Finished dumping profiler traces in 0.06 seconds +[titan] 2025-09-09 07:11:07,257 - root - INFO - step: 20485 loss: 3.0455 memory: 122.03GiB(87.57%) tps: 10,166 tflops: 484.50 mfu: 48.99% global_avg_ntp_loss: 0.9270 global_avg_top_loss: 2.1186 +[titan] 2025-09-09 07:11:07,257 - root - INFO - lr: 1.0796e-05 gnorm: 1.11 [1 day, 13:35:40<1 day, 11:48:52] +[titan] 2025-09-09 07:11:39,390 - root - INFO - step: 20490 loss: 3.0723 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.01 mfu: 49.14% global_avg_ntp_loss: 0.9626 global_avg_top_loss: 2.1097 +[titan] 2025-09-09 07:11:39,391 - root - INFO - lr: 1.0793e-05 gnorm: 0.36 [1 day, 13:36:12<1 day, 11:48:18] +[titan] 2025-09-09 07:12:11,222 - root - INFO - step: 20495 loss: 2.7660 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.62 mfu: 49.61% global_avg_ntp_loss: 0.7899 global_avg_top_loss: 1.9761 +[titan] 2025-09-09 07:12:11,222 - root - INFO - lr: 1.0789e-05 gnorm: 0.35 [1 day, 13:36:44<1 day, 11:47:44] +[titan] 2025-09-09 07:12:36,466 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 07:12:42,945 - root - INFO - step: 20500 loss: 2.8090 memory: 122.03GiB(87.57%) tps: 10,330 tflops: 492.30 mfu: 49.78% global_avg_ntp_loss: 0.8068 global_avg_top_loss: 2.0023 +[titan] 2025-09-09 07:12:42,945 - root - INFO - lr: 1.0785e-05 gnorm: 0.36 [1 day, 13:37:16<1 day, 11:47:09] +[titan] 2025-09-09 07:13:14,870 - root - INFO - step: 20505 loss: 2.7942 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.18 mfu: 49.46% global_avg_ntp_loss: 0.8034 global_avg_top_loss: 1.9908 +[titan] 2025-09-09 07:13:14,870 - root - INFO - lr: 1.0782e-05 gnorm: 0.34 [1 day, 13:37:48<1 day, 11:46:35] +[titan] 2025-09-09 07:13:46,506 - root - INFO - step: 20510 loss: 3.1287 memory: 122.03GiB(87.57%) tps: 10,358 tflops: 493.65 mfu: 49.91% global_avg_ntp_loss: 0.9606 global_avg_top_loss: 2.1681 +[titan] 2025-09-09 07:13:46,506 - root - INFO - lr: 1.0778e-05 gnorm: 0.66 [1 day, 13:38:20<1 day, 11:46:01] +[titan] 2025-09-09 07:14:18,460 - root - INFO - step: 20515 loss: 2.7824 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.74 mfu: 49.42% global_avg_ntp_loss: 0.7989 global_avg_top_loss: 1.9835 +[titan] 2025-09-09 07:14:18,461 - root - INFO - lr: 1.0775e-05 gnorm: 0.34 [1 day, 13:38:52<1 day, 11:45:27] +[titan] 2025-09-09 07:14:50,163 - root - INFO - step: 20520 loss: 2.7884 memory: 122.03GiB(87.57%) tps: 10,336 tflops: 492.61 mfu: 49.81% global_avg_ntp_loss: 0.7993 global_avg_top_loss: 1.9891 +[titan] 2025-09-09 07:14:50,164 - root - INFO - lr: 1.0771e-05 gnorm: 0.38 [1 day, 13:39:23<1 day, 11:44:53] +[titan] 2025-09-09 07:15:22,156 - root - INFO - step: 20525 loss: 2.7545 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.15 mfu: 49.36% global_avg_ntp_loss: 0.7855 global_avg_top_loss: 1.9689 +[titan] 2025-09-09 07:15:22,156 - root - INFO - lr: 1.0768e-05 gnorm: 0.36 [1 day, 13:39:55<1 day, 11:44:19] +[titan] 2025-09-09 07:15:53,885 - root - INFO - step: 20530 loss: 2.8623 memory: 122.03GiB(87.57%) tps: 10,327 tflops: 492.20 mfu: 49.77% global_avg_ntp_loss: 0.8337 global_avg_top_loss: 2.0287 +[titan] 2025-09-09 07:15:53,885 - root - INFO - lr: 1.0764e-05 gnorm: 0.37 [1 day, 13:40:27<1 day, 11:43:44] +[titan] 2025-09-09 07:16:25,766 - root - INFO - step: 20535 loss: 2.6397 memory: 122.03GiB(87.57%) tps: 10,278 tflops: 489.86 mfu: 49.53% global_avg_ntp_loss: 0.7316 global_avg_top_loss: 1.9082 +[titan] 2025-09-09 07:16:25,766 - root - INFO - lr: 1.0760e-05 gnorm: 0.40 [1 day, 13:40:59<1 day, 11:43:10] +[titan] 2025-09-09 07:16:57,719 - root - INFO - step: 20540 loss: 2.7389 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.76 mfu: 49.42% global_avg_ntp_loss: 0.7752 global_avg_top_loss: 1.9637 +[titan] 2025-09-09 07:16:57,719 - root - INFO - lr: 1.0757e-05 gnorm: 0.34 [1 day, 13:41:31<1 day, 11:42:36] +[titan] 2025-09-09 07:17:29,789 - root - INFO - step: 20545 loss: 2.7142 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.97 mfu: 49.24% global_avg_ntp_loss: 0.7712 global_avg_top_loss: 1.9429 +[titan] 2025-09-09 07:17:29,789 - root - INFO - lr: 1.0753e-05 gnorm: 0.34 [1 day, 13:42:03<1 day, 11:42:02] +[titan] 2025-09-09 07:17:55,293 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 07:18:01,632 - root - INFO - step: 20550 loss: 2.7682 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.44 mfu: 49.59% global_avg_ntp_loss: 0.7911 global_avg_top_loss: 1.9771 +[titan] 2025-09-09 07:18:01,633 - root - INFO - lr: 1.0750e-05 gnorm: 0.34 [1 day, 13:42:35<1 day, 11:41:28] +[titan] 2025-09-09 07:18:33,494 - root - INFO - step: 20555 loss: 3.2850 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.15 mfu: 49.56% global_avg_ntp_loss: 1.0778 global_avg_top_loss: 2.2072 +[titan] 2025-09-09 07:18:33,494 - root - INFO - lr: 1.0746e-05 gnorm: 0.39 [1 day, 13:43:07<1 day, 11:40:54] +[titan] 2025-09-09 07:19:05,267 - root - INFO - step: 20560 loss: 2.8272 memory: 122.03GiB(87.57%) tps: 10,314 tflops: 491.53 mfu: 49.70% global_avg_ntp_loss: 0.8203 global_avg_top_loss: 2.0069 +[titan] 2025-09-09 07:19:05,267 - root - INFO - lr: 1.0743e-05 gnorm: 0.37 [1 day, 13:43:38<1 day, 11:40:20] +[titan] 2025-09-09 07:19:37,134 - root - INFO - step: 20565 loss: 2.7941 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.06 mfu: 49.55% global_avg_ntp_loss: 0.8025 global_avg_top_loss: 1.9916 +[titan] 2025-09-09 07:19:37,134 - root - INFO - lr: 1.0739e-05 gnorm: 0.35 [1 day, 13:44:10<1 day, 11:39:46] +[titan] 2025-09-09 07:20:09,312 - root - INFO - step: 20570 loss: 2.9126 memory: 122.03GiB(87.57%) tps: 10,184 tflops: 485.35 mfu: 49.07% global_avg_ntp_loss: 0.8736 global_avg_top_loss: 2.0389 +[titan] 2025-09-09 07:20:09,312 - root - INFO - lr: 1.0736e-05 gnorm: 0.38 [1 day, 13:44:42<1 day, 11:39:12] +[titan] 2025-09-09 07:20:41,308 - root - INFO - step: 20575 loss: 2.7576 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.09 mfu: 49.35% global_avg_ntp_loss: 0.7856 global_avg_top_loss: 1.9721 +[titan] 2025-09-09 07:20:41,308 - root - INFO - lr: 1.0732e-05 gnorm: 0.36 [1 day, 13:45:14<1 day, 11:38:38] +[titan] 2025-09-09 07:21:13,132 - root - INFO - step: 20580 loss: 2.8376 memory: 122.03GiB(87.57%) tps: 10,297 tflops: 490.74 mfu: 49.62% global_avg_ntp_loss: 0.8256 global_avg_top_loss: 2.0120 +[titan] 2025-09-09 07:21:13,132 - root - INFO - lr: 1.0728e-05 gnorm: 0.34 [1 day, 13:45:46<1 day, 11:38:04] +[titan] 2025-09-09 07:21:45,123 - root - INFO - step: 20585 loss: 2.8509 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.16 mfu: 49.36% global_avg_ntp_loss: 0.8310 global_avg_top_loss: 2.0199 +[titan] 2025-09-09 07:21:45,124 - root - INFO - lr: 1.0725e-05 gnorm: 0.34 [1 day, 13:46:18<1 day, 11:37:30] +[titan] 2025-09-09 07:22:16,997 - root - INFO - step: 20590 loss: 3.1311 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.98 mfu: 49.54% global_avg_ntp_loss: 0.9648 global_avg_top_loss: 2.1662 +[titan] 2025-09-09 07:22:16,997 - root - INFO - lr: 1.0721e-05 gnorm: 0.69 [1 day, 13:46:50<1 day, 11:36:55] +[titan] 2025-09-09 07:22:49,016 - root - INFO - step: 20595 loss: 2.7975 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.74 mfu: 49.32% global_avg_ntp_loss: 0.8041 global_avg_top_loss: 1.9933 +[titan] 2025-09-09 07:22:49,017 - root - INFO - lr: 1.0718e-05 gnorm: 0.36 [1 day, 13:47:22<1 day, 11:36:21] +[titan] 2025-09-09 07:23:14,280 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 07:23:20,736 - root - INFO - step: 20600 loss: 2.7198 memory: 122.03GiB(87.57%) tps: 10,331 tflops: 492.36 mfu: 49.78% global_avg_ntp_loss: 0.7719 global_avg_top_loss: 1.9479 +[titan] 2025-09-09 07:23:20,736 - root - INFO - lr: 1.0714e-05 gnorm: 0.35 [1 day, 13:47:54<1 day, 11:35:47] +[titan] 2025-09-09 07:23:52,653 - root - INFO - step: 20605 loss: 2.6982 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.30 mfu: 49.47% global_avg_ntp_loss: 0.7595 global_avg_top_loss: 1.9387 +[titan] 2025-09-09 07:23:52,653 - root - INFO - lr: 1.0711e-05 gnorm: 0.36 [1 day, 13:48:26<1 day, 11:35:13] +[titan] 2025-09-09 07:24:24,493 - root - INFO - step: 20610 loss: 2.6344 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.50 mfu: 49.60% global_avg_ntp_loss: 0.7300 global_avg_top_loss: 1.9045 +[titan] 2025-09-09 07:24:24,493 - root - INFO - lr: 1.0707e-05 gnorm: 0.35 [1 day, 13:48:58<1 day, 11:34:39] +[titan] 2025-09-09 07:24:56,361 - root - INFO - step: 20615 loss: 2.8545 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.06 mfu: 49.55% global_avg_ntp_loss: 0.8316 global_avg_top_loss: 2.0229 +[titan] 2025-09-09 07:24:56,361 - root - INFO - lr: 1.0703e-05 gnorm: 0.36 [1 day, 13:49:29<1 day, 11:34:05] +[titan] 2025-09-09 07:25:28,166 - root - INFO - step: 20620 loss: 2.7952 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.02 mfu: 49.65% global_avg_ntp_loss: 0.8033 global_avg_top_loss: 1.9919 +[titan] 2025-09-09 07:25:28,166 - root - INFO - lr: 1.0700e-05 gnorm: 0.35 [1 day, 13:50:01<1 day, 11:33:31] +[titan] 2025-09-09 07:26:00,181 - root - INFO - step: 20625 loss: 2.7567 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.81 mfu: 49.32% global_avg_ntp_loss: 0.7954 global_avg_top_loss: 1.9613 +[titan] 2025-09-09 07:26:00,181 - root - INFO - lr: 1.0696e-05 gnorm: 0.36 [1 day, 13:50:33<1 day, 11:32:57] +[titan] 2025-09-09 07:26:32,201 - root - INFO - step: 20630 loss: 3.6945 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.73 mfu: 49.32% global_avg_ntp_loss: 1.3172 global_avg_top_loss: 2.3773 +[titan] 2025-09-09 07:26:32,202 - root - INFO - lr: 1.0693e-05 gnorm: 0.34 [1 day, 13:51:05<1 day, 11:32:23] +[titan] 2025-09-09 07:27:04,024 - root - INFO - step: 20635 loss: 3.1787 memory: 122.03GiB(87.57%) tps: 10,297 tflops: 490.76 mfu: 49.62% global_avg_ntp_loss: 1.0293 global_avg_top_loss: 2.1493 +[titan] 2025-09-09 07:27:04,024 - root - INFO - lr: 1.0689e-05 gnorm: 0.49 [1 day, 13:51:37<1 day, 11:31:49] +[titan] 2025-09-09 07:27:35,971 - root - INFO - step: 20640 loss: 2.8221 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.85 mfu: 49.43% global_avg_ntp_loss: 0.8122 global_avg_top_loss: 2.0099 +[titan] 2025-09-09 07:27:35,971 - root - INFO - lr: 1.0686e-05 gnorm: 0.33 [1 day, 13:52:09<1 day, 11:31:14] +[titan] 2025-09-09 07:28:08,043 - root - INFO - step: 20645 loss: 2.7387 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.94 mfu: 49.24% global_avg_ntp_loss: 0.7786 global_avg_top_loss: 1.9601 +[titan] 2025-09-09 07:28:08,044 - root - INFO - lr: 1.0682e-05 gnorm: 0.33 [1 day, 13:52:41<1 day, 11:30:41] +[titan] 2025-09-09 07:28:33,580 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 07:28:39,992 - root - INFO - step: 20650 loss: 2.8459 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.81 mfu: 49.42% global_avg_ntp_loss: 0.8255 global_avg_top_loss: 2.0204 +[titan] 2025-09-09 07:28:39,993 - root - INFO - lr: 1.0678e-05 gnorm: 0.38 [1 day, 13:53:13<1 day, 11:30:07] +[titan] 2025-09-09 07:29:11,711 - root - INFO - step: 20655 loss: 2.9157 memory: 122.03GiB(87.57%) tps: 10,331 tflops: 492.37 mfu: 49.78% global_avg_ntp_loss: 0.8784 global_avg_top_loss: 2.0373 +[titan] 2025-09-09 07:29:11,711 - root - INFO - lr: 1.0675e-05 gnorm: 0.38 [1 day, 13:53:45<1 day, 11:29:32] +[titan] 2025-09-09 07:29:43,746 - root - INFO - step: 20660 loss: 2.7643 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.51 mfu: 49.29% global_avg_ntp_loss: 0.7887 global_avg_top_loss: 1.9755 +[titan] 2025-09-09 07:29:43,746 - root - INFO - lr: 1.0671e-05 gnorm: 0.38 [1 day, 13:54:17<1 day, 11:28:58] +[titan] 2025-09-09 07:30:15,580 - root - INFO - step: 20665 loss: 2.8318 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.58 mfu: 49.60% global_avg_ntp_loss: 0.8232 global_avg_top_loss: 2.0086 +[titan] 2025-09-09 07:30:15,580 - root - INFO - lr: 1.0668e-05 gnorm: 0.35 [1 day, 13:54:49<1 day, 11:28:24] +[titan] 2025-09-09 07:30:47,812 - root - INFO - step: 20670 loss: 2.7860 memory: 122.03GiB(87.57%) tps: 10,166 tflops: 484.52 mfu: 48.99% global_avg_ntp_loss: 0.7990 global_avg_top_loss: 1.9870 +[titan] 2025-09-09 07:30:47,812 - root - INFO - lr: 1.0664e-05 gnorm: 0.40 [1 day, 13:55:21<1 day, 11:27:50] +[titan] 2025-09-09 07:31:20,088 - root - INFO - step: 20675 loss: 3.0578 memory: 122.03GiB(87.57%) tps: 10,153 tflops: 483.87 mfu: 48.92% global_avg_ntp_loss: 0.9446 global_avg_top_loss: 2.1133 +[titan] 2025-09-09 07:31:20,088 - root - INFO - lr: 1.0661e-05 gnorm: 0.38 [1 day, 13:55:53<1 day, 11:27:17] +[titan] 2025-09-09 07:31:52,029 - root - INFO - step: 20680 loss: 2.8441 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.94 mfu: 49.44% global_avg_ntp_loss: 0.8248 global_avg_top_loss: 2.0194 +[titan] 2025-09-09 07:31:52,030 - root - INFO - lr: 1.0657e-05 gnorm: 0.36 [1 day, 13:56:25<1 day, 11:26:43] +[titan] 2025-09-09 07:32:23,838 - root - INFO - step: 20685 loss: 2.8750 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 490.97 mfu: 49.64% global_avg_ntp_loss: 0.8328 global_avg_top_loss: 2.0422 +[titan] 2025-09-09 07:32:23,838 - root - INFO - lr: 1.0653e-05 gnorm: 1.13 [1 day, 13:56:57<1 day, 11:26:09] +[titan] 2025-09-09 07:32:55,828 - root - INFO - step: 20690 loss: 2.7702 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.18 mfu: 49.36% global_avg_ntp_loss: 0.7990 global_avg_top_loss: 1.9712 +[titan] 2025-09-09 07:32:55,829 - root - INFO - lr: 1.0650e-05 gnorm: 0.39 [1 day, 13:57:29<1 day, 11:25:35] +[titan] 2025-09-09 07:33:27,566 - root - INFO - step: 20695 loss: 2.7366 memory: 122.03GiB(87.57%) tps: 10,325 tflops: 492.08 mfu: 49.76% global_avg_ntp_loss: 0.7745 global_avg_top_loss: 1.9621 +[titan] 2025-09-09 07:33:27,566 - root - INFO - lr: 1.0646e-05 gnorm: 0.36 [1 day, 13:58:01<1 day, 11:25:00] +[titan] 2025-09-09 07:33:53,012 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 07:33:59,395 - root - INFO - step: 20700 loss: 2.9222 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.66 mfu: 49.61% global_avg_ntp_loss: 0.8573 global_avg_top_loss: 2.0649 +[titan] 2025-09-09 07:33:59,395 - root - INFO - lr: 1.0643e-05 gnorm: 0.33 [1 day, 13:58:32<1 day, 11:24:26] +[titan] 2025-09-09 07:34:31,148 - root - INFO - step: 20705 loss: 2.7591 memory: 122.03GiB(87.57%) tps: 10,320 tflops: 491.83 mfu: 49.73% global_avg_ntp_loss: 0.7978 global_avg_top_loss: 1.9613 +[titan] 2025-09-09 07:34:31,148 - root - INFO - lr: 1.0639e-05 gnorm: 0.47 [1 day, 13:59:04<1 day, 11:23:52] +[titan] 2025-09-09 07:35:03,020 - root - INFO - step: 20710 loss: 3.3606 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 490.00 mfu: 49.54% global_avg_ntp_loss: 1.1187 global_avg_top_loss: 2.2419 +[titan] 2025-09-09 07:35:03,021 - root - INFO - lr: 1.0636e-05 gnorm: 0.34 [1 day, 13:59:36<1 day, 11:23:18] +[titan] 2025-09-09 07:35:35,006 - root - INFO - step: 20715 loss: 2.7429 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.25 mfu: 49.37% global_avg_ntp_loss: 0.7828 global_avg_top_loss: 1.9601 +[titan] 2025-09-09 07:35:35,006 - root - INFO - lr: 1.0632e-05 gnorm: 0.33 [1 day, 14:00:08<1 day, 11:22:44] +[titan] 2025-09-09 07:36:06,715 - root - INFO - step: 20720 loss: 2.8326 memory: 122.03GiB(87.57%) tps: 10,334 tflops: 492.51 mfu: 49.80% global_avg_ntp_loss: 0.8175 global_avg_top_loss: 2.0150 +[titan] 2025-09-09 07:36:06,716 - root - INFO - lr: 1.0628e-05 gnorm: 0.34 [1 day, 14:00:40<1 day, 11:22:10] +[titan] 2025-09-09 07:36:38,797 - root - INFO - step: 20725 loss: 2.8353 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.79 mfu: 49.22% global_avg_ntp_loss: 0.8213 global_avg_top_loss: 2.0140 +[titan] 2025-09-09 07:36:38,798 - root - INFO - lr: 1.0625e-05 gnorm: 0.33 [1 day, 14:01:12<1 day, 11:21:36] +[titan] 2025-09-09 07:37:10,651 - root - INFO - step: 20730 loss: 2.8775 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.27 mfu: 49.57% global_avg_ntp_loss: 0.8411 global_avg_top_loss: 2.0364 +[titan] 2025-09-09 07:37:10,652 - root - INFO - lr: 1.0621e-05 gnorm: 0.36 [1 day, 14:01:44<1 day, 11:21:02] +[titan] 2025-09-09 07:37:42,841 - root - INFO - step: 20735 loss: 2.9359 memory: 122.03GiB(87.57%) tps: 10,180 tflops: 485.16 mfu: 49.06% global_avg_ntp_loss: 0.8688 global_avg_top_loss: 2.0671 +[titan] 2025-09-09 07:37:42,842 - root - INFO - lr: 1.0618e-05 gnorm: 0.44 [1 day, 14:02:16<1 day, 11:20:28] +[titan] 2025-09-09 07:38:14,599 - root - INFO - step: 20740 loss: 2.7190 memory: 122.03GiB(87.57%) tps: 10,318 tflops: 491.77 mfu: 49.72% global_avg_ntp_loss: 0.7677 global_avg_top_loss: 1.9513 +[titan] 2025-09-09 07:38:14,599 - root - INFO - lr: 1.0614e-05 gnorm: 0.45 [1 day, 14:02:48<1 day, 11:19:54] +[titan] 2025-09-09 07:38:46,380 - root - INFO - step: 20745 loss: 2.8439 memory: 122.03GiB(87.57%) tps: 10,311 tflops: 491.40 mfu: 49.69% global_avg_ntp_loss: 0.8215 global_avg_top_loss: 2.0224 +[titan] 2025-09-09 07:38:46,380 - root - INFO - lr: 1.0611e-05 gnorm: 0.35 [1 day, 14:03:19<1 day, 11:19:19] +[titan] 2025-09-09 07:39:11,961 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 07:39:18,414 - root - INFO - step: 20750 loss: 2.7640 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.52 mfu: 49.29% global_avg_ntp_loss: 0.7947 global_avg_top_loss: 1.9693 +[titan] 2025-09-09 07:39:18,414 - root - INFO - lr: 1.0607e-05 gnorm: 0.35 [1 day, 14:03:51<1 day, 11:18:46] +[titan] 2025-09-09 07:39:50,347 - root - INFO - step: 20755 loss: 2.7815 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.06 mfu: 49.45% global_avg_ntp_loss: 0.7944 global_avg_top_loss: 1.9871 +[titan] 2025-09-09 07:39:50,347 - root - INFO - lr: 1.0604e-05 gnorm: 0.44 [1 day, 14:04:23<1 day, 11:18:12] +[titan] 2025-09-09 07:40:22,425 - root - INFO - step: 20760 loss: 2.8046 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.85 mfu: 49.23% global_avg_ntp_loss: 0.8071 global_avg_top_loss: 1.9975 +[titan] 2025-09-09 07:40:22,425 - root - INFO - lr: 1.0600e-05 gnorm: 0.34 [1 day, 14:04:55<1 day, 11:17:38] +[titan] 2025-09-09 07:40:54,273 - root - INFO - step: 20765 loss: 2.8028 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.36 mfu: 49.58% global_avg_ntp_loss: 0.8079 global_avg_top_loss: 1.9949 +[titan] 2025-09-09 07:40:54,274 - root - INFO - lr: 1.0596e-05 gnorm: 0.34 [1 day, 14:05:27<1 day, 11:17:04] +[titan] 2025-09-09 07:41:26,612 - root - INFO - step: 20770 loss: 2.8227 memory: 122.03GiB(87.57%) tps: 10,133 tflops: 482.93 mfu: 48.83% global_avg_ntp_loss: 0.8154 global_avg_top_loss: 2.0073 +[titan] 2025-09-09 07:41:26,612 - root - INFO - lr: 1.0593e-05 gnorm: 0.37 [1 day, 14:06:00<1 day, 11:16:30] +[titan] 2025-09-09 07:41:58,614 - root - INFO - step: 20775 loss: 2.8022 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 488.00 mfu: 49.34% global_avg_ntp_loss: 0.8065 global_avg_top_loss: 1.9957 +[titan] 2025-09-09 07:41:58,615 - root - INFO - lr: 1.0589e-05 gnorm: 0.35 [1 day, 14:06:32<1 day, 11:15:56] +[titan] 2025-09-09 07:42:30,425 - root - INFO - step: 20780 loss: 2.9815 memory: 122.03GiB(87.57%) tps: 10,301 tflops: 490.95 mfu: 49.64% global_avg_ntp_loss: 0.9007 global_avg_top_loss: 2.0808 +[titan] 2025-09-09 07:42:30,425 - root - INFO - lr: 1.0586e-05 gnorm: 0.34 [1 day, 14:07:03<1 day, 11:15:22] +[titan] 2025-09-09 07:43:02,471 - root - INFO - step: 20785 loss: 2.6697 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.34 mfu: 49.28% global_avg_ntp_loss: 0.7556 global_avg_top_loss: 1.9141 +[titan] 2025-09-09 07:43:02,471 - root - INFO - lr: 1.0582e-05 gnorm: 0.39 [1 day, 14:07:36<1 day, 11:14:48] +[titan] 2025-09-09 07:43:34,584 - root - INFO - step: 20790 loss: 3.1640 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.32 mfu: 49.17% global_avg_ntp_loss: 1.0273 global_avg_top_loss: 2.1367 +[titan] 2025-09-09 07:43:34,584 - root - INFO - lr: 1.0579e-05 gnorm: 0.37 [1 day, 14:08:08<1 day, 11:14:14] +[titan] 2025-09-09 07:44:06,508 - root - INFO - step: 20795 loss: 3.0651 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.19 mfu: 49.46% global_avg_ntp_loss: 0.9439 global_avg_top_loss: 2.1211 +[titan] 2025-09-09 07:44:06,509 - root - INFO - lr: 1.0575e-05 gnorm: 0.34 [1 day, 14:08:40<1 day, 11:13:40] +[titan] 2025-09-09 07:44:32,015 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 07:44:38,368 - root - INFO - step: 20800 loss: 2.6540 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.19 mfu: 49.56% global_avg_ntp_loss: 0.7441 global_avg_top_loss: 1.9100 +[titan] 2025-09-09 07:44:38,368 - root - INFO - lr: 1.0571e-05 gnorm: 0.34 [1 day, 14:09:11<1 day, 11:13:06] +[titan] 2025-09-09 07:45:10,207 - root - INFO - step: 20805 loss: 2.7972 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.50 mfu: 49.60% global_avg_ntp_loss: 0.8050 global_avg_top_loss: 1.9922 +[titan] 2025-09-09 07:45:10,207 - root - INFO - lr: 1.0568e-05 gnorm: 0.35 [1 day, 14:09:43<1 day, 11:12:32] +[titan] 2025-09-09 07:45:41,945 - root - INFO - step: 20810 loss: 2.7258 memory: 122.03GiB(87.57%) tps: 10,325 tflops: 492.06 mfu: 49.75% global_avg_ntp_loss: 0.7687 global_avg_top_loss: 1.9571 +[titan] 2025-09-09 07:45:41,945 - root - INFO - lr: 1.0564e-05 gnorm: 0.36 [1 day, 14:10:15<1 day, 11:11:58] +[titan] 2025-09-09 07:46:13,890 - root - INFO - step: 20815 loss: 2.7597 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.88 mfu: 49.43% global_avg_ntp_loss: 0.7872 global_avg_top_loss: 1.9725 +[titan] 2025-09-09 07:46:13,890 - root - INFO - lr: 1.0561e-05 gnorm: 0.40 [1 day, 14:10:47<1 day, 11:11:24] +[titan] 2025-09-09 07:46:46,092 - root - INFO - step: 20820 loss: 2.7721 memory: 122.03GiB(87.57%) tps: 10,176 tflops: 484.98 mfu: 49.04% global_avg_ntp_loss: 0.7911 global_avg_top_loss: 1.9811 +[titan] 2025-09-09 07:46:46,093 - root - INFO - lr: 1.0557e-05 gnorm: 0.36 [1 day, 14:11:19<1 day, 11:10:50] +[titan] 2025-09-09 07:47:18,042 - root - INFO - step: 20825 loss: 2.7175 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.81 mfu: 49.42% global_avg_ntp_loss: 0.7694 global_avg_top_loss: 1.9481 +[titan] 2025-09-09 07:47:18,042 - root - INFO - lr: 1.0554e-05 gnorm: 0.35 [1 day, 14:11:51<1 day, 11:10:16] +[titan] 2025-09-09 07:47:50,006 - root - INFO - step: 20830 loss: 2.8856 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.59 mfu: 49.40% global_avg_ntp_loss: 0.8411 global_avg_top_loss: 2.0445 +[titan] 2025-09-09 07:47:50,006 - root - INFO - lr: 1.0550e-05 gnorm: 0.35 [1 day, 14:12:23<1 day, 11:09:42] +[titan] 2025-09-09 07:48:21,883 - root - INFO - step: 20835 loss: 2.7359 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.92 mfu: 49.54% global_avg_ntp_loss: 0.7772 global_avg_top_loss: 1.9587 +[titan] 2025-09-09 07:48:21,884 - root - INFO - lr: 1.0546e-05 gnorm: 0.35 [1 day, 14:12:55<1 day, 11:09:08] +[titan] 2025-09-09 07:48:53,834 - root - INFO - step: 20840 loss: 2.7794 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.79 mfu: 49.42% global_avg_ntp_loss: 0.7969 global_avg_top_loss: 1.9825 +[titan] 2025-09-09 07:48:53,834 - root - INFO - lr: 1.0543e-05 gnorm: 0.33 [1 day, 14:13:27<1 day, 11:08:34] +[titan] 2025-09-09 07:49:25,664 - root - INFO - step: 20845 loss: 2.7689 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.64 mfu: 49.61% global_avg_ntp_loss: 0.7916 global_avg_top_loss: 1.9773 +[titan] 2025-09-09 07:49:25,664 - root - INFO - lr: 1.0539e-05 gnorm: 0.33 [1 day, 14:13:59<1 day, 11:08:00] +[titan] 2025-09-09 07:49:51,116 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 07:49:57,538 - root - INFO - step: 20850 loss: 2.9739 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.96 mfu: 49.54% global_avg_ntp_loss: 0.8858 global_avg_top_loss: 2.0881 +[titan] 2025-09-09 07:49:57,539 - root - INFO - lr: 1.0536e-05 gnorm: 1.04 [1 day, 14:14:31<1 day, 11:07:26] +[titan] 2025-09-09 07:50:29,472 - root - INFO - step: 20855 loss: 2.9238 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.05 mfu: 49.45% global_avg_ntp_loss: 0.8636 global_avg_top_loss: 2.0602 +[titan] 2025-09-09 07:50:29,472 - root - INFO - lr: 1.0532e-05 gnorm: 0.77 [1 day, 14:15:03<1 day, 11:06:52] +[titan] 2025-09-09 07:51:01,615 - root - INFO - step: 20860 loss: 2.7706 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.87 mfu: 49.13% global_avg_ntp_loss: 0.7953 global_avg_top_loss: 1.9753 +[titan] 2025-09-09 07:51:01,615 - root - INFO - lr: 1.0529e-05 gnorm: 0.37 [1 day, 14:15:35<1 day, 11:06:18] +[titan] 2025-09-09 07:51:33,434 - root - INFO - step: 20865 loss: 2.7579 memory: 122.03GiB(87.57%) tps: 10,298 tflops: 490.81 mfu: 49.63% global_avg_ntp_loss: 0.7845 global_avg_top_loss: 1.9734 +[titan] 2025-09-09 07:51:33,435 - root - INFO - lr: 1.0525e-05 gnorm: 0.34 [1 day, 14:16:06<1 day, 11:05:44] +[titan] 2025-09-09 07:52:05,439 - root - INFO - step: 20870 loss: 2.9183 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.97 mfu: 49.34% global_avg_ntp_loss: 0.8903 global_avg_top_loss: 2.0279 +[titan] 2025-09-09 07:52:05,439 - root - INFO - lr: 1.0522e-05 gnorm: 0.41 [1 day, 14:16:38<1 day, 11:05:10] +[titan] 2025-09-09 07:52:37,550 - root - INFO - step: 20875 loss: 2.6673 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.35 mfu: 49.18% global_avg_ntp_loss: 0.7462 global_avg_top_loss: 1.9211 +[titan] 2025-09-09 07:52:37,550 - root - INFO - lr: 1.0518e-05 gnorm: 0.49 [1 day, 14:17:11<1 day, 11:04:36] +[titan] 2025-09-09 07:53:09,604 - root - INFO - step: 20880 loss: 2.6185 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.20 mfu: 49.26% global_avg_ntp_loss: 0.7242 global_avg_top_loss: 1.8943 +[titan] 2025-09-09 07:53:09,605 - root - INFO - lr: 1.0514e-05 gnorm: 0.34 [1 day, 14:17:43<1 day, 11:04:02] +[titan] 2025-09-09 07:53:41,434 - root - INFO - step: 20885 loss: 2.7658 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.66 mfu: 49.61% global_avg_ntp_loss: 0.7907 global_avg_top_loss: 1.9751 +[titan] 2025-09-09 07:53:41,434 - root - INFO - lr: 1.0511e-05 gnorm: 0.33 [1 day, 14:18:14<1 day, 11:03:28] +[titan] 2025-09-09 07:54:13,278 - root - INFO - step: 20890 loss: 2.6896 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.42 mfu: 49.59% global_avg_ntp_loss: 0.7557 global_avg_top_loss: 1.9340 +[titan] 2025-09-09 07:54:13,278 - root - INFO - lr: 1.0507e-05 gnorm: 0.40 [1 day, 14:18:46<1 day, 11:02:54] +[titan] 2025-09-09 07:54:45,332 - root - INFO - step: 20895 loss: 2.7992 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.22 mfu: 49.26% global_avg_ntp_loss: 0.8024 global_avg_top_loss: 1.9968 +[titan] 2025-09-09 07:54:45,332 - root - INFO - lr: 1.0504e-05 gnorm: 0.48 [1 day, 14:19:18<1 day, 11:02:20] +[titan] 2025-09-09 07:55:10,807 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 07:55:17,280 - root - INFO - step: 20900 loss: 2.7383 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.82 mfu: 49.43% global_avg_ntp_loss: 0.7758 global_avg_top_loss: 1.9626 +[titan] 2025-09-09 07:55:17,281 - root - INFO - lr: 1.0500e-05 gnorm: 0.34 [1 day, 14:19:50<1 day, 11:01:46] +[titan] 2025-09-09 07:55:49,358 - root - INFO - step: 20905 loss: 2.8631 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.86 mfu: 49.23% global_avg_ntp_loss: 0.8373 global_avg_top_loss: 2.0258 +[titan] 2025-09-09 07:55:49,358 - root - INFO - lr: 1.0497e-05 gnorm: 0.37 [1 day, 14:20:22<1 day, 11:01:12] +[titan] 2025-09-09 07:56:21,199 - root - INFO - step: 20910 loss: 2.8960 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.48 mfu: 49.59% global_avg_ntp_loss: 0.8515 global_avg_top_loss: 2.0445 +[titan] 2025-09-09 07:56:21,199 - root - INFO - lr: 1.0493e-05 gnorm: 0.36 [1 day, 14:20:54<1 day, 11:00:38] +[titan] 2025-09-09 07:56:53,363 - root - INFO - step: 20915 loss: 2.8280 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.55 mfu: 49.09% global_avg_ntp_loss: 0.8177 global_avg_top_loss: 2.0103 +[titan] 2025-09-09 07:56:53,363 - root - INFO - lr: 1.0489e-05 gnorm: 0.35 [1 day, 14:21:26<1 day, 11:00:04] +[titan] 2025-09-09 07:57:25,296 - root - INFO - step: 20920 loss: 2.7663 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.06 mfu: 49.45% global_avg_ntp_loss: 0.7890 global_avg_top_loss: 1.9773 +[titan] 2025-09-09 07:57:25,296 - root - INFO - lr: 1.0486e-05 gnorm: 0.34 [1 day, 14:21:58<1 day, 10:59:30] +[titan] 2025-09-09 07:57:57,248 - root - INFO - step: 20925 loss: 2.8015 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.77 mfu: 49.42% global_avg_ntp_loss: 0.8027 global_avg_top_loss: 1.9988 +[titan] 2025-09-09 07:57:57,248 - root - INFO - lr: 1.0482e-05 gnorm: 0.35 [1 day, 14:22:30<1 day, 10:58:56] +[titan] 2025-09-09 07:58:29,155 - root - INFO - step: 20930 loss: 2.9265 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.46 mfu: 49.49% global_avg_ntp_loss: 0.8656 global_avg_top_loss: 2.0609 +[titan] 2025-09-09 07:58:29,155 - root - INFO - lr: 1.0479e-05 gnorm: 0.76 [1 day, 14:23:02<1 day, 10:58:22] +[titan] 2025-09-09 07:59:01,038 - root - INFO - step: 20935 loss: 3.2675 memory: 122.03GiB(87.57%) tps: 10,278 tflops: 489.83 mfu: 49.53% global_avg_ntp_loss: 1.0719 global_avg_top_loss: 2.1956 +[titan] 2025-09-09 07:59:01,038 - root - INFO - lr: 1.0475e-05 gnorm: 0.36 [1 day, 14:23:34<1 day, 10:57:48] +[titan] 2025-09-09 07:59:32,950 - root - INFO - step: 20940 loss: 2.8089 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.38 mfu: 49.48% global_avg_ntp_loss: 0.8071 global_avg_top_loss: 2.0018 +[titan] 2025-09-09 07:59:32,950 - root - INFO - lr: 1.0472e-05 gnorm: 0.33 [1 day, 14:24:06<1 day, 10:57:14] +[titan] 2025-09-09 08:00:04,740 - root - INFO - step: 20945 loss: 2.7405 memory: 122.03GiB(87.57%) tps: 10,308 tflops: 491.27 mfu: 49.67% global_avg_ntp_loss: 0.7780 global_avg_top_loss: 1.9625 +[titan] 2025-09-09 08:00:04,740 - root - INFO - lr: 1.0468e-05 gnorm: 0.35 [1 day, 14:24:38<1 day, 10:56:40] +[titan] 2025-09-09 08:00:30,312 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:00:36,667 - root - INFO - step: 20950 loss: 2.6918 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.14 mfu: 49.46% global_avg_ntp_loss: 0.7578 global_avg_top_loss: 1.9341 +[titan] 2025-09-09 08:00:36,668 - root - INFO - lr: 1.0464e-05 gnorm: 0.41 [1 day, 14:25:10<1 day, 10:56:06] +[titan] 2025-09-09 08:01:08,427 - root - INFO - step: 20955 loss: 2.6915 memory: 122.03GiB(87.57%) tps: 10,318 tflops: 491.73 mfu: 49.72% global_avg_ntp_loss: 0.7568 global_avg_top_loss: 1.9348 +[titan] 2025-09-09 08:01:08,427 - root - INFO - lr: 1.0461e-05 gnorm: 0.41 [1 day, 14:25:41<1 day, 10:55:32] +[titan] 2025-09-09 08:01:40,171 - root - INFO - step: 20960 loss: 2.8383 memory: 122.03GiB(87.57%) tps: 10,323 tflops: 491.98 mfu: 49.75% global_avg_ntp_loss: 0.8246 global_avg_top_loss: 2.0137 +[titan] 2025-09-09 08:01:40,171 - root - INFO - lr: 1.0457e-05 gnorm: 0.42 [1 day, 14:26:13<1 day, 10:54:58] +[titan] 2025-09-09 08:02:11,879 - root - INFO - step: 20965 loss: 2.7883 memory: 122.03GiB(87.57%) tps: 10,334 tflops: 492.53 mfu: 49.80% global_avg_ntp_loss: 0.8007 global_avg_top_loss: 1.9876 +[titan] 2025-09-09 08:02:11,879 - root - INFO - lr: 1.0454e-05 gnorm: 0.35 [1 day, 14:26:45<1 day, 10:54:24] +[titan] 2025-09-09 08:02:43,657 - root - INFO - step: 20970 loss: 2.7243 memory: 122.03GiB(87.57%) tps: 10,312 tflops: 491.45 mfu: 49.69% global_avg_ntp_loss: 0.7716 global_avg_top_loss: 1.9527 +[titan] 2025-09-09 08:02:43,657 - root - INFO - lr: 1.0450e-05 gnorm: 0.35 [1 day, 14:27:17<1 day, 10:53:49] +[titan] 2025-09-09 08:03:15,701 - root - INFO - step: 20975 loss: 2.8051 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.36 mfu: 49.28% global_avg_ntp_loss: 0.8077 global_avg_top_loss: 1.9973 +[titan] 2025-09-09 08:03:15,701 - root - INFO - lr: 1.0447e-05 gnorm: 0.61 [1 day, 14:27:49<1 day, 10:53:16] +[titan] 2025-09-09 08:03:47,823 - root - INFO - step: 20980 loss: 2.8081 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.18 mfu: 49.16% global_avg_ntp_loss: 0.8093 global_avg_top_loss: 1.9988 +[titan] 2025-09-09 08:03:47,823 - root - INFO - lr: 1.0443e-05 gnorm: 0.34 [1 day, 14:28:21<1 day, 10:52:42] +[titan] 2025-09-09 08:04:19,780 - root - INFO - step: 20985 loss: 2.7822 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.70 mfu: 49.41% global_avg_ntp_loss: 0.8019 global_avg_top_loss: 1.9803 +[titan] 2025-09-09 08:04:19,780 - root - INFO - lr: 1.0440e-05 gnorm: 0.33 [1 day, 14:28:53<1 day, 10:52:08] +[titan] 2025-09-09 08:04:51,585 - root - INFO - step: 20990 loss: 2.7827 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.02 mfu: 49.65% global_avg_ntp_loss: 0.7972 global_avg_top_loss: 1.9855 +[titan] 2025-09-09 08:04:51,586 - root - INFO - lr: 1.0436e-05 gnorm: 0.34 [1 day, 14:29:25<1 day, 10:51:34] +[titan] 2025-09-09 08:05:04,549 - root - INFO - Dumping profiler traces at step 20992 +[titan] 2025-09-09 08:05:04,604 - root - INFO - Finished dumping profiler traces in 0.05 seconds +[titan] 2025-09-09 08:05:23,732 - root - INFO - step: 20995 loss: 2.7803 memory: 122.03GiB(87.57%) tps: 10,194 tflops: 485.82 mfu: 49.12% global_avg_ntp_loss: 0.7956 global_avg_top_loss: 1.9847 +[titan] 2025-09-09 08:05:23,732 - root - INFO - lr: 1.0432e-05 gnorm: 0.39 [1 day, 14:29:57<1 day, 10:51:00] +[titan] 2025-09-09 08:05:49,142 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:05:55,532 - root - INFO - step: 21000 loss: 2.7204 memory: 122.03GiB(87.57%) tps: 10,304 tflops: 491.10 mfu: 49.66% global_avg_ntp_loss: 0.7681 global_avg_top_loss: 1.9524 +[titan] 2025-09-09 08:05:55,532 - root - INFO - lr: 1.0429e-05 gnorm: 0.33 [1 day, 14:30:29<1 day, 10:50:26] +[titan] 2025-09-09 08:06:27,528 - root - INFO - step: 21005 loss: 2.9063 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.10 mfu: 49.35% global_avg_ntp_loss: 0.8637 global_avg_top_loss: 2.0426 +[titan] 2025-09-09 08:06:27,528 - root - INFO - lr: 1.0425e-05 gnorm: 0.39 [1 day, 14:31:01<1 day, 10:49:52] +[titan] 2025-09-09 08:06:59,334 - root - INFO - step: 21010 loss: 2.8169 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.02 mfu: 49.65% global_avg_ntp_loss: 0.8137 global_avg_top_loss: 2.0032 +[titan] 2025-09-09 08:06:59,334 - root - INFO - lr: 1.0422e-05 gnorm: 0.37 [1 day, 14:31:32<1 day, 10:49:18] +[titan] 2025-09-09 08:07:31,218 - root - INFO - step: 21015 loss: 3.2978 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.81 mfu: 49.53% global_avg_ntp_loss: 1.0833 global_avg_top_loss: 2.2144 +[titan] 2025-09-09 08:07:31,218 - root - INFO - lr: 1.0418e-05 gnorm: 0.34 [1 day, 14:32:04<1 day, 10:48:44] +[titan] 2025-09-09 08:08:03,277 - root - INFO - step: 21020 loss: 2.8163 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.13 mfu: 49.26% global_avg_ntp_loss: 0.8144 global_avg_top_loss: 2.0019 +[titan] 2025-09-09 08:08:03,278 - root - INFO - lr: 1.0415e-05 gnorm: 0.34 [1 day, 14:32:36<1 day, 10:48:10] +[titan] 2025-09-09 08:08:35,226 - root - INFO - step: 21025 loss: 2.8080 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.82 mfu: 49.43% global_avg_ntp_loss: 0.8089 global_avg_top_loss: 1.9991 +[titan] 2025-09-09 08:08:35,226 - root - INFO - lr: 1.0411e-05 gnorm: 0.34 [1 day, 14:33:08<1 day, 10:47:36] +[titan] 2025-09-09 08:09:07,117 - root - INFO - step: 21030 loss: 2.7775 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.71 mfu: 49.52% global_avg_ntp_loss: 0.8056 global_avg_top_loss: 1.9720 +[titan] 2025-09-09 08:09:07,117 - root - INFO - lr: 1.0407e-05 gnorm: 0.34 [1 day, 14:33:40<1 day, 10:47:02] +[titan] 2025-09-09 08:09:39,062 - root - INFO - step: 21035 loss: 2.7663 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.88 mfu: 49.43% global_avg_ntp_loss: 0.7906 global_avg_top_loss: 1.9757 +[titan] 2025-09-09 08:09:39,062 - root - INFO - lr: 1.0404e-05 gnorm: 0.34 [1 day, 14:34:12<1 day, 10:46:28] +[titan] 2025-09-09 08:10:11,008 - root - INFO - step: 21040 loss: 2.7821 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.86 mfu: 49.43% global_avg_ntp_loss: 0.7995 global_avg_top_loss: 1.9826 +[titan] 2025-09-09 08:10:11,008 - root - INFO - lr: 1.0400e-05 gnorm: 0.35 [1 day, 14:34:44<1 day, 10:45:54] +[titan] 2025-09-09 08:10:42,828 - root - INFO - step: 21045 loss: 2.7440 memory: 122.03GiB(87.57%) tps: 10,298 tflops: 490.80 mfu: 49.63% global_avg_ntp_loss: 0.7797 global_avg_top_loss: 1.9643 +[titan] 2025-09-09 08:10:42,829 - root - INFO - lr: 1.0397e-05 gnorm: 0.33 [1 day, 14:35:16<1 day, 10:45:20] +[titan] 2025-09-09 08:11:08,408 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:11:14,825 - root - INFO - step: 21050 loss: 2.7525 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.09 mfu: 49.35% global_avg_ntp_loss: 0.7832 global_avg_top_loss: 1.9693 +[titan] 2025-09-09 08:11:14,825 - root - INFO - lr: 1.0393e-05 gnorm: 0.33 [1 day, 14:35:48<1 day, 10:44:46] +[titan] 2025-09-09 08:11:46,976 - root - INFO - step: 21055 loss: 2.6952 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.74 mfu: 49.11% global_avg_ntp_loss: 0.7572 global_avg_top_loss: 1.9380 +[titan] 2025-09-09 08:11:46,977 - root - INFO - lr: 1.0390e-05 gnorm: 0.34 [1 day, 14:36:20<1 day, 10:44:12] +[titan] 2025-09-09 08:12:18,872 - root - INFO - step: 21060 loss: 2.6235 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.64 mfu: 49.51% global_avg_ntp_loss: 0.7295 global_avg_top_loss: 1.8941 +[titan] 2025-09-09 08:12:18,872 - root - INFO - lr: 1.0386e-05 gnorm: 0.45 [1 day, 14:36:52<1 day, 10:43:38] +[titan] 2025-09-09 08:12:50,791 - root - INFO - step: 21065 loss: 2.7581 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.27 mfu: 49.47% global_avg_ntp_loss: 0.7873 global_avg_top_loss: 1.9709 +[titan] 2025-09-09 08:12:50,791 - root - INFO - lr: 1.0383e-05 gnorm: 0.34 [1 day, 14:37:24<1 day, 10:43:04] +[titan] 2025-09-09 08:13:22,704 - root - INFO - step: 21070 loss: 2.6939 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.37 mfu: 49.48% global_avg_ntp_loss: 0.7548 global_avg_top_loss: 1.9391 +[titan] 2025-09-09 08:13:22,704 - root - INFO - lr: 1.0379e-05 gnorm: 0.34 [1 day, 14:37:56<1 day, 10:42:30] +[titan] 2025-09-09 08:13:54,590 - root - INFO - step: 21075 loss: 2.7878 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.79 mfu: 49.52% global_avg_ntp_loss: 0.8016 global_avg_top_loss: 1.9862 +[titan] 2025-09-09 08:13:54,590 - root - INFO - lr: 1.0375e-05 gnorm: 0.34 [1 day, 14:38:28<1 day, 10:41:56] +[titan] 2025-09-09 08:14:26,459 - root - INFO - step: 21080 loss: 2.7931 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.04 mfu: 49.55% global_avg_ntp_loss: 0.8034 global_avg_top_loss: 1.9896 +[titan] 2025-09-09 08:14:26,459 - root - INFO - lr: 1.0372e-05 gnorm: 0.34 [1 day, 14:38:59<1 day, 10:41:22] +[titan] 2025-09-09 08:14:58,295 - root - INFO - step: 21085 loss: 3.1784 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.55 mfu: 49.60% global_avg_ntp_loss: 0.9944 global_avg_top_loss: 2.1841 +[titan] 2025-09-09 08:14:58,295 - root - INFO - lr: 1.0368e-05 gnorm: 0.37 [1 day, 14:39:31<1 day, 10:40:48] +[titan] 2025-09-09 08:15:30,317 - root - INFO - step: 21090 loss: 2.8480 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.70 mfu: 49.31% global_avg_ntp_loss: 0.8296 global_avg_top_loss: 2.0184 +[titan] 2025-09-09 08:15:30,317 - root - INFO - lr: 1.0365e-05 gnorm: 0.32 [1 day, 14:40:03<1 day, 10:40:14] +[titan] 2025-09-09 08:16:02,066 - root - INFO - step: 21095 loss: 3.1556 memory: 122.03GiB(87.57%) tps: 10,321 tflops: 491.91 mfu: 49.74% global_avg_ntp_loss: 1.0224 global_avg_top_loss: 2.1332 +[titan] 2025-09-09 08:16:02,066 - root - INFO - lr: 1.0361e-05 gnorm: 0.33 [1 day, 14:40:35<1 day, 10:39:40] +[titan] 2025-09-09 08:16:27,563 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:16:33,905 - root - INFO - step: 21100 loss: 2.8049 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.50 mfu: 49.60% global_avg_ntp_loss: 0.8074 global_avg_top_loss: 1.9976 +[titan] 2025-09-09 08:16:33,905 - root - INFO - lr: 1.0358e-05 gnorm: 0.36 [1 day, 14:41:07<1 day, 10:39:06] +[titan] 2025-09-09 08:17:05,759 - root - INFO - step: 21105 loss: 2.6375 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.28 mfu: 49.57% global_avg_ntp_loss: 0.7327 global_avg_top_loss: 1.9048 +[titan] 2025-09-09 08:17:05,759 - root - INFO - lr: 1.0354e-05 gnorm: 0.33 [1 day, 14:41:39<1 day, 10:38:32] +[titan] 2025-09-09 08:17:37,475 - root - INFO - step: 21110 loss: 2.7934 memory: 122.03GiB(87.57%) tps: 10,332 tflops: 492.41 mfu: 49.79% global_avg_ntp_loss: 0.7993 global_avg_top_loss: 1.9942 +[titan] 2025-09-09 08:17:37,475 - root - INFO - lr: 1.0350e-05 gnorm: 0.34 [1 day, 14:42:10<1 day, 10:37:58] +[titan] 2025-09-09 08:18:09,514 - root - INFO - step: 21115 loss: 2.7802 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.43 mfu: 49.29% global_avg_ntp_loss: 0.7952 global_avg_top_loss: 1.9850 +[titan] 2025-09-09 08:18:09,514 - root - INFO - lr: 1.0347e-05 gnorm: 0.33 [1 day, 14:42:43<1 day, 10:37:24] +[titan] 2025-09-09 08:18:41,343 - root - INFO - step: 21120 loss: 2.7539 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.66 mfu: 49.61% global_avg_ntp_loss: 0.7800 global_avg_top_loss: 1.9739 +[titan] 2025-09-09 08:18:41,343 - root - INFO - lr: 1.0343e-05 gnorm: 0.35 [1 day, 14:43:14<1 day, 10:36:50] +[titan] 2025-09-09 08:19:13,080 - root - INFO - step: 21125 loss: 3.1028 memory: 122.03GiB(87.57%) tps: 10,325 tflops: 492.08 mfu: 49.75% global_avg_ntp_loss: 0.9473 global_avg_top_loss: 2.1555 +[titan] 2025-09-09 08:19:13,081 - root - INFO - lr: 1.0340e-05 gnorm: 0.38 [1 day, 14:43:46<1 day, 10:36:16] +[titan] 2025-09-09 08:19:45,334 - root - INFO - step: 21130 loss: 2.7767 memory: 122.03GiB(87.57%) tps: 10,160 tflops: 484.21 mfu: 48.96% global_avg_ntp_loss: 0.7932 global_avg_top_loss: 1.9835 +[titan] 2025-09-09 08:19:45,334 - root - INFO - lr: 1.0336e-05 gnorm: 0.34 [1 day, 14:44:18<1 day, 10:35:42] +[titan] 2025-09-09 08:20:17,484 - root - INFO - step: 21135 loss: 2.7999 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.76 mfu: 49.12% global_avg_ntp_loss: 0.8034 global_avg_top_loss: 1.9965 +[titan] 2025-09-09 08:20:17,484 - root - INFO - lr: 1.0333e-05 gnorm: 0.33 [1 day, 14:44:50<1 day, 10:35:08] +[titan] 2025-09-09 08:20:49,385 - root - INFO - step: 21140 loss: 2.5905 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.54 mfu: 49.50% global_avg_ntp_loss: 0.7072 global_avg_top_loss: 1.8832 +[titan] 2025-09-09 08:20:49,386 - root - INFO - lr: 1.0329e-05 gnorm: 0.43 [1 day, 14:45:22<1 day, 10:34:35] +[titan] 2025-09-09 08:21:21,367 - root - INFO - step: 21145 loss: 3.1811 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.32 mfu: 49.38% global_avg_ntp_loss: 1.0347 global_avg_top_loss: 2.1463 +[titan] 2025-09-09 08:21:21,367 - root - INFO - lr: 1.0326e-05 gnorm: 0.34 [1 day, 14:45:54<1 day, 10:34:01] +[titan] 2025-09-09 08:21:46,893 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:21:53,350 - root - INFO - step: 21150 loss: 2.8332 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.29 mfu: 49.37% global_avg_ntp_loss: 0.8192 global_avg_top_loss: 2.0140 +[titan] 2025-09-09 08:21:53,350 - root - INFO - lr: 1.0322e-05 gnorm: 0.37 [1 day, 14:46:26<1 day, 10:33:27] +[titan] 2025-09-09 08:22:25,342 - root - INFO - step: 21155 loss: 2.7156 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.15 mfu: 49.36% global_avg_ntp_loss: 0.7669 global_avg_top_loss: 1.9487 +[titan] 2025-09-09 08:22:25,343 - root - INFO - lr: 1.0318e-05 gnorm: 0.37 [1 day, 14:46:58<1 day, 10:32:53] +[titan] 2025-09-09 08:22:57,111 - root - INFO - step: 21160 loss: 2.7876 memory: 122.03GiB(87.57%) tps: 10,315 tflops: 491.59 mfu: 49.71% global_avg_ntp_loss: 0.8013 global_avg_top_loss: 1.9863 +[titan] 2025-09-09 08:22:57,111 - root - INFO - lr: 1.0315e-05 gnorm: 0.34 [1 day, 14:47:30<1 day, 10:32:19] +[titan] 2025-09-09 08:23:29,275 - root - INFO - step: 21165 loss: 3.0356 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.55 mfu: 49.10% global_avg_ntp_loss: 0.9197 global_avg_top_loss: 2.1159 +[titan] 2025-09-09 08:23:29,275 - root - INFO - lr: 1.0311e-05 gnorm: 0.39 [1 day, 14:48:02<1 day, 10:31:45] +[titan] 2025-09-09 08:24:01,373 - root - INFO - step: 21170 loss: 2.7337 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.55 mfu: 49.20% global_avg_ntp_loss: 0.7779 global_avg_top_loss: 1.9559 +[titan] 2025-09-09 08:24:01,373 - root - INFO - lr: 1.0308e-05 gnorm: 0.34 [1 day, 14:48:34<1 day, 10:31:11] +[titan] 2025-09-09 08:24:33,246 - root - INFO - step: 21175 loss: 3.2832 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.98 mfu: 49.54% global_avg_ntp_loss: 1.0789 global_avg_top_loss: 2.2042 +[titan] 2025-09-09 08:24:33,246 - root - INFO - lr: 1.0304e-05 gnorm: 0.38 [1 day, 14:49:06<1 day, 10:30:37] +[titan] 2025-09-09 08:25:05,150 - root - INFO - step: 21180 loss: 2.7886 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.50 mfu: 49.49% global_avg_ntp_loss: 0.8007 global_avg_top_loss: 1.9879 +[titan] 2025-09-09 08:25:05,150 - root - INFO - lr: 1.0301e-05 gnorm: 0.38 [1 day, 14:49:38<1 day, 10:30:03] +[titan] 2025-09-09 08:25:37,105 - root - INFO - step: 21185 loss: 2.7363 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.73 mfu: 49.42% global_avg_ntp_loss: 0.7782 global_avg_top_loss: 1.9581 +[titan] 2025-09-09 08:25:37,105 - root - INFO - lr: 1.0297e-05 gnorm: 0.32 [1 day, 14:50:10<1 day, 10:29:29] +[titan] 2025-09-09 08:26:09,013 - root - INFO - step: 21190 loss: 2.7883 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.43 mfu: 49.49% global_avg_ntp_loss: 0.7990 global_avg_top_loss: 1.9894 +[titan] 2025-09-09 08:26:09,014 - root - INFO - lr: 1.0294e-05 gnorm: 0.37 [1 day, 14:50:42<1 day, 10:28:55] +[titan] 2025-09-09 08:26:40,951 - root - INFO - step: 21195 loss: 2.7609 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.99 mfu: 49.44% global_avg_ntp_loss: 0.7866 global_avg_top_loss: 1.9743 +[titan] 2025-09-09 08:26:40,951 - root - INFO - lr: 1.0290e-05 gnorm: 0.35 [1 day, 14:51:14<1 day, 10:28:21] +[titan] 2025-09-09 08:27:06,456 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:27:12,856 - root - INFO - step: 21200 loss: 2.8088 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.49 mfu: 49.49% global_avg_ntp_loss: 0.8095 global_avg_top_loss: 1.9993 +[titan] 2025-09-09 08:27:12,857 - root - INFO - lr: 1.0286e-05 gnorm: 0.34 [1 day, 14:51:46<1 day, 10:27:47] +[titan] 2025-09-09 08:27:44,760 - root - INFO - step: 21205 loss: 2.8327 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.51 mfu: 49.50% global_avg_ntp_loss: 0.8227 global_avg_top_loss: 2.0100 +[titan] 2025-09-09 08:27:44,760 - root - INFO - lr: 1.0283e-05 gnorm: 0.36 [1 day, 14:52:18<1 day, 10:27:13] +[titan] 2025-09-09 08:28:16,645 - root - INFO - step: 21210 loss: 2.7428 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.80 mfu: 49.52% global_avg_ntp_loss: 0.7750 global_avg_top_loss: 1.9678 +[titan] 2025-09-09 08:28:16,645 - root - INFO - lr: 1.0279e-05 gnorm: 0.37 [1 day, 14:52:50<1 day, 10:26:39] +[titan] 2025-09-09 08:28:48,497 - root - INFO - step: 21215 loss: 2.7985 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.30 mfu: 49.58% global_avg_ntp_loss: 0.8044 global_avg_top_loss: 1.9941 +[titan] 2025-09-09 08:28:48,498 - root - INFO - lr: 1.0276e-05 gnorm: 0.41 [1 day, 14:53:22<1 day, 10:26:05] +[titan] 2025-09-09 08:29:20,625 - root - INFO - step: 21220 loss: 2.7330 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.10 mfu: 49.15% global_avg_ntp_loss: 0.7767 global_avg_top_loss: 1.9564 +[titan] 2025-09-09 08:29:20,625 - root - INFO - lr: 1.0272e-05 gnorm: 0.64 [1 day, 14:53:54<1 day, 10:25:32] +[titan] 2025-09-09 08:29:52,376 - root - INFO - step: 21225 loss: 3.2566 memory: 122.03GiB(87.57%) tps: 10,320 tflops: 491.86 mfu: 49.73% global_avg_ntp_loss: 1.0652 global_avg_top_loss: 2.1914 +[titan] 2025-09-09 08:29:52,376 - root - INFO - lr: 1.0269e-05 gnorm: 0.35 [1 day, 14:54:25<1 day, 10:24:58] +[titan] 2025-09-09 08:30:24,267 - root - INFO - step: 21230 loss: 2.7619 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.71 mfu: 49.52% global_avg_ntp_loss: 0.7897 global_avg_top_loss: 1.9723 +[titan] 2025-09-09 08:30:24,267 - root - INFO - lr: 1.0265e-05 gnorm: 0.38 [1 day, 14:54:57<1 day, 10:24:24] +[titan] 2025-09-09 08:30:56,119 - root - INFO - step: 21235 loss: 2.7596 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.31 mfu: 49.58% global_avg_ntp_loss: 0.7843 global_avg_top_loss: 1.9753 +[titan] 2025-09-09 08:30:56,119 - root - INFO - lr: 1.0262e-05 gnorm: 0.36 [1 day, 14:55:29<1 day, 10:23:50] +[titan] 2025-09-09 08:31:27,996 - root - INFO - step: 21240 loss: 2.6339 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.92 mfu: 49.54% global_avg_ntp_loss: 0.7300 global_avg_top_loss: 1.9039 +[titan] 2025-09-09 08:31:27,996 - root - INFO - lr: 1.0258e-05 gnorm: 0.37 [1 day, 14:56:01<1 day, 10:23:16] +[titan] 2025-09-09 08:31:59,921 - root - INFO - step: 21245 loss: 2.8927 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.18 mfu: 49.46% global_avg_ntp_loss: 0.8453 global_avg_top_loss: 2.0474 +[titan] 2025-09-09 08:31:59,922 - root - INFO - lr: 1.0254e-05 gnorm: 0.36 [1 day, 14:56:33<1 day, 10:22:42] +[titan] 2025-09-09 08:32:25,411 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:32:31,783 - root - INFO - step: 21250 loss: 2.6490 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.16 mfu: 49.56% global_avg_ntp_loss: 0.7374 global_avg_top_loss: 1.9116 +[titan] 2025-09-09 08:32:31,783 - root - INFO - lr: 1.0251e-05 gnorm: 0.37 [1 day, 14:57:05<1 day, 10:22:08] +[titan] 2025-09-09 08:33:03,883 - root - INFO - step: 21255 loss: 2.9289 memory: 122.03GiB(87.57%) tps: 10,208 tflops: 486.51 mfu: 49.19% global_avg_ntp_loss: 0.8814 global_avg_top_loss: 2.0475 +[titan] 2025-09-09 08:33:03,883 - root - INFO - lr: 1.0247e-05 gnorm: 0.39 [1 day, 14:57:37<1 day, 10:21:34] +[titan] 2025-09-09 08:33:35,861 - root - INFO - step: 21260 loss: 2.7507 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.38 mfu: 49.38% global_avg_ntp_loss: 0.7845 global_avg_top_loss: 1.9663 +[titan] 2025-09-09 08:33:35,861 - root - INFO - lr: 1.0244e-05 gnorm: 0.41 [1 day, 14:58:09<1 day, 10:21:00] +[titan] 2025-09-09 08:34:07,792 - root - INFO - step: 21265 loss: 2.7694 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.10 mfu: 49.45% global_avg_ntp_loss: 0.7900 global_avg_top_loss: 1.9794 +[titan] 2025-09-09 08:34:07,792 - root - INFO - lr: 1.0240e-05 gnorm: 0.35 [1 day, 14:58:41<1 day, 10:20:26] +[titan] 2025-09-09 08:34:39,695 - root - INFO - step: 21270 loss: 3.2825 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.52 mfu: 49.50% global_avg_ntp_loss: 1.0775 global_avg_top_loss: 2.2050 +[titan] 2025-09-09 08:34:39,695 - root - INFO - lr: 1.0237e-05 gnorm: 0.36 [1 day, 14:59:13<1 day, 10:19:52] +[titan] 2025-09-09 08:35:11,642 - root - INFO - step: 21275 loss: 2.7718 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.84 mfu: 49.43% global_avg_ntp_loss: 0.7955 global_avg_top_loss: 1.9763 +[titan] 2025-09-09 08:35:11,643 - root - INFO - lr: 1.0233e-05 gnorm: 0.34 [1 day, 14:59:45<1 day, 10:19:18] +[titan] 2025-09-09 08:35:43,578 - root - INFO - step: 21280 loss: 2.7849 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.01 mfu: 49.45% global_avg_ntp_loss: 0.8014 global_avg_top_loss: 1.9835 +[titan] 2025-09-09 08:35:43,579 - root - INFO - lr: 1.0229e-05 gnorm: 0.38 [1 day, 15:00:17<1 day, 10:18:44] +[titan] 2025-09-09 08:36:15,842 - root - INFO - step: 21285 loss: 2.8401 memory: 122.03GiB(87.57%) tps: 10,156 tflops: 484.05 mfu: 48.94% global_avg_ntp_loss: 0.8238 global_avg_top_loss: 2.0163 +[titan] 2025-09-09 08:36:15,842 - root - INFO - lr: 1.0226e-05 gnorm: 0.42 [1 day, 15:00:49<1 day, 10:18:11] +[titan] 2025-09-09 08:36:47,469 - root - INFO - step: 21290 loss: 3.1267 memory: 122.03GiB(87.57%) tps: 10,361 tflops: 493.79 mfu: 49.93% global_avg_ntp_loss: 0.9904 global_avg_top_loss: 2.1363 +[titan] 2025-09-09 08:36:47,470 - root - INFO - lr: 1.0222e-05 gnorm: 0.48 [1 day, 15:01:20<1 day, 10:17:36] +[titan] 2025-09-09 08:37:19,400 - root - INFO - step: 21295 loss: 2.7485 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.10 mfu: 49.45% global_avg_ntp_loss: 0.7810 global_avg_top_loss: 1.9674 +[titan] 2025-09-09 08:37:19,400 - root - INFO - lr: 1.0219e-05 gnorm: 0.34 [1 day, 15:01:52<1 day, 10:17:03] +[titan] 2025-09-09 08:37:44,909 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:37:51,307 - root - INFO - step: 21300 loss: 2.6898 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.45 mfu: 49.49% global_avg_ntp_loss: 0.7528 global_avg_top_loss: 1.9370 +[titan] 2025-09-09 08:37:51,307 - root - INFO - lr: 1.0215e-05 gnorm: 0.63 [1 day, 15:02:24<1 day, 10:16:29] +[titan] 2025-09-09 08:38:23,328 - root - INFO - step: 21305 loss: 3.2138 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.72 mfu: 49.31% global_avg_ntp_loss: 1.0453 global_avg_top_loss: 2.1684 +[titan] 2025-09-09 08:38:23,328 - root - INFO - lr: 1.0212e-05 gnorm: 0.33 [1 day, 15:02:56<1 day, 10:15:55] +[titan] 2025-09-09 08:38:55,457 - root - INFO - step: 21310 loss: 2.6523 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.08 mfu: 49.15% global_avg_ntp_loss: 0.7390 global_avg_top_loss: 1.9133 +[titan] 2025-09-09 08:38:55,457 - root - INFO - lr: 1.0208e-05 gnorm: 0.33 [1 day, 15:03:28<1 day, 10:15:21] +[titan] 2025-09-09 08:39:27,267 - root - INFO - step: 21315 loss: 2.7725 memory: 122.03GiB(87.57%) tps: 10,301 tflops: 490.95 mfu: 49.64% global_avg_ntp_loss: 0.7902 global_avg_top_loss: 1.9823 +[titan] 2025-09-09 08:39:27,267 - root - INFO - lr: 1.0205e-05 gnorm: 0.38 [1 day, 15:04:00<1 day, 10:14:47] +[titan] 2025-09-09 08:39:59,304 - root - INFO - step: 21320 loss: 3.2155 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.48 mfu: 49.29% global_avg_ntp_loss: 1.0474 global_avg_top_loss: 2.1681 +[titan] 2025-09-09 08:39:59,304 - root - INFO - lr: 1.0201e-05 gnorm: 0.37 [1 day, 15:04:32<1 day, 10:14:13] +[titan] 2025-09-09 08:40:31,203 - root - INFO - step: 21325 loss: 2.7910 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.58 mfu: 49.50% global_avg_ntp_loss: 0.8014 global_avg_top_loss: 1.9895 +[titan] 2025-09-09 08:40:31,203 - root - INFO - lr: 1.0197e-05 gnorm: 0.35 [1 day, 15:05:04<1 day, 10:13:39] +[titan] 2025-09-09 08:41:02,867 - root - INFO - step: 21330 loss: 2.7792 memory: 122.03GiB(87.57%) tps: 10,349 tflops: 493.21 mfu: 49.87% global_avg_ntp_loss: 0.7908 global_avg_top_loss: 1.9884 +[titan] 2025-09-09 08:41:02,867 - root - INFO - lr: 1.0194e-05 gnorm: 0.75 [1 day, 15:05:36<1 day, 10:13:05] +[titan] 2025-09-09 08:41:34,736 - root - INFO - step: 21335 loss: 2.8769 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.04 mfu: 49.55% global_avg_ntp_loss: 0.8415 global_avg_top_loss: 2.0354 +[titan] 2025-09-09 08:41:34,736 - root - INFO - lr: 1.0190e-05 gnorm: 0.36 [1 day, 15:06:08<1 day, 10:12:31] +[titan] 2025-09-09 08:42:06,671 - root - INFO - step: 21340 loss: 2.8258 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.03 mfu: 49.45% global_avg_ntp_loss: 0.8173 global_avg_top_loss: 2.0085 +[titan] 2025-09-09 08:42:06,671 - root - INFO - lr: 1.0187e-05 gnorm: 0.35 [1 day, 15:06:40<1 day, 10:11:57] +[titan] 2025-09-09 08:42:39,088 - root - INFO - step: 21345 loss: 2.7732 memory: 122.03GiB(87.57%) tps: 10,108 tflops: 481.76 mfu: 48.71% global_avg_ntp_loss: 0.7933 global_avg_top_loss: 1.9799 +[titan] 2025-09-09 08:42:39,089 - root - INFO - lr: 1.0183e-05 gnorm: 0.35 [1 day, 15:07:12<1 day, 10:11:24] +[titan] 2025-09-09 08:43:04,629 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:43:10,999 - root - INFO - step: 21350 loss: 3.1695 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.40 mfu: 49.48% global_avg_ntp_loss: 1.0284 global_avg_top_loss: 2.1411 +[titan] 2025-09-09 08:43:10,999 - root - INFO - lr: 1.0180e-05 gnorm: 0.44 [1 day, 15:07:44<1 day, 10:10:50] +[titan] 2025-09-09 08:43:43,087 - root - INFO - step: 21355 loss: 2.8161 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.70 mfu: 49.21% global_avg_ntp_loss: 0.8140 global_avg_top_loss: 2.0022 +[titan] 2025-09-09 08:43:43,087 - root - INFO - lr: 1.0176e-05 gnorm: 0.34 [1 day, 15:08:16<1 day, 10:10:16] +[titan] 2025-09-09 08:44:15,025 - root - INFO - step: 21360 loss: 2.8946 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.98 mfu: 49.44% global_avg_ntp_loss: 0.8617 global_avg_top_loss: 2.0329 +[titan] 2025-09-09 08:44:15,025 - root - INFO - lr: 1.0173e-05 gnorm: 0.53 [1 day, 15:08:48<1 day, 10:09:42] +[titan] 2025-09-09 08:44:47,037 - root - INFO - step: 21365 loss: 2.8272 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.85 mfu: 49.33% global_avg_ntp_loss: 0.8180 global_avg_top_loss: 2.0092 +[titan] 2025-09-09 08:44:47,037 - root - INFO - lr: 1.0169e-05 gnorm: 0.37 [1 day, 15:09:20<1 day, 10:09:08] +[titan] 2025-09-09 08:45:19,178 - root - INFO - step: 21370 loss: 2.7363 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.90 mfu: 49.13% global_avg_ntp_loss: 0.7739 global_avg_top_loss: 1.9625 +[titan] 2025-09-09 08:45:19,178 - root - INFO - lr: 1.0165e-05 gnorm: 0.58 [1 day, 15:09:52<1 day, 10:08:34] +[titan] 2025-09-09 08:45:51,007 - root - INFO - step: 21375 loss: 2.6926 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.66 mfu: 49.61% global_avg_ntp_loss: 0.7562 global_avg_top_loss: 1.9364 +[titan] 2025-09-09 08:45:51,007 - root - INFO - lr: 1.0162e-05 gnorm: 0.35 [1 day, 15:10:24<1 day, 10:08:00] +[titan] 2025-09-09 08:46:22,977 - root - INFO - step: 21380 loss: 3.1665 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.50 mfu: 49.39% global_avg_ntp_loss: 1.0248 global_avg_top_loss: 2.1417 +[titan] 2025-09-09 08:46:22,977 - root - INFO - lr: 1.0158e-05 gnorm: 0.38 [1 day, 15:10:56<1 day, 10:07:27] +[titan] 2025-09-09 08:46:54,724 - root - INFO - step: 21385 loss: 3.1693 memory: 122.03GiB(87.57%) tps: 10,322 tflops: 491.93 mfu: 49.74% global_avg_ntp_loss: 1.0278 global_avg_top_loss: 2.1416 +[titan] 2025-09-09 08:46:54,724 - root - INFO - lr: 1.0155e-05 gnorm: 0.38 [1 day, 15:11:28<1 day, 10:06:53] +[titan] 2025-09-09 08:47:26,715 - root - INFO - step: 21390 loss: 2.7132 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.17 mfu: 49.36% global_avg_ntp_loss: 0.7658 global_avg_top_loss: 1.9475 +[titan] 2025-09-09 08:47:26,715 - root - INFO - lr: 1.0151e-05 gnorm: 0.35 [1 day, 15:12:00<1 day, 10:06:19] +[titan] 2025-09-09 08:47:58,746 - root - INFO - step: 21395 loss: 2.8333 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.56 mfu: 49.30% global_avg_ntp_loss: 0.8238 global_avg_top_loss: 2.0096 +[titan] 2025-09-09 08:47:58,746 - root - INFO - lr: 1.0148e-05 gnorm: 0.38 [1 day, 15:12:32<1 day, 10:05:45] +[titan] 2025-09-09 08:48:24,377 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:48:30,698 - root - INFO - step: 21400 loss: 3.2217 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.78 mfu: 49.42% global_avg_ntp_loss: 1.0531 global_avg_top_loss: 2.1686 +[titan] 2025-09-09 08:48:30,698 - root - INFO - lr: 1.0144e-05 gnorm: 0.49 [1 day, 15:13:04<1 day, 10:05:11] +[titan] 2025-09-09 08:49:02,771 - root - INFO - step: 21405 loss: 2.7985 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.92 mfu: 49.23% global_avg_ntp_loss: 0.8028 global_avg_top_loss: 1.9957 +[titan] 2025-09-09 08:49:02,771 - root - INFO - lr: 1.0141e-05 gnorm: 0.38 [1 day, 15:13:36<1 day, 10:04:37] +[titan] 2025-09-09 08:49:34,716 - root - INFO - step: 21410 loss: 2.7025 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.87 mfu: 49.43% global_avg_ntp_loss: 0.7627 global_avg_top_loss: 1.9398 +[titan] 2025-09-09 08:49:34,717 - root - INFO - lr: 1.0137e-05 gnorm: 0.39 [1 day, 15:14:08<1 day, 10:04:03] +[titan] 2025-09-09 08:50:06,822 - root - INFO - step: 21415 loss: 3.2043 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.43 mfu: 49.18% global_avg_ntp_loss: 1.0499 global_avg_top_loss: 2.1544 +[titan] 2025-09-09 08:50:06,822 - root - INFO - lr: 1.0134e-05 gnorm: 0.41 [1 day, 15:14:40<1 day, 10:03:30] +[titan] 2025-09-09 08:50:38,963 - root - INFO - step: 21420 loss: 2.7892 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.89 mfu: 49.13% global_avg_ntp_loss: 0.8005 global_avg_top_loss: 1.9887 +[titan] 2025-09-09 08:50:38,964 - root - INFO - lr: 1.0130e-05 gnorm: 0.41 [1 day, 15:15:12<1 day, 10:02:56] +[titan] 2025-09-09 08:51:10,695 - root - INFO - step: 21425 loss: 2.8082 memory: 122.03GiB(87.57%) tps: 10,327 tflops: 492.16 mfu: 49.76% global_avg_ntp_loss: 0.8092 global_avg_top_loss: 1.9990 +[titan] 2025-09-09 08:51:10,696 - root - INFO - lr: 1.0126e-05 gnorm: 0.33 [1 day, 15:15:44<1 day, 10:02:22] +[titan] 2025-09-09 08:51:42,687 - root - INFO - step: 21430 loss: 3.2729 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.17 mfu: 49.36% global_avg_ntp_loss: 1.0723 global_avg_top_loss: 2.2005 +[titan] 2025-09-09 08:51:42,687 - root - INFO - lr: 1.0123e-05 gnorm: 0.44 [1 day, 15:16:16<1 day, 10:01:48] +[titan] 2025-09-09 08:52:14,594 - root - INFO - step: 21435 loss: 2.9409 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.46 mfu: 49.49% global_avg_ntp_loss: 0.8806 global_avg_top_loss: 2.0603 +[titan] 2025-09-09 08:52:14,594 - root - INFO - lr: 1.0119e-05 gnorm: 0.36 [1 day, 15:16:48<1 day, 10:01:14] +[titan] 2025-09-09 08:52:46,674 - root - INFO - step: 21440 loss: 2.7742 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.82 mfu: 49.22% global_avg_ntp_loss: 0.7939 global_avg_top_loss: 1.9803 +[titan] 2025-09-09 08:52:46,674 - root - INFO - lr: 1.0116e-05 gnorm: 0.34 [1 day, 15:17:20<1 day, 10:00:40] +[titan] 2025-09-09 08:53:18,594 - root - INFO - step: 21445 loss: 2.9764 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.26 mfu: 49.47% global_avg_ntp_loss: 0.9063 global_avg_top_loss: 2.0701 +[titan] 2025-09-09 08:53:18,594 - root - INFO - lr: 1.0112e-05 gnorm: 0.37 [1 day, 15:17:52<1 day, 10:00:06] +[titan] 2025-09-09 08:53:44,001 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:53:50,615 - root - INFO - step: 21450 loss: 2.8811 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.72 mfu: 49.31% global_avg_ntp_loss: 0.8518 global_avg_top_loss: 2.0293 +[titan] 2025-09-09 08:53:50,615 - root - INFO - lr: 1.0109e-05 gnorm: 0.59 [1 day, 15:18:24<1 day, 9:59:33] +[titan] 2025-09-09 08:54:23,030 - root - INFO - step: 21455 loss: 2.7564 memory: 122.03GiB(87.57%) tps: 10,109 tflops: 481.79 mfu: 48.71% global_avg_ntp_loss: 0.7852 global_avg_top_loss: 1.9713 +[titan] 2025-09-09 08:54:23,030 - root - INFO - lr: 1.0105e-05 gnorm: 0.35 [1 day, 15:18:56<1 day, 9:58:59] +[titan] 2025-09-09 08:54:55,019 - root - INFO - step: 21460 loss: 2.7226 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.21 mfu: 49.36% global_avg_ntp_loss: 0.7708 global_avg_top_loss: 1.9519 +[titan] 2025-09-09 08:54:55,019 - root - INFO - lr: 1.0102e-05 gnorm: 0.34 [1 day, 15:19:28<1 day, 9:58:25] +[titan] 2025-09-09 08:55:27,040 - root - INFO - step: 21465 loss: 2.7373 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.71 mfu: 49.31% global_avg_ntp_loss: 0.7844 global_avg_top_loss: 1.9530 +[titan] 2025-09-09 08:55:27,041 - root - INFO - lr: 1.0098e-05 gnorm: 0.33 [1 day, 15:20:00<1 day, 9:57:51] +[titan] 2025-09-09 08:55:59,128 - root - INFO - step: 21470 loss: 2.7267 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.71 mfu: 49.21% global_avg_ntp_loss: 0.7742 global_avg_top_loss: 1.9524 +[titan] 2025-09-09 08:55:59,128 - root - INFO - lr: 1.0094e-05 gnorm: 0.34 [1 day, 15:20:32<1 day, 9:57:18] +[titan] 2025-09-09 08:56:30,923 - root - INFO - step: 21475 loss: 2.7791 memory: 122.03GiB(87.57%) tps: 10,306 tflops: 491.17 mfu: 49.66% global_avg_ntp_loss: 0.7971 global_avg_top_loss: 1.9820 +[titan] 2025-09-09 08:56:30,924 - root - INFO - lr: 1.0091e-05 gnorm: 0.33 [1 day, 15:21:04<1 day, 9:56:44] +[titan] 2025-09-09 08:57:03,162 - root - INFO - step: 21480 loss: 3.2365 memory: 122.03GiB(87.57%) tps: 10,165 tflops: 484.43 mfu: 48.98% global_avg_ntp_loss: 1.0599 global_avg_top_loss: 2.1767 +[titan] 2025-09-09 08:57:03,162 - root - INFO - lr: 1.0087e-05 gnorm: 0.36 [1 day, 15:21:36<1 day, 9:56:10] +[titan] 2025-09-09 08:57:35,295 - root - INFO - step: 21485 loss: 2.8410 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.02 mfu: 49.14% global_avg_ntp_loss: 0.8233 global_avg_top_loss: 2.0177 +[titan] 2025-09-09 08:57:35,295 - root - INFO - lr: 1.0084e-05 gnorm: 0.33 [1 day, 15:22:08<1 day, 9:55:36] +[titan] 2025-09-09 08:58:07,572 - root - INFO - step: 21490 loss: 2.7830 memory: 122.03GiB(87.57%) tps: 10,152 tflops: 483.85 mfu: 48.92% global_avg_ntp_loss: 0.8010 global_avg_top_loss: 1.9821 +[titan] 2025-09-09 08:58:07,572 - root - INFO - lr: 1.0080e-05 gnorm: 0.39 [1 day, 15:22:41<1 day, 9:55:03] +[titan] 2025-09-09 08:58:39,731 - root - INFO - step: 21495 loss: 3.1936 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.63 mfu: 49.10% global_avg_ntp_loss: 1.0459 global_avg_top_loss: 2.1477 +[titan] 2025-09-09 08:58:39,731 - root - INFO - lr: 1.0077e-05 gnorm: 0.45 [1 day, 15:23:13<1 day, 9:54:29] +[titan] 2025-09-09 08:59:05,214 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:59:11,694 - root - INFO - step: 21500 loss: 2.7727 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.59 mfu: 49.40% global_avg_ntp_loss: 0.7926 global_avg_top_loss: 1.9801 +[titan] 2025-09-09 08:59:11,694 - root - INFO - lr: 1.0073e-05 gnorm: 0.34 [1 day, 15:23:45<1 day, 9:53:55] +[titan] 2025-09-09 08:59:37,595 - root - INFO - Dumping profiler traces at step 21504 +[titan] 2025-09-09 08:59:37,663 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-09 08:59:43,998 - root - INFO - step: 21505 loss: 2.8412 memory: 122.03GiB(87.57%) tps: 10,144 tflops: 483.44 mfu: 48.88% global_avg_ntp_loss: 0.8224 global_avg_top_loss: 2.0188 +[titan] 2025-09-09 08:59:43,999 - root - INFO - lr: 1.0070e-05 gnorm: 0.35 [1 day, 15:24:17<1 day, 9:53:22] +[titan] 2025-09-09 09:00:16,226 - root - INFO - step: 21510 loss: 3.1718 memory: 122.03GiB(87.57%) tps: 10,168 tflops: 484.60 mfu: 49.00% global_avg_ntp_loss: 1.0284 global_avg_top_loss: 2.1434 +[titan] 2025-09-09 09:00:16,226 - root - INFO - lr: 1.0066e-05 gnorm: 0.53 [1 day, 15:24:49<1 day, 9:52:48] +[titan] 2025-09-09 09:00:48,288 - root - INFO - step: 21515 loss: 2.8465 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.10 mfu: 49.25% global_avg_ntp_loss: 0.8217 global_avg_top_loss: 2.0248 +[titan] 2025-09-09 09:00:48,288 - root - INFO - lr: 1.0062e-05 gnorm: 0.36 [1 day, 15:25:21<1 day, 9:52:14] +[titan] 2025-09-09 09:01:20,370 - root - INFO - step: 21520 loss: 2.7681 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.79 mfu: 49.22% global_avg_ntp_loss: 0.7901 global_avg_top_loss: 1.9780 +[titan] 2025-09-09 09:01:20,370 - root - INFO - lr: 1.0059e-05 gnorm: 0.36 [1 day, 15:25:53<1 day, 9:51:40] +[titan] 2025-09-09 09:01:52,291 - root - INFO - step: 21525 loss: 2.8162 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.24 mfu: 49.47% global_avg_ntp_loss: 0.8160 global_avg_top_loss: 2.0002 +[titan] 2025-09-09 09:01:52,291 - root - INFO - lr: 1.0055e-05 gnorm: 0.38 [1 day, 15:26:25<1 day, 9:51:06] +[titan] 2025-09-09 09:02:24,640 - root - INFO - step: 21530 loss: 3.1600 memory: 122.03GiB(87.57%) tps: 10,130 tflops: 482.78 mfu: 48.81% global_avg_ntp_loss: 1.0201 global_avg_top_loss: 2.1399 +[titan] 2025-09-09 09:02:24,640 - root - INFO - lr: 1.0052e-05 gnorm: 0.34 [1 day, 15:26:58<1 day, 9:50:33] +[titan] 2025-09-09 09:02:56,772 - root - INFO - step: 21535 loss: 2.8205 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.02 mfu: 49.14% global_avg_ntp_loss: 0.8149 global_avg_top_loss: 2.0056 +[titan] 2025-09-09 09:02:56,773 - root - INFO - lr: 1.0048e-05 gnorm: 0.34 [1 day, 15:27:30<1 day, 9:49:59] +[titan] 2025-09-09 09:03:28,861 - root - INFO - step: 21540 loss: 2.8180 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.69 mfu: 49.21% global_avg_ntp_loss: 0.8195 global_avg_top_loss: 1.9985 +[titan] 2025-09-09 09:03:28,861 - root - INFO - lr: 1.0045e-05 gnorm: 0.34 [1 day, 15:28:02<1 day, 9:49:26] +[titan] 2025-09-09 09:04:00,701 - root - INFO - step: 21545 loss: 2.6384 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.49 mfu: 49.59% global_avg_ntp_loss: 0.7275 global_avg_top_loss: 1.9108 +[titan] 2025-09-09 09:04:00,701 - root - INFO - lr: 1.0041e-05 gnorm: 0.46 [1 day, 15:28:34<1 day, 9:48:52] +[titan] 2025-09-09 09:04:26,261 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 09:04:32,679 - root - INFO - step: 21550 loss: 2.8645 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.38 mfu: 49.38% global_avg_ntp_loss: 0.8352 global_avg_top_loss: 2.0293 +[titan] 2025-09-09 09:04:32,679 - root - INFO - lr: 1.0038e-05 gnorm: 0.34 [1 day, 15:29:06<1 day, 9:48:18] +[titan] 2025-09-09 09:05:04,603 - root - INFO - step: 21555 loss: 2.7747 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.19 mfu: 49.46% global_avg_ntp_loss: 0.8015 global_avg_top_loss: 1.9732 +[titan] 2025-09-09 09:05:04,603 - root - INFO - lr: 1.0034e-05 gnorm: 0.34 [1 day, 15:29:38<1 day, 9:47:44] +[titan] 2025-09-09 09:05:36,651 - root - INFO - step: 21560 loss: 3.2656 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.31 mfu: 49.27% global_avg_ntp_loss: 1.0721 global_avg_top_loss: 2.1935 +[titan] 2025-09-09 09:05:36,651 - root - INFO - lr: 1.0031e-05 gnorm: 0.37 [1 day, 15:30:10<1 day, 9:47:10] +[titan] 2025-09-09 09:06:08,694 - root - INFO - step: 21565 loss: 2.8106 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.38 mfu: 49.28% global_avg_ntp_loss: 0.8015 global_avg_top_loss: 2.0091 +[titan] 2025-09-09 09:06:08,694 - root - INFO - lr: 1.0027e-05 gnorm: 0.47 [1 day, 15:30:42<1 day, 9:46:36] +[titan] 2025-09-09 09:06:40,846 - root - INFO - step: 21570 loss: 2.7767 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.74 mfu: 49.11% global_avg_ntp_loss: 0.7938 global_avg_top_loss: 1.9830 +[titan] 2025-09-09 09:06:40,846 - root - INFO - lr: 1.0023e-05 gnorm: 0.33 [1 day, 15:31:14<1 day, 9:46:03] +[titan] 2025-09-09 09:07:12,820 - root - INFO - step: 21575 loss: 3.2102 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.43 mfu: 49.39% global_avg_ntp_loss: 1.0477 global_avg_top_loss: 2.1625 +[titan] 2025-09-09 09:07:12,820 - root - INFO - lr: 1.0020e-05 gnorm: 0.36 [1 day, 15:31:46<1 day, 9:45:29] +[titan] 2025-09-09 09:07:44,834 - root - INFO - step: 21580 loss: 2.8018 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.83 mfu: 49.33% global_avg_ntp_loss: 0.7931 global_avg_top_loss: 2.0087 +[titan] 2025-09-09 09:07:44,834 - root - INFO - lr: 1.0016e-05 gnorm: 1.13 [1 day, 15:32:18<1 day, 9:44:55] +[titan] 2025-09-09 09:08:16,905 - root - INFO - step: 21585 loss: 2.8776 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.95 mfu: 49.24% global_avg_ntp_loss: 0.8386 global_avg_top_loss: 2.0390 +[titan] 2025-09-09 09:08:16,905 - root - INFO - lr: 1.0013e-05 gnorm: 0.37 [1 day, 15:32:50<1 day, 9:44:21] +[titan] 2025-09-09 09:08:48,956 - root - INFO - step: 21590 loss: 2.7611 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.27 mfu: 49.27% global_avg_ntp_loss: 0.7877 global_avg_top_loss: 1.9734 +[titan] 2025-09-09 09:08:48,956 - root - INFO - lr: 1.0009e-05 gnorm: 0.35 [1 day, 15:33:22<1 day, 9:43:47] +[titan] 2025-09-09 09:09:20,964 - root - INFO - step: 21595 loss: 2.6858 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.92 mfu: 49.33% global_avg_ntp_loss: 0.7556 global_avg_top_loss: 1.9303 +[titan] 2025-09-09 09:09:20,964 - root - INFO - lr: 1.0006e-05 gnorm: 0.34 [1 day, 15:33:54<1 day, 9:43:14] +[titan] 2025-09-09 09:09:46,585 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 09:09:53,062 - root - INFO - step: 21600 loss: 2.7319 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.54 mfu: 49.20% global_avg_ntp_loss: 0.7754 global_avg_top_loss: 1.9565 +[titan] 2025-09-09 09:09:53,062 - root - INFO - lr: 1.0002e-05 gnorm: 0.34 [1 day, 15:34:26<1 day, 9:42:40] +[titan] 2025-09-09 09:10:25,170 - root - INFO - step: 21605 loss: 2.8199 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.39 mfu: 49.18% global_avg_ntp_loss: 0.8112 global_avg_top_loss: 2.0086 +[titan] 2025-09-09 09:10:25,171 - root - INFO - lr: 9.9986e-06 gnorm: 0.34 [1 day, 15:34:58<1 day, 9:42:06] +[titan] 2025-09-09 09:10:57,198 - root - INFO - step: 21610 loss: 3.2078 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.61 mfu: 49.30% global_avg_ntp_loss: 1.0454 global_avg_top_loss: 2.1623 +[titan] 2025-09-09 09:10:57,198 - root - INFO - lr: 9.9950e-06 gnorm: 0.34 [1 day, 15:35:30<1 day, 9:41:32] +[titan] 2025-09-09 09:11:29,325 - root - INFO - step: 21615 loss: 2.8016 memory: 122.03GiB(87.57%) tps: 10,200 tflops: 486.12 mfu: 49.15% global_avg_ntp_loss: 0.8056 global_avg_top_loss: 1.9959 +[titan] 2025-09-09 09:11:29,325 - root - INFO - lr: 9.9915e-06 gnorm: 0.36 [1 day, 15:36:02<1 day, 9:40:59] +[titan] 2025-09-09 09:12:01,269 - root - INFO - step: 21620 loss: 2.7403 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.89 mfu: 49.43% global_avg_ntp_loss: 0.7775 global_avg_top_loss: 1.9628 +[titan] 2025-09-09 09:12:01,269 - root - INFO - lr: 9.9879e-06 gnorm: 0.36 [1 day, 15:36:34<1 day, 9:40:25] +[titan] 2025-09-09 09:12:33,187 - root - INFO - step: 21625 loss: 2.7520 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.28 mfu: 49.47% global_avg_ntp_loss: 0.7841 global_avg_top_loss: 1.9678 +[titan] 2025-09-09 09:12:33,188 - root - INFO - lr: 9.9844e-06 gnorm: 0.35 [1 day, 15:37:06<1 day, 9:39:51] +[titan] 2025-09-09 09:13:04,992 - root - INFO - step: 21630 loss: 2.7973 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.03 mfu: 49.65% global_avg_ntp_loss: 0.8051 global_avg_top_loss: 1.9922 +[titan] 2025-09-09 09:13:04,992 - root - INFO - lr: 9.9808e-06 gnorm: 0.34 [1 day, 15:37:38<1 day, 9:39:17] +[titan] 2025-09-09 09:13:37,257 - root - INFO - step: 21635 loss: 2.8207 memory: 122.03GiB(87.57%) tps: 10,156 tflops: 484.03 mfu: 48.94% global_avg_ntp_loss: 0.8149 global_avg_top_loss: 2.0058 +[titan] 2025-09-09 09:13:37,258 - root - INFO - lr: 9.9773e-06 gnorm: 0.33 [1 day, 15:38:10<1 day, 9:38:43] +[titan] 2025-09-09 09:14:09,247 - root - INFO - step: 21640 loss: 3.2058 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.20 mfu: 49.36% global_avg_ntp_loss: 1.0431 global_avg_top_loss: 2.1627 +[titan] 2025-09-09 09:14:09,247 - root - INFO - lr: 9.9737e-06 gnorm: 0.35 [1 day, 15:38:42<1 day, 9:38:10] +[titan] 2025-09-09 09:14:41,093 - root - INFO - step: 21645 loss: 2.7891 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.39 mfu: 49.58% global_avg_ntp_loss: 0.8006 global_avg_top_loss: 1.9886 +[titan] 2025-09-09 09:14:41,094 - root - INFO - lr: 9.9702e-06 gnorm: 0.33 [1 day, 15:39:14<1 day, 9:37:36] +[titan] 2025-09-09 09:15:06,673 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 09:15:13,182 - root - INFO - step: 21650 loss: 2.8429 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.68 mfu: 49.21% global_avg_ntp_loss: 0.8245 global_avg_top_loss: 2.0184 +[titan] 2025-09-09 09:15:13,183 - root - INFO - lr: 9.9667e-06 gnorm: 0.34 [1 day, 15:39:46<1 day, 9:37:02] +[titan] 2025-09-09 09:15:45,108 - root - INFO - step: 21655 loss: 2.7378 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.17 mfu: 49.46% global_avg_ntp_loss: 0.7868 global_avg_top_loss: 1.9509 +[titan] 2025-09-09 09:15:45,108 - root - INFO - lr: 9.9631e-06 gnorm: 0.34 [1 day, 15:40:18<1 day, 9:36:28] +[titan] 2025-09-09 09:16:17,071 - root - INFO - step: 21660 loss: 2.7385 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.61 mfu: 49.40% global_avg_ntp_loss: 0.7806 global_avg_top_loss: 1.9579 +[titan] 2025-09-09 09:16:17,071 - root - INFO - lr: 9.9596e-06 gnorm: 0.33 [1 day, 15:40:50<1 day, 9:35:54] +[titan] 2025-09-09 09:16:48,949 - root - INFO - step: 21665 loss: 2.7924 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.90 mfu: 49.53% global_avg_ntp_loss: 0.8005 global_avg_top_loss: 1.9919 +[titan] 2025-09-09 09:16:48,949 - root - INFO - lr: 9.9560e-06 gnorm: 0.35 [1 day, 15:41:22<1 day, 9:35:20] +[titan] 2025-09-09 09:17:20,889 - root - INFO - step: 21670 loss: 2.8056 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.95 mfu: 49.44% global_avg_ntp_loss: 0.8157 global_avg_top_loss: 1.9900 +[titan] 2025-09-09 09:17:20,890 - root - INFO - lr: 9.9525e-06 gnorm: 0.33 [1 day, 15:41:54<1 day, 9:34:46] +[titan] 2025-09-09 09:17:53,055 - root - INFO - step: 21675 loss: 3.1147 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.53 mfu: 49.09% global_avg_ntp_loss: 0.9845 global_avg_top_loss: 2.1303 +[titan] 2025-09-09 09:17:53,055 - root - INFO - lr: 9.9489e-06 gnorm: 0.42 [1 day, 15:42:26<1 day, 9:34:13] +[titan] 2025-09-09 09:18:25,059 - root - INFO - step: 21680 loss: 2.8162 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.97 mfu: 49.34% global_avg_ntp_loss: 0.8134 global_avg_top_loss: 2.0027 +[titan] 2025-09-09 09:18:25,060 - root - INFO - lr: 9.9454e-06 gnorm: 0.33 [1 day, 15:42:58<1 day, 9:33:39] +[titan] 2025-09-09 09:18:57,218 - root - INFO - step: 21685 loss: 2.7674 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.63 mfu: 49.10% global_avg_ntp_loss: 0.7897 global_avg_top_loss: 1.9777 +[titan] 2025-09-09 09:18:57,218 - root - INFO - lr: 9.9418e-06 gnorm: 0.34 [1 day, 15:43:30<1 day, 9:33:05] +[titan] 2025-09-09 09:19:29,260 - root - INFO - step: 21690 loss: 3.2513 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.40 mfu: 49.28% global_avg_ntp_loss: 1.0636 global_avg_top_loss: 2.1877 +[titan] 2025-09-09 09:19:29,260 - root - INFO - lr: 9.9383e-06 gnorm: 0.36 [1 day, 15:44:02<1 day, 9:32:32] +[titan] 2025-09-09 09:20:01,389 - root - INFO - step: 21695 loss: 2.7729 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.08 mfu: 49.15% global_avg_ntp_loss: 0.7945 global_avg_top_loss: 1.9784 +[titan] 2025-09-09 09:20:01,389 - root - INFO - lr: 9.9347e-06 gnorm: 0.35 [1 day, 15:44:34<1 day, 9:31:58] +[titan] 2025-09-09 09:20:27,089 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 09:20:33,537 - root - INFO - step: 21700 loss: 3.2302 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.79 mfu: 49.12% global_avg_ntp_loss: 1.0528 global_avg_top_loss: 2.1775 +[titan] 2025-09-09 09:20:33,537 - root - INFO - lr: 9.9312e-06 gnorm: 0.39 [1 day, 15:45:06<1 day, 9:31:24] +[titan] 2025-09-09 09:21:05,525 - root - INFO - step: 21705 loss: 2.8747 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.22 mfu: 49.37% global_avg_ntp_loss: 0.8379 global_avg_top_loss: 2.0368 +[titan] 2025-09-09 09:21:05,525 - root - INFO - lr: 9.9277e-06 gnorm: 0.35 [1 day, 15:45:38<1 day, 9:30:50] +[titan] 2025-09-09 09:21:37,444 - root - INFO - step: 21710 loss: 2.7781 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.27 mfu: 49.47% global_avg_ntp_loss: 0.7988 global_avg_top_loss: 1.9793 +[titan] 2025-09-09 09:21:37,445 - root - INFO - lr: 9.9241e-06 gnorm: 0.34 [1 day, 15:46:10<1 day, 9:30:17] +[titan] 2025-09-09 09:22:09,456 - root - INFO - step: 21715 loss: 2.7318 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.87 mfu: 49.33% global_avg_ntp_loss: 0.7771 global_avg_top_loss: 1.9547 +[titan] 2025-09-09 09:22:09,456 - root - INFO - lr: 9.9206e-06 gnorm: 0.34 [1 day, 15:46:42<1 day, 9:29:43] +[titan] 2025-09-09 09:22:41,500 - root - INFO - step: 21720 loss: 2.9010 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.37 mfu: 49.28% global_avg_ntp_loss: 0.8719 global_avg_top_loss: 2.0291 +[titan] 2025-09-09 09:22:41,500 - root - INFO - lr: 9.9170e-06 gnorm: 0.35 [1 day, 15:47:14<1 day, 9:29:09] +[titan] 2025-09-09 09:23:13,407 - root - INFO - step: 21725 loss: 2.8291 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.46 mfu: 49.49% global_avg_ntp_loss: 0.8200 global_avg_top_loss: 2.0091 +[titan] 2025-09-09 09:23:13,407 - root - INFO - lr: 9.9135e-06 gnorm: 0.34 [1 day, 15:47:46<1 day, 9:28:35] +[titan] 2025-09-09 09:23:45,374 - root - INFO - step: 21730 loss: 2.8107 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.54 mfu: 49.40% global_avg_ntp_loss: 0.8108 global_avg_top_loss: 2.0000 +[titan] 2025-09-09 09:23:45,374 - root - INFO - lr: 9.9099e-06 gnorm: 0.46 [1 day, 15:48:18<1 day, 9:28:01] +[titan] 2025-09-09 09:24:17,342 - root - INFO - step: 21735 loss: 2.7608 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.53 mfu: 49.40% global_avg_ntp_loss: 0.7838 global_avg_top_loss: 1.9770 +[titan] 2025-09-09 09:24:17,342 - root - INFO - lr: 9.9064e-06 gnorm: 0.33 [1 day, 15:48:50<1 day, 9:27:27] +[titan] 2025-09-09 09:24:49,459 - root - INFO - step: 21740 loss: 2.6843 memory: 122.03GiB(87.57%) tps: 10,203 tflops: 486.26 mfu: 49.17% global_avg_ntp_loss: 0.7508 global_avg_top_loss: 1.9335 +[titan] 2025-09-09 09:24:49,459 - root - INFO - lr: 9.9028e-06 gnorm: 0.33 [1 day, 15:49:22<1 day, 9:26:54] +[titan] 2025-09-09 09:25:21,361 - root - INFO - step: 21745 loss: 2.7867 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.53 mfu: 49.50% global_avg_ntp_loss: 0.7984 global_avg_top_loss: 1.9883 +[titan] 2025-09-09 09:25:21,362 - root - INFO - lr: 9.8993e-06 gnorm: 0.33 [1 day, 15:49:54<1 day, 9:26:20] +[titan] 2025-09-09 09:25:46,807 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 09:25:53,227 - root - INFO - step: 21750 loss: 2.7778 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.09 mfu: 49.55% global_avg_ntp_loss: 0.7930 global_avg_top_loss: 1.9848 +[titan] 2025-09-09 09:25:53,228 - root - INFO - lr: 9.8958e-06 gnorm: 0.35 [1 day, 15:50:26<1 day, 9:25:46] +[titan] 2025-09-09 09:26:25,445 - root - INFO - step: 21755 loss: 3.2527 memory: 122.03GiB(87.57%) tps: 10,171 tflops: 484.74 mfu: 49.01% global_avg_ntp_loss: 1.0648 global_avg_top_loss: 2.1878 +[titan] 2025-09-09 09:26:25,445 - root - INFO - lr: 9.8922e-06 gnorm: 0.37 [1 day, 15:50:58<1 day, 9:25:12] +[titan] 2025-09-09 09:26:57,489 - root - INFO - step: 21760 loss: 2.8124 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.37 mfu: 49.28% global_avg_ntp_loss: 0.8115 global_avg_top_loss: 2.0009 +[titan] 2025-09-09 09:26:57,489 - root - INFO - lr: 9.8887e-06 gnorm: 0.34 [1 day, 15:51:30<1 day, 9:24:39] +[titan] 2025-09-09 09:27:29,538 - root - INFO - step: 21765 loss: 2.7269 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.28 mfu: 49.27% global_avg_ntp_loss: 0.7721 global_avg_top_loss: 1.9549 +[titan] 2025-09-09 09:27:29,539 - root - INFO - lr: 9.8851e-06 gnorm: 0.33 [1 day, 15:52:02<1 day, 9:24:05] +[titan] 2025-09-09 09:28:01,434 - root - INFO - step: 21770 loss: 3.2764 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.63 mfu: 49.51% global_avg_ntp_loss: 1.0767 global_avg_top_loss: 2.1997 +[titan] 2025-09-09 09:28:01,435 - root - INFO - lr: 9.8816e-06 gnorm: 0.36 [1 day, 15:52:34<1 day, 9:23:31] +[titan] 2025-09-09 09:28:33,547 - root - INFO - step: 21775 loss: 2.8499 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.33 mfu: 49.17% global_avg_ntp_loss: 0.8312 global_avg_top_loss: 2.0187 +[titan] 2025-09-09 09:28:33,547 - root - INFO - lr: 9.8781e-06 gnorm: 0.34 [1 day, 15:53:06<1 day, 9:22:57] +[titan] 2025-09-09 09:29:05,222 - root - INFO - step: 21780 loss: 2.7963 memory: 122.03GiB(87.57%) tps: 10,345 tflops: 493.04 mfu: 49.85% global_avg_ntp_loss: 0.8028 global_avg_top_loss: 1.9935 +[titan] 2025-09-09 09:29:05,222 - root - INFO - lr: 9.8745e-06 gnorm: 0.35 [1 day, 15:53:38<1 day, 9:22:23] +[titan] 2025-09-09 09:29:37,192 - root - INFO - step: 21785 loss: 2.8062 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.49 mfu: 49.39% global_avg_ntp_loss: 0.8093 global_avg_top_loss: 1.9969 +[titan] 2025-09-09 09:29:37,193 - root - INFO - lr: 9.8710e-06 gnorm: 0.34 [1 day, 15:54:10<1 day, 9:21:50] +[titan] 2025-09-09 09:30:08,984 - root - INFO - step: 21790 loss: 2.7962 memory: 122.03GiB(87.57%) tps: 10,307 tflops: 491.24 mfu: 49.67% global_avg_ntp_loss: 0.8009 global_avg_top_loss: 1.9953 +[titan] 2025-09-09 09:30:08,984 - root - INFO - lr: 9.8674e-06 gnorm: 0.35 [1 day, 15:54:42<1 day, 9:21:16] +[titan] 2025-09-09 09:30:40,810 - root - INFO - step: 21795 loss: 2.7956 memory: 122.03GiB(87.57%) tps: 10,296 tflops: 490.70 mfu: 49.62% global_avg_ntp_loss: 0.8059 global_avg_top_loss: 1.9897 +[titan] 2025-09-09 09:30:40,810 - root - INFO - lr: 9.8639e-06 gnorm: 0.32 [1 day, 15:55:14<1 day, 9:20:42] +[titan] 2025-09-09 09:31:06,355 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 09:31:12,680 - root - INFO - step: 21800 loss: 2.7703 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.03 mfu: 49.55% global_avg_ntp_loss: 0.7934 global_avg_top_loss: 1.9769 +[titan] 2025-09-09 09:31:12,680 - root - INFO - lr: 9.8603e-06 gnorm: 0.43 [1 day, 15:55:46<1 day, 9:20:08] +[titan] 2025-09-09 09:31:44,624 - root - INFO - step: 21805 loss: 2.7569 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.89 mfu: 49.43% global_avg_ntp_loss: 0.7871 global_avg_top_loss: 1.9699 +[titan] 2025-09-09 09:31:44,624 - root - INFO - lr: 9.8568e-06 gnorm: 0.32 [1 day, 15:56:18<1 day, 9:19:34] +[titan] 2025-09-09 09:32:16,374 - root - INFO - step: 21810 loss: 2.7460 memory: 122.03GiB(87.57%) tps: 10,321 tflops: 491.87 mfu: 49.73% global_avg_ntp_loss: 0.7822 global_avg_top_loss: 1.9639 +[titan] 2025-09-09 09:32:16,374 - root - INFO - lr: 9.8533e-06 gnorm: 0.35 [1 day, 15:56:49<1 day, 9:19:00] +[titan] 2025-09-09 09:32:48,303 - root - INFO - step: 21815 loss: 2.6446 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.13 mfu: 49.46% global_avg_ntp_loss: 0.7350 global_avg_top_loss: 1.9095 +[titan] 2025-09-09 09:32:48,303 - root - INFO - lr: 9.8497e-06 gnorm: 0.33 [1 day, 15:57:21<1 day, 9:18:26] +[titan] 2025-09-09 09:33:20,058 - root - INFO - step: 21820 loss: 2.8282 memory: 122.03GiB(87.57%) tps: 10,319 tflops: 491.80 mfu: 49.73% global_avg_ntp_loss: 0.8196 global_avg_top_loss: 2.0086 +[titan] 2025-09-09 09:33:20,058 - root - INFO - lr: 9.8462e-06 gnorm: 0.35 [1 day, 15:57:53<1 day, 9:17:52] +[titan] 2025-09-09 09:33:52,089 - root - INFO - step: 21825 loss: 2.8589 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.56 mfu: 49.30% global_avg_ntp_loss: 0.8374 global_avg_top_loss: 2.0214 +[titan] 2025-09-09 09:33:52,089 - root - INFO - lr: 9.8426e-06 gnorm: 0.35 [1 day, 15:58:25<1 day, 9:17:18] +[titan] 2025-09-09 09:34:23,933 - root - INFO - step: 21830 loss: 2.7610 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.43 mfu: 49.59% global_avg_ntp_loss: 0.7878 global_avg_top_loss: 1.9732 +[titan] 2025-09-09 09:34:23,933 - root - INFO - lr: 9.8391e-06 gnorm: 0.33 [1 day, 15:58:57<1 day, 9:16:44] +[titan] 2025-09-09 09:34:56,009 - root - INFO - step: 21835 loss: 3.2460 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.89 mfu: 49.23% global_avg_ntp_loss: 1.0598 global_avg_top_loss: 2.1862 +[titan] 2025-09-09 09:34:56,009 - root - INFO - lr: 9.8356e-06 gnorm: 0.36 [1 day, 15:59:29<1 day, 9:16:11] +[titan] 2025-09-09 09:35:27,984 - root - INFO - step: 21840 loss: 2.8116 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.42 mfu: 49.39% global_avg_ntp_loss: 0.8206 global_avg_top_loss: 1.9911 +[titan] 2025-09-09 09:35:27,984 - root - INFO - lr: 9.8320e-06 gnorm: 0.33 [1 day, 16:00:01<1 day, 9:15:37] +[titan] 2025-09-09 09:35:59,849 - root - INFO - step: 21845 loss: 2.8150 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.10 mfu: 49.55% global_avg_ntp_loss: 0.8118 global_avg_top_loss: 2.0031 +[titan] 2025-09-09 09:35:59,849 - root - INFO - lr: 9.8285e-06 gnorm: 0.34 [1 day, 16:00:33<1 day, 9:15:03] +[titan] 2025-09-09 09:36:25,267 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 09:36:31,658 - root - INFO - step: 21850 loss: 3.1993 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 490.98 mfu: 49.64% global_avg_ntp_loss: 1.0417 global_avg_top_loss: 2.1576 +[titan] 2025-09-09 09:36:31,658 - root - INFO - lr: 9.8249e-06 gnorm: 0.40 [1 day, 16:01:05<1 day, 9:14:29] +[titan] 2025-09-09 09:37:03,728 - root - INFO - step: 21855 loss: 2.8371 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.96 mfu: 49.24% global_avg_ntp_loss: 0.8225 global_avg_top_loss: 2.0146 +[titan] 2025-09-09 09:37:03,729 - root - INFO - lr: 9.8214e-06 gnorm: 0.33 [1 day, 16:01:37<1 day, 9:13:55] +[titan] 2025-09-09 09:37:35,659 - root - INFO - step: 21860 loss: 2.8166 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.09 mfu: 49.45% global_avg_ntp_loss: 0.8123 global_avg_top_loss: 2.0043 +[titan] 2025-09-09 09:37:35,660 - root - INFO - lr: 9.8179e-06 gnorm: 0.35 [1 day, 16:02:09<1 day, 9:13:22] +[titan] 2025-09-09 09:38:07,597 - root - INFO - step: 21865 loss: 3.2084 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.99 mfu: 49.44% global_avg_ntp_loss: 1.0441 global_avg_top_loss: 2.1643 +[titan] 2025-09-09 09:38:07,597 - root - INFO - lr: 9.8143e-06 gnorm: 0.41 [1 day, 16:02:41<1 day, 9:12:48] +[titan] 2025-09-09 09:38:39,438 - root - INFO - step: 21870 loss: 2.7741 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.47 mfu: 49.59% global_avg_ntp_loss: 0.7944 global_avg_top_loss: 1.9797 +[titan] 2025-09-09 09:38:39,439 - root - INFO - lr: 9.8108e-06 gnorm: 0.33 [1 day, 16:03:12<1 day, 9:12:14] +[titan] 2025-09-09 09:39:11,569 - root - INFO - step: 21875 loss: 2.7826 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.05 mfu: 49.15% global_avg_ntp_loss: 0.7949 global_avg_top_loss: 1.9877 +[titan] 2025-09-09 09:39:11,570 - root - INFO - lr: 9.8072e-06 gnorm: 0.36 [1 day, 16:03:45<1 day, 9:11:40] +[titan] 2025-09-09 09:39:43,333 - root - INFO - step: 21880 loss: 2.7948 memory: 122.03GiB(87.57%) tps: 10,316 tflops: 491.67 mfu: 49.71% global_avg_ntp_loss: 0.8035 global_avg_top_loss: 1.9913 +[titan] 2025-09-09 09:39:43,333 - root - INFO - lr: 9.8037e-06 gnorm: 0.35 [1 day, 16:04:16<1 day, 9:11:06] +[titan] 2025-09-09 09:40:15,140 - root - INFO - step: 21885 loss: 2.8201 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 491.00 mfu: 49.65% global_avg_ntp_loss: 0.8154 global_avg_top_loss: 2.0047 +[titan] 2025-09-09 09:40:15,140 - root - INFO - lr: 9.8002e-06 gnorm: 0.38 [1 day, 16:04:48<1 day, 9:10:32] +[titan] 2025-09-09 09:40:46,739 - root - INFO - step: 21890 loss: 2.7615 memory: 122.03GiB(87.57%) tps: 10,370 tflops: 494.22 mfu: 49.97% global_avg_ntp_loss: 0.7901 global_avg_top_loss: 1.9714 +[titan] 2025-09-09 09:40:46,740 - root - INFO - lr: 9.7966e-06 gnorm: 0.34 [1 day, 16:05:20<1 day, 9:09:58] +[titan] 2025-09-09 09:41:18,568 - root - INFO - step: 21895 loss: 2.8081 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.66 mfu: 49.61% global_avg_ntp_loss: 0.8092 global_avg_top_loss: 1.9988 +[titan] 2025-09-09 09:41:18,569 - root - INFO - lr: 9.7931e-06 gnorm: 0.32 [1 day, 16:05:52<1 day, 9:09:24] +[titan] 2025-09-09 09:41:43,981 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 09:41:50,384 - root - INFO - step: 21900 loss: 2.7632 memory: 122.03GiB(87.57%) tps: 10,300 tflops: 490.87 mfu: 49.63% global_avg_ntp_loss: 0.7893 global_avg_top_loss: 1.9739 +[titan] 2025-09-09 09:41:50,384 - root - INFO - lr: 9.7896e-06 gnorm: 0.36 [1 day, 16:06:23<1 day, 9:08:50] +[titan] 2025-09-09 09:42:22,285 - root - INFO - step: 21905 loss: 2.6902 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.54 mfu: 49.50% global_avg_ntp_loss: 0.7564 global_avg_top_loss: 1.9338 +[titan] 2025-09-09 09:42:22,286 - root - INFO - lr: 9.7860e-06 gnorm: 0.34 [1 day, 16:06:55<1 day, 9:08:17] +[titan] 2025-09-09 09:42:54,214 - root - INFO - step: 21910 loss: 2.8081 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.12 mfu: 49.46% global_avg_ntp_loss: 0.8064 global_avg_top_loss: 2.0017 +[titan] 2025-09-09 09:42:54,215 - root - INFO - lr: 9.7825e-06 gnorm: 0.32 [1 day, 16:07:27<1 day, 9:07:43] +[titan] 2025-09-09 09:43:26,138 - root - INFO - step: 21915 loss: 3.2417 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.21 mfu: 49.46% global_avg_ntp_loss: 1.0618 global_avg_top_loss: 2.1799 +[titan] 2025-09-09 09:43:26,138 - root - INFO - lr: 9.7789e-06 gnorm: 0.38 [1 day, 16:07:59<1 day, 9:07:09] +[titan] 2025-09-09 09:43:57,814 - root - INFO - step: 21920 loss: 2.8170 memory: 122.03GiB(87.57%) tps: 10,345 tflops: 493.03 mfu: 49.85% global_avg_ntp_loss: 0.8148 global_avg_top_loss: 2.0022 +[titan] 2025-09-09 09:43:57,814 - root - INFO - lr: 9.7754e-06 gnorm: 0.33 [1 day, 16:08:31<1 day, 9:06:35] +[titan] 2025-09-09 09:44:29,578 - root - INFO - step: 21925 loss: 2.7314 memory: 122.03GiB(87.57%) tps: 10,316 tflops: 491.67 mfu: 49.71% global_avg_ntp_loss: 0.7739 global_avg_top_loss: 1.9575 +[titan] 2025-09-09 09:44:29,578 - root - INFO - lr: 9.7719e-06 gnorm: 0.32 [1 day, 16:09:03<1 day, 9:06:01] +[titan] 2025-09-09 09:45:01,467 - root - INFO - step: 21930 loss: 3.2231 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.74 mfu: 49.52% global_avg_ntp_loss: 1.0525 global_avg_top_loss: 2.1706 +[titan] 2025-09-09 09:45:01,467 - root - INFO - lr: 9.7683e-06 gnorm: 0.37 [1 day, 16:09:34<1 day, 9:05:27] +[titan] 2025-09-09 09:45:33,163 - root - INFO - step: 21935 loss: 2.6797 memory: 122.03GiB(87.57%) tps: 10,338 tflops: 492.72 mfu: 49.82% global_avg_ntp_loss: 0.7534 global_avg_top_loss: 1.9263 +[titan] 2025-09-09 09:45:33,163 - root - INFO - lr: 9.7648e-06 gnorm: 0.32 [1 day, 16:10:06<1 day, 9:04:53] +[titan] 2025-09-09 09:46:04,881 - root - INFO - step: 21940 loss: 2.6642 memory: 122.03GiB(87.57%) tps: 10,331 tflops: 492.37 mfu: 49.78% global_avg_ntp_loss: 0.7409 global_avg_top_loss: 1.9233 +[titan] 2025-09-09 09:46:04,882 - root - INFO - lr: 9.7613e-06 gnorm: 0.36 [1 day, 16:10:38<1 day, 9:04:19] +[titan] 2025-09-09 09:46:36,663 - root - INFO - step: 21945 loss: 3.2113 memory: 122.03GiB(87.57%) tps: 10,310 tflops: 491.39 mfu: 49.69% global_avg_ntp_loss: 1.0472 global_avg_top_loss: 2.1641 +[titan] 2025-09-09 09:46:36,663 - root - INFO - lr: 9.7577e-06 gnorm: 0.42 [1 day, 16:11:10<1 day, 9:03:45] +[titan] 2025-09-09 09:47:02,171 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 09:47:08,576 - root - INFO - step: 21950 loss: 2.8186 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.37 mfu: 49.48% global_avg_ntp_loss: 0.8139 global_avg_top_loss: 2.0048 +[titan] 2025-09-09 09:47:08,576 - root - INFO - lr: 9.7542e-06 gnorm: 0.33 [1 day, 16:11:42<1 day, 9:03:11] +[titan] 2025-09-09 09:47:40,404 - root - INFO - step: 21955 loss: 2.8415 memory: 122.03GiB(87.57%) tps: 10,296 tflops: 490.68 mfu: 49.61% global_avg_ntp_loss: 0.8217 global_avg_top_loss: 2.0198 +[titan] 2025-09-09 09:47:40,404 - root - INFO - lr: 9.7507e-06 gnorm: 0.35 [1 day, 16:12:13<1 day, 9:02:37] +[titan] 2025-09-09 09:48:12,410 - root - INFO - step: 21960 loss: 2.7756 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.94 mfu: 49.34% global_avg_ntp_loss: 0.7947 global_avg_top_loss: 1.9810 +[titan] 2025-09-09 09:48:12,411 - root - INFO - lr: 9.7471e-06 gnorm: 0.40 [1 day, 16:12:45<1 day, 9:02:04] +[titan] 2025-09-09 09:48:44,356 - root - INFO - step: 21965 loss: 2.7766 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.88 mfu: 49.43% global_avg_ntp_loss: 0.7952 global_avg_top_loss: 1.9814 +[titan] 2025-09-09 09:48:44,356 - root - INFO - lr: 9.7436e-06 gnorm: 0.34 [1 day, 16:13:17<1 day, 9:01:30] +[titan] 2025-09-09 09:49:16,243 - root - INFO - step: 21970 loss: 2.8347 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.76 mfu: 49.52% global_avg_ntp_loss: 0.8210 global_avg_top_loss: 2.0137 +[titan] 2025-09-09 09:49:16,244 - root - INFO - lr: 9.7401e-06 gnorm: 0.33 [1 day, 16:13:49<1 day, 9:00:56] +[titan] 2025-09-09 09:49:48,135 - root - INFO - step: 21975 loss: 2.8245 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.70 mfu: 49.51% global_avg_ntp_loss: 0.8236 global_avg_top_loss: 2.0009 +[titan] 2025-09-09 09:49:48,135 - root - INFO - lr: 9.7365e-06 gnorm: 0.41 [1 day, 16:14:21<1 day, 9:00:22] +[titan] 2025-09-09 09:50:19,972 - root - INFO - step: 21980 loss: 2.7653 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.53 mfu: 49.60% global_avg_ntp_loss: 0.7893 global_avg_top_loss: 1.9760 +[titan] 2025-09-09 09:50:19,972 - root - INFO - lr: 9.7330e-06 gnorm: 0.32 [1 day, 16:14:53<1 day, 8:59:48] +[titan] 2025-09-09 09:50:51,599 - root - INFO - step: 21985 loss: 2.7592 memory: 122.03GiB(87.57%) tps: 10,361 tflops: 493.80 mfu: 49.93% global_avg_ntp_loss: 0.7876 global_avg_top_loss: 1.9716 +[titan] 2025-09-09 09:50:51,599 - root - INFO - lr: 9.7294e-06 gnorm: 0.34 [1 day, 16:15:25<1 day, 8:59:14] +[titan] 2025-09-09 09:51:23,425 - root - INFO - step: 21990 loss: 2.7158 memory: 122.03GiB(87.57%) tps: 10,296 tflops: 490.71 mfu: 49.62% global_avg_ntp_loss: 0.7660 global_avg_top_loss: 1.9497 +[titan] 2025-09-09 09:51:23,425 - root - INFO - lr: 9.7259e-06 gnorm: 0.34 [1 day, 16:15:56<1 day, 8:58:40] +[titan] 2025-09-09 09:51:55,246 - root - INFO - step: 21995 loss: 3.1904 memory: 122.03GiB(87.57%) tps: 10,298 tflops: 490.78 mfu: 49.62% global_avg_ntp_loss: 1.0343 global_avg_top_loss: 2.1560 +[titan] 2025-09-09 09:51:55,246 - root - INFO - lr: 9.7224e-06 gnorm: 0.40 [1 day, 16:16:28<1 day, 8:58:07] +[titan] 2025-09-09 09:52:20,946 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 09:52:27,529 - root - INFO - step: 22000 loss: 2.7668 memory: 122.03GiB(87.57%) tps: 10,150 tflops: 483.76 mfu: 48.91% global_avg_ntp_loss: 0.7902 global_avg_top_loss: 1.9766 +[titan] 2025-09-09 09:52:27,529 - root - INFO - lr: 9.7188e-06 gnorm: 0.35 [1 day, 16:17:00<1 day, 8:57:33] +[titan] 2025-09-09 09:52:59,567 - root - INFO - step: 22005 loss: 2.7883 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.45 mfu: 49.29% global_avg_ntp_loss: 0.8005 global_avg_top_loss: 1.9878 +[titan] 2025-09-09 09:52:59,568 - root - INFO - lr: 9.7153e-06 gnorm: 0.33 [1 day, 16:17:32<1 day, 8:56:59] +[titan] 2025-09-09 09:53:31,448 - root - INFO - step: 22010 loss: 2.7886 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.87 mfu: 49.53% global_avg_ntp_loss: 0.8015 global_avg_top_loss: 1.9871 +[titan] 2025-09-09 09:53:31,448 - root - INFO - lr: 9.7118e-06 gnorm: 0.35 [1 day, 16:18:04<1 day, 8:56:25] +[titan] 2025-09-09 09:54:03,731 - root - INFO - step: 22015 loss: 2.7549 memory: 122.03GiB(87.57%) tps: 10,150 tflops: 483.76 mfu: 48.91% global_avg_ntp_loss: 0.7886 global_avg_top_loss: 1.9664 +[titan] 2025-09-09 09:54:03,731 - root - INFO - lr: 9.7082e-06 gnorm: 0.33 [1 day, 16:18:37<1 day, 8:55:52] +[titan] 2025-09-09 09:54:10,377 - root - INFO - Dumping profiler traces at step 22016 +[titan] 2025-09-09 09:54:10,442 - root - INFO - Finished dumping profiler traces in 0.06 seconds +[titan] 2025-09-09 09:54:35,687 - root - INFO - step: 22020 loss: 2.7375 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.71 mfu: 49.41% global_avg_ntp_loss: 0.7770 global_avg_top_loss: 1.9605 +[titan] 2025-09-09 09:54:35,687 - root - INFO - lr: 9.7047e-06 gnorm: 0.35 [1 day, 16:19:09<1 day, 8:55:18] +[titan] 2025-09-09 09:55:07,657 - root - INFO - step: 22025 loss: 3.2441 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.50 mfu: 49.39% global_avg_ntp_loss: 1.0641 global_avg_top_loss: 2.1800 +[titan] 2025-09-09 09:55:07,657 - root - INFO - lr: 9.7012e-06 gnorm: 0.34 [1 day, 16:19:41<1 day, 8:54:44] +[titan] 2025-09-09 09:55:39,860 - root - INFO - step: 22030 loss: 2.7467 memory: 122.03GiB(87.57%) tps: 10,175 tflops: 484.96 mfu: 49.03% global_avg_ntp_loss: 0.7864 global_avg_top_loss: 1.9603 +[titan] 2025-09-09 09:55:39,860 - root - INFO - lr: 9.6976e-06 gnorm: 0.33 [1 day, 16:20:13<1 day, 8:54:11] +[titan] 2025-09-09 09:56:11,600 - root - INFO - step: 22035 loss: 2.8318 memory: 122.03GiB(87.57%) tps: 10,324 tflops: 492.03 mfu: 49.75% global_avg_ntp_loss: 0.8177 global_avg_top_loss: 2.0142 +[titan] 2025-09-09 09:56:11,601 - root - INFO - lr: 9.6941e-06 gnorm: 0.33 [1 day, 16:20:45<1 day, 8:53:37] +[titan] 2025-09-09 09:56:43,555 - root - INFO - step: 22040 loss: 2.7866 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.73 mfu: 49.42% global_avg_ntp_loss: 0.7991 global_avg_top_loss: 1.9874 +[titan] 2025-09-09 09:56:43,555 - root - INFO - lr: 9.6906e-06 gnorm: 0.34 [1 day, 16:21:16<1 day, 8:53:03] +[titan] 2025-09-09 09:57:15,359 - root - INFO - step: 22045 loss: 2.7100 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.05 mfu: 49.65% global_avg_ntp_loss: 0.7674 global_avg_top_loss: 1.9426 +[titan] 2025-09-09 09:57:15,359 - root - INFO - lr: 9.6871e-06 gnorm: 0.34 [1 day, 16:21:48<1 day, 8:52:29] +[titan] 2025-09-09 09:57:40,940 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 09:57:47,274 - root - INFO - step: 22050 loss: 2.7349 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.33 mfu: 49.48% global_avg_ntp_loss: 0.7748 global_avg_top_loss: 1.9601 +[titan] 2025-09-09 09:57:47,275 - root - INFO - lr: 9.6835e-06 gnorm: 0.35 [1 day, 16:22:20<1 day, 8:51:55] +[titan] 2025-09-09 09:58:19,127 - root - INFO - step: 22055 loss: 2.7298 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.30 mfu: 49.57% global_avg_ntp_loss: 0.7710 global_avg_top_loss: 1.9588 +[titan] 2025-09-09 09:58:19,127 - root - INFO - lr: 9.6800e-06 gnorm: 0.37 [1 day, 16:22:52<1 day, 8:51:22] +[titan] 2025-09-09 09:58:50,997 - root - INFO - step: 22060 loss: 2.7477 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.02 mfu: 49.55% global_avg_ntp_loss: 0.7843 global_avg_top_loss: 1.9634 +[titan] 2025-09-09 09:58:50,997 - root - INFO - lr: 9.6765e-06 gnorm: 0.35 [1 day, 16:23:24<1 day, 8:50:48] +[titan] 2025-09-09 09:59:22,837 - root - INFO - step: 22065 loss: 2.7699 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.49 mfu: 49.59% global_avg_ntp_loss: 0.7909 global_avg_top_loss: 1.9789 +[titan] 2025-09-09 09:59:22,838 - root - INFO - lr: 9.6729e-06 gnorm: 0.34 [1 day, 16:23:56<1 day, 8:50:14] +[titan] 2025-09-09 09:59:54,826 - root - INFO - step: 22070 loss: 2.7495 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.20 mfu: 49.36% global_avg_ntp_loss: 0.7871 global_avg_top_loss: 1.9624 +[titan] 2025-09-09 09:59:54,827 - root - INFO - lr: 9.6694e-06 gnorm: 0.38 [1 day, 16:24:28<1 day, 8:49:40] +[titan] 2025-09-09 10:00:26,934 - root - INFO - step: 22075 loss: 3.2809 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.40 mfu: 49.18% global_avg_ntp_loss: 1.0810 global_avg_top_loss: 2.1999 +[titan] 2025-09-09 10:00:26,935 - root - INFO - lr: 9.6659e-06 gnorm: 0.33 [1 day, 16:25:00<1 day, 8:49:06] +[titan] 2025-09-09 10:00:58,595 - root - INFO - step: 22080 loss: 2.8153 memory: 122.03GiB(87.57%) tps: 10,350 tflops: 493.26 mfu: 49.87% global_avg_ntp_loss: 0.8120 global_avg_top_loss: 2.0034 +[titan] 2025-09-09 10:00:58,596 - root - INFO - lr: 9.6623e-06 gnorm: 0.35 [1 day, 16:25:32<1 day, 8:48:32] +[titan] 2025-09-09 10:01:30,589 - root - INFO - step: 22085 loss: 2.7460 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.13 mfu: 49.36% global_avg_ntp_loss: 0.7811 global_avg_top_loss: 1.9649 +[titan] 2025-09-09 10:01:30,590 - root - INFO - lr: 9.6588e-06 gnorm: 0.33 [1 day, 16:26:04<1 day, 8:47:59] +[titan] 2025-09-09 10:02:02,361 - root - INFO - step: 22090 loss: 2.7389 memory: 122.03GiB(87.57%) tps: 10,314 tflops: 491.54 mfu: 49.70% global_avg_ntp_loss: 0.7754 global_avg_top_loss: 1.9635 +[titan] 2025-09-09 10:02:02,361 - root - INFO - lr: 9.6553e-06 gnorm: 0.34 [1 day, 16:26:35<1 day, 8:47:25] +[titan] 2025-09-09 10:02:34,290 - root - INFO - step: 22095 loss: 2.7602 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.12 mfu: 49.46% global_avg_ntp_loss: 0.7891 global_avg_top_loss: 1.9711 +[titan] 2025-09-09 10:02:34,290 - root - INFO - lr: 9.6517e-06 gnorm: 0.34 [1 day, 16:27:07<1 day, 8:46:51] +[titan] 2025-09-09 10:02:59,648 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 10:03:06,074 - root - INFO - step: 22100 loss: 2.7562 memory: 122.03GiB(87.57%) tps: 10,310 tflops: 491.36 mfu: 49.68% global_avg_ntp_loss: 0.7872 global_avg_top_loss: 1.9690 +[titan] 2025-09-09 10:03:06,074 - root - INFO - lr: 9.6482e-06 gnorm: 0.36 [1 day, 16:27:39<1 day, 8:46:17] +[titan] 2025-09-09 10:03:37,842 - root - INFO - step: 22105 loss: 3.1691 memory: 122.03GiB(87.57%) tps: 10,315 tflops: 491.61 mfu: 49.71% global_avg_ntp_loss: 1.0253 global_avg_top_loss: 2.1439 +[titan] 2025-09-09 10:03:37,842 - root - INFO - lr: 9.6447e-06 gnorm: 0.37 [1 day, 16:28:11<1 day, 8:45:43] +[titan] 2025-09-09 10:04:09,641 - root - INFO - step: 22110 loss: 2.8558 memory: 122.03GiB(87.57%) tps: 10,305 tflops: 491.12 mfu: 49.66% global_avg_ntp_loss: 0.8291 global_avg_top_loss: 2.0267 +[titan] 2025-09-09 10:04:09,641 - root - INFO - lr: 9.6412e-06 gnorm: 0.33 [1 day, 16:28:43<1 day, 8:45:09] +[titan] 2025-09-09 10:04:41,365 - root - INFO - step: 22115 loss: 2.6868 memory: 122.03GiB(87.57%) tps: 10,329 tflops: 492.28 mfu: 49.78% global_avg_ntp_loss: 0.7539 global_avg_top_loss: 1.9328 +[titan] 2025-09-09 10:04:41,365 - root - INFO - lr: 9.6376e-06 gnorm: 0.37 [1 day, 16:29:14<1 day, 8:44:35] +[titan] 2025-09-09 10:05:13,221 - root - INFO - step: 22120 loss: 3.3268 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.24 mfu: 49.57% global_avg_ntp_loss: 1.1003 global_avg_top_loss: 2.2265 +[titan] 2025-09-09 10:05:13,221 - root - INFO - lr: 9.6341e-06 gnorm: 0.64 [1 day, 16:29:46<1 day, 8:44:01] +[titan] 2025-09-09 10:05:45,319 - root - INFO - step: 22125 loss: 2.7736 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.55 mfu: 49.20% global_avg_ntp_loss: 0.7928 global_avg_top_loss: 1.9808 +[titan] 2025-09-09 10:05:45,320 - root - INFO - lr: 9.6306e-06 gnorm: 0.33 [1 day, 16:30:18<1 day, 8:43:28] +[titan] 2025-09-09 10:06:17,221 - root - INFO - step: 22130 loss: 2.8568 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.54 mfu: 49.50% global_avg_ntp_loss: 0.8254 global_avg_top_loss: 2.0314 +[titan] 2025-09-09 10:06:17,222 - root - INFO - lr: 9.6270e-06 gnorm: 0.43 [1 day, 16:30:50<1 day, 8:42:54] +[titan] 2025-09-09 10:06:48,942 - root - INFO - step: 22135 loss: 2.7616 memory: 122.03GiB(87.57%) tps: 10,330 tflops: 492.34 mfu: 49.78% global_avg_ntp_loss: 0.7892 global_avg_top_loss: 1.9724 +[titan] 2025-09-09 10:06:48,942 - root - INFO - lr: 9.6235e-06 gnorm: 0.35 [1 day, 16:31:22<1 day, 8:42:20] +[titan] 2025-09-09 10:07:20,630 - root - INFO - step: 22140 loss: 2.7316 memory: 122.03GiB(87.57%) tps: 10,341 tflops: 492.84 mfu: 49.83% global_avg_ntp_loss: 0.7714 global_avg_top_loss: 1.9602 +[titan] 2025-09-09 10:07:20,630 - root - INFO - lr: 9.6200e-06 gnorm: 0.34 [1 day, 16:31:54<1 day, 8:41:46] +[titan] 2025-09-09 10:07:52,503 - root - INFO - step: 22145 loss: 2.8411 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.99 mfu: 49.54% global_avg_ntp_loss: 0.8221 global_avg_top_loss: 2.0190 +[titan] 2025-09-09 10:07:52,503 - root - INFO - lr: 9.6165e-06 gnorm: 0.33 [1 day, 16:32:25<1 day, 8:41:12] +[titan] 2025-09-09 10:08:18,032 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 10:08:24,541 - root - INFO - step: 22150 loss: 2.6662 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.46 mfu: 49.29% global_avg_ntp_loss: 0.7439 global_avg_top_loss: 1.9223 +[titan] 2025-09-09 10:08:24,541 - root - INFO - lr: 9.6129e-06 gnorm: 0.33 [1 day, 16:32:57<1 day, 8:40:39] +[titan] 2025-09-09 10:08:56,313 - root - INFO - step: 22155 loss: 2.7910 memory: 122.03GiB(87.57%) tps: 10,314 tflops: 491.54 mfu: 49.70% global_avg_ntp_loss: 0.8034 global_avg_top_loss: 1.9877 +[titan] 2025-09-09 10:08:56,313 - root - INFO - lr: 9.6094e-06 gnorm: 0.34 [1 day, 16:33:29<1 day, 8:40:05] +[titan] 2025-09-09 10:09:28,146 - root - INFO - step: 22160 loss: 2.7642 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.60 mfu: 49.61% global_avg_ntp_loss: 0.7887 global_avg_top_loss: 1.9755 +[titan] 2025-09-09 10:09:28,146 - root - INFO - lr: 9.6059e-06 gnorm: 0.34 [1 day, 16:34:01<1 day, 8:39:31] +[titan] 2025-09-09 10:10:00,153 - root - INFO - step: 22165 loss: 2.7905 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.92 mfu: 49.33% global_avg_ntp_loss: 0.8073 global_avg_top_loss: 1.9832 +[titan] 2025-09-09 10:10:00,153 - root - INFO - lr: 9.6024e-06 gnorm: 0.33 [1 day, 16:34:33<1 day, 8:38:57] +[titan] 2025-09-09 10:10:31,835 - root - INFO - step: 22170 loss: 2.7554 memory: 122.03GiB(87.57%) tps: 10,343 tflops: 492.94 mfu: 49.84% global_avg_ntp_loss: 0.7862 global_avg_top_loss: 1.9693 +[titan] 2025-09-09 10:10:31,835 - root - INFO - lr: 9.5988e-06 gnorm: 0.32 [1 day, 16:35:05<1 day, 8:38:23] +[titan] 2025-09-09 10:11:03,637 - root - INFO - step: 22175 loss: 2.7585 memory: 122.03GiB(87.57%) tps: 10,304 tflops: 491.07 mfu: 49.65% global_avg_ntp_loss: 0.7868 global_avg_top_loss: 1.9716 +[titan] 2025-09-09 10:11:03,637 - root - INFO - lr: 9.5953e-06 gnorm: 0.34 [1 day, 16:35:37<1 day, 8:37:49] +[titan] 2025-09-09 10:11:35,565 - root - INFO - step: 22180 loss: 2.8919 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.13 mfu: 49.46% global_avg_ntp_loss: 0.8649 global_avg_top_loss: 2.0270 +[titan] 2025-09-09 10:11:35,566 - root - INFO - lr: 9.5918e-06 gnorm: 0.34 [1 day, 16:36:08<1 day, 8:37:16] +[titan] 2025-09-09 10:12:07,434 - root - INFO - step: 22185 loss: 2.6708 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.04 mfu: 49.55% global_avg_ntp_loss: 0.7526 global_avg_top_loss: 1.9182 +[titan] 2025-09-09 10:12:07,435 - root - INFO - lr: 9.5882e-06 gnorm: 0.32 [1 day, 16:36:40<1 day, 8:36:42] +[titan] 2025-09-09 10:12:39,290 - root - INFO - step: 22190 loss: 2.7907 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.25 mfu: 49.57% global_avg_ntp_loss: 0.8036 global_avg_top_loss: 1.9871 +[titan] 2025-09-09 10:12:39,290 - root - INFO - lr: 9.5847e-06 gnorm: 0.36 [1 day, 16:37:12<1 day, 8:36:08] +[titan] 2025-09-09 10:13:11,255 - root - INFO - step: 22195 loss: 2.8944 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.56 mfu: 49.40% global_avg_ntp_loss: 0.8586 global_avg_top_loss: 2.0359 +[titan] 2025-09-09 10:13:11,256 - root - INFO - lr: 9.5812e-06 gnorm: 0.34 [1 day, 16:37:44<1 day, 8:35:34] +[titan] 2025-09-09 10:13:36,671 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 10:13:43,039 - root - INFO - step: 22200 loss: 2.7918 memory: 122.03GiB(87.57%) tps: 10,310 tflops: 491.36 mfu: 49.68% global_avg_ntp_loss: 0.8020 global_avg_top_loss: 1.9898 +[titan] 2025-09-09 10:13:43,040 - root - INFO - lr: 9.5777e-06 gnorm: 0.36 [1 day, 16:38:16<1 day, 8:35:00] +[titan] 2025-09-09 10:14:14,772 - root - INFO - step: 22205 loss: 2.7796 memory: 122.03GiB(87.57%) tps: 10,326 tflops: 492.15 mfu: 49.76% global_avg_ntp_loss: 0.7992 global_avg_top_loss: 1.9804 +[titan] 2025-09-09 10:14:14,772 - root - INFO - lr: 9.5741e-06 gnorm: 0.39 [1 day, 16:38:48<1 day, 8:34:26] +[titan] 2025-09-09 10:14:46,616 - root - INFO - step: 22210 loss: 2.7982 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.43 mfu: 49.59% global_avg_ntp_loss: 0.8021 global_avg_top_loss: 1.9961 +[titan] 2025-09-09 10:14:46,616 - root - INFO - lr: 9.5706e-06 gnorm: 0.34 [1 day, 16:39:20<1 day, 8:33:52] +[titan] 2025-09-09 10:15:18,345 - root - INFO - step: 22215 loss: 2.6928 memory: 122.03GiB(87.57%) tps: 10,328 tflops: 492.21 mfu: 49.77% global_avg_ntp_loss: 0.7549 global_avg_top_loss: 1.9379 +[titan] 2025-09-09 10:15:18,345 - root - INFO - lr: 9.5671e-06 gnorm: 0.33 [1 day, 16:39:51<1 day, 8:33:19] +[titan] 2025-09-09 10:15:50,298 - root - INFO - step: 22220 loss: 2.7928 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.75 mfu: 49.42% global_avg_ntp_loss: 0.8067 global_avg_top_loss: 1.9861 +[titan] 2025-09-09 10:15:50,299 - root - INFO - lr: 9.5636e-06 gnorm: 0.34 [1 day, 16:40:23<1 day, 8:32:45] +[titan] 2025-09-09 10:16:22,115 - root - INFO - step: 22225 loss: 2.8799 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.86 mfu: 49.63% global_avg_ntp_loss: 0.8419 global_avg_top_loss: 2.0380 +[titan] 2025-09-09 10:16:22,115 - root - INFO - lr: 9.5600e-06 gnorm: 0.86 [1 day, 16:40:55<1 day, 8:32:11] +[titan] 2025-09-09 10:16:53,931 - root - INFO - step: 22230 loss: 2.7991 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.85 mfu: 49.63% global_avg_ntp_loss: 0.8039 global_avg_top_loss: 1.9952 +[titan] 2025-09-09 10:16:53,932 - root - INFO - lr: 9.5565e-06 gnorm: 0.34 [1 day, 16:41:27<1 day, 8:31:37] +[titan] 2025-09-09 10:17:25,892 - root - INFO - step: 22235 loss: 3.1268 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.63 mfu: 49.41% global_avg_ntp_loss: 0.9946 global_avg_top_loss: 2.1322 +[titan] 2025-09-09 10:17:25,893 - root - INFO - lr: 9.5530e-06 gnorm: 0.40 [1 day, 16:41:59<1 day, 8:31:03] +[titan] 2025-09-09 10:17:57,779 - root - INFO - step: 22240 loss: 2.7001 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.77 mfu: 49.52% global_avg_ntp_loss: 0.7597 global_avg_top_loss: 1.9404 +[titan] 2025-09-09 10:17:57,780 - root - INFO - lr: 9.5495e-06 gnorm: 0.40 [1 day, 16:42:31<1 day, 8:30:30] +[titan] 2025-09-09 10:18:29,766 - root - INFO - step: 22245 loss: 2.8551 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.24 mfu: 49.37% global_avg_ntp_loss: 0.8313 global_avg_top_loss: 2.0238 +[titan] 2025-09-09 10:18:29,766 - root - INFO - lr: 9.5460e-06 gnorm: 0.36 [1 day, 16:43:03<1 day, 8:29:56] +[titan] 2025-09-09 10:18:55,316 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 10:19:01,712 - root - INFO - step: 22250 loss: 2.8191 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.86 mfu: 49.43% global_avg_ntp_loss: 0.8132 global_avg_top_loss: 2.0059 +[titan] 2025-09-09 10:19:01,713 - root - INFO - lr: 9.5424e-06 gnorm: 0.36 [1 day, 16:43:35<1 day, 8:29:22] +[titan] 2025-09-09 10:19:33,612 - root - INFO - step: 22255 loss: 2.7885 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.58 mfu: 49.50% global_avg_ntp_loss: 0.8026 global_avg_top_loss: 1.9859 +[titan] 2025-09-09 10:19:33,612 - root - INFO - lr: 9.5389e-06 gnorm: 0.33 [1 day, 16:44:07<1 day, 8:28:48] +[titan] 2025-09-09 10:20:05,773 - root - INFO - step: 22260 loss: 2.7400 memory: 122.03GiB(87.57%) tps: 10,189 tflops: 485.59 mfu: 49.10% global_avg_ntp_loss: 0.7771 global_avg_top_loss: 1.9629 +[titan] 2025-09-09 10:20:05,773 - root - INFO - lr: 9.5354e-06 gnorm: 0.34 [1 day, 16:44:39<1 day, 8:28:15] +[titan] 2025-09-09 10:20:37,760 - root - INFO - step: 22265 loss: 2.7462 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.23 mfu: 49.37% global_avg_ntp_loss: 0.7826 global_avg_top_loss: 1.9636 +[titan] 2025-09-09 10:20:37,760 - root - INFO - lr: 9.5319e-06 gnorm: 0.34 [1 day, 16:45:11<1 day, 8:27:41] +[titan] 2025-09-09 10:21:09,496 - root - INFO - step: 22270 loss: 2.7766 memory: 122.03GiB(87.57%) tps: 10,325 tflops: 492.10 mfu: 49.76% global_avg_ntp_loss: 0.7942 global_avg_top_loss: 1.9823 +[titan] 2025-09-09 10:21:09,496 - root - INFO - lr: 9.5283e-06 gnorm: 0.36 [1 day, 16:45:42<1 day, 8:27:07] +[titan] 2025-09-09 10:21:41,427 - root - INFO - step: 22275 loss: 2.8120 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.09 mfu: 49.45% global_avg_ntp_loss: 0.8139 global_avg_top_loss: 1.9981 +[titan] 2025-09-09 10:21:41,427 - root - INFO - lr: 9.5248e-06 gnorm: 0.33 [1 day, 16:46:14<1 day, 8:26:33] +[titan] 2025-09-09 10:22:13,141 - root - INFO - step: 22280 loss: 2.7896 memory: 122.03GiB(87.57%) tps: 10,333 tflops: 492.44 mfu: 49.79% global_avg_ntp_loss: 0.8030 global_avg_top_loss: 1.9865 +[titan] 2025-09-09 10:22:13,141 - root - INFO - lr: 9.5213e-06 gnorm: 0.34 [1 day, 16:46:46<1 day, 8:25:59] +[titan] 2025-09-09 10:22:44,944 - root - INFO - step: 22285 loss: 3.1378 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.06 mfu: 49.65% global_avg_ntp_loss: 0.9973 global_avg_top_loss: 2.1405 +[titan] 2025-09-09 10:22:44,944 - root - INFO - lr: 9.5178e-06 gnorm: 0.34 [1 day, 16:47:18<1 day, 8:25:26] +[titan] 2025-09-09 10:23:16,856 - root - INFO - step: 22290 loss: 2.7761 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.38 mfu: 49.48% global_avg_ntp_loss: 0.7948 global_avg_top_loss: 1.9813 +[titan] 2025-09-09 10:23:16,856 - root - INFO - lr: 9.5143e-06 gnorm: 0.37 [1 day, 16:47:50<1 day, 8:24:52] +[titan] 2025-09-09 10:23:48,837 - root - INFO - step: 22295 loss: 3.2144 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.33 mfu: 49.38% global_avg_ntp_loss: 1.0470 global_avg_top_loss: 2.1675 +[titan] 2025-09-09 10:23:48,837 - root - INFO - lr: 9.5107e-06 gnorm: 0.35 [1 day, 16:48:22<1 day, 8:24:18] +[titan] 2025-09-09 10:24:14,316 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 10:24:20,731 - root - INFO - step: 22300 loss: 2.7452 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.66 mfu: 49.51% global_avg_ntp_loss: 0.7705 global_avg_top_loss: 1.9747 +[titan] 2025-09-09 10:24:20,731 - root - INFO - lr: 9.5072e-06 gnorm: 1.10 [1 day, 16:48:54<1 day, 8:23:44] +[titan] 2025-09-09 10:24:52,941 - root - INFO - step: 22305 loss: 2.8242 memory: 122.03GiB(87.57%) tps: 10,173 tflops: 484.85 mfu: 49.02% global_avg_ntp_loss: 0.8184 global_avg_top_loss: 2.0058 +[titan] 2025-09-09 10:24:52,942 - root - INFO - lr: 9.5037e-06 gnorm: 0.39 [1 day, 16:49:26<1 day, 8:23:11] +[titan] 2025-09-09 10:25:24,735 - root - INFO - step: 22310 loss: 3.0296 memory: 122.03GiB(87.57%) tps: 10,307 tflops: 491.21 mfu: 49.67% global_avg_ntp_loss: 0.9360 global_avg_top_loss: 2.0937 +[titan] 2025-09-09 10:25:24,735 - root - INFO - lr: 9.5002e-06 gnorm: 0.35 [1 day, 16:49:58<1 day, 8:22:37] +[titan] 2025-09-09 10:25:56,775 - root - INFO - step: 22315 loss: 2.7146 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.43 mfu: 49.29% global_avg_ntp_loss: 0.7729 global_avg_top_loss: 1.9417 +[titan] 2025-09-09 10:25:56,775 - root - INFO - lr: 9.4967e-06 gnorm: 0.38 [1 day, 16:50:30<1 day, 8:22:03] +[titan] 2025-09-09 10:26:28,848 - root - INFO - step: 22320 loss: 2.7762 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.93 mfu: 49.23% global_avg_ntp_loss: 0.7959 global_avg_top_loss: 1.9804 +[titan] 2025-09-09 10:26:28,848 - root - INFO - lr: 9.4931e-06 gnorm: 0.35 [1 day, 16:51:02<1 day, 8:21:30] +[titan] 2025-09-09 10:27:00,585 - root - INFO - step: 22325 loss: 2.7520 memory: 122.03GiB(87.57%) tps: 10,325 tflops: 492.08 mfu: 49.76% global_avg_ntp_loss: 0.7834 global_avg_top_loss: 1.9685 +[titan] 2025-09-09 10:27:00,585 - root - INFO - lr: 9.4896e-06 gnorm: 0.35 [1 day, 16:51:33<1 day, 8:20:56] +[titan] 2025-09-09 10:27:32,399 - root - INFO - step: 22330 loss: 2.7717 memory: 122.03GiB(87.57%) tps: 10,300 tflops: 490.88 mfu: 49.63% global_avg_ntp_loss: 0.7958 global_avg_top_loss: 1.9759 +[titan] 2025-09-09 10:27:32,400 - root - INFO - lr: 9.4861e-06 gnorm: 0.35 [1 day, 16:52:05<1 day, 8:20:22] +[titan] 2025-09-09 10:28:04,253 - root - INFO - step: 22335 loss: 2.6690 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.29 mfu: 49.57% global_avg_ntp_loss: 0.7426 global_avg_top_loss: 1.9264 +[titan] 2025-09-09 10:28:04,253 - root - INFO - lr: 9.4826e-06 gnorm: 1.22 [1 day, 16:52:37<1 day, 8:19:48] +[titan] 2025-09-09 10:28:36,221 - root - INFO - step: 22340 loss: 2.7836 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.52 mfu: 49.40% global_avg_ntp_loss: 0.8001 global_avg_top_loss: 1.9835 +[titan] 2025-09-09 10:28:36,221 - root - INFO - lr: 9.4791e-06 gnorm: 0.33 [1 day, 16:53:09<1 day, 8:19:14] +[titan] 2025-09-09 10:29:08,159 - root - INFO - step: 22345 loss: 2.9081 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.99 mfu: 49.44% global_avg_ntp_loss: 0.8610 global_avg_top_loss: 2.0471 +[titan] 2025-09-09 10:29:08,159 - root - INFO - lr: 9.4755e-06 gnorm: 0.34 [1 day, 16:53:41<1 day, 8:18:41] +[titan] 2025-09-09 10:29:33,594 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 10:29:40,005 - root - INFO - step: 22350 loss: 2.8288 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.39 mfu: 49.58% global_avg_ntp_loss: 0.8207 global_avg_top_loss: 2.0080 +[titan] 2025-09-09 10:29:40,005 - root - INFO - lr: 9.4720e-06 gnorm: 0.34 [1 day, 16:54:13<1 day, 8:18:07] +[titan] 2025-09-09 10:30:11,980 - root - INFO - step: 22355 loss: 2.7864 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.43 mfu: 49.39% global_avg_ntp_loss: 0.7995 global_avg_top_loss: 1.9869 +[titan] 2025-09-09 10:30:11,980 - root - INFO - lr: 9.4685e-06 gnorm: 0.33 [1 day, 16:54:45<1 day, 8:17:33] +[titan] 2025-09-09 10:30:44,054 - root - INFO - step: 22360 loss: 2.8197 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.90 mfu: 49.23% global_avg_ntp_loss: 0.8168 global_avg_top_loss: 2.0029 +[titan] 2025-09-09 10:30:44,054 - root - INFO - lr: 9.4650e-06 gnorm: 0.34 [1 day, 16:55:17<1 day, 8:17:00] +[titan] 2025-09-09 10:31:15,806 - root - INFO - step: 22365 loss: 3.1971 memory: 122.03GiB(87.57%) tps: 10,320 tflops: 491.86 mfu: 49.73% global_avg_ntp_loss: 1.0403 global_avg_top_loss: 2.1568 +[titan] 2025-09-09 10:31:15,806 - root - INFO - lr: 9.4615e-06 gnorm: 0.36 [1 day, 16:55:49<1 day, 8:16:26] +[titan] 2025-09-09 10:31:47,589 - root - INFO - step: 22370 loss: 2.7473 memory: 122.03GiB(87.57%) tps: 10,310 tflops: 491.36 mfu: 49.68% global_avg_ntp_loss: 0.7877 global_avg_top_loss: 1.9596 +[titan] 2025-09-09 10:31:47,589 - root - INFO - lr: 9.4580e-06 gnorm: 0.39 [1 day, 16:56:20<1 day, 8:15:52] +[titan] 2025-09-09 10:32:19,470 - root - INFO - step: 22375 loss: 3.7100 memory: 122.03GiB(87.57%) tps: 10,278 tflops: 489.86 mfu: 49.53% global_avg_ntp_loss: 1.3271 global_avg_top_loss: 2.3829 +[titan] 2025-09-09 10:32:19,470 - root - INFO - lr: 9.4544e-06 gnorm: 0.36 [1 day, 16:56:52<1 day, 8:15:18] +[titan] 2025-09-09 10:32:51,533 - root - INFO - step: 22380 loss: 2.7663 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.08 mfu: 49.25% global_avg_ntp_loss: 0.7923 global_avg_top_loss: 1.9740 +[titan] 2025-09-09 10:32:51,533 - root - INFO - lr: 9.4509e-06 gnorm: 0.35 [1 day, 16:57:24<1 day, 8:14:44] +[titan] 2025-09-09 10:33:23,490 - root - INFO - step: 22385 loss: 2.7515 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.70 mfu: 49.41% global_avg_ntp_loss: 0.7823 global_avg_top_loss: 1.9692 +[titan] 2025-09-09 10:33:23,490 - root - INFO - lr: 9.4474e-06 gnorm: 0.33 [1 day, 16:57:56<1 day, 8:14:11] +[titan] 2025-09-09 10:33:55,253 - root - INFO - step: 22390 loss: 2.8091 memory: 122.03GiB(87.57%) tps: 10,316 tflops: 491.68 mfu: 49.71% global_avg_ntp_loss: 0.8087 global_avg_top_loss: 2.0004 +[titan] 2025-09-09 10:33:55,253 - root - INFO - lr: 9.4439e-06 gnorm: 0.33 [1 day, 16:58:28<1 day, 8:13:37] +[titan] 2025-09-09 10:34:27,319 - root - INFO - step: 22395 loss: 2.7495 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.04 mfu: 49.25% global_avg_ntp_loss: 0.7844 global_avg_top_loss: 1.9651 +[titan] 2025-09-09 10:34:27,319 - root - INFO - lr: 9.4404e-06 gnorm: 0.34 [1 day, 16:59:00<1 day, 8:13:03] +[titan] 2025-09-09 10:34:52,743 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 10:34:59,120 - root - INFO - step: 22400 loss: 2.7576 memory: 122.03GiB(87.57%) tps: 10,304 tflops: 491.09 mfu: 49.66% global_avg_ntp_loss: 0.7869 global_avg_top_loss: 1.9707 +[titan] 2025-09-09 10:34:59,120 - root - INFO - lr: 9.4369e-06 gnorm: 0.38 [1 day, 16:59:32<1 day, 8:12:29] +[titan] 2025-09-09 10:35:31,054 - root - INFO - step: 22405 loss: 2.6767 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.05 mfu: 49.45% global_avg_ntp_loss: 0.7545 global_avg_top_loss: 1.9222 +[titan] 2025-09-09 10:35:31,054 - root - INFO - lr: 9.4333e-06 gnorm: 0.35 [1 day, 17:00:04<1 day, 8:11:56] +[titan] 2025-09-09 10:36:02,956 - root - INFO - step: 22410 loss: 2.8015 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.53 mfu: 49.50% global_avg_ntp_loss: 0.8100 global_avg_top_loss: 1.9915 +[titan] 2025-09-09 10:36:02,956 - root - INFO - lr: 9.4298e-06 gnorm: 0.34 [1 day, 17:00:36<1 day, 8:11:22] +[titan] 2025-09-09 10:36:35,047 - root - INFO - step: 22415 loss: 2.6917 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.65 mfu: 49.21% global_avg_ntp_loss: 0.7588 global_avg_top_loss: 1.9329 +[titan] 2025-09-09 10:36:35,047 - root - INFO - lr: 9.4263e-06 gnorm: 0.45 [1 day, 17:01:08<1 day, 8:10:48] +[titan] 2025-09-09 10:37:06,842 - root - INFO - step: 22420 loss: 2.6305 memory: 122.03GiB(87.57%) tps: 10,306 tflops: 491.19 mfu: 49.66% global_avg_ntp_loss: 0.7283 global_avg_top_loss: 1.9022 +[titan] 2025-09-09 10:37:06,842 - root - INFO - lr: 9.4228e-06 gnorm: 0.38 [1 day, 17:01:40<1 day, 8:10:14] +[titan] 2025-09-09 10:37:38,715 - root - INFO - step: 22425 loss: 2.7768 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.98 mfu: 49.54% global_avg_ntp_loss: 0.7934 global_avg_top_loss: 1.9834 +[titan] 2025-09-09 10:37:38,716 - root - INFO - lr: 9.4193e-06 gnorm: 0.35 [1 day, 17:02:12<1 day, 8:09:41] +[titan] 2025-09-09 10:38:10,781 - root - INFO - step: 22430 loss: 2.7943 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.04 mfu: 49.25% global_avg_ntp_loss: 0.8103 global_avg_top_loss: 1.9840 +[titan] 2025-09-09 10:38:10,781 - root - INFO - lr: 9.4158e-06 gnorm: 0.36 [1 day, 17:02:44<1 day, 8:09:07] +[titan] 2025-09-09 10:38:42,775 - root - INFO - step: 22435 loss: 2.7842 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.13 mfu: 49.36% global_avg_ntp_loss: 0.7941 global_avg_top_loss: 1.9901 +[titan] 2025-09-09 10:38:42,775 - root - INFO - lr: 9.4123e-06 gnorm: 0.41 [1 day, 17:03:16<1 day, 8:08:33] +[titan] 2025-09-09 10:39:14,483 - root - INFO - step: 22440 loss: 2.7109 memory: 122.03GiB(87.57%) tps: 10,334 tflops: 492.53 mfu: 49.80% global_avg_ntp_loss: 0.7639 global_avg_top_loss: 1.9470 +[titan] 2025-09-09 10:39:14,483 - root - INFO - lr: 9.4087e-06 gnorm: 0.39 [1 day, 17:03:47<1 day, 8:07:59] +[titan] 2025-09-09 10:39:46,414 - root - INFO - step: 22445 loss: 3.2198 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.10 mfu: 49.45% global_avg_ntp_loss: 1.0513 global_avg_top_loss: 2.1685 +[titan] 2025-09-09 10:39:46,414 - root - INFO - lr: 9.4052e-06 gnorm: 0.35 [1 day, 17:04:19<1 day, 8:07:26] +[titan] 2025-09-09 10:40:11,864 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 10:40:18,264 - root - INFO - step: 22450 loss: 2.7975 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.32 mfu: 49.58% global_avg_ntp_loss: 0.8064 global_avg_top_loss: 1.9911 +[titan] 2025-09-09 10:40:18,265 - root - INFO - lr: 9.4017e-06 gnorm: 0.33 [1 day, 17:04:51<1 day, 8:06:52] +[titan] 2025-09-09 10:40:50,368 - root - INFO - step: 22455 loss: 3.1519 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.46 mfu: 49.19% global_avg_ntp_loss: 1.0199 global_avg_top_loss: 2.1320 +[titan] 2025-09-09 10:40:50,369 - root - INFO - lr: 9.3982e-06 gnorm: 0.39 [1 day, 17:05:23<1 day, 8:06:18] +[titan] 2025-09-09 10:41:22,175 - root - INFO - step: 22460 loss: 2.7428 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 491.00 mfu: 49.65% global_avg_ntp_loss: 0.7800 global_avg_top_loss: 1.9628 +[titan] 2025-09-09 10:41:22,176 - root - INFO - lr: 9.3947e-06 gnorm: 0.34 [1 day, 17:05:55<1 day, 8:05:44] +[titan] 2025-09-09 10:41:54,085 - root - INFO - step: 22465 loss: 2.7963 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.42 mfu: 49.49% global_avg_ntp_loss: 0.8013 global_avg_top_loss: 1.9951 +[titan] 2025-09-09 10:41:54,086 - root - INFO - lr: 9.3912e-06 gnorm: 0.36 [1 day, 17:06:27<1 day, 8:05:11] +[titan] 2025-09-09 10:42:25,953 - root - INFO - step: 22470 loss: 3.2035 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.06 mfu: 49.55% global_avg_ntp_loss: 1.0400 global_avg_top_loss: 2.1635 +[titan] 2025-09-09 10:42:25,953 - root - INFO - lr: 9.3877e-06 gnorm: 0.41 [1 day, 17:06:59<1 day, 8:04:37] +[titan] 2025-09-09 10:42:57,857 - root - INFO - step: 22475 loss: 2.7818 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.50 mfu: 49.49% global_avg_ntp_loss: 0.7972 global_avg_top_loss: 1.9846 +[titan] 2025-09-09 10:42:57,858 - root - INFO - lr: 9.3841e-06 gnorm: 0.32 [1 day, 17:07:31<1 day, 8:04:03] +[titan] 2025-09-09 10:43:29,723 - root - INFO - step: 22480 loss: 2.7182 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.09 mfu: 49.55% global_avg_ntp_loss: 0.7633 global_avg_top_loss: 1.9549 +[titan] 2025-09-09 10:43:29,723 - root - INFO - lr: 9.3806e-06 gnorm: 0.87 [1 day, 17:08:03<1 day, 8:03:29] +[titan] 2025-09-09 10:44:01,625 - root - INFO - step: 22485 loss: 2.6214 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.54 mfu: 49.50% global_avg_ntp_loss: 0.7243 global_avg_top_loss: 1.8971 +[titan] 2025-09-09 10:44:01,625 - root - INFO - lr: 9.3771e-06 gnorm: 0.38 [1 day, 17:08:34<1 day, 8:02:56] +[titan] 2025-09-09 10:44:33,371 - root - INFO - step: 22490 loss: 2.7821 memory: 122.03GiB(87.57%) tps: 10,322 tflops: 491.95 mfu: 49.74% global_avg_ntp_loss: 0.7987 global_avg_top_loss: 1.9834 +[titan] 2025-09-09 10:44:33,371 - root - INFO - lr: 9.3736e-06 gnorm: 0.34 [1 day, 17:09:06<1 day, 8:02:22] +[titan] 2025-09-09 10:45:05,612 - root - INFO - step: 22495 loss: 2.7225 memory: 122.03GiB(87.57%) tps: 10,163 tflops: 484.38 mfu: 48.98% global_avg_ntp_loss: 0.7685 global_avg_top_loss: 1.9540 +[titan] 2025-09-09 10:45:05,612 - root - INFO - lr: 9.3701e-06 gnorm: 0.33 [1 day, 17:09:38<1 day, 8:01:48] +[titan] 2025-09-09 10:45:31,000 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds. +[titan] 2025-09-09 10:45:37,392 - root - INFO - step: 22500 loss: 2.7479 memory: 122.03GiB(87.57%) tps: 10,311 tflops: 491.42 mfu: 49.69% global_avg_ntp_loss: 0.7795 global_avg_top_loss: 1.9683 +[titan] 2025-09-09 10:45:37,392 - root - INFO - lr: 9.3666e-06 gnorm: 0.35 [1 day, 17:10:10<1 day, 8:01:15] +[titan] 2025-09-09 10:46:09,296 - root - INFO - step: 22505 loss: 2.8345 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.50 mfu: 49.49% global_avg_ntp_loss: 0.8235 global_avg_top_loss: 2.0110 +[titan] 2025-09-09 10:46:09,297 - root - INFO - lr: 9.3631e-06 gnorm: 0.34 [1 day, 17:10:42<1 day, 8:00:41] +[titan] 2025-09-09 10:46:41,250 - root - INFO - step: 22510 loss: 2.7794 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.75 mfu: 49.42% global_avg_ntp_loss: 0.7973 global_avg_top_loss: 1.9821 +[titan] 2025-09-09 10:46:41,250 - root - INFO - lr: 9.3596e-06 gnorm: 0.45 [1 day, 17:11:14<1 day, 8:00:07] +[titan] 2025-09-09 10:47:13,222 - root - INFO - step: 22515 loss: 2.8173 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.47 mfu: 49.39% global_avg_ntp_loss: 0.8121 global_avg_top_loss: 2.0052 +[titan] 2025-09-09 10:47:13,222 - root - INFO - lr: 9.3561e-06 gnorm: 0.37 [1 day, 17:11:46<1 day, 7:59:33] +[titan] 2025-09-09 10:47:45,147 - root - INFO - step: 22520 loss: 2.6888 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.17 mfu: 49.46% global_avg_ntp_loss: 0.7589 global_avg_top_loss: 1.9299 +[titan] 2025-09-09 10:47:45,148 - root - INFO - lr: 9.3526e-06 gnorm: 0.33 [1 day, 17:12:18<1 day, 7:59:00] +[titan] 2025-09-09 10:48:17,090 - root - INFO - step: 22525 loss: 3.2359 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.91 mfu: 49.43% global_avg_ntp_loss: 1.0577 global_avg_top_loss: 2.1781 +[titan] 2025-09-09 10:48:17,091 - root - INFO - lr: 9.3490e-06 gnorm: 0.33 [1 day, 17:12:50<1 day, 7:58:26] +[titan] 2025-09-09 10:48:36,508 - root - INFO - Dumping profiler traces at step 22528 +[titan] 2025-09-09 10:48:36,562 - root - INFO - Finished dumping profiler traces in 0.05 seconds +[titan] 2025-09-09 10:48:49,411 - root - INFO - step: 22530 loss: 2.8530 memory: 122.03GiB(87.57%) tps: 10,139 tflops: 483.20 mfu: 48.86% global_avg_ntp_loss: 0.8325 global_avg_top_loss: 2.0205 +[titan] 2025-09-09 10:48:49,411 - root - INFO - lr: 9.3455e-06 gnorm: 0.33 [1 day, 17:13:22<1 day, 7:57:53] +[titan] 2025-09-09 10:49:21,402 - root - INFO - step: 22535 loss: 3.2439 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.17 mfu: 49.36% global_avg_ntp_loss: 1.0600 global_avg_top_loss: 2.1838 +[titan] 2025-09-09 10:49:21,402 - root - INFO - lr: 9.3420e-06 gnorm: 0.33 [1 day, 17:13:54<1 day, 7:57:19] +[titan] 2025-09-09 10:49:53,343 - root - INFO - step: 22540 loss: 2.6626 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.94 mfu: 49.44% global_avg_ntp_loss: 0.7424 global_avg_top_loss: 1.9202 +[titan] 2025-09-09 10:49:53,343 - root - INFO - lr: 9.3385e-06 gnorm: 0.49 [1 day, 17:14:26<1 day, 7:56:45] +[titan] 2025-09-09 10:50:25,320 - root - INFO - step: 22545 loss: 2.7366 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.39 mfu: 49.38% global_avg_ntp_loss: 0.7754 global_avg_top_loss: 1.9612 +[titan] 2025-09-09 10:50:25,320 - root - INFO - lr: 9.3350e-06 gnorm: 0.38 [1 day, 17:14:58<1 day, 7:56:12] +[titan] 2025-09-09 10:50:50,666 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds. +[titan] 2025-09-09 10:50:57,066 - root - INFO - step: 22550 loss: 2.7777 memory: 122.03GiB(87.57%) tps: 10,322 tflops: 491.95 mfu: 49.74% global_avg_ntp_loss: 0.7964 global_avg_top_loss: 1.9813 +[titan] 2025-09-09 10:50:57,066 - root - INFO - lr: 9.3315e-06 gnorm: 0.34 [1 day, 17:15:30<1 day, 7:55:38] +[titan] 2025-09-09 10:51:28,927 - root - INFO - step: 22555 loss: 2.8046 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.17 mfu: 49.56% global_avg_ntp_loss: 0.8085 global_avg_top_loss: 1.9961 +[titan] 2025-09-09 10:51:28,927 - root - INFO - lr: 9.3280e-06 gnorm: 0.34 [1 day, 17:16:02<1 day, 7:55:04] +[titan] 2025-09-09 10:52:00,705 - root - INFO - step: 22560 loss: 2.8238 memory: 122.03GiB(87.57%) tps: 10,311 tflops: 491.44 mfu: 49.69% global_avg_ntp_loss: 0.8159 global_avg_top_loss: 2.0079 +[titan] 2025-09-09 10:52:00,706 - root - INFO - lr: 9.3245e-06 gnorm: 0.35 [1 day, 17:16:34<1 day, 7:54:30] +[titan] 2025-09-09 10:52:32,649 - root - INFO - step: 22565 loss: 2.7995 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.90 mfu: 49.43% global_avg_ntp_loss: 0.8084 global_avg_top_loss: 1.9911 +[titan] 2025-09-09 10:52:32,649 - root - INFO - lr: 9.3210e-06 gnorm: 0.33 [1 day, 17:17:06<1 day, 7:53:56] +[titan] 2025-09-09 10:53:04,350 - root - INFO - step: 22570 loss: 2.7344 memory: 122.03GiB(87.57%) tps: 10,337 tflops: 492.63 mfu: 49.81% global_avg_ntp_loss: 0.7736 global_avg_top_loss: 1.9607 +[titan] 2025-09-09 10:53:04,350 - root - INFO - lr: 9.3175e-06 gnorm: 0.33 [1 day, 17:17:37<1 day, 7:53:23] +[titan] 2025-09-09 10:53:36,468 - root - INFO - step: 22575 loss: 2.7945 memory: 122.03GiB(87.57%) tps: 10,203 tflops: 486.25 mfu: 49.17% global_avg_ntp_loss: 0.8036 global_avg_top_loss: 1.9910 +[titan] 2025-09-09 10:53:36,468 - root - INFO - lr: 9.3140e-06 gnorm: 0.34 [1 day, 17:18:09<1 day, 7:52:49] +[titan] 2025-09-09 10:54:08,276 - root - INFO - step: 22580 loss: 2.8650 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 490.98 mfu: 49.64% global_avg_ntp_loss: 0.8340 global_avg_top_loss: 2.0310 +[titan] 2025-09-09 10:54:08,276 - root - INFO - lr: 9.3105e-06 gnorm: 0.37 [1 day, 17:18:41<1 day, 7:52:15] +[titan] 2025-09-09 10:54:40,256 - root - INFO - step: 22585 loss: 2.8324 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.35 mfu: 49.38% global_avg_ntp_loss: 0.8186 global_avg_top_loss: 2.0138 +[titan] 2025-09-09 10:54:40,256 - root - INFO - lr: 9.3069e-06 gnorm: 0.34 [1 day, 17:19:13<1 day, 7:51:41] +[titan] 2025-09-09 10:55:12,133 - root - INFO - step: 22590 loss: 2.8197 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.91 mfu: 49.54% global_avg_ntp_loss: 0.8118 global_avg_top_loss: 2.0079 +[titan] 2025-09-09 10:55:12,134 - root - INFO - lr: 9.3034e-06 gnorm: 0.36 [1 day, 17:19:45<1 day, 7:51:08] +[titan] 2025-09-09 10:55:44,076 - root - INFO - step: 22595 loss: 2.8490 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.91 mfu: 49.43% global_avg_ntp_loss: 0.8302 global_avg_top_loss: 2.0188 +[titan] 2025-09-09 10:55:44,076 - root - INFO - lr: 9.2999e-06 gnorm: 0.33 [1 day, 17:20:17<1 day, 7:50:34] +[titan] 2025-09-09 10:56:09,584 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 10:56:15,911 - root - INFO - step: 22600 loss: 2.7900 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.57 mfu: 49.60% global_avg_ntp_loss: 0.7988 global_avg_top_loss: 1.9912 +[titan] 2025-09-09 10:56:15,911 - root - INFO - lr: 9.2964e-06 gnorm: 0.33 [1 day, 17:20:49<1 day, 7:50:00] +[titan] 2025-09-09 10:56:47,763 - root - INFO - step: 22605 loss: 3.3223 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.31 mfu: 49.58% global_avg_ntp_loss: 1.1002 global_avg_top_loss: 2.2221 +[titan] 2025-09-09 10:56:47,763 - root - INFO - lr: 9.2929e-06 gnorm: 0.32 [1 day, 17:21:21<1 day, 7:49:27] +[titan] 2025-09-09 10:57:19,784 - root - INFO - step: 22610 loss: 2.6639 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.71 mfu: 49.31% global_avg_ntp_loss: 0.7431 global_avg_top_loss: 1.9208 +[titan] 2025-09-09 10:57:19,784 - root - INFO - lr: 9.2894e-06 gnorm: 0.33 [1 day, 17:21:53<1 day, 7:48:53] +[titan] 2025-09-09 10:57:51,810 - root - INFO - step: 22615 loss: 3.2190 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.64 mfu: 49.31% global_avg_ntp_loss: 1.0484 global_avg_top_loss: 2.1705 +[titan] 2025-09-09 10:57:51,810 - root - INFO - lr: 9.2859e-06 gnorm: 0.34 [1 day, 17:22:25<1 day, 7:48:19] +[titan] 2025-09-09 10:58:23,852 - root - INFO - step: 22620 loss: 2.6631 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.39 mfu: 49.28% global_avg_ntp_loss: 0.7442 global_avg_top_loss: 1.9189 +[titan] 2025-09-09 10:58:23,853 - root - INFO - lr: 9.2824e-06 gnorm: 0.48 [1 day, 17:22:57<1 day, 7:47:46] +[titan] 2025-09-09 10:58:55,700 - root - INFO - step: 22625 loss: 2.6856 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.38 mfu: 49.58% global_avg_ntp_loss: 0.7538 global_avg_top_loss: 1.9318 +[titan] 2025-09-09 10:58:55,700 - root - INFO - lr: 9.2789e-06 gnorm: 0.36 [1 day, 17:23:29<1 day, 7:47:12] +[titan] 2025-09-09 10:59:27,447 - root - INFO - step: 22630 loss: 2.7753 memory: 122.03GiB(87.57%) tps: 10,322 tflops: 491.93 mfu: 49.74% global_avg_ntp_loss: 0.7984 global_avg_top_loss: 1.9769 +[titan] 2025-09-09 10:59:27,447 - root - INFO - lr: 9.2754e-06 gnorm: 0.34 [1 day, 17:24:00<1 day, 7:46:38] +[titan] 2025-09-09 10:59:59,425 - root - INFO - step: 22635 loss: 2.8106 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.37 mfu: 49.38% global_avg_ntp_loss: 0.8130 global_avg_top_loss: 1.9975 +[titan] 2025-09-09 10:59:59,425 - root - INFO - lr: 9.2719e-06 gnorm: 0.34 [1 day, 17:24:32<1 day, 7:46:04] +[titan] 2025-09-09 11:00:31,576 - root - INFO - step: 22640 loss: 2.7696 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.74 mfu: 49.11% global_avg_ntp_loss: 0.7899 global_avg_top_loss: 1.9797 +[titan] 2025-09-09 11:00:31,576 - root - INFO - lr: 9.2684e-06 gnorm: 0.33 [1 day, 17:25:04<1 day, 7:45:31] +[titan] 2025-09-09 11:01:03,377 - root - INFO - step: 22645 loss: 2.8397 memory: 122.03GiB(87.57%) tps: 10,304 tflops: 491.10 mfu: 49.66% global_avg_ntp_loss: 0.8273 global_avg_top_loss: 2.0124 +[titan] 2025-09-09 11:01:03,377 - root - INFO - lr: 9.2649e-06 gnorm: 0.35 [1 day, 17:25:36<1 day, 7:44:57] +[titan] 2025-09-09 11:01:28,758 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 11:01:35,230 - root - INFO - step: 22650 loss: 2.8608 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.29 mfu: 49.57% global_avg_ntp_loss: 0.8327 global_avg_top_loss: 2.0280 +[titan] 2025-09-09 11:01:35,230 - root - INFO - lr: 9.2614e-06 gnorm: 0.35 [1 day, 17:26:08<1 day, 7:44:23] +[titan] 2025-09-09 11:02:07,069 - root - INFO - step: 22655 loss: 2.7537 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.50 mfu: 49.60% global_avg_ntp_loss: 0.7841 global_avg_top_loss: 1.9696 +[titan] 2025-09-09 11:02:07,070 - root - INFO - lr: 9.2579e-06 gnorm: 0.35 [1 day, 17:26:40<1 day, 7:43:50] +[titan] 2025-09-09 11:02:38,890 - root - INFO - step: 22660 loss: 2.7692 memory: 122.03GiB(87.57%) tps: 10,298 tflops: 490.79 mfu: 49.63% global_avg_ntp_loss: 0.7933 global_avg_top_loss: 1.9759 +[titan] 2025-09-09 11:02:38,890 - root - INFO - lr: 9.2544e-06 gnorm: 0.33 [1 day, 17:27:12<1 day, 7:43:16] +[titan] 2025-09-09 11:03:10,840 - root - INFO - step: 22665 loss: 2.8392 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.81 mfu: 49.42% global_avg_ntp_loss: 0.8259 global_avg_top_loss: 2.0133 +[titan] 2025-09-09 11:03:10,840 - root - INFO - lr: 9.2509e-06 gnorm: 0.36 [1 day, 17:27:44<1 day, 7:42:42] +[titan] 2025-09-09 11:03:42,657 - root - INFO - step: 22670 loss: 2.7571 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.84 mfu: 49.63% global_avg_ntp_loss: 0.7886 global_avg_top_loss: 1.9686 +[titan] 2025-09-09 11:03:42,657 - root - INFO - lr: 9.2474e-06 gnorm: 0.35 [1 day, 17:28:16<1 day, 7:42:08] +[titan] 2025-09-09 11:04:14,527 - root - INFO - step: 22675 loss: 2.7855 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.02 mfu: 49.55% global_avg_ntp_loss: 0.7986 global_avg_top_loss: 1.9869 +[titan] 2025-09-09 11:04:14,527 - root - INFO - lr: 9.2439e-06 gnorm: 0.34 [1 day, 17:28:47<1 day, 7:41:35] +[titan] 2025-09-09 11:04:46,139 - root - INFO - step: 22680 loss: 2.7507 memory: 122.03GiB(87.57%) tps: 10,366 tflops: 494.03 mfu: 49.95% global_avg_ntp_loss: 0.7822 global_avg_top_loss: 1.9685 +[titan] 2025-09-09 11:04:46,139 - root - INFO - lr: 9.2404e-06 gnorm: 0.34 [1 day, 17:29:19<1 day, 7:41:01] +[titan] 2025-09-09 11:05:18,103 - root - INFO - step: 22685 loss: 2.7724 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.58 mfu: 49.40% global_avg_ntp_loss: 0.7908 global_avg_top_loss: 1.9816 +[titan] 2025-09-09 11:05:18,104 - root - INFO - lr: 9.2369e-06 gnorm: 0.34 [1 day, 17:29:51<1 day, 7:40:27] +[titan] 2025-09-09 11:05:49,930 - root - INFO - step: 22690 loss: 2.7801 memory: 122.03GiB(87.57%) tps: 10,296 tflops: 490.69 mfu: 49.62% global_avg_ntp_loss: 0.8051 global_avg_top_loss: 1.9750 +[titan] 2025-09-09 11:05:49,930 - root - INFO - lr: 9.2334e-06 gnorm: 0.33 [1 day, 17:30:23<1 day, 7:39:53] +[titan] 2025-09-09 11:06:21,894 - root - INFO - step: 22695 loss: 2.7162 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.58 mfu: 49.40% global_avg_ntp_loss: 0.7669 global_avg_top_loss: 1.9493 +[titan] 2025-09-09 11:06:21,895 - root - INFO - lr: 9.2299e-06 gnorm: 0.33 [1 day, 17:30:55<1 day, 7:39:20] +[titan] 2025-09-09 11:06:47,430 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 11:06:53,760 - root - INFO - step: 22700 loss: 2.8097 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.10 mfu: 49.56% global_avg_ntp_loss: 0.8060 global_avg_top_loss: 2.0037 +[titan] 2025-09-09 11:06:53,760 - root - INFO - lr: 9.2264e-06 gnorm: 0.34 [1 day, 17:31:27<1 day, 7:38:46] +[titan] 2025-09-09 11:07:25,749 - root - INFO - step: 22705 loss: 2.7664 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.20 mfu: 49.36% global_avg_ntp_loss: 0.7870 global_avg_top_loss: 1.9794 +[titan] 2025-09-09 11:07:25,749 - root - INFO - lr: 9.2229e-06 gnorm: 0.33 [1 day, 17:31:59<1 day, 7:38:12] +[titan] 2025-09-09 11:07:57,582 - root - INFO - step: 22710 loss: 2.7659 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.60 mfu: 49.61% global_avg_ntp_loss: 0.7872 global_avg_top_loss: 1.9787 +[titan] 2025-09-09 11:07:57,582 - root - INFO - lr: 9.2194e-06 gnorm: 0.34 [1 day, 17:32:30<1 day, 7:37:38] +[titan] 2025-09-09 11:08:29,517 - root - INFO - step: 22715 loss: 2.6822 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.03 mfu: 49.45% global_avg_ntp_loss: 0.7539 global_avg_top_loss: 1.9284 +[titan] 2025-09-09 11:08:29,517 - root - INFO - lr: 9.2159e-06 gnorm: 0.38 [1 day, 17:33:02<1 day, 7:37:05] +[titan] 2025-09-09 11:09:01,395 - root - INFO - step: 22720 loss: 2.7197 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.91 mfu: 49.54% global_avg_ntp_loss: 0.7646 global_avg_top_loss: 1.9552 +[titan] 2025-09-09 11:09:01,395 - root - INFO - lr: 9.2124e-06 gnorm: 0.37 [1 day, 17:33:34<1 day, 7:36:31] +[titan] 2025-09-09 11:09:33,487 - root - INFO - step: 22725 loss: 2.7337 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.64 mfu: 49.21% global_avg_ntp_loss: 0.7739 global_avg_top_loss: 1.9597 +[titan] 2025-09-09 11:09:33,487 - root - INFO - lr: 9.2089e-06 gnorm: 0.37 [1 day, 17:34:06<1 day, 7:35:57] +[titan] 2025-09-09 11:10:05,420 - root - INFO - step: 22730 loss: 2.7944 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.07 mfu: 49.45% global_avg_ntp_loss: 0.8004 global_avg_top_loss: 1.9940 +[titan] 2025-09-09 11:10:05,420 - root - INFO - lr: 9.2054e-06 gnorm: 0.34 [1 day, 17:34:38<1 day, 7:35:24] +[titan] 2025-09-09 11:10:37,326 - root - INFO - step: 22735 loss: 2.8260 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.47 mfu: 49.49% global_avg_ntp_loss: 0.8181 global_avg_top_loss: 2.0079 +[titan] 2025-09-09 11:10:37,326 - root - INFO - lr: 9.2019e-06 gnorm: 0.35 [1 day, 17:35:10<1 day, 7:34:50] +[titan] 2025-09-09 11:11:09,287 - root - INFO - step: 22740 loss: 2.7214 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.63 mfu: 49.41% global_avg_ntp_loss: 0.7718 global_avg_top_loss: 1.9496 +[titan] 2025-09-09 11:11:09,288 - root - INFO - lr: 9.1984e-06 gnorm: 0.34 [1 day, 17:35:42<1 day, 7:34:16] +[titan] 2025-09-09 11:11:41,417 - root - INFO - step: 22745 loss: 2.7742 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.06 mfu: 49.15% global_avg_ntp_loss: 0.7975 global_avg_top_loss: 1.9766 +[titan] 2025-09-09 11:11:41,418 - root - INFO - lr: 9.1949e-06 gnorm: 0.34 [1 day, 17:36:14<1 day, 7:33:43] +[titan] 2025-09-09 11:12:06,924 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 11:12:13,283 - root - INFO - step: 22750 loss: 2.8156 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.09 mfu: 49.55% global_avg_ntp_loss: 0.8110 global_avg_top_loss: 2.0046 +[titan] 2025-09-09 11:12:13,283 - root - INFO - lr: 9.1914e-06 gnorm: 0.34 [1 day, 17:36:46<1 day, 7:33:09] +[titan] 2025-09-09 11:12:45,201 - root - INFO - step: 22755 loss: 2.6956 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.29 mfu: 49.47% global_avg_ntp_loss: 0.7568 global_avg_top_loss: 1.9388 +[titan] 2025-09-09 11:12:45,201 - root - INFO - lr: 9.1879e-06 gnorm: 0.38 [1 day, 17:37:18<1 day, 7:32:35] +[titan] 2025-09-09 11:13:16,846 - root - INFO - step: 22760 loss: 2.9061 memory: 122.03GiB(87.57%) tps: 10,355 tflops: 493.51 mfu: 49.90% global_avg_ntp_loss: 0.8713 global_avg_top_loss: 2.0348 +[titan] 2025-09-09 11:13:16,846 - root - INFO - lr: 9.1844e-06 gnorm: 0.33 [1 day, 17:37:50<1 day, 7:32:02] +[titan] 2025-09-09 11:13:48,720 - root - INFO - step: 22765 loss: 2.6836 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.98 mfu: 49.54% global_avg_ntp_loss: 0.7558 global_avg_top_loss: 1.9277 +[titan] 2025-09-09 11:13:48,720 - root - INFO - lr: 9.1809e-06 gnorm: 0.34 [1 day, 17:38:22<1 day, 7:31:28] +[titan] 2025-09-09 11:14:20,695 - root - INFO - step: 22770 loss: 2.9051 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.42 mfu: 49.38% global_avg_ntp_loss: 0.8531 global_avg_top_loss: 2.0520 +[titan] 2025-09-09 11:14:20,695 - root - INFO - lr: 9.1774e-06 gnorm: 0.38 [1 day, 17:38:54<1 day, 7:30:54] +[titan] 2025-09-09 11:14:52,570 - root - INFO - step: 22775 loss: 2.7745 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.95 mfu: 49.54% global_avg_ntp_loss: 0.7923 global_avg_top_loss: 1.9822 +[titan] 2025-09-09 11:14:52,570 - root - INFO - lr: 9.1739e-06 gnorm: 0.35 [1 day, 17:39:25<1 day, 7:30:20] +[titan] 2025-09-09 11:15:24,495 - root - INFO - step: 22780 loss: 2.7520 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.18 mfu: 49.46% global_avg_ntp_loss: 0.7901 global_avg_top_loss: 1.9619 +[titan] 2025-09-09 11:15:24,495 - root - INFO - lr: 9.1704e-06 gnorm: 0.33 [1 day, 17:39:57<1 day, 7:29:47] +[titan] 2025-09-09 11:15:56,458 - root - INFO - step: 22785 loss: 2.7848 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.60 mfu: 49.40% global_avg_ntp_loss: 0.7977 global_avg_top_loss: 1.9870 +[titan] 2025-09-09 11:15:56,458 - root - INFO - lr: 9.1669e-06 gnorm: 0.35 [1 day, 17:40:29<1 day, 7:29:13] +[titan] 2025-09-09 11:16:28,304 - root - INFO - step: 22790 loss: 2.8074 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.40 mfu: 49.59% global_avg_ntp_loss: 0.8169 global_avg_top_loss: 1.9905 +[titan] 2025-09-09 11:16:28,304 - root - INFO - lr: 9.1634e-06 gnorm: 0.36 [1 day, 17:41:01<1 day, 7:28:39] +[titan] 2025-09-09 11:17:00,082 - root - INFO - step: 22795 loss: 2.7460 memory: 122.03GiB(87.57%) tps: 10,312 tflops: 491.45 mfu: 49.69% global_avg_ntp_loss: 0.7854 global_avg_top_loss: 1.9607 +[titan] 2025-09-09 11:17:00,082 - root - INFO - lr: 9.1599e-06 gnorm: 0.36 [1 day, 17:41:33<1 day, 7:28:06] +[titan] 2025-09-09 11:17:25,548 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 11:17:31,876 - root - INFO - step: 22800 loss: 2.7609 memory: 122.03GiB(87.57%) tps: 10,307 tflops: 491.20 mfu: 49.67% global_avg_ntp_loss: 0.7932 global_avg_top_loss: 1.9677 +[titan] 2025-09-09 11:17:31,876 - root - INFO - lr: 9.1564e-06 gnorm: 0.34 [1 day, 17:42:05<1 day, 7:27:32] +[titan] 2025-09-09 11:18:03,734 - root - INFO - step: 22805 loss: 2.7826 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.21 mfu: 49.57% global_avg_ntp_loss: 0.7975 global_avg_top_loss: 1.9850 +[titan] 2025-09-09 11:18:03,734 - root - INFO - lr: 9.1529e-06 gnorm: 0.34 [1 day, 17:42:37<1 day, 7:26:58] +[titan] 2025-09-09 11:18:35,893 - root - INFO - step: 22810 loss: 2.8022 memory: 122.03GiB(87.57%) tps: 10,189 tflops: 485.62 mfu: 49.10% global_avg_ntp_loss: 0.8065 global_avg_top_loss: 1.9957 +[titan] 2025-09-09 11:18:35,894 - root - INFO - lr: 9.1494e-06 gnorm: 0.36 [1 day, 17:43:09<1 day, 7:26:25] +[titan] 2025-09-09 11:19:07,699 - root - INFO - step: 22815 loss: 2.7565 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.03 mfu: 49.65% global_avg_ntp_loss: 0.7833 global_avg_top_loss: 1.9732 +[titan] 2025-09-09 11:19:07,699 - root - INFO - lr: 9.1460e-06 gnorm: 0.34 [1 day, 17:43:41<1 day, 7:25:51] +[titan] 2025-09-09 11:19:39,496 - root - INFO - step: 22820 loss: 2.7649 memory: 122.03GiB(87.57%) tps: 10,305 tflops: 491.14 mfu: 49.66% global_avg_ntp_loss: 0.7868 global_avg_top_loss: 1.9781 +[titan] 2025-09-09 11:19:39,497 - root - INFO - lr: 9.1425e-06 gnorm: 0.34 [1 day, 17:44:12<1 day, 7:25:17] +[titan] 2025-09-09 11:20:11,554 - root - INFO - step: 22825 loss: 2.8329 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.16 mfu: 49.26% global_avg_ntp_loss: 0.8225 global_avg_top_loss: 2.0104 +[titan] 2025-09-09 11:20:11,554 - root - INFO - lr: 9.1390e-06 gnorm: 0.33 [1 day, 17:44:44<1 day, 7:24:44] +[titan] 2025-09-09 11:20:43,313 - root - INFO - step: 22830 loss: 2.7845 memory: 122.03GiB(87.57%) tps: 10,318 tflops: 491.74 mfu: 49.72% global_avg_ntp_loss: 0.7975 global_avg_top_loss: 1.9870 +[titan] 2025-09-09 11:20:43,313 - root - INFO - lr: 9.1355e-06 gnorm: 0.34 [1 day, 17:45:16<1 day, 7:24:10] +[titan] 2025-09-09 11:21:15,287 - root - INFO - step: 22835 loss: 2.7507 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.44 mfu: 49.39% global_avg_ntp_loss: 0.7929 global_avg_top_loss: 1.9578 +[titan] 2025-09-09 11:21:15,287 - root - INFO - lr: 9.1320e-06 gnorm: 0.33 [1 day, 17:45:48<1 day, 7:23:36] +[titan] 2025-09-09 11:21:47,194 - root - INFO - step: 22840 loss: 2.7794 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.46 mfu: 49.49% global_avg_ntp_loss: 0.7953 global_avg_top_loss: 1.9841 +[titan] 2025-09-09 11:21:47,194 - root - INFO - lr: 9.1285e-06 gnorm: 0.36 [1 day, 17:46:20<1 day, 7:23:02] +[titan] 2025-09-09 11:22:19,155 - root - INFO - step: 22845 loss: 2.7638 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.63 mfu: 49.41% global_avg_ntp_loss: 0.7921 global_avg_top_loss: 1.9717 +[titan] 2025-09-09 11:22:19,155 - root - INFO - lr: 9.1250e-06 gnorm: 0.37 [1 day, 17:46:52<1 day, 7:22:29] +[titan] 2025-09-09 11:22:44,617 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 11:22:51,039 - root - INFO - step: 22850 loss: 2.8019 memory: 122.03GiB(87.57%) tps: 10,278 tflops: 489.82 mfu: 49.53% global_avg_ntp_loss: 0.8064 global_avg_top_loss: 1.9954 +[titan] 2025-09-09 11:22:51,039 - root - INFO - lr: 9.1215e-06 gnorm: 0.35 [1 day, 17:47:24<1 day, 7:21:55] +[titan] 2025-09-09 11:23:22,972 - root - INFO - step: 22855 loss: 3.1268 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.07 mfu: 49.45% global_avg_ntp_loss: 1.0072 global_avg_top_loss: 2.1197 +[titan] 2025-09-09 11:23:22,972 - root - INFO - lr: 9.1180e-06 gnorm: 0.35 [1 day, 17:47:56<1 day, 7:21:21] +[titan] 2025-09-09 11:23:54,718 - root - INFO - step: 22860 loss: 2.7160 memory: 122.03GiB(87.57%) tps: 10,322 tflops: 491.94 mfu: 49.74% global_avg_ntp_loss: 0.7664 global_avg_top_loss: 1.9496 +[titan] 2025-09-09 11:23:54,718 - root - INFO - lr: 9.1145e-06 gnorm: 0.35 [1 day, 17:48:28<1 day, 7:20:48] +[titan] 2025-09-09 11:24:26,551 - root - INFO - step: 22865 loss: 2.7752 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.60 mfu: 49.61% global_avg_ntp_loss: 0.7947 global_avg_top_loss: 1.9805 +[titan] 2025-09-09 11:24:26,551 - root - INFO - lr: 9.1110e-06 gnorm: 0.34 [1 day, 17:48:59<1 day, 7:20:14] +[titan] 2025-09-09 11:24:58,450 - root - INFO - step: 22870 loss: 2.7930 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.58 mfu: 49.50% global_avg_ntp_loss: 0.8056 global_avg_top_loss: 1.9874 +[titan] 2025-09-09 11:24:58,450 - root - INFO - lr: 9.1075e-06 gnorm: 0.35 [1 day, 17:49:31<1 day, 7:19:40] +[titan] 2025-09-09 11:25:30,303 - root - INFO - step: 22875 loss: 2.5558 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.29 mfu: 49.57% global_avg_ntp_loss: 0.6957 global_avg_top_loss: 1.8601 +[titan] 2025-09-09 11:25:30,304 - root - INFO - lr: 9.1041e-06 gnorm: 0.39 [1 day, 17:50:03<1 day, 7:19:06] +[titan] 2025-09-09 11:26:02,104 - root - INFO - step: 22880 loss: 2.6741 memory: 122.03GiB(87.57%) tps: 10,304 tflops: 491.10 mfu: 49.66% global_avg_ntp_loss: 0.7454 global_avg_top_loss: 1.9287 +[titan] 2025-09-09 11:26:02,104 - root - INFO - lr: 9.1006e-06 gnorm: 0.51 [1 day, 17:50:35<1 day, 7:18:33] +[titan] 2025-09-09 11:26:33,809 - root - INFO - step: 22885 loss: 2.7962 memory: 122.03GiB(87.57%) tps: 10,336 tflops: 492.58 mfu: 49.81% global_avg_ntp_loss: 0.8071 global_avg_top_loss: 1.9891 +[titan] 2025-09-09 11:26:33,809 - root - INFO - lr: 9.0971e-06 gnorm: 0.35 [1 day, 17:51:07<1 day, 7:17:59] +[titan] 2025-09-09 11:27:05,668 - root - INFO - step: 22890 loss: 2.8700 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.19 mfu: 49.56% global_avg_ntp_loss: 0.8373 global_avg_top_loss: 2.0327 +[titan] 2025-09-09 11:27:05,669 - root - INFO - lr: 9.0936e-06 gnorm: 0.37 [1 day, 17:51:38<1 day, 7:17:25] +[titan] 2025-09-09 11:27:37,617 - root - INFO - step: 22895 loss: 2.8032 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.83 mfu: 49.43% global_avg_ntp_loss: 0.8068 global_avg_top_loss: 1.9964 +[titan] 2025-09-09 11:27:37,617 - root - INFO - lr: 9.0901e-06 gnorm: 0.35 [1 day, 17:52:10<1 day, 7:16:52] +[titan] 2025-09-09 11:28:03,042 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 11:28:09,363 - root - INFO - step: 22900 loss: 2.8294 memory: 122.03GiB(87.57%) tps: 10,322 tflops: 491.94 mfu: 49.74% global_avg_ntp_loss: 0.8210 global_avg_top_loss: 2.0084 +[titan] 2025-09-09 11:28:09,363 - root - INFO - lr: 9.0866e-06 gnorm: 0.35 [1 day, 17:52:42<1 day, 7:16:18] +[titan] 2025-09-09 11:28:41,201 - root - INFO - step: 22905 loss: 2.7928 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.52 mfu: 49.60% global_avg_ntp_loss: 0.8006 global_avg_top_loss: 1.9921 +[titan] 2025-09-09 11:28:41,201 - root - INFO - lr: 9.0831e-06 gnorm: 0.33 [1 day, 17:53:14<1 day, 7:15:44] +[titan] 2025-09-09 11:29:12,984 - root - INFO - step: 22910 loss: 2.7151 memory: 122.03GiB(87.57%) tps: 10,310 tflops: 491.37 mfu: 49.68% global_avg_ntp_loss: 0.7683 global_avg_top_loss: 1.9468 +[titan] 2025-09-09 11:29:12,985 - root - INFO - lr: 9.0796e-06 gnorm: 0.35 [1 day, 17:53:46<1 day, 7:15:10] +[titan] 2025-09-09 11:29:44,870 - root - INFO - step: 22915 loss: 2.8219 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.79 mfu: 49.52% global_avg_ntp_loss: 0.8126 global_avg_top_loss: 2.0093 +[titan] 2025-09-09 11:29:44,871 - root - INFO - lr: 9.0761e-06 gnorm: 0.36 [1 day, 17:54:18<1 day, 7:14:37] +[titan] 2025-09-09 11:30:16,700 - root - INFO - step: 22920 loss: 2.7748 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.65 mfu: 49.61% global_avg_ntp_loss: 0.7942 global_avg_top_loss: 1.9806 +[titan] 2025-09-09 11:30:16,700 - root - INFO - lr: 9.0727e-06 gnorm: 0.37 [1 day, 17:54:50<1 day, 7:14:03] +[titan] 2025-09-09 11:30:48,436 - root - INFO - step: 22925 loss: 2.7282 memory: 122.03GiB(87.57%) tps: 10,325 tflops: 492.09 mfu: 49.76% global_avg_ntp_loss: 0.7720 global_avg_top_loss: 1.9562 +[titan] 2025-09-09 11:30:48,436 - root - INFO - lr: 9.0692e-06 gnorm: 0.35 [1 day, 17:55:21<1 day, 7:13:29] +[titan] 2025-09-09 11:31:20,305 - root - INFO - step: 22930 loss: 2.8364 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.04 mfu: 49.55% global_avg_ntp_loss: 0.8199 global_avg_top_loss: 2.0165 +[titan] 2025-09-09 11:31:20,306 - root - INFO - lr: 9.0657e-06 gnorm: 0.39 [1 day, 17:55:53<1 day, 7:12:55] +[titan] 2025-09-09 11:31:52,015 - root - INFO - step: 22935 loss: 2.6347 memory: 122.03GiB(87.57%) tps: 10,334 tflops: 492.51 mfu: 49.80% global_avg_ntp_loss: 0.7312 global_avg_top_loss: 1.9035 +[titan] 2025-09-09 11:31:52,015 - root - INFO - lr: 9.0622e-06 gnorm: 0.33 [1 day, 17:56:25<1 day, 7:12:22] +[titan] 2025-09-09 11:32:23,867 - root - INFO - step: 22940 loss: 2.7362 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.30 mfu: 49.58% global_avg_ntp_loss: 0.7749 global_avg_top_loss: 1.9613 +[titan] 2025-09-09 11:32:23,867 - root - INFO - lr: 9.0587e-06 gnorm: 0.34 [1 day, 17:56:57<1 day, 7:11:48] +[titan] 2025-09-09 11:32:55,767 - root - INFO - step: 22945 loss: 2.7867 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.57 mfu: 49.50% global_avg_ntp_loss: 0.7998 global_avg_top_loss: 1.9869 +[titan] 2025-09-09 11:32:55,767 - root - INFO - lr: 9.0552e-06 gnorm: 0.34 [1 day, 17:57:29<1 day, 7:11:14] +[titan] 2025-09-09 11:33:21,345 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 11:33:27,657 - root - INFO - step: 22950 loss: 2.8209 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.72 mfu: 49.52% global_avg_ntp_loss: 0.8159 global_avg_top_loss: 2.0050 +[titan] 2025-09-09 11:33:27,657 - root - INFO - lr: 9.0517e-06 gnorm: 0.34 [1 day, 17:58:00<1 day, 7:10:40] +[titan] 2025-09-09 11:33:59,558 - root - INFO - step: 22955 loss: 2.7284 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.55 mfu: 49.50% global_avg_ntp_loss: 0.7689 global_avg_top_loss: 1.9596 +[titan] 2025-09-09 11:33:59,558 - root - INFO - lr: 9.0483e-06 gnorm: 0.33 [1 day, 17:58:32<1 day, 7:10:07] +[titan] 2025-09-09 11:34:31,523 - root - INFO - step: 22960 loss: 2.6068 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.57 mfu: 49.40% global_avg_ntp_loss: 0.7167 global_avg_top_loss: 1.8901 +[titan] 2025-09-09 11:34:31,523 - root - INFO - lr: 9.0448e-06 gnorm: 0.41 [1 day, 17:59:04<1 day, 7:09:33] +[titan] 2025-09-09 11:35:03,263 - root - INFO - step: 22965 loss: 2.7925 memory: 122.03GiB(87.57%) tps: 10,324 tflops: 492.03 mfu: 49.75% global_avg_ntp_loss: 0.8003 global_avg_top_loss: 1.9922 +[titan] 2025-09-09 11:35:03,263 - root - INFO - lr: 9.0413e-06 gnorm: 0.34 [1 day, 17:59:36<1 day, 7:08:59] +[titan] 2025-09-09 11:35:35,175 - root - INFO - step: 22970 loss: 2.6798 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.38 mfu: 49.48% global_avg_ntp_loss: 0.7490 global_avg_top_loss: 1.9308 +[titan] 2025-09-09 11:35:35,175 - root - INFO - lr: 9.0378e-06 gnorm: 0.34 [1 day, 18:00:08<1 day, 7:08:26] +[titan] 2025-09-09 11:36:07,033 - root - INFO - step: 22975 loss: 2.7753 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.22 mfu: 49.57% global_avg_ntp_loss: 0.7944 global_avg_top_loss: 1.9809 +[titan] 2025-09-09 11:36:07,033 - root - INFO - lr: 9.0343e-06 gnorm: 0.34 [1 day, 18:00:40<1 day, 7:07:52] +[titan] 2025-09-09 11:36:38,804 - root - INFO - step: 22980 loss: 2.7525 memory: 122.03GiB(87.57%) tps: 10,314 tflops: 491.56 mfu: 49.70% global_avg_ntp_loss: 0.7825 global_avg_top_loss: 1.9700 +[titan] 2025-09-09 11:36:38,804 - root - INFO - lr: 9.0308e-06 gnorm: 0.34 [1 day, 18:01:12<1 day, 7:07:18] +[titan] 2025-09-09 11:37:10,727 - root - INFO - step: 22985 loss: 2.7750 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.22 mfu: 49.47% global_avg_ntp_loss: 0.7942 global_avg_top_loss: 1.9808 +[titan] 2025-09-09 11:37:10,727 - root - INFO - lr: 9.0274e-06 gnorm: 0.33 [1 day, 18:01:44<1 day, 7:06:45] +[titan] 2025-09-09 11:37:42,724 - root - INFO - step: 22990 loss: 2.7206 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.08 mfu: 49.35% global_avg_ntp_loss: 0.7689 global_avg_top_loss: 1.9517 +[titan] 2025-09-09 11:37:42,724 - root - INFO - lr: 9.0239e-06 gnorm: 0.34 [1 day, 18:02:16<1 day, 7:06:11] +[titan] 2025-09-09 11:38:14,615 - root - INFO - step: 22995 loss: 2.7913 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.71 mfu: 49.52% global_avg_ntp_loss: 0.8000 global_avg_top_loss: 1.9913 +[titan] 2025-09-09 11:38:14,615 - root - INFO - lr: 9.0204e-06 gnorm: 0.34 [1 day, 18:02:47<1 day, 7:05:37] +[titan] 2025-09-09 11:38:40,131 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 11:38:46,599 - root - INFO - step: 23000 loss: 2.7613 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.28 mfu: 49.37% global_avg_ntp_loss: 0.7908 global_avg_top_loss: 1.9706 +[titan] 2025-09-09 11:38:46,600 - root - INFO - lr: 9.0169e-06 gnorm: 0.34 [1 day, 18:03:19<1 day, 7:05:04] +[titan] 2025-09-09 11:39:18,599 - root - INFO - step: 23005 loss: 2.8584 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.04 mfu: 49.35% global_avg_ntp_loss: 0.8325 global_avg_top_loss: 2.0259 +[titan] 2025-09-09 11:39:18,599 - root - INFO - lr: 9.0134e-06 gnorm: 0.36 [1 day, 18:03:51<1 day, 7:04:30] +[titan] 2025-09-09 11:39:50,502 - root - INFO - step: 23010 loss: 3.0171 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.53 mfu: 49.50% global_avg_ntp_loss: 0.9313 global_avg_top_loss: 2.0859 +[titan] 2025-09-09 11:39:50,502 - root - INFO - lr: 9.0099e-06 gnorm: 0.35 [1 day, 18:04:23<1 day, 7:03:57] +[titan] 2025-09-09 11:40:22,575 - root - INFO - step: 23015 loss: 2.7704 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.92 mfu: 49.23% global_avg_ntp_loss: 0.7901 global_avg_top_loss: 1.9803 +[titan] 2025-09-09 11:40:22,576 - root - INFO - lr: 9.0065e-06 gnorm: 0.48 [1 day, 18:04:55<1 day, 7:03:23] +[titan] 2025-09-09 11:40:54,414 - root - INFO - step: 23020 loss: 2.8252 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.52 mfu: 49.60% global_avg_ntp_loss: 0.8169 global_avg_top_loss: 2.0083 +[titan] 2025-09-09 11:40:54,414 - root - INFO - lr: 9.0030e-06 gnorm: 0.40 [1 day, 18:05:27<1 day, 7:02:49] +[titan] 2025-09-09 11:41:26,341 - root - INFO - step: 23025 loss: 2.7760 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.15 mfu: 49.46% global_avg_ntp_loss: 0.7956 global_avg_top_loss: 1.9804 +[titan] 2025-09-09 11:41:26,341 - root - INFO - lr: 8.9995e-06 gnorm: 0.38 [1 day, 18:05:59<1 day, 7:02:16] +[titan] 2025-09-09 11:41:58,349 - root - INFO - step: 23030 loss: 3.0877 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.92 mfu: 49.33% global_avg_ntp_loss: 0.9368 global_avg_top_loss: 2.1509 +[titan] 2025-09-09 11:41:58,349 - root - INFO - lr: 8.9960e-06 gnorm: 0.45 [1 day, 18:06:31<1 day, 7:01:42] +[titan] 2025-09-09 11:42:30,274 - root - INFO - step: 23035 loss: 2.6858 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.17 mfu: 49.46% global_avg_ntp_loss: 0.7464 global_avg_top_loss: 1.9394 +[titan] 2025-09-09 11:42:30,275 - root - INFO - lr: 8.9925e-06 gnorm: 0.47 [1 day, 18:07:03<1 day, 7:01:08] +[titan] 2025-09-09 11:43:02,324 - root - INFO - step: 23040 loss: 2.7471 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.28 mfu: 49.27% global_avg_ntp_loss: 0.7782 global_avg_top_loss: 1.9689 +[titan] 2025-09-09 11:43:02,324 - root - INFO - lr: 8.9891e-06 gnorm: 0.34 [1 day, 18:07:35<1 day, 7:00:35] +[titan] 2025-09-09 11:43:02,608 - root - INFO - Dumping profiler traces at step 23040 +[titan] 2025-09-09 11:43:02,676 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-09 11:43:34,417 - root - INFO - step: 23045 loss: 2.7461 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.63 mfu: 49.20% global_avg_ntp_loss: 0.7832 global_avg_top_loss: 1.9629 +[titan] 2025-09-09 11:43:34,417 - root - INFO - lr: 8.9856e-06 gnorm: 0.37 [1 day, 18:08:07<1 day, 7:00:01] +[titan] 2025-09-09 11:43:59,967 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 11:44:06,339 - root - INFO - step: 23050 loss: 2.7869 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.23 mfu: 49.47% global_avg_ntp_loss: 0.8034 global_avg_top_loss: 1.9834 +[titan] 2025-09-09 11:44:06,339 - root - INFO - lr: 8.9821e-06 gnorm: 0.36 [1 day, 18:08:39<1 day, 6:59:28] +[titan] 2025-09-09 11:44:38,083 - root - INFO - step: 23055 loss: 2.6814 memory: 122.03GiB(87.57%) tps: 10,323 tflops: 491.97 mfu: 49.74% global_avg_ntp_loss: 0.7484 global_avg_top_loss: 1.9330 +[titan] 2025-09-09 11:44:38,083 - root - INFO - lr: 8.9786e-06 gnorm: 0.35 [1 day, 18:09:11<1 day, 6:58:54] +[titan] 2025-09-09 11:45:10,114 - root - INFO - step: 23060 loss: 2.8875 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.57 mfu: 49.30% global_avg_ntp_loss: 0.8427 global_avg_top_loss: 2.0448 +[titan] 2025-09-09 11:45:10,114 - root - INFO - lr: 8.9751e-06 gnorm: 0.35 [1 day, 18:09:43<1 day, 6:58:20] +[titan] 2025-09-09 11:45:41,907 - root - INFO - step: 23065 loss: 2.7740 memory: 122.03GiB(87.57%) tps: 10,307 tflops: 491.20 mfu: 49.67% global_avg_ntp_loss: 0.7953 global_avg_top_loss: 1.9787 +[titan] 2025-09-09 11:45:41,908 - root - INFO - lr: 8.9717e-06 gnorm: 0.38 [1 day, 18:10:15<1 day, 6:57:47] +[titan] 2025-09-09 11:46:13,768 - root - INFO - step: 23070 loss: 2.8134 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.17 mfu: 49.56% global_avg_ntp_loss: 0.8113 global_avg_top_loss: 2.0021 +[titan] 2025-09-09 11:46:13,769 - root - INFO - lr: 8.9682e-06 gnorm: 0.34 [1 day, 18:10:47<1 day, 6:57:13] +[titan] 2025-09-09 11:46:45,656 - root - INFO - step: 23075 loss: 2.8088 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.76 mfu: 49.52% global_avg_ntp_loss: 0.8091 global_avg_top_loss: 1.9996 +[titan] 2025-09-09 11:46:45,656 - root - INFO - lr: 8.9647e-06 gnorm: 0.37 [1 day, 18:11:18<1 day, 6:56:39] +[titan] 2025-09-09 11:47:17,643 - root - INFO - step: 23080 loss: 2.7354 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.23 mfu: 49.37% global_avg_ntp_loss: 0.7726 global_avg_top_loss: 1.9629 +[titan] 2025-09-09 11:47:17,644 - root - INFO - lr: 8.9612e-06 gnorm: 0.35 [1 day, 18:11:50<1 day, 6:56:06] +[titan] 2025-09-09 11:47:49,421 - root - INFO - step: 23085 loss: 2.8107 memory: 122.03GiB(87.57%) tps: 10,312 tflops: 491.45 mfu: 49.69% global_avg_ntp_loss: 0.8102 global_avg_top_loss: 2.0005 +[titan] 2025-09-09 11:47:49,421 - root - INFO - lr: 8.9578e-06 gnorm: 0.35 [1 day, 18:12:22<1 day, 6:55:32] +[titan] 2025-09-09 11:48:21,076 - root - INFO - step: 23090 loss: 2.7969 memory: 122.03GiB(87.57%) tps: 10,352 tflops: 493.36 mfu: 49.88% global_avg_ntp_loss: 0.8008 global_avg_top_loss: 1.9961 +[titan] 2025-09-09 11:48:21,076 - root - INFO - lr: 8.9543e-06 gnorm: 0.35 [1 day, 18:12:54<1 day, 6:54:58] +[titan] 2025-09-09 11:48:52,999 - root - INFO - step: 23095 loss: 2.7270 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.21 mfu: 49.47% global_avg_ntp_loss: 0.7717 global_avg_top_loss: 1.9553 +[titan] 2025-09-09 11:48:53,000 - root - INFO - lr: 8.9508e-06 gnorm: 0.35 [1 day, 18:13:26<1 day, 6:54:25] +[titan] 2025-09-09 11:49:18,548 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 11:49:24,953 - root - INFO - step: 23100 loss: 2.7597 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.74 mfu: 49.42% global_avg_ntp_loss: 0.7865 global_avg_top_loss: 1.9731 +[titan] 2025-09-09 11:49:24,954 - root - INFO - lr: 8.9473e-06 gnorm: 0.36 [1 day, 18:13:58<1 day, 6:53:51] +[titan] 2025-09-09 11:49:56,770 - root - INFO - step: 23105 loss: 2.6881 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.85 mfu: 49.63% global_avg_ntp_loss: 0.7603 global_avg_top_loss: 1.9278 +[titan] 2025-09-09 11:49:56,770 - root - INFO - lr: 8.9439e-06 gnorm: 0.43 [1 day, 18:14:30<1 day, 6:53:17] +[titan] 2025-09-09 11:50:28,604 - root - INFO - step: 23110 loss: 2.7222 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.58 mfu: 49.60% global_avg_ntp_loss: 0.7715 global_avg_top_loss: 1.9506 +[titan] 2025-09-09 11:50:28,605 - root - INFO - lr: 8.9404e-06 gnorm: 0.38 [1 day, 18:15:01<1 day, 6:52:44] +[titan] 2025-09-09 11:51:00,852 - root - INFO - step: 23115 loss: 3.1842 memory: 122.03GiB(87.57%) tps: 10,162 tflops: 484.29 mfu: 48.97% global_avg_ntp_loss: 1.0304 global_avg_top_loss: 2.1538 +[titan] 2025-09-09 11:51:00,852 - root - INFO - lr: 8.9369e-06 gnorm: 0.34 [1 day, 18:15:34<1 day, 6:52:10] +[titan] 2025-09-09 11:51:32,818 - root - INFO - step: 23120 loss: 2.7098 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.56 mfu: 49.40% global_avg_ntp_loss: 0.7635 global_avg_top_loss: 1.9463 +[titan] 2025-09-09 11:51:32,818 - root - INFO - lr: 8.9334e-06 gnorm: 0.34 [1 day, 18:16:06<1 day, 6:51:37] +[titan] 2025-09-09 11:52:04,703 - root - INFO - step: 23125 loss: 2.7399 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.80 mfu: 49.52% global_avg_ntp_loss: 0.7820 global_avg_top_loss: 1.9579 +[titan] 2025-09-09 11:52:04,703 - root - INFO - lr: 8.9300e-06 gnorm: 0.39 [1 day, 18:16:38<1 day, 6:51:03] +[titan] 2025-09-09 11:52:36,806 - root - INFO - step: 23130 loss: 2.7554 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.47 mfu: 49.19% global_avg_ntp_loss: 0.7868 global_avg_top_loss: 1.9686 +[titan] 2025-09-09 11:52:36,806 - root - INFO - lr: 8.9265e-06 gnorm: 0.37 [1 day, 18:17:10<1 day, 6:50:29] +[titan] 2025-09-09 11:53:08,936 - root - INFO - step: 23135 loss: 2.8058 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.06 mfu: 49.15% global_avg_ntp_loss: 0.8041 global_avg_top_loss: 2.0017 +[titan] 2025-09-09 11:53:08,936 - root - INFO - lr: 8.9230e-06 gnorm: 0.35 [1 day, 18:17:42<1 day, 6:49:56] +[titan] 2025-09-09 11:53:40,833 - root - INFO - step: 23140 loss: 2.7882 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.62 mfu: 49.51% global_avg_ntp_loss: 0.7986 global_avg_top_loss: 1.9897 +[titan] 2025-09-09 11:53:40,833 - root - INFO - lr: 8.9195e-06 gnorm: 0.38 [1 day, 18:18:14<1 day, 6:49:22] +[titan] 2025-09-09 11:54:12,844 - root - INFO - step: 23145 loss: 2.6345 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.86 mfu: 49.33% global_avg_ntp_loss: 0.7333 global_avg_top_loss: 1.9012 +[titan] 2025-09-09 11:54:12,844 - root - INFO - lr: 8.9161e-06 gnorm: 0.35 [1 day, 18:18:46<1 day, 6:48:49] +[titan] 2025-09-09 11:54:38,349 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 11:54:44,752 - root - INFO - step: 23150 loss: 2.7469 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.45 mfu: 49.49% global_avg_ntp_loss: 0.7794 global_avg_top_loss: 1.9675 +[titan] 2025-09-09 11:54:44,752 - root - INFO - lr: 8.9126e-06 gnorm: 0.34 [1 day, 18:19:18<1 day, 6:48:15] +[titan] 2025-09-09 11:55:16,689 - root - INFO - step: 23155 loss: 2.7659 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 489.01 mfu: 49.44% global_avg_ntp_loss: 0.7974 global_avg_top_loss: 1.9685 +[titan] 2025-09-09 11:55:16,689 - root - INFO - lr: 8.9091e-06 gnorm: 0.36 [1 day, 18:19:49<1 day, 6:47:41] +[titan] 2025-09-09 11:55:48,589 - root - INFO - step: 23160 loss: 2.7921 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.56 mfu: 49.50% global_avg_ntp_loss: 0.8010 global_avg_top_loss: 1.9911 +[titan] 2025-09-09 11:55:48,589 - root - INFO - lr: 8.9056e-06 gnorm: 0.35 [1 day, 18:20:21<1 day, 6:47:08] +[titan] 2025-09-09 11:56:20,500 - root - INFO - step: 23165 loss: 2.8496 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.40 mfu: 49.48% global_avg_ntp_loss: 0.8365 global_avg_top_loss: 2.0131 +[titan] 2025-09-09 11:56:20,500 - root - INFO - lr: 8.9022e-06 gnorm: 0.34 [1 day, 18:20:53<1 day, 6:46:34] +[titan] 2025-09-09 11:56:52,479 - root - INFO - step: 23170 loss: 2.7570 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.36 mfu: 49.38% global_avg_ntp_loss: 0.7863 global_avg_top_loss: 1.9707 +[titan] 2025-09-09 11:56:52,479 - root - INFO - lr: 8.8987e-06 gnorm: 0.36 [1 day, 18:21:25<1 day, 6:46:01] +[titan] 2025-09-09 11:57:24,375 - root - INFO - step: 23175 loss: 2.7902 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.62 mfu: 49.51% global_avg_ntp_loss: 0.8023 global_avg_top_loss: 1.9879 +[titan] 2025-09-09 11:57:24,375 - root - INFO - lr: 8.8952e-06 gnorm: 0.37 [1 day, 18:21:57<1 day, 6:45:27] +[titan] 2025-09-09 11:57:56,643 - root - INFO - step: 23180 loss: 2.6998 memory: 122.03GiB(87.57%) tps: 10,155 tflops: 483.98 mfu: 48.94% global_avg_ntp_loss: 0.7586 global_avg_top_loss: 1.9412 +[titan] 2025-09-09 11:57:56,643 - root - INFO - lr: 8.8918e-06 gnorm: 0.36 [1 day, 18:22:29<1 day, 6:44:54] +[titan] 2025-09-09 11:58:28,425 - root - INFO - step: 23185 loss: 2.7027 memory: 122.03GiB(87.57%) tps: 10,310 tflops: 491.39 mfu: 49.69% global_avg_ntp_loss: 0.7590 global_avg_top_loss: 1.9437 +[titan] 2025-09-09 11:58:28,425 - root - INFO - lr: 8.8883e-06 gnorm: 0.34 [1 day, 18:23:01<1 day, 6:44:20] +[titan] 2025-09-09 11:59:00,240 - root - INFO - step: 23190 loss: 2.7335 memory: 122.03GiB(87.57%) tps: 10,300 tflops: 490.88 mfu: 49.63% global_avg_ntp_loss: 0.7693 global_avg_top_loss: 1.9642 +[titan] 2025-09-09 11:59:00,240 - root - INFO - lr: 8.8848e-06 gnorm: 0.35 [1 day, 18:23:33<1 day, 6:43:46] +[titan] 2025-09-09 11:59:32,236 - root - INFO - step: 23195 loss: 3.1848 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.09 mfu: 49.35% global_avg_ntp_loss: 1.0295 global_avg_top_loss: 2.1552 +[titan] 2025-09-09 11:59:32,236 - root - INFO - lr: 8.8813e-06 gnorm: 0.34 [1 day, 18:24:05<1 day, 6:43:13] +[titan] 2025-09-09 11:59:57,813 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:00:04,191 - root - INFO - step: 23200 loss: 2.8072 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.73 mfu: 49.42% global_avg_ntp_loss: 0.8083 global_avg_top_loss: 1.9988 +[titan] 2025-09-09 12:00:04,191 - root - INFO - lr: 8.8779e-06 gnorm: 0.37 [1 day, 18:24:37<1 day, 6:42:39] +[titan] 2025-09-09 12:00:36,129 - root - INFO - step: 23205 loss: 2.8785 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.99 mfu: 49.44% global_avg_ntp_loss: 0.8585 global_avg_top_loss: 2.0200 +[titan] 2025-09-09 12:00:36,129 - root - INFO - lr: 8.8744e-06 gnorm: 1.67 [1 day, 18:25:09<1 day, 6:42:05] +[titan] 2025-09-09 12:01:08,076 - root - INFO - step: 23210 loss: 2.8774 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.84 mfu: 49.43% global_avg_ntp_loss: 0.8501 global_avg_top_loss: 2.0273 +[titan] 2025-09-09 12:01:08,076 - root - INFO - lr: 8.8709e-06 gnorm: 0.56 [1 day, 18:25:41<1 day, 6:41:32] +[titan] 2025-09-09 12:01:39,999 - root - INFO - step: 23215 loss: 2.7257 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.22 mfu: 49.47% global_avg_ntp_loss: 0.7716 global_avg_top_loss: 1.9541 +[titan] 2025-09-09 12:01:39,999 - root - INFO - lr: 8.8675e-06 gnorm: 0.33 [1 day, 18:26:13<1 day, 6:40:58] +[titan] 2025-09-09 12:02:11,927 - root - INFO - step: 23220 loss: 2.7426 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.14 mfu: 49.46% global_avg_ntp_loss: 0.7794 global_avg_top_loss: 1.9631 +[titan] 2025-09-09 12:02:11,927 - root - INFO - lr: 8.8640e-06 gnorm: 0.39 [1 day, 18:26:45<1 day, 6:40:25] +[titan] 2025-09-09 12:02:43,813 - root - INFO - step: 23225 loss: 2.7552 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.78 mfu: 49.52% global_avg_ntp_loss: 0.7856 global_avg_top_loss: 1.9696 +[titan] 2025-09-09 12:02:43,813 - root - INFO - lr: 8.8605e-06 gnorm: 0.38 [1 day, 18:27:17<1 day, 6:39:51] +[titan] 2025-09-09 12:03:15,806 - root - INFO - step: 23230 loss: 2.8304 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.15 mfu: 49.36% global_avg_ntp_loss: 0.8263 global_avg_top_loss: 2.0041 +[titan] 2025-09-09 12:03:15,806 - root - INFO - lr: 8.8571e-06 gnorm: 0.42 [1 day, 18:27:49<1 day, 6:39:17] +[titan] 2025-09-09 12:03:47,624 - root - INFO - step: 23235 loss: 2.8164 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.83 mfu: 49.63% global_avg_ntp_loss: 0.8141 global_avg_top_loss: 2.0023 +[titan] 2025-09-09 12:03:47,624 - root - INFO - lr: 8.8536e-06 gnorm: 0.35 [1 day, 18:28:20<1 day, 6:38:44] +[titan] 2025-09-09 12:04:19,566 - root - INFO - step: 23240 loss: 2.7515 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.91 mfu: 49.43% global_avg_ntp_loss: 0.7795 global_avg_top_loss: 1.9721 +[titan] 2025-09-09 12:04:19,567 - root - INFO - lr: 8.8501e-06 gnorm: 0.38 [1 day, 18:28:52<1 day, 6:38:10] +[titan] 2025-09-09 12:04:51,346 - root - INFO - step: 23245 loss: 2.6992 memory: 122.03GiB(87.57%) tps: 10,311 tflops: 491.42 mfu: 49.69% global_avg_ntp_loss: 0.7580 global_avg_top_loss: 1.9413 +[titan] 2025-09-09 12:04:51,346 - root - INFO - lr: 8.8467e-06 gnorm: 0.42 [1 day, 18:29:24<1 day, 6:37:36] +[titan] 2025-09-09 12:05:17,084 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:05:23,482 - root - INFO - step: 23250 loss: 2.7266 memory: 122.03GiB(87.57%) tps: 10,197 tflops: 485.97 mfu: 49.14% global_avg_ntp_loss: 0.7712 global_avg_top_loss: 1.9554 +[titan] 2025-09-09 12:05:23,483 - root - INFO - lr: 8.8432e-06 gnorm: 0.36 [1 day, 18:29:56<1 day, 6:37:03] +[titan] 2025-09-09 12:05:55,347 - root - INFO - step: 23255 loss: 2.7597 memory: 122.03GiB(87.57%) tps: 10,284 tflops: 490.11 mfu: 49.56% global_avg_ntp_loss: 0.7876 global_avg_top_loss: 1.9721 +[titan] 2025-09-09 12:05:55,347 - root - INFO - lr: 8.8397e-06 gnorm: 0.37 [1 day, 18:30:28<1 day, 6:36:29] +[titan] 2025-09-09 12:06:27,341 - root - INFO - step: 23260 loss: 2.7848 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.13 mfu: 49.36% global_avg_ntp_loss: 0.8092 global_avg_top_loss: 1.9756 +[titan] 2025-09-09 12:06:27,341 - root - INFO - lr: 8.8363e-06 gnorm: 0.34 [1 day, 18:31:00<1 day, 6:35:56] +[titan] 2025-09-09 12:06:59,310 - root - INFO - step: 23265 loss: 2.6614 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.51 mfu: 49.39% global_avg_ntp_loss: 0.7415 global_avg_top_loss: 1.9199 +[titan] 2025-09-09 12:06:59,311 - root - INFO - lr: 8.8328e-06 gnorm: 0.36 [1 day, 18:31:32<1 day, 6:35:22] +[titan] 2025-09-09 12:07:31,117 - root - INFO - step: 23270 loss: 2.7784 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 491.01 mfu: 49.65% global_avg_ntp_loss: 0.7939 global_avg_top_loss: 1.9845 +[titan] 2025-09-09 12:07:31,117 - root - INFO - lr: 8.8293e-06 gnorm: 0.37 [1 day, 18:32:04<1 day, 6:34:49] +[titan] 2025-09-09 12:08:03,210 - root - INFO - step: 23275 loss: 3.1576 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.62 mfu: 49.20% global_avg_ntp_loss: 1.0188 global_avg_top_loss: 2.1388 +[titan] 2025-09-09 12:08:03,210 - root - INFO - lr: 8.8259e-06 gnorm: 0.33 [1 day, 18:32:36<1 day, 6:34:15] +[titan] 2025-09-09 12:08:35,038 - root - INFO - step: 23280 loss: 2.6968 memory: 122.03GiB(87.57%) tps: 10,296 tflops: 490.68 mfu: 49.61% global_avg_ntp_loss: 0.7566 global_avg_top_loss: 1.9402 +[titan] 2025-09-09 12:08:35,038 - root - INFO - lr: 8.8224e-06 gnorm: 0.32 [1 day, 18:33:08<1 day, 6:33:41] +[titan] 2025-09-09 12:09:06,991 - root - INFO - step: 23285 loss: 2.7542 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.75 mfu: 49.42% global_avg_ntp_loss: 0.7828 global_avg_top_loss: 1.9714 +[titan] 2025-09-09 12:09:06,992 - root - INFO - lr: 8.8189e-06 gnorm: 0.34 [1 day, 18:33:40<1 day, 6:33:08] +[titan] 2025-09-09 12:09:39,068 - root - INFO - step: 23290 loss: 2.8147 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.87 mfu: 49.23% global_avg_ntp_loss: 0.8125 global_avg_top_loss: 2.0022 +[titan] 2025-09-09 12:09:39,068 - root - INFO - lr: 8.8155e-06 gnorm: 0.35 [1 day, 18:34:12<1 day, 6:32:34] +[titan] 2025-09-09 12:10:11,005 - root - INFO - step: 23295 loss: 2.7415 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.01 mfu: 49.44% global_avg_ntp_loss: 0.7804 global_avg_top_loss: 1.9612 +[titan] 2025-09-09 12:10:11,005 - root - INFO - lr: 8.8120e-06 gnorm: 0.35 [1 day, 18:34:44<1 day, 6:32:01] +[titan] 2025-09-09 12:10:36,636 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:10:43,076 - root - INFO - step: 23300 loss: 2.7410 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.95 mfu: 49.24% global_avg_ntp_loss: 0.7760 global_avg_top_loss: 1.9650 +[titan] 2025-09-09 12:10:43,076 - root - INFO - lr: 8.8086e-06 gnorm: 0.35 [1 day, 18:35:16<1 day, 6:31:27] +[titan] 2025-09-09 12:11:15,031 - root - INFO - step: 23305 loss: 2.7628 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.73 mfu: 49.42% global_avg_ntp_loss: 0.7868 global_avg_top_loss: 1.9760 +[titan] 2025-09-09 12:11:15,031 - root - INFO - lr: 8.8051e-06 gnorm: 0.35 [1 day, 18:35:48<1 day, 6:30:54] +[titan] 2025-09-09 12:11:47,043 - root - INFO - step: 23310 loss: 2.7991 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.85 mfu: 49.33% global_avg_ntp_loss: 0.8023 global_avg_top_loss: 1.9968 +[titan] 2025-09-09 12:11:47,044 - root - INFO - lr: 8.8016e-06 gnorm: 0.53 [1 day, 18:36:20<1 day, 6:30:20] +[titan] 2025-09-09 12:12:19,089 - root - INFO - step: 23315 loss: 2.8062 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.34 mfu: 49.28% global_avg_ntp_loss: 0.8075 global_avg_top_loss: 1.9987 +[titan] 2025-09-09 12:12:19,090 - root - INFO - lr: 8.7982e-06 gnorm: 0.38 [1 day, 18:36:52<1 day, 6:29:47] +[titan] 2025-09-09 12:12:50,889 - root - INFO - step: 23320 loss: 2.7351 memory: 122.03GiB(87.57%) tps: 10,304 tflops: 491.10 mfu: 49.66% global_avg_ntp_loss: 0.7736 global_avg_top_loss: 1.9614 +[titan] 2025-09-09 12:12:50,890 - root - INFO - lr: 8.7947e-06 gnorm: 0.34 [1 day, 18:37:24<1 day, 6:29:13] +[titan] 2025-09-09 12:13:22,907 - root - INFO - step: 23325 loss: 2.6636 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.76 mfu: 49.32% global_avg_ntp_loss: 0.7399 global_avg_top_loss: 1.9236 +[titan] 2025-09-09 12:13:22,908 - root - INFO - lr: 8.7912e-06 gnorm: 0.36 [1 day, 18:37:56<1 day, 6:28:39] +[titan] 2025-09-09 12:13:54,931 - root - INFO - step: 23330 loss: 2.6635 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.68 mfu: 49.31% global_avg_ntp_loss: 0.7441 global_avg_top_loss: 1.9194 +[titan] 2025-09-09 12:13:54,931 - root - INFO - lr: 8.7878e-06 gnorm: 0.33 [1 day, 18:38:28<1 day, 6:28:06] +[titan] 2025-09-09 12:14:26,891 - root - INFO - step: 23335 loss: 2.5600 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.64 mfu: 49.41% global_avg_ntp_loss: 0.6974 global_avg_top_loss: 1.8626 +[titan] 2025-09-09 12:14:26,891 - root - INFO - lr: 8.7843e-06 gnorm: 0.41 [1 day, 18:39:00<1 day, 6:27:32] +[titan] 2025-09-09 12:14:58,919 - root - INFO - step: 23340 loss: 2.8411 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.61 mfu: 49.30% global_avg_ntp_loss: 0.8424 global_avg_top_loss: 1.9986 +[titan] 2025-09-09 12:14:58,919 - root - INFO - lr: 8.7809e-06 gnorm: 0.34 [1 day, 18:39:32<1 day, 6:26:59] +[titan] 2025-09-09 12:15:30,777 - root - INFO - step: 23345 loss: 2.7061 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.22 mfu: 49.57% global_avg_ntp_loss: 0.7633 global_avg_top_loss: 1.9427 +[titan] 2025-09-09 12:15:30,777 - root - INFO - lr: 8.7774e-06 gnorm: 0.35 [1 day, 18:40:04<1 day, 6:26:25] +[titan] 2025-09-09 12:15:56,333 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:16:02,741 - root - INFO - step: 23350 loss: 2.7458 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.58 mfu: 49.40% global_avg_ntp_loss: 0.7796 global_avg_top_loss: 1.9662 +[titan] 2025-09-09 12:16:02,741 - root - INFO - lr: 8.7740e-06 gnorm: 0.38 [1 day, 18:40:36<1 day, 6:25:52] +[titan] 2025-09-09 12:16:34,601 - root - INFO - step: 23355 loss: 3.2101 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.19 mfu: 49.56% global_avg_ntp_loss: 1.0401 global_avg_top_loss: 2.1699 +[titan] 2025-09-09 12:16:34,601 - root - INFO - lr: 8.7705e-06 gnorm: 0.60 [1 day, 18:41:07<1 day, 6:25:18] +[titan] 2025-09-09 12:17:06,548 - root - INFO - step: 23360 loss: 2.7253 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.85 mfu: 49.43% global_avg_ntp_loss: 0.7679 global_avg_top_loss: 1.9574 +[titan] 2025-09-09 12:17:06,548 - root - INFO - lr: 8.7670e-06 gnorm: 0.35 [1 day, 18:41:39<1 day, 6:24:44] +[titan] 2025-09-09 12:17:38,393 - root - INFO - step: 23365 loss: 2.7058 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.42 mfu: 49.59% global_avg_ntp_loss: 0.7605 global_avg_top_loss: 1.9453 +[titan] 2025-09-09 12:17:38,393 - root - INFO - lr: 8.7636e-06 gnorm: 0.37 [1 day, 18:42:11<1 day, 6:24:11] +[titan] 2025-09-09 12:18:10,211 - root - INFO - step: 23370 loss: 2.7983 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.83 mfu: 49.63% global_avg_ntp_loss: 0.8055 global_avg_top_loss: 1.9927 +[titan] 2025-09-09 12:18:10,211 - root - INFO - lr: 8.7601e-06 gnorm: 0.42 [1 day, 18:42:43<1 day, 6:23:37] +[titan] 2025-09-09 12:18:42,398 - root - INFO - step: 23375 loss: 2.7652 memory: 122.03GiB(87.57%) tps: 10,181 tflops: 485.20 mfu: 49.06% global_avg_ntp_loss: 0.7873 global_avg_top_loss: 1.9779 +[titan] 2025-09-09 12:18:42,398 - root - INFO - lr: 8.7567e-06 gnorm: 0.35 [1 day, 18:43:15<1 day, 6:23:04] +[titan] 2025-09-09 12:19:14,226 - root - INFO - step: 23380 loss: 2.7757 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.67 mfu: 49.61% global_avg_ntp_loss: 0.7929 global_avg_top_loss: 1.9828 +[titan] 2025-09-09 12:19:14,226 - root - INFO - lr: 8.7532e-06 gnorm: 0.37 [1 day, 18:43:47<1 day, 6:22:30] +[titan] 2025-09-09 12:19:46,052 - root - INFO - step: 23385 loss: 2.7610 memory: 122.03GiB(87.57%) tps: 10,296 tflops: 490.71 mfu: 49.62% global_avg_ntp_loss: 0.7895 global_avg_top_loss: 1.9714 +[titan] 2025-09-09 12:19:46,052 - root - INFO - lr: 8.7497e-06 gnorm: 0.40 [1 day, 18:44:19<1 day, 6:21:56] +[titan] 2025-09-09 12:20:17,962 - root - INFO - step: 23390 loss: 3.0177 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.41 mfu: 49.49% global_avg_ntp_loss: 0.9169 global_avg_top_loss: 2.1008 +[titan] 2025-09-09 12:20:17,962 - root - INFO - lr: 8.7463e-06 gnorm: 0.46 [1 day, 18:44:51<1 day, 6:21:23] +[titan] 2025-09-09 12:20:50,114 - root - INFO - step: 23395 loss: 2.8390 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.74 mfu: 49.11% global_avg_ntp_loss: 0.8225 global_avg_top_loss: 2.0165 +[titan] 2025-09-09 12:20:50,114 - root - INFO - lr: 8.7428e-06 gnorm: 0.35 [1 day, 18:45:23<1 day, 6:20:49] +[titan] 2025-09-09 12:21:15,917 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:21:22,196 - root - INFO - step: 23400 loss: 2.7674 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.79 mfu: 49.22% global_avg_ntp_loss: 0.7905 global_avg_top_loss: 1.9769 +[titan] 2025-09-09 12:21:22,196 - root - INFO - lr: 8.7394e-06 gnorm: 0.34 [1 day, 18:45:55<1 day, 6:20:16] +[titan] 2025-09-09 12:21:54,058 - root - INFO - step: 23405 loss: 2.7830 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.15 mfu: 49.56% global_avg_ntp_loss: 0.7948 global_avg_top_loss: 1.9882 +[titan] 2025-09-09 12:21:54,058 - root - INFO - lr: 8.7359e-06 gnorm: 0.40 [1 day, 18:46:27<1 day, 6:19:42] +[titan] 2025-09-09 12:22:25,949 - root - INFO - step: 23410 loss: 2.8381 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.70 mfu: 49.51% global_avg_ntp_loss: 0.8232 global_avg_top_loss: 2.0149 +[titan] 2025-09-09 12:22:25,950 - root - INFO - lr: 8.7325e-06 gnorm: 0.38 [1 day, 18:46:59<1 day, 6:19:09] +[titan] 2025-09-09 12:22:58,010 - root - INFO - step: 23415 loss: 2.6745 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.11 mfu: 49.25% global_avg_ntp_loss: 0.7517 global_avg_top_loss: 1.9228 +[titan] 2025-09-09 12:22:58,010 - root - INFO - lr: 8.7290e-06 gnorm: 0.35 [1 day, 18:47:31<1 day, 6:18:35] +[titan] 2025-09-09 12:23:29,890 - root - INFO - step: 23420 loss: 2.7963 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.88 mfu: 49.53% global_avg_ntp_loss: 0.8049 global_avg_top_loss: 1.9914 +[titan] 2025-09-09 12:23:29,890 - root - INFO - lr: 8.7256e-06 gnorm: 0.39 [1 day, 18:48:03<1 day, 6:18:01] +[titan] 2025-09-09 12:24:01,955 - root - INFO - step: 23425 loss: 2.6885 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.04 mfu: 49.25% global_avg_ntp_loss: 0.7613 global_avg_top_loss: 1.9272 +[titan] 2025-09-09 12:24:01,956 - root - INFO - lr: 8.7221e-06 gnorm: 0.39 [1 day, 18:48:35<1 day, 6:17:28] +[titan] 2025-09-09 12:24:33,892 - root - INFO - step: 23430 loss: 2.7514 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 489.01 mfu: 49.44% global_avg_ntp_loss: 0.7817 global_avg_top_loss: 1.9698 +[titan] 2025-09-09 12:24:33,892 - root - INFO - lr: 8.7187e-06 gnorm: 0.36 [1 day, 18:49:07<1 day, 6:16:54] +[titan] 2025-09-09 12:25:05,823 - root - INFO - step: 23435 loss: 3.1636 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.10 mfu: 49.45% global_avg_ntp_loss: 1.0187 global_avg_top_loss: 2.1450 +[titan] 2025-09-09 12:25:05,823 - root - INFO - lr: 8.7152e-06 gnorm: 0.43 [1 day, 18:49:39<1 day, 6:16:21] +[titan] 2025-09-09 12:25:37,660 - root - INFO - step: 23440 loss: 2.7340 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.53 mfu: 49.60% global_avg_ntp_loss: 0.7726 global_avg_top_loss: 1.9614 +[titan] 2025-09-09 12:25:37,661 - root - INFO - lr: 8.7117e-06 gnorm: 0.36 [1 day, 18:50:10<1 day, 6:15:47] +[titan] 2025-09-09 12:26:09,802 - root - INFO - step: 23445 loss: 2.7629 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.89 mfu: 49.13% global_avg_ntp_loss: 0.7870 global_avg_top_loss: 1.9759 +[titan] 2025-09-09 12:26:09,802 - root - INFO - lr: 8.7083e-06 gnorm: 0.37 [1 day, 18:50:43<1 day, 6:15:14] +[titan] 2025-09-09 12:26:35,445 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:26:41,841 - root - INFO - step: 23450 loss: 2.7548 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.44 mfu: 49.29% global_avg_ntp_loss: 0.7878 global_avg_top_loss: 1.9670 +[titan] 2025-09-09 12:26:41,841 - root - INFO - lr: 8.7048e-06 gnorm: 0.41 [1 day, 18:51:15<1 day, 6:14:40] +[titan] 2025-09-09 12:27:13,893 - root - INFO - step: 23455 loss: 2.7984 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.25 mfu: 49.27% global_avg_ntp_loss: 0.8033 global_avg_top_loss: 1.9950 +[titan] 2025-09-09 12:27:13,893 - root - INFO - lr: 8.7014e-06 gnorm: 0.35 [1 day, 18:51:47<1 day, 6:14:07] +[titan] 2025-09-09 12:27:45,931 - root - INFO - step: 23460 loss: 2.7268 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.46 mfu: 49.29% global_avg_ntp_loss: 0.7748 global_avg_top_loss: 1.9520 +[titan] 2025-09-09 12:27:45,931 - root - INFO - lr: 8.6979e-06 gnorm: 0.34 [1 day, 18:52:19<1 day, 6:13:33] +[titan] 2025-09-09 12:28:17,891 - root - INFO - step: 23465 loss: 2.8195 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.64 mfu: 49.41% global_avg_ntp_loss: 0.8143 global_avg_top_loss: 2.0052 +[titan] 2025-09-09 12:28:17,892 - root - INFO - lr: 8.6945e-06 gnorm: 0.37 [1 day, 18:52:51<1 day, 6:13:00] +[titan] 2025-09-09 12:28:49,979 - root - INFO - step: 23470 loss: 3.1037 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.71 mfu: 49.21% global_avg_ntp_loss: 0.9470 global_avg_top_loss: 2.1566 +[titan] 2025-09-09 12:28:49,979 - root - INFO - lr: 8.6910e-06 gnorm: 0.39 [1 day, 18:53:23<1 day, 6:12:26] +[titan] 2025-09-09 12:29:21,795 - root - INFO - step: 23475 loss: 2.6998 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.85 mfu: 49.63% global_avg_ntp_loss: 0.7643 global_avg_top_loss: 1.9354 +[titan] 2025-09-09 12:29:21,796 - root - INFO - lr: 8.6876e-06 gnorm: 0.35 [1 day, 18:53:55<1 day, 6:11:53] +[titan] 2025-09-09 12:29:54,059 - root - INFO - step: 23480 loss: 2.7357 memory: 122.03GiB(87.57%) tps: 10,156 tflops: 484.04 mfu: 48.94% global_avg_ntp_loss: 0.7758 global_avg_top_loss: 1.9598 +[titan] 2025-09-09 12:29:54,060 - root - INFO - lr: 8.6841e-06 gnorm: 0.34 [1 day, 18:54:27<1 day, 6:11:19] +[titan] 2025-09-09 12:30:26,233 - root - INFO - step: 23485 loss: 2.8005 memory: 122.03GiB(87.57%) tps: 10,185 tflops: 485.41 mfu: 49.08% global_avg_ntp_loss: 0.8039 global_avg_top_loss: 1.9966 +[titan] 2025-09-09 12:30:26,233 - root - INFO - lr: 8.6807e-06 gnorm: 0.34 [1 day, 18:54:59<1 day, 6:10:46] +[titan] 2025-09-09 12:30:58,284 - root - INFO - step: 23490 loss: 2.7755 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.25 mfu: 49.27% global_avg_ntp_loss: 0.7952 global_avg_top_loss: 1.9804 +[titan] 2025-09-09 12:30:58,284 - root - INFO - lr: 8.6772e-06 gnorm: 0.34 [1 day, 18:55:31<1 day, 6:10:12] +[titan] 2025-09-09 12:31:30,358 - root - INFO - step: 23495 loss: 2.6809 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.91 mfu: 49.23% global_avg_ntp_loss: 0.7512 global_avg_top_loss: 1.9297 +[titan] 2025-09-09 12:31:30,358 - root - INFO - lr: 8.6738e-06 gnorm: 0.35 [1 day, 18:56:03<1 day, 6:09:39] +[titan] 2025-09-09 12:31:56,191 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:32:02,546 - root - INFO - step: 23500 loss: 2.7743 memory: 122.03GiB(87.57%) tps: 10,180 tflops: 485.19 mfu: 49.06% global_avg_ntp_loss: 0.7926 global_avg_top_loss: 1.9817 +[titan] 2025-09-09 12:32:02,547 - root - INFO - lr: 8.6703e-06 gnorm: 0.37 [1 day, 18:56:35<1 day, 6:09:05] +[titan] 2025-09-09 12:32:34,615 - root - INFO - step: 23505 loss: 2.6703 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.99 mfu: 49.24% global_avg_ntp_loss: 0.7554 global_avg_top_loss: 1.9149 +[titan] 2025-09-09 12:32:34,616 - root - INFO - lr: 8.6669e-06 gnorm: 0.35 [1 day, 18:57:07<1 day, 6:08:32] +[titan] 2025-09-09 12:33:06,979 - root - INFO - step: 23510 loss: 2.7271 memory: 122.03GiB(87.57%) tps: 10,125 tflops: 482.55 mfu: 48.79% global_avg_ntp_loss: 0.7726 global_avg_top_loss: 1.9545 +[titan] 2025-09-09 12:33:06,980 - root - INFO - lr: 8.6634e-06 gnorm: 0.34 [1 day, 18:57:40<1 day, 6:07:59] +[titan] 2025-09-09 12:33:39,117 - root - INFO - step: 23515 loss: 3.1399 memory: 122.03GiB(87.57%) tps: 10,196 tflops: 485.95 mfu: 49.14% global_avg_ntp_loss: 0.9570 global_avg_top_loss: 2.1829 +[titan] 2025-09-09 12:33:39,117 - root - INFO - lr: 8.6600e-06 gnorm: 0.39 [1 day, 18:58:12<1 day, 6:07:25] +[titan] 2025-09-09 12:34:11,249 - root - INFO - step: 23520 loss: 2.7517 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.04 mfu: 49.14% global_avg_ntp_loss: 0.7878 global_avg_top_loss: 1.9640 +[titan] 2025-09-09 12:34:11,249 - root - INFO - lr: 8.6566e-06 gnorm: 0.36 [1 day, 18:58:44<1 day, 6:06:52] +[titan] 2025-09-09 12:34:43,177 - root - INFO - step: 23525 loss: 2.6464 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.13 mfu: 49.46% global_avg_ntp_loss: 0.7354 global_avg_top_loss: 1.9110 +[titan] 2025-09-09 12:34:43,177 - root - INFO - lr: 8.6531e-06 gnorm: 0.36 [1 day, 18:59:16<1 day, 6:06:18] +[titan] 2025-09-09 12:35:15,249 - root - INFO - step: 23530 loss: 2.8099 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.94 mfu: 49.24% global_avg_ntp_loss: 0.8116 global_avg_top_loss: 1.9983 +[titan] 2025-09-09 12:35:15,250 - root - INFO - lr: 8.6497e-06 gnorm: 0.35 [1 day, 18:59:48<1 day, 6:05:45] +[titan] 2025-09-09 12:35:47,336 - root - INFO - step: 23535 loss: 2.6512 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.72 mfu: 49.21% global_avg_ntp_loss: 0.7391 global_avg_top_loss: 1.9121 +[titan] 2025-09-09 12:35:47,336 - root - INFO - lr: 8.6462e-06 gnorm: 0.36 [1 day, 19:00:20<1 day, 6:05:11] +[titan] 2025-09-09 12:36:19,207 - root - INFO - step: 23540 loss: 2.7836 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 490.01 mfu: 49.55% global_avg_ntp_loss: 0.7960 global_avg_top_loss: 1.9876 +[titan] 2025-09-09 12:36:19,208 - root - INFO - lr: 8.6428e-06 gnorm: 0.35 [1 day, 19:00:52<1 day, 6:04:38] +[titan] 2025-09-09 12:36:51,143 - root - INFO - step: 23545 loss: 2.8085 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.02 mfu: 49.45% global_avg_ntp_loss: 0.8067 global_avg_top_loss: 2.0018 +[titan] 2025-09-09 12:36:51,143 - root - INFO - lr: 8.6393e-06 gnorm: 0.34 [1 day, 19:01:24<1 day, 6:04:04] +[titan] 2025-09-09 12:37:16,854 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:37:23,332 - root - INFO - step: 23550 loss: 2.7229 memory: 122.03GiB(87.57%) tps: 10,180 tflops: 485.17 mfu: 49.06% global_avg_ntp_loss: 0.7713 global_avg_top_loss: 1.9516 +[titan] 2025-09-09 12:37:23,333 - root - INFO - lr: 8.6359e-06 gnorm: 0.37 [1 day, 19:01:56<1 day, 6:03:31] +[titan] 2025-09-09 12:37:36,257 - root - INFO - Dumping profiler traces at step 23552 +[titan] 2025-09-09 12:37:36,315 - root - INFO - Finished dumping profiler traces in 0.06 seconds +[titan] 2025-09-09 12:37:55,411 - root - INFO - step: 23555 loss: 2.7637 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.84 mfu: 49.23% global_avg_ntp_loss: 0.7858 global_avg_top_loss: 1.9779 +[titan] 2025-09-09 12:37:55,411 - root - INFO - lr: 8.6324e-06 gnorm: 0.40 [1 day, 19:02:28<1 day, 6:02:57] +[titan] 2025-09-09 12:38:27,564 - root - INFO - step: 23560 loss: 2.7298 memory: 122.03GiB(87.57%) tps: 10,191 tflops: 485.71 mfu: 49.11% global_avg_ntp_loss: 0.7723 global_avg_top_loss: 1.9575 +[titan] 2025-09-09 12:38:27,564 - root - INFO - lr: 8.6290e-06 gnorm: 0.34 [1 day, 19:03:00<1 day, 6:02:24] +[titan] 2025-09-09 12:38:59,577 - root - INFO - step: 23565 loss: 2.7718 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.85 mfu: 49.33% global_avg_ntp_loss: 0.7922 global_avg_top_loss: 1.9796 +[titan] 2025-09-09 12:38:59,577 - root - INFO - lr: 8.6255e-06 gnorm: 0.36 [1 day, 19:03:32<1 day, 6:01:51] +[titan] 2025-09-09 12:39:31,578 - root - INFO - step: 23570 loss: 3.0739 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.02 mfu: 49.35% global_avg_ntp_loss: 0.9525 global_avg_top_loss: 2.1214 +[titan] 2025-09-09 12:39:31,578 - root - INFO - lr: 8.6221e-06 gnorm: 0.35 [1 day, 19:04:04<1 day, 6:01:17] +[titan] 2025-09-09 12:40:03,797 - root - INFO - step: 23575 loss: 2.6237 memory: 122.03GiB(87.57%) tps: 10,170 tflops: 484.72 mfu: 49.01% global_avg_ntp_loss: 0.7232 global_avg_top_loss: 1.9004 +[titan] 2025-09-09 12:40:03,797 - root - INFO - lr: 8.6187e-06 gnorm: 0.34 [1 day, 19:04:37<1 day, 6:00:44] +[titan] 2025-09-09 12:40:35,544 - root - INFO - step: 23580 loss: 2.7272 memory: 122.03GiB(87.57%) tps: 10,322 tflops: 491.92 mfu: 49.74% global_avg_ntp_loss: 0.7693 global_avg_top_loss: 1.9579 +[titan] 2025-09-09 12:40:35,545 - root - INFO - lr: 8.6152e-06 gnorm: 0.34 [1 day, 19:05:08<1 day, 6:00:10] +[titan] 2025-09-09 12:41:07,453 - root - INFO - step: 23585 loss: 2.6902 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.43 mfu: 49.49% global_avg_ntp_loss: 0.7624 global_avg_top_loss: 1.9278 +[titan] 2025-09-09 12:41:07,454 - root - INFO - lr: 8.6118e-06 gnorm: 0.35 [1 day, 19:05:40<1 day, 5:59:36] +[titan] 2025-09-09 12:41:39,552 - root - INFO - step: 23590 loss: 2.7357 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.55 mfu: 49.20% global_avg_ntp_loss: 0.7731 global_avg_top_loss: 1.9626 +[titan] 2025-09-09 12:41:39,552 - root - INFO - lr: 8.6083e-06 gnorm: 0.34 [1 day, 19:06:12<1 day, 5:59:03] +[titan] 2025-09-09 12:42:11,437 - root - INFO - step: 23595 loss: 2.6897 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.79 mfu: 49.52% global_avg_ntp_loss: 0.7595 global_avg_top_loss: 1.9302 +[titan] 2025-09-09 12:42:11,437 - root - INFO - lr: 8.6049e-06 gnorm: 0.37 [1 day, 19:06:44<1 day, 5:58:29] +[titan] 2025-09-09 12:42:36,968 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:42:43,327 - root - INFO - step: 23600 loss: 2.7191 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.72 mfu: 49.52% global_avg_ntp_loss: 0.7666 global_avg_top_loss: 1.9524 +[titan] 2025-09-09 12:42:43,327 - root - INFO - lr: 8.6015e-06 gnorm: 0.35 [1 day, 19:07:16<1 day, 5:57:56] +[titan] 2025-09-09 12:43:15,414 - root - INFO - step: 23605 loss: 2.9160 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.71 mfu: 49.21% global_avg_ntp_loss: 0.8723 global_avg_top_loss: 2.0437 +[titan] 2025-09-09 12:43:15,415 - root - INFO - lr: 8.5980e-06 gnorm: 0.37 [1 day, 19:07:48<1 day, 5:57:22] +[titan] 2025-09-09 12:43:47,502 - root - INFO - step: 23610 loss: 2.6779 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.70 mfu: 49.21% global_avg_ntp_loss: 0.7476 global_avg_top_loss: 1.9303 +[titan] 2025-09-09 12:43:47,503 - root - INFO - lr: 8.5946e-06 gnorm: 0.34 [1 day, 19:08:20<1 day, 5:56:49] +[titan] 2025-09-09 12:44:19,582 - root - INFO - step: 23615 loss: 2.7448 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.83 mfu: 49.22% global_avg_ntp_loss: 0.7784 global_avg_top_loss: 1.9664 +[titan] 2025-09-09 12:44:19,582 - root - INFO - lr: 8.5911e-06 gnorm: 0.35 [1 day, 19:08:52<1 day, 5:56:15] +[titan] 2025-09-09 12:44:51,378 - root - INFO - step: 23620 loss: 2.7088 memory: 122.03GiB(87.57%) tps: 10,306 tflops: 491.16 mfu: 49.66% global_avg_ntp_loss: 0.7653 global_avg_top_loss: 1.9435 +[titan] 2025-09-09 12:44:51,379 - root - INFO - lr: 8.5877e-06 gnorm: 0.37 [1 day, 19:09:24<1 day, 5:55:42] +[titan] 2025-09-09 12:45:23,511 - root - INFO - step: 23625 loss: 2.8010 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.02 mfu: 49.14% global_avg_ntp_loss: 0.8065 global_avg_top_loss: 1.9945 +[titan] 2025-09-09 12:45:23,511 - root - INFO - lr: 8.5843e-06 gnorm: 0.35 [1 day, 19:09:56<1 day, 5:55:08] +[titan] 2025-09-09 12:45:55,505 - root - INFO - step: 23630 loss: 2.9796 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.14 mfu: 49.36% global_avg_ntp_loss: 0.8844 global_avg_top_loss: 2.0952 +[titan] 2025-09-09 12:45:55,505 - root - INFO - lr: 8.5808e-06 gnorm: 0.36 [1 day, 19:10:28<1 day, 5:54:35] +[titan] 2025-09-09 12:46:27,671 - root - INFO - step: 23635 loss: 2.7300 memory: 122.03GiB(87.57%) tps: 10,187 tflops: 485.51 mfu: 49.09% global_avg_ntp_loss: 0.7748 global_avg_top_loss: 1.9553 +[titan] 2025-09-09 12:46:27,671 - root - INFO - lr: 8.5774e-06 gnorm: 0.36 [1 day, 19:11:00<1 day, 5:54:01] +[titan] 2025-09-09 12:46:59,664 - root - INFO - step: 23640 loss: 2.7168 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.14 mfu: 49.36% global_avg_ntp_loss: 0.7695 global_avg_top_loss: 1.9473 +[titan] 2025-09-09 12:46:59,664 - root - INFO - lr: 8.5739e-06 gnorm: 0.36 [1 day, 19:11:32<1 day, 5:53:28] +[titan] 2025-09-09 12:47:31,749 - root - INFO - step: 23645 loss: 2.7886 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.74 mfu: 49.22% global_avg_ntp_loss: 0.7986 global_avg_top_loss: 1.9900 +[titan] 2025-09-09 12:47:31,750 - root - INFO - lr: 8.5705e-06 gnorm: 0.35 [1 day, 19:12:04<1 day, 5:52:55] +[titan] 2025-09-09 12:47:57,383 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:48:03,771 - root - INFO - step: 23650 loss: 2.8259 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.70 mfu: 49.31% global_avg_ntp_loss: 0.8204 global_avg_top_loss: 2.0055 +[titan] 2025-09-09 12:48:03,772 - root - INFO - lr: 8.5671e-06 gnorm: 0.41 [1 day, 19:12:37<1 day, 5:52:21] +[titan] 2025-09-09 12:48:36,003 - root - INFO - step: 23655 loss: 2.7630 memory: 122.03GiB(87.57%) tps: 10,167 tflops: 484.54 mfu: 48.99% global_avg_ntp_loss: 0.7897 global_avg_top_loss: 1.9733 +[titan] 2025-09-09 12:48:36,003 - root - INFO - lr: 8.5636e-06 gnorm: 0.41 [1 day, 19:13:09<1 day, 5:51:48] +[titan] 2025-09-09 12:49:07,820 - root - INFO - step: 23660 loss: 2.7133 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.84 mfu: 49.63% global_avg_ntp_loss: 0.7639 global_avg_top_loss: 1.9494 +[titan] 2025-09-09 12:49:07,821 - root - INFO - lr: 8.5602e-06 gnorm: 0.38 [1 day, 19:13:41<1 day, 5:51:14] +[titan] 2025-09-09 12:49:39,931 - root - INFO - step: 23665 loss: 2.7182 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.35 mfu: 49.18% global_avg_ntp_loss: 0.7685 global_avg_top_loss: 1.9496 +[titan] 2025-09-09 12:49:39,931 - root - INFO - lr: 8.5568e-06 gnorm: 0.34 [1 day, 19:14:13<1 day, 5:50:41] +[titan] 2025-09-09 12:50:11,941 - root - INFO - step: 23670 loss: 2.7891 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.89 mfu: 49.33% global_avg_ntp_loss: 0.7985 global_avg_top_loss: 1.9905 +[titan] 2025-09-09 12:50:11,941 - root - INFO - lr: 8.5533e-06 gnorm: 0.36 [1 day, 19:14:45<1 day, 5:50:07] +[titan] 2025-09-09 12:50:43,859 - root - INFO - step: 23675 loss: 2.7823 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.29 mfu: 49.47% global_avg_ntp_loss: 0.8097 global_avg_top_loss: 1.9726 +[titan] 2025-09-09 12:50:43,859 - root - INFO - lr: 8.5499e-06 gnorm: 0.34 [1 day, 19:15:17<1 day, 5:49:34] +[titan] 2025-09-09 12:51:15,789 - root - INFO - step: 23680 loss: 2.8048 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.11 mfu: 49.46% global_avg_ntp_loss: 0.8042 global_avg_top_loss: 2.0006 +[titan] 2025-09-09 12:51:15,789 - root - INFO - lr: 8.5464e-06 gnorm: 0.36 [1 day, 19:15:49<1 day, 5:49:00] +[titan] 2025-09-09 12:51:47,680 - root - INFO - step: 23685 loss: 3.0752 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.70 mfu: 49.51% global_avg_ntp_loss: 0.9326 global_avg_top_loss: 2.1426 +[titan] 2025-09-09 12:51:47,681 - root - INFO - lr: 8.5430e-06 gnorm: 0.52 [1 day, 19:16:20<1 day, 5:48:26] +[titan] 2025-09-09 12:52:19,455 - root - INFO - step: 23690 loss: 2.7414 memory: 122.03GiB(87.57%) tps: 10,313 tflops: 491.50 mfu: 49.70% global_avg_ntp_loss: 0.7755 global_avg_top_loss: 1.9659 +[titan] 2025-09-09 12:52:19,455 - root - INFO - lr: 8.5396e-06 gnorm: 0.35 [1 day, 19:16:52<1 day, 5:47:53] +[titan] 2025-09-09 12:52:51,258 - root - INFO - step: 23695 loss: 2.8033 memory: 122.03GiB(87.57%) tps: 10,304 tflops: 491.06 mfu: 49.65% global_avg_ntp_loss: 0.8079 global_avg_top_loss: 1.9955 +[titan] 2025-09-09 12:52:51,258 - root - INFO - lr: 8.5361e-06 gnorm: 0.35 [1 day, 19:17:24<1 day, 5:47:19] +[titan] 2025-09-09 12:53:17,026 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:53:23,372 - root - INFO - step: 23700 loss: 2.7465 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.30 mfu: 49.17% global_avg_ntp_loss: 0.7785 global_avg_top_loss: 1.9680 +[titan] 2025-09-09 12:53:23,372 - root - INFO - lr: 8.5327e-06 gnorm: 0.36 [1 day, 19:17:56<1 day, 5:46:46] +[titan] 2025-09-09 12:53:55,414 - root - INFO - step: 23705 loss: 2.7954 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.39 mfu: 49.28% global_avg_ntp_loss: 0.8048 global_avg_top_loss: 1.9906 +[titan] 2025-09-09 12:53:55,414 - root - INFO - lr: 8.5293e-06 gnorm: 0.35 [1 day, 19:18:28<1 day, 5:46:12] +[titan] 2025-09-09 12:54:27,389 - root - INFO - step: 23710 loss: 2.8390 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.43 mfu: 49.39% global_avg_ntp_loss: 0.8229 global_avg_top_loss: 2.0161 +[titan] 2025-09-09 12:54:27,389 - root - INFO - lr: 8.5258e-06 gnorm: 0.38 [1 day, 19:19:00<1 day, 5:45:39] +[titan] 2025-09-09 12:54:59,612 - root - INFO - step: 23715 loss: 2.7635 memory: 122.03GiB(87.57%) tps: 10,169 tflops: 484.66 mfu: 49.01% global_avg_ntp_loss: 0.7885 global_avg_top_loss: 1.9750 +[titan] 2025-09-09 12:54:59,612 - root - INFO - lr: 8.5224e-06 gnorm: 0.35 [1 day, 19:19:32<1 day, 5:45:05] +[titan] 2025-09-09 12:55:31,684 - root - INFO - step: 23720 loss: 2.7533 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.94 mfu: 49.24% global_avg_ntp_loss: 0.7840 global_avg_top_loss: 1.9693 +[titan] 2025-09-09 12:55:31,684 - root - INFO - lr: 8.5190e-06 gnorm: 0.37 [1 day, 19:20:04<1 day, 5:44:32] +[titan] 2025-09-09 12:56:03,626 - root - INFO - step: 23725 loss: 2.7679 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.92 mfu: 49.44% global_avg_ntp_loss: 0.7896 global_avg_top_loss: 1.9782 +[titan] 2025-09-09 12:56:03,627 - root - INFO - lr: 8.5156e-06 gnorm: 0.35 [1 day, 19:20:36<1 day, 5:43:58] +[titan] 2025-09-09 12:56:35,610 - root - INFO - step: 23730 loss: 2.7012 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.29 mfu: 49.37% global_avg_ntp_loss: 0.7586 global_avg_top_loss: 1.9426 +[titan] 2025-09-09 12:56:35,610 - root - INFO - lr: 8.5121e-06 gnorm: 0.33 [1 day, 19:21:08<1 day, 5:43:25] +[titan] 2025-09-09 12:57:07,595 - root - INFO - step: 23735 loss: 2.6789 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.27 mfu: 49.37% global_avg_ntp_loss: 0.7473 global_avg_top_loss: 1.9316 +[titan] 2025-09-09 12:57:07,595 - root - INFO - lr: 8.5087e-06 gnorm: 0.34 [1 day, 19:21:40<1 day, 5:42:51] +[titan] 2025-09-09 12:57:39,488 - root - INFO - step: 23740 loss: 2.7761 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.67 mfu: 49.51% global_avg_ntp_loss: 0.7952 global_avg_top_loss: 1.9809 +[titan] 2025-09-09 12:57:39,489 - root - INFO - lr: 8.5053e-06 gnorm: 0.34 [1 day, 19:22:12<1 day, 5:42:18] +[titan] 2025-09-09 12:58:11,608 - root - INFO - step: 23745 loss: 2.7724 memory: 122.03GiB(87.57%) tps: 10,202 tflops: 486.22 mfu: 49.16% global_avg_ntp_loss: 0.7937 global_avg_top_loss: 1.9787 +[titan] 2025-09-09 12:58:11,608 - root - INFO - lr: 8.5018e-06 gnorm: 0.35 [1 day, 19:22:44<1 day, 5:41:45] +[titan] 2025-09-09 12:58:37,355 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:58:43,797 - root - INFO - step: 23750 loss: 2.7206 memory: 122.03GiB(87.57%) tps: 10,180 tflops: 485.17 mfu: 49.06% global_avg_ntp_loss: 0.7687 global_avg_top_loss: 1.9519 +[titan] 2025-09-09 12:58:43,798 - root - INFO - lr: 8.4984e-06 gnorm: 0.35 [1 day, 19:23:17<1 day, 5:41:11] +[titan] 2025-09-09 12:59:15,640 - root - INFO - step: 23755 loss: 2.7171 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.46 mfu: 49.59% global_avg_ntp_loss: 0.7654 global_avg_top_loss: 1.9517 +[titan] 2025-09-09 12:59:15,640 - root - INFO - lr: 8.4950e-06 gnorm: 0.34 [1 day, 19:23:48<1 day, 5:40:38] +[titan] 2025-09-09 12:59:47,612 - root - INFO - step: 23760 loss: 2.7101 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.47 mfu: 49.39% global_avg_ntp_loss: 0.7638 global_avg_top_loss: 1.9463 +[titan] 2025-09-09 12:59:47,612 - root - INFO - lr: 8.4915e-06 gnorm: 0.37 [1 day, 19:24:20<1 day, 5:40:04] +[titan] 2025-09-09 13:00:20,050 - root - INFO - step: 23765 loss: 2.7622 memory: 122.03GiB(87.57%) tps: 10,102 tflops: 481.44 mfu: 48.68% global_avg_ntp_loss: 0.7887 global_avg_top_loss: 1.9735 +[titan] 2025-09-09 13:00:20,051 - root - INFO - lr: 8.4881e-06 gnorm: 0.35 [1 day, 19:24:53<1 day, 5:39:31] +[titan] 2025-09-09 13:00:51,953 - root - INFO - step: 23770 loss: 2.6923 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.53 mfu: 49.50% global_avg_ntp_loss: 0.7562 global_avg_top_loss: 1.9362 +[titan] 2025-09-09 13:00:51,953 - root - INFO - lr: 8.4847e-06 gnorm: 0.39 [1 day, 19:25:25<1 day, 5:38:57] +[titan] 2025-09-09 13:01:24,149 - root - INFO - step: 23775 loss: 2.7301 memory: 122.03GiB(87.57%) tps: 10,178 tflops: 485.06 mfu: 49.05% global_avg_ntp_loss: 0.7714 global_avg_top_loss: 1.9587 +[titan] 2025-09-09 13:01:24,150 - root - INFO - lr: 8.4813e-06 gnorm: 0.44 [1 day, 19:25:57<1 day, 5:38:24] +[titan] 2025-09-09 13:01:56,233 - root - INFO - step: 23780 loss: 2.7156 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.76 mfu: 49.22% global_avg_ntp_loss: 0.7650 global_avg_top_loss: 1.9506 +[titan] 2025-09-09 13:01:56,234 - root - INFO - lr: 8.4778e-06 gnorm: 0.34 [1 day, 19:26:29<1 day, 5:37:51] +[titan] 2025-09-09 13:02:28,320 - root - INFO - step: 23785 loss: 2.7678 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.71 mfu: 49.21% global_avg_ntp_loss: 0.7887 global_avg_top_loss: 1.9791 +[titan] 2025-09-09 13:02:28,321 - root - INFO - lr: 8.4744e-06 gnorm: 0.36 [1 day, 19:27:01<1 day, 5:37:17] +[titan] 2025-09-09 13:03:00,254 - root - INFO - step: 23790 loss: 2.7838 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.06 mfu: 49.45% global_avg_ntp_loss: 0.8106 global_avg_top_loss: 1.9732 +[titan] 2025-09-09 13:03:00,254 - root - INFO - lr: 8.4710e-06 gnorm: 0.35 [1 day, 19:27:33<1 day, 5:36:44] +[titan] 2025-09-09 13:03:32,362 - root - INFO - step: 23795 loss: 2.7578 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.39 mfu: 49.18% global_avg_ntp_loss: 0.7861 global_avg_top_loss: 1.9717 +[titan] 2025-09-09 13:03:32,362 - root - INFO - lr: 8.4676e-06 gnorm: 0.41 [1 day, 19:28:05<1 day, 5:36:10] +[titan] 2025-09-09 13:03:58,011 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 13:04:04,398 - root - INFO - step: 23800 loss: 2.6799 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.49 mfu: 49.29% global_avg_ntp_loss: 0.7520 global_avg_top_loss: 1.9279 +[titan] 2025-09-09 13:04:04,398 - root - INFO - lr: 8.4641e-06 gnorm: 0.37 [1 day, 19:28:37<1 day, 5:35:37] +[titan] 2025-09-09 13:04:36,295 - root - INFO - step: 23805 loss: 2.7871 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.62 mfu: 49.51% global_avg_ntp_loss: 0.7972 global_avg_top_loss: 1.9899 +[titan] 2025-09-09 13:04:36,295 - root - INFO - lr: 8.4607e-06 gnorm: 0.35 [1 day, 19:29:09<1 day, 5:35:03] +[titan] 2025-09-09 13:05:08,114 - root - INFO - step: 23810 loss: 2.7643 memory: 122.03GiB(87.57%) tps: 10,298 tflops: 490.81 mfu: 49.63% global_avg_ntp_loss: 0.7882 global_avg_top_loss: 1.9761 +[titan] 2025-09-09 13:05:08,114 - root - INFO - lr: 8.4573e-06 gnorm: 0.37 [1 day, 19:29:41<1 day, 5:34:30] +[titan] 2025-09-09 13:05:40,071 - root - INFO - step: 23815 loss: 2.8376 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.69 mfu: 49.41% global_avg_ntp_loss: 0.8248 global_avg_top_loss: 2.0128 +[titan] 2025-09-09 13:05:40,071 - root - INFO - lr: 8.4539e-06 gnorm: 0.39 [1 day, 19:30:13<1 day, 5:33:56] +[titan] 2025-09-09 13:06:12,175 - root - INFO - step: 23820 loss: 2.7753 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.47 mfu: 49.19% global_avg_ntp_loss: 0.7953 global_avg_top_loss: 1.9800 +[titan] 2025-09-09 13:06:12,175 - root - INFO - lr: 8.4504e-06 gnorm: 0.38 [1 day, 19:30:45<1 day, 5:33:23] +[titan] 2025-09-09 13:06:44,115 - root - INFO - step: 23825 loss: 2.7898 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.95 mfu: 49.44% global_avg_ntp_loss: 0.8021 global_avg_top_loss: 1.9877 +[titan] 2025-09-09 13:06:44,115 - root - INFO - lr: 8.4470e-06 gnorm: 0.36 [1 day, 19:31:17<1 day, 5:32:49] +[titan] 2025-09-09 13:07:15,889 - root - INFO - step: 23830 loss: 3.1567 memory: 122.03GiB(87.57%) tps: 10,313 tflops: 491.50 mfu: 49.70% global_avg_ntp_loss: 1.0199 global_avg_top_loss: 2.1368 +[titan] 2025-09-09 13:07:15,889 - root - INFO - lr: 8.4436e-06 gnorm: 0.44 [1 day, 19:31:49<1 day, 5:32:15] +[titan] 2025-09-09 13:07:47,767 - root - INFO - step: 23835 loss: 2.7924 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.91 mfu: 49.54% global_avg_ntp_loss: 0.7984 global_avg_top_loss: 1.9939 +[titan] 2025-09-09 13:07:47,767 - root - INFO - lr: 8.4402e-06 gnorm: 0.35 [1 day, 19:32:20<1 day, 5:31:42] +[titan] 2025-09-09 13:08:19,662 - root - INFO - step: 23840 loss: 2.6894 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.64 mfu: 49.51% global_avg_ntp_loss: 0.7556 global_avg_top_loss: 1.9338 +[titan] 2025-09-09 13:08:19,663 - root - INFO - lr: 8.4367e-06 gnorm: 0.36 [1 day, 19:32:52<1 day, 5:31:08] +[titan] 2025-09-09 13:08:51,714 - root - INFO - step: 23845 loss: 2.7534 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.26 mfu: 49.27% global_avg_ntp_loss: 0.7857 global_avg_top_loss: 1.9676 +[titan] 2025-09-09 13:08:51,714 - root - INFO - lr: 8.4333e-06 gnorm: 0.35 [1 day, 19:33:24<1 day, 5:30:35] +[titan] 2025-09-09 13:09:17,453 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 13:09:23,828 - root - INFO - step: 23850 loss: 2.7687 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.31 mfu: 49.17% global_avg_ntp_loss: 0.7978 global_avg_top_loss: 1.9710 +[titan] 2025-09-09 13:09:23,828 - root - INFO - lr: 8.4299e-06 gnorm: 0.38 [1 day, 19:33:57<1 day, 5:30:02] +[titan] 2025-09-09 13:09:55,719 - root - INFO - step: 23855 loss: 2.7126 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.70 mfu: 49.51% global_avg_ntp_loss: 0.7691 global_avg_top_loss: 1.9434 +[titan] 2025-09-09 13:09:55,719 - root - INFO - lr: 8.4265e-06 gnorm: 0.34 [1 day, 19:34:28<1 day, 5:29:28] +[titan] 2025-09-09 13:10:27,633 - root - INFO - step: 23860 loss: 2.7057 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.36 mfu: 49.48% global_avg_ntp_loss: 0.7603 global_avg_top_loss: 1.9454 +[titan] 2025-09-09 13:10:27,633 - root - INFO - lr: 8.4231e-06 gnorm: 0.78 [1 day, 19:35:00<1 day, 5:28:54] +[titan] 2025-09-09 13:10:59,580 - root - INFO - step: 23865 loss: 2.7233 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.85 mfu: 49.43% global_avg_ntp_loss: 0.7694 global_avg_top_loss: 1.9539 +[titan] 2025-09-09 13:10:59,580 - root - INFO - lr: 8.4196e-06 gnorm: 0.37 [1 day, 19:35:32<1 day, 5:28:21] +[titan] 2025-09-09 13:11:31,518 - root - INFO - step: 23870 loss: 2.7358 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.99 mfu: 49.44% global_avg_ntp_loss: 0.7752 global_avg_top_loss: 1.9606 +[titan] 2025-09-09 13:11:31,518 - root - INFO - lr: 8.4162e-06 gnorm: 0.35 [1 day, 19:36:04<1 day, 5:27:47] +[titan] 2025-09-09 13:12:03,634 - root - INFO - step: 23875 loss: 2.7294 memory: 122.03GiB(87.57%) tps: 10,203 tflops: 486.27 mfu: 49.17% global_avg_ntp_loss: 0.7740 global_avg_top_loss: 1.9553 +[titan] 2025-09-09 13:12:03,634 - root - INFO - lr: 8.4128e-06 gnorm: 0.35 [1 day, 19:36:36<1 day, 5:27:14] +[titan] 2025-09-09 13:12:35,542 - root - INFO - step: 23880 loss: 2.6747 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.44 mfu: 49.49% global_avg_ntp_loss: 0.7496 global_avg_top_loss: 1.9251 +[titan] 2025-09-09 13:12:35,543 - root - INFO - lr: 8.4094e-06 gnorm: 0.36 [1 day, 19:37:08<1 day, 5:26:40] +[titan] 2025-09-09 13:13:07,652 - root - INFO - step: 23885 loss: 2.7978 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.38 mfu: 49.18% global_avg_ntp_loss: 0.8052 global_avg_top_loss: 1.9926 +[titan] 2025-09-09 13:13:07,652 - root - INFO - lr: 8.4060e-06 gnorm: 0.35 [1 day, 19:37:40<1 day, 5:26:07] +[titan] 2025-09-09 13:13:39,698 - root - INFO - step: 23890 loss: 2.6990 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.33 mfu: 49.28% global_avg_ntp_loss: 0.7588 global_avg_top_loss: 1.9402 +[titan] 2025-09-09 13:13:39,698 - root - INFO - lr: 8.4025e-06 gnorm: 0.36 [1 day, 19:38:12<1 day, 5:25:34] +[titan] 2025-09-09 13:14:11,783 - root - INFO - step: 23895 loss: 2.7727 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.75 mfu: 49.22% global_avg_ntp_loss: 0.7720 global_avg_top_loss: 2.0007 +[titan] 2025-09-09 13:14:11,783 - root - INFO - lr: 8.3991e-06 gnorm: 1.27 [1 day, 19:38:45<1 day, 5:25:00] +[titan] 2025-09-09 13:14:37,525 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 13:14:43,968 - root - INFO - step: 23900 loss: 2.7869 memory: 122.03GiB(87.57%) tps: 10,181 tflops: 485.24 mfu: 49.06% global_avg_ntp_loss: 0.7982 global_avg_top_loss: 1.9887 +[titan] 2025-09-09 13:14:43,968 - root - INFO - lr: 8.3957e-06 gnorm: 0.36 [1 day, 19:39:17<1 day, 5:24:27] +[titan] 2025-09-09 13:15:15,839 - root - INFO - step: 23905 loss: 2.7571 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 490.00 mfu: 49.55% global_avg_ntp_loss: 0.7893 global_avg_top_loss: 1.9677 +[titan] 2025-09-09 13:15:15,840 - root - INFO - lr: 8.3923e-06 gnorm: 0.79 [1 day, 19:39:49<1 day, 5:23:53] +[titan] 2025-09-09 13:15:47,681 - root - INFO - step: 23910 loss: 3.0514 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.47 mfu: 49.59% global_avg_ntp_loss: 0.9716 global_avg_top_loss: 2.0798 +[titan] 2025-09-09 13:15:47,681 - root - INFO - lr: 8.3889e-06 gnorm: 0.35 [1 day, 19:40:20<1 day, 5:23:20] +[titan] 2025-09-09 13:16:19,629 - root - INFO - step: 23915 loss: 2.7363 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.83 mfu: 49.43% global_avg_ntp_loss: 0.7733 global_avg_top_loss: 1.9630 +[titan] 2025-09-09 13:16:19,629 - root - INFO - lr: 8.3855e-06 gnorm: 0.35 [1 day, 19:40:52<1 day, 5:22:46] +[titan] 2025-09-09 13:16:51,674 - root - INFO - step: 23920 loss: 2.6696 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.35 mfu: 49.28% global_avg_ntp_loss: 0.7477 global_avg_top_loss: 1.9219 +[titan] 2025-09-09 13:16:51,675 - root - INFO - lr: 8.3820e-06 gnorm: 0.35 [1 day, 19:41:24<1 day, 5:22:13] +[titan] 2025-09-09 13:17:23,604 - root - INFO - step: 23925 loss: 2.6664 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.11 mfu: 49.45% global_avg_ntp_loss: 0.7422 global_avg_top_loss: 1.9242 +[titan] 2025-09-09 13:17:23,605 - root - INFO - lr: 8.3786e-06 gnorm: 0.36 [1 day, 19:41:56<1 day, 5:21:39] +[titan] 2025-09-09 13:17:55,553 - root - INFO - step: 23930 loss: 2.7392 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.82 mfu: 49.43% global_avg_ntp_loss: 0.7781 global_avg_top_loss: 1.9611 +[titan] 2025-09-09 13:17:55,554 - root - INFO - lr: 8.3752e-06 gnorm: 0.36 [1 day, 19:42:28<1 day, 5:21:06] +[titan] 2025-09-09 13:18:27,600 - root - INFO - step: 23935 loss: 2.7458 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.33 mfu: 49.27% global_avg_ntp_loss: 0.7804 global_avg_top_loss: 1.9654 +[titan] 2025-09-09 13:18:27,600 - root - INFO - lr: 8.3718e-06 gnorm: 0.44 [1 day, 19:43:00<1 day, 5:20:32] +[titan] 2025-09-09 13:18:59,407 - root - INFO - step: 23940 loss: 2.6329 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 491.00 mfu: 49.65% global_avg_ntp_loss: 0.7268 global_avg_top_loss: 1.9061 +[titan] 2025-09-09 13:18:59,407 - root - INFO - lr: 8.3684e-06 gnorm: 0.60 [1 day, 19:43:32<1 day, 5:19:59] +[titan] 2025-09-09 13:19:31,349 - root - INFO - step: 23945 loss: 2.8411 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.92 mfu: 49.44% global_avg_ntp_loss: 0.8344 global_avg_top_loss: 2.0067 +[titan] 2025-09-09 13:19:31,349 - root - INFO - lr: 8.3650e-06 gnorm: 0.36 [1 day, 19:44:04<1 day, 5:19:25] +[titan] 2025-09-09 13:19:56,912 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 13:20:03,313 - root - INFO - step: 23950 loss: 2.7655 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.58 mfu: 49.40% global_avg_ntp_loss: 0.7908 global_avg_top_loss: 1.9747 +[titan] 2025-09-09 13:20:03,314 - root - INFO - lr: 8.3616e-06 gnorm: 0.35 [1 day, 19:44:36<1 day, 5:18:52] +[titan] 2025-09-09 13:20:35,440 - root - INFO - step: 23955 loss: 2.7691 memory: 122.03GiB(87.57%) tps: 10,200 tflops: 486.11 mfu: 49.15% global_avg_ntp_loss: 0.7909 global_avg_top_loss: 1.9782 +[titan] 2025-09-09 13:20:35,440 - root - INFO - lr: 8.3581e-06 gnorm: 0.35 [1 day, 19:45:08<1 day, 5:18:18] +[titan] 2025-09-09 13:21:07,623 - root - INFO - step: 23960 loss: 2.7289 memory: 122.03GiB(87.57%) tps: 10,182 tflops: 485.26 mfu: 49.07% global_avg_ntp_loss: 0.7718 global_avg_top_loss: 1.9571 +[titan] 2025-09-09 13:21:07,624 - root - INFO - lr: 8.3547e-06 gnorm: 0.36 [1 day, 19:45:40<1 day, 5:17:45] +[titan] 2025-09-09 13:21:39,702 - root - INFO - step: 23965 loss: 2.7192 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.83 mfu: 49.22% global_avg_ntp_loss: 0.7663 global_avg_top_loss: 1.9529 +[titan] 2025-09-09 13:21:39,703 - root - INFO - lr: 8.3513e-06 gnorm: 0.36 [1 day, 19:46:12<1 day, 5:17:12] +[titan] 2025-09-09 13:22:11,721 - root - INFO - step: 23970 loss: 2.7345 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.75 mfu: 49.32% global_avg_ntp_loss: 0.7679 global_avg_top_loss: 1.9666 +[titan] 2025-09-09 13:22:11,722 - root - INFO - lr: 8.3479e-06 gnorm: 0.37 [1 day, 19:46:44<1 day, 5:16:38] +[titan] 2025-09-09 13:22:43,657 - root - INFO - step: 23975 loss: 3.5589 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.02 mfu: 49.45% global_avg_ntp_loss: 1.2390 global_avg_top_loss: 2.3199 +[titan] 2025-09-09 13:22:43,657 - root - INFO - lr: 8.3445e-06 gnorm: 0.36 [1 day, 19:47:16<1 day, 5:16:05] +[titan] 2025-09-09 13:23:15,706 - root - INFO - step: 23980 loss: 2.8295 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.29 mfu: 49.27% global_avg_ntp_loss: 0.8171 global_avg_top_loss: 2.0124 +[titan] 2025-09-09 13:23:15,707 - root - INFO - lr: 8.3411e-06 gnorm: 0.37 [1 day, 19:47:48<1 day, 5:15:31] +[titan] 2025-09-09 13:23:47,768 - root - INFO - step: 23985 loss: 2.6171 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.10 mfu: 49.25% global_avg_ntp_loss: 0.7251 global_avg_top_loss: 1.8921 +[titan] 2025-09-09 13:23:47,768 - root - INFO - lr: 8.3377e-06 gnorm: 0.44 [1 day, 19:48:20<1 day, 5:14:58] +[titan] 2025-09-09 13:24:19,781 - root - INFO - step: 23990 loss: 3.2770 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.84 mfu: 49.33% global_avg_ntp_loss: 1.0733 global_avg_top_loss: 2.2038 +[titan] 2025-09-09 13:24:19,781 - root - INFO - lr: 8.3343e-06 gnorm: 0.38 [1 day, 19:48:52<1 day, 5:14:24] +[titan] 2025-09-09 13:24:52,007 - root - INFO - step: 23995 loss: 2.7645 memory: 122.03GiB(87.57%) tps: 10,168 tflops: 484.61 mfu: 49.00% global_avg_ntp_loss: 0.7881 global_avg_top_loss: 1.9764 +[titan] 2025-09-09 13:24:52,008 - root - INFO - lr: 8.3309e-06 gnorm: 0.35 [1 day, 19:49:25<1 day, 5:13:51] +[titan] 2025-09-09 13:25:17,571 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 13:25:23,948 - root - INFO - step: 24000 loss: 2.6465 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.95 mfu: 49.44% global_avg_ntp_loss: 0.7351 global_avg_top_loss: 1.9114 +[titan] 2025-09-09 13:25:23,948 - root - INFO - lr: 8.3274e-06 gnorm: 0.35 [1 day, 19:49:57<1 day, 5:13:18] +[titan] 2025-09-09 13:25:56,152 - root - INFO - step: 24005 loss: 2.7422 memory: 122.03GiB(87.57%) tps: 10,175 tflops: 484.94 mfu: 49.03% global_avg_ntp_loss: 0.7750 global_avg_top_loss: 1.9672 +[titan] 2025-09-09 13:25:56,153 - root - INFO - lr: 8.3240e-06 gnorm: 0.48 [1 day, 19:50:29<1 day, 5:12:44] +[titan] 2025-09-09 13:26:27,923 - root - INFO - step: 24010 loss: 2.7806 memory: 122.03GiB(87.57%) tps: 10,314 tflops: 491.56 mfu: 49.70% global_avg_ntp_loss: 0.7938 global_avg_top_loss: 1.9868 +[titan] 2025-09-09 13:26:27,923 - root - INFO - lr: 8.3206e-06 gnorm: 0.39 [1 day, 19:51:01<1 day, 5:12:11] +[titan] 2025-09-09 13:26:59,995 - root - INFO - step: 24015 loss: 2.9127 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.94 mfu: 49.24% global_avg_ntp_loss: 0.8678 global_avg_top_loss: 2.0449 +[titan] 2025-09-09 13:26:59,996 - root - INFO - lr: 8.3172e-06 gnorm: 0.40 [1 day, 19:51:33<1 day, 5:11:37] +[titan] 2025-09-09 13:27:31,819 - root - INFO - step: 24020 loss: 2.6760 memory: 122.03GiB(87.57%) tps: 10,297 tflops: 490.75 mfu: 49.62% global_avg_ntp_loss: 0.7498 global_avg_top_loss: 1.9262 +[titan] 2025-09-09 13:27:31,819 - root - INFO - lr: 8.3138e-06 gnorm: 0.54 [1 day, 19:52:05<1 day, 5:11:04] +[titan] 2025-09-09 13:28:03,919 - root - INFO - step: 24025 loss: 2.6722 memory: 122.03GiB(87.57%) tps: 10,208 tflops: 486.52 mfu: 49.19% global_avg_ntp_loss: 0.7470 global_avg_top_loss: 1.9252 +[titan] 2025-09-09 13:28:03,919 - root - INFO - lr: 8.3104e-06 gnorm: 0.34 [1 day, 19:52:37<1 day, 5:10:30] +[titan] 2025-09-09 13:28:35,728 - root - INFO - step: 24030 loss: 2.8901 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 490.96 mfu: 49.64% global_avg_ntp_loss: 0.8443 global_avg_top_loss: 2.0458 +[titan] 2025-09-09 13:28:35,728 - root - INFO - lr: 8.3070e-06 gnorm: 0.40 [1 day, 19:53:08<1 day, 5:09:57] +[titan] 2025-09-09 13:29:07,702 - root - INFO - step: 24035 loss: 2.7610 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.44 mfu: 49.39% global_avg_ntp_loss: 0.7864 global_avg_top_loss: 1.9745 +[titan] 2025-09-09 13:29:07,702 - root - INFO - lr: 8.3036e-06 gnorm: 0.38 [1 day, 19:53:40<1 day, 5:09:23] +[titan] 2025-09-09 13:29:39,792 - root - INFO - step: 24040 loss: 2.6985 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.66 mfu: 49.21% global_avg_ntp_loss: 0.7585 global_avg_top_loss: 1.9401 +[titan] 2025-09-09 13:29:39,793 - root - INFO - lr: 8.3002e-06 gnorm: 0.40 [1 day, 19:54:12<1 day, 5:08:50] +[titan] 2025-09-09 13:30:11,645 - root - INFO - step: 24045 loss: 3.0818 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.29 mfu: 49.57% global_avg_ntp_loss: 0.9355 global_avg_top_loss: 2.1463 +[titan] 2025-09-09 13:30:11,646 - root - INFO - lr: 8.2968e-06 gnorm: 0.43 [1 day, 19:54:44<1 day, 5:08:16] +[titan] 2025-09-09 13:30:37,178 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 13:30:43,693 - root - INFO - step: 24050 loss: 2.7219 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.32 mfu: 49.27% global_avg_ntp_loss: 0.7698 global_avg_top_loss: 1.9521 +[titan] 2025-09-09 13:30:43,693 - root - INFO - lr: 8.2934e-06 gnorm: 0.35 [1 day, 19:55:16<1 day, 5:07:43] +[titan] 2025-09-09 13:31:15,797 - root - INFO - step: 24055 loss: 3.1788 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.45 mfu: 49.19% global_avg_ntp_loss: 1.0313 global_avg_top_loss: 2.1476 +[titan] 2025-09-09 13:31:15,798 - root - INFO - lr: 8.2900e-06 gnorm: 0.37 [1 day, 19:55:49<1 day, 5:07:10] +[titan] 2025-09-09 13:31:47,561 - root - INFO - step: 24060 loss: 2.6929 memory: 122.03GiB(87.57%) tps: 10,316 tflops: 491.67 mfu: 49.71% global_avg_ntp_loss: 0.7555 global_avg_top_loss: 1.9374 +[titan] 2025-09-09 13:31:47,561 - root - INFO - lr: 8.2866e-06 gnorm: 0.38 [1 day, 19:56:20<1 day, 5:06:36] +[titan] 2025-09-09 13:32:13,646 - root - INFO - Dumping profiler traces at step 24064 +[titan] 2025-09-09 13:32:13,703 - root - INFO - Finished dumping profiler traces in 0.06 seconds +[titan] 2025-09-09 13:32:19,995 - root - INFO - step: 24065 loss: 2.6309 memory: 122.03GiB(87.57%) tps: 10,103 tflops: 481.51 mfu: 48.69% global_avg_ntp_loss: 0.7302 global_avg_top_loss: 1.9008 +[titan] 2025-09-09 13:32:19,995 - root - INFO - lr: 8.2832e-06 gnorm: 0.33 [1 day, 19:56:53<1 day, 5:06:03] +[titan] 2025-09-09 13:32:51,952 - root - INFO - step: 24070 loss: 2.6834 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.70 mfu: 49.41% global_avg_ntp_loss: 0.7503 global_avg_top_loss: 1.9331 +[titan] 2025-09-09 13:32:51,952 - root - INFO - lr: 8.2798e-06 gnorm: 0.35 [1 day, 19:57:25<1 day, 5:05:29] +[titan] 2025-09-09 13:33:23,868 - root - INFO - step: 24075 loss: 2.7221 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.32 mfu: 49.48% global_avg_ntp_loss: 0.7654 global_avg_top_loss: 1.9567 +[titan] 2025-09-09 13:33:23,868 - root - INFO - lr: 8.2764e-06 gnorm: 0.35 [1 day, 19:57:57<1 day, 5:04:56] +[titan] 2025-09-09 13:33:55,860 - root - INFO - step: 24080 loss: 2.7347 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.16 mfu: 49.36% global_avg_ntp_loss: 0.7893 global_avg_top_loss: 1.9455 +[titan] 2025-09-09 13:33:55,860 - root - INFO - lr: 8.2730e-06 gnorm: 0.52 [1 day, 19:58:29<1 day, 5:04:22] +[titan] 2025-09-09 13:34:27,919 - root - INFO - step: 24085 loss: 2.7412 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.14 mfu: 49.26% global_avg_ntp_loss: 0.7809 global_avg_top_loss: 1.9603 +[titan] 2025-09-09 13:34:27,919 - root - INFO - lr: 8.2696e-06 gnorm: 0.36 [1 day, 19:59:01<1 day, 5:03:49] +[titan] 2025-09-09 13:35:00,212 - root - INFO - step: 24090 loss: 2.7386 memory: 122.03GiB(87.57%) tps: 10,147 tflops: 483.61 mfu: 48.90% global_avg_ntp_loss: 0.7762 global_avg_top_loss: 1.9623 +[titan] 2025-09-09 13:35:00,212 - root - INFO - lr: 8.2662e-06 gnorm: 0.35 [1 day, 19:59:33<1 day, 5:03:16] +[titan] 2025-09-09 13:35:32,409 - root - INFO - step: 24095 loss: 2.8068 memory: 122.03GiB(87.57%) tps: 10,177 tflops: 485.05 mfu: 49.04% global_avg_ntp_loss: 0.8017 global_avg_top_loss: 2.0051 +[titan] 2025-09-09 13:35:32,409 - root - INFO - lr: 8.2627e-06 gnorm: 0.68 [1 day, 20:00:05<1 day, 5:02:42] +[titan] 2025-09-09 13:35:58,028 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 13:36:04,421 - root - INFO - step: 24100 loss: 2.7521 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.86 mfu: 49.33% global_avg_ntp_loss: 0.7877 global_avg_top_loss: 1.9645 +[titan] 2025-09-09 13:36:04,421 - root - INFO - lr: 8.2593e-06 gnorm: 0.78 [1 day, 20:00:37<1 day, 5:02:09] +[titan] 2025-09-09 13:36:36,523 - root - INFO - step: 24105 loss: 2.7440 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.48 mfu: 49.19% global_avg_ntp_loss: 0.7771 global_avg_top_loss: 1.9669 +[titan] 2025-09-09 13:36:36,524 - root - INFO - lr: 8.2559e-06 gnorm: 0.35 [1 day, 20:01:09<1 day, 5:01:36] +[titan] 2025-09-09 13:37:08,694 - root - INFO - step: 24110 loss: 2.7829 memory: 122.03GiB(87.57%) tps: 10,186 tflops: 485.45 mfu: 49.08% global_avg_ntp_loss: 0.7966 global_avg_top_loss: 1.9863 +[titan] 2025-09-09 13:37:08,694 - root - INFO - lr: 8.2525e-06 gnorm: 0.35 [1 day, 20:01:41<1 day, 5:01:02] +[titan] 2025-09-09 13:37:40,675 - root - INFO - step: 24115 loss: 2.7578 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.33 mfu: 49.38% global_avg_ntp_loss: 0.7857 global_avg_top_loss: 1.9722 +[titan] 2025-09-09 13:37:40,675 - root - INFO - lr: 8.2491e-06 gnorm: 0.35 [1 day, 20:02:13<1 day, 5:00:29] +[titan] 2025-09-09 13:38:12,695 - root - INFO - step: 24120 loss: 2.7801 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.73 mfu: 49.32% global_avg_ntp_loss: 0.7919 global_avg_top_loss: 1.9882 +[titan] 2025-09-09 13:38:12,695 - root - INFO - lr: 8.2458e-06 gnorm: 0.36 [1 day, 20:02:45<1 day, 4:59:55] +[titan] 2025-09-09 13:38:44,627 - root - INFO - step: 24125 loss: 2.7460 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.08 mfu: 49.45% global_avg_ntp_loss: 0.7799 global_avg_top_loss: 1.9661 +[titan] 2025-09-09 13:38:44,627 - root - INFO - lr: 8.2424e-06 gnorm: 0.35 [1 day, 20:03:17<1 day, 4:59:22] +[titan] 2025-09-09 13:39:16,379 - root - INFO - step: 24130 loss: 2.7060 memory: 122.03GiB(87.57%) tps: 10,320 tflops: 491.85 mfu: 49.73% global_avg_ntp_loss: 0.7611 global_avg_top_loss: 1.9450 +[titan] 2025-09-09 13:39:16,379 - root - INFO - lr: 8.2390e-06 gnorm: 0.36 [1 day, 20:03:49<1 day, 4:58:48] +[titan] 2025-09-09 13:39:48,502 - root - INFO - step: 24135 loss: 3.1977 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.18 mfu: 49.16% global_avg_ntp_loss: 1.0387 global_avg_top_loss: 2.1590 +[titan] 2025-09-09 13:39:48,502 - root - INFO - lr: 8.2356e-06 gnorm: 0.37 [1 day, 20:04:21<1 day, 4:58:15] +[titan] 2025-09-09 13:40:20,685 - root - INFO - step: 24140 loss: 2.8257 memory: 122.03GiB(87.57%) tps: 10,182 tflops: 485.25 mfu: 49.06% global_avg_ntp_loss: 0.8179 global_avg_top_loss: 2.0078 +[titan] 2025-09-09 13:40:20,686 - root - INFO - lr: 8.2322e-06 gnorm: 0.37 [1 day, 20:04:53<1 day, 4:57:42] +[titan] 2025-09-09 13:40:52,634 - root - INFO - step: 24145 loss: 2.7141 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.82 mfu: 49.43% global_avg_ntp_loss: 0.7642 global_avg_top_loss: 1.9499 +[titan] 2025-09-09 13:40:52,635 - root - INFO - lr: 8.2288e-06 gnorm: 0.37 [1 day, 20:05:25<1 day, 4:57:08] +[titan] 2025-09-09 13:41:18,160 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 13:41:24,562 - root - INFO - step: 24150 loss: 3.0353 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.15 mfu: 49.46% global_avg_ntp_loss: 0.9486 global_avg_top_loss: 2.0866 +[titan] 2025-09-09 13:41:24,562 - root - INFO - lr: 8.2254e-06 gnorm: 0.39 [1 day, 20:05:57<1 day, 4:56:35] +[titan] 2025-09-09 13:41:56,895 - root - INFO - step: 24155 loss: 3.0151 memory: 122.03GiB(87.57%) tps: 10,135 tflops: 483.01 mfu: 48.84% global_avg_ntp_loss: 0.9217 global_avg_top_loss: 2.0934 +[titan] 2025-09-09 13:41:56,895 - root - INFO - lr: 8.2220e-06 gnorm: 0.35 [1 day, 20:06:30<1 day, 4:56:01] +[titan] 2025-09-09 13:42:28,888 - root - INFO - step: 24160 loss: 2.7939 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.14 mfu: 49.36% global_avg_ntp_loss: 0.8035 global_avg_top_loss: 1.9904 +[titan] 2025-09-09 13:42:28,888 - root - INFO - lr: 8.2186e-06 gnorm: 0.37 [1 day, 20:07:02<1 day, 4:55:28] +[titan] 2025-09-09 13:43:00,975 - root - INFO - step: 24165 loss: 2.6934 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.71 mfu: 49.21% global_avg_ntp_loss: 0.7547 global_avg_top_loss: 1.9387 +[titan] 2025-09-09 13:43:00,976 - root - INFO - lr: 8.2152e-06 gnorm: 0.35 [1 day, 20:07:34<1 day, 4:54:55] +[titan] 2025-09-09 13:43:32,860 - root - INFO - step: 24170 loss: 2.6097 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.81 mfu: 49.53% global_avg_ntp_loss: 0.7178 global_avg_top_loss: 1.8919 +[titan] 2025-09-09 13:43:32,860 - root - INFO - lr: 8.2118e-06 gnorm: 0.35 [1 day, 20:08:06<1 day, 4:54:21] +[titan] 2025-09-09 13:44:04,718 - root - INFO - step: 24175 loss: 2.7924 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.21 mfu: 49.57% global_avg_ntp_loss: 0.8012 global_avg_top_loss: 1.9913 +[titan] 2025-09-09 13:44:04,718 - root - INFO - lr: 8.2084e-06 gnorm: 0.53 [1 day, 20:08:37<1 day, 4:53:47] +[titan] 2025-09-09 13:44:36,913 - root - INFO - step: 24180 loss: 2.7876 memory: 122.03GiB(87.57%) tps: 10,178 tflops: 485.09 mfu: 49.05% global_avg_ntp_loss: 0.7980 global_avg_top_loss: 1.9896 +[titan] 2025-09-09 13:44:36,913 - root - INFO - lr: 8.2050e-06 gnorm: 0.37 [1 day, 20:09:10<1 day, 4:53:14] +[titan] 2025-09-09 13:45:08,972 - root - INFO - step: 24185 loss: 2.7621 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.14 mfu: 49.26% global_avg_ntp_loss: 0.7901 global_avg_top_loss: 1.9720 +[titan] 2025-09-09 13:45:08,972 - root - INFO - lr: 8.2016e-06 gnorm: 0.35 [1 day, 20:09:42<1 day, 4:52:41] +[titan] 2025-09-09 13:45:40,839 - root - INFO - step: 24190 loss: 2.8281 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.07 mfu: 49.55% global_avg_ntp_loss: 0.8131 global_avg_top_loss: 2.0150 +[titan] 2025-09-09 13:45:40,839 - root - INFO - lr: 8.1982e-06 gnorm: 0.34 [1 day, 20:10:14<1 day, 4:52:07] +[titan] 2025-09-09 13:46:12,871 - root - INFO - step: 24195 loss: 2.7647 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.56 mfu: 49.30% global_avg_ntp_loss: 0.7854 global_avg_top_loss: 1.9793 +[titan] 2025-09-09 13:46:12,871 - root - INFO - lr: 8.1948e-06 gnorm: 0.36 [1 day, 20:10:46<1 day, 4:51:34] +[titan] 2025-09-09 13:46:38,454 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 13:46:44,925 - root - INFO - step: 24200 loss: 2.7885 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.21 mfu: 49.26% global_avg_ntp_loss: 0.8007 global_avg_top_loss: 1.9878 +[titan] 2025-09-09 13:46:44,925 - root - INFO - lr: 8.1914e-06 gnorm: 0.42 [1 day, 20:11:18<1 day, 4:51:00] +[titan] 2025-09-09 13:47:16,947 - root - INFO - step: 24205 loss: 2.7038 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.70 mfu: 49.31% global_avg_ntp_loss: 0.7580 global_avg_top_loss: 1.9458 +[titan] 2025-09-09 13:47:16,947 - root - INFO - lr: 8.1880e-06 gnorm: 0.46 [1 day, 20:11:50<1 day, 4:50:27] +[titan] 2025-09-09 13:47:49,053 - root - INFO - step: 24210 loss: 2.8731 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.42 mfu: 49.18% global_avg_ntp_loss: 0.8388 global_avg_top_loss: 2.0343 +[titan] 2025-09-09 13:47:49,054 - root - INFO - lr: 8.1846e-06 gnorm: 0.55 [1 day, 20:12:22<1 day, 4:49:54] +[titan] 2025-09-09 13:48:21,088 - root - INFO - step: 24215 loss: 3.4447 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.50 mfu: 49.29% global_avg_ntp_loss: 1.1791 global_avg_top_loss: 2.2656 +[titan] 2025-09-09 13:48:21,089 - root - INFO - lr: 8.1813e-06 gnorm: 0.41 [1 day, 20:12:54<1 day, 4:49:20] +[titan] 2025-09-09 13:48:52,860 - root - INFO - step: 24220 loss: 2.7942 memory: 122.03GiB(87.57%) tps: 10,314 tflops: 491.55 mfu: 49.70% global_avg_ntp_loss: 0.8011 global_avg_top_loss: 1.9931 +[titan] 2025-09-09 13:48:52,860 - root - INFO - lr: 8.1779e-06 gnorm: 0.38 [1 day, 20:13:26<1 day, 4:48:47] +[titan] 2025-09-09 13:49:24,901 - root - INFO - step: 24225 loss: 2.7667 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.41 mfu: 49.28% global_avg_ntp_loss: 0.7918 global_avg_top_loss: 1.9749 +[titan] 2025-09-09 13:49:24,901 - root - INFO - lr: 8.1745e-06 gnorm: 0.35 [1 day, 20:13:58<1 day, 4:48:13] +[titan] 2025-09-09 13:49:57,046 - root - INFO - step: 24230 loss: 2.7424 memory: 122.03GiB(87.57%) tps: 10,194 tflops: 485.83 mfu: 49.12% global_avg_ntp_loss: 0.7777 global_avg_top_loss: 1.9647 +[titan] 2025-09-09 13:49:57,046 - root - INFO - lr: 8.1711e-06 gnorm: 0.40 [1 day, 20:14:30<1 day, 4:47:40] +[titan] 2025-09-09 13:50:29,068 - root - INFO - step: 24235 loss: 2.7678 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.71 mfu: 49.31% global_avg_ntp_loss: 0.7885 global_avg_top_loss: 1.9793 +[titan] 2025-09-09 13:50:29,068 - root - INFO - lr: 8.1677e-06 gnorm: 0.35 [1 day, 20:15:02<1 day, 4:47:06] +[titan] 2025-09-09 13:51:00,968 - root - INFO - step: 24240 loss: 2.8187 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.57 mfu: 49.50% global_avg_ntp_loss: 0.8246 global_avg_top_loss: 1.9941 +[titan] 2025-09-09 13:51:00,968 - root - INFO - lr: 8.1643e-06 gnorm: 0.41 [1 day, 20:15:34<1 day, 4:46:33] +[titan] 2025-09-09 13:51:33,059 - root - INFO - step: 24245 loss: 2.6846 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.65 mfu: 49.21% global_avg_ntp_loss: 0.7499 global_avg_top_loss: 1.9347 +[titan] 2025-09-09 13:51:33,059 - root - INFO - lr: 8.1609e-06 gnorm: 0.38 [1 day, 20:16:06<1 day, 4:46:00] +[titan] 2025-09-09 13:51:58,559 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 13:52:04,983 - root - INFO - step: 24250 loss: 2.7423 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.20 mfu: 49.46% global_avg_ntp_loss: 0.7762 global_avg_top_loss: 1.9661 +[titan] 2025-09-09 13:52:04,983 - root - INFO - lr: 8.1575e-06 gnorm: 0.40 [1 day, 20:16:38<1 day, 4:45:26] +[titan] 2025-09-09 13:52:36,875 - root - INFO - step: 24255 loss: 2.7311 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.69 mfu: 49.51% global_avg_ntp_loss: 0.7760 global_avg_top_loss: 1.9550 +[titan] 2025-09-09 13:52:36,875 - root - INFO - lr: 8.1541e-06 gnorm: 0.53 [1 day, 20:17:10<1 day, 4:44:53] +[titan] 2025-09-09 13:53:09,121 - root - INFO - step: 24260 loss: 2.7513 memory: 122.03GiB(87.57%) tps: 10,162 tflops: 484.31 mfu: 48.97% global_avg_ntp_loss: 0.7812 global_avg_top_loss: 1.9701 +[titan] 2025-09-09 13:53:09,121 - root - INFO - lr: 8.1508e-06 gnorm: 0.35 [1 day, 20:17:42<1 day, 4:44:19] +[titan] 2025-09-09 13:53:41,069 - root - INFO - step: 24265 loss: 2.5740 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.83 mfu: 49.43% global_avg_ntp_loss: 0.7042 global_avg_top_loss: 1.8698 +[titan] 2025-09-09 13:53:41,070 - root - INFO - lr: 8.1474e-06 gnorm: 0.36 [1 day, 20:18:14<1 day, 4:43:46] +[titan] 2025-09-09 13:54:13,045 - root - INFO - step: 24270 loss: 2.7462 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.40 mfu: 49.38% global_avg_ntp_loss: 0.7769 global_avg_top_loss: 1.9692 +[titan] 2025-09-09 13:54:13,046 - root - INFO - lr: 8.1440e-06 gnorm: 0.36 [1 day, 20:18:46<1 day, 4:43:12] +[titan] 2025-09-09 13:54:44,868 - root - INFO - step: 24275 loss: 2.7525 memory: 122.03GiB(87.57%) tps: 10,297 tflops: 490.76 mfu: 49.62% global_avg_ntp_loss: 0.7841 global_avg_top_loss: 1.9684 +[titan] 2025-09-09 13:54:44,868 - root - INFO - lr: 8.1406e-06 gnorm: 0.37 [1 day, 20:19:18<1 day, 4:42:39] +[titan] 2025-09-09 13:55:16,882 - root - INFO - step: 24280 loss: 2.7304 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.82 mfu: 49.32% global_avg_ntp_loss: 0.7685 global_avg_top_loss: 1.9619 +[titan] 2025-09-09 13:55:16,883 - root - INFO - lr: 8.1372e-06 gnorm: 0.35 [1 day, 20:19:50<1 day, 4:42:06] +[titan] 2025-09-09 13:55:48,743 - root - INFO - step: 24285 loss: 2.7590 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.18 mfu: 49.56% global_avg_ntp_loss: 0.7876 global_avg_top_loss: 1.9713 +[titan] 2025-09-09 13:55:48,743 - root - INFO - lr: 8.1338e-06 gnorm: 0.35 [1 day, 20:20:21<1 day, 4:41:32] +[titan] 2025-09-09 13:56:20,548 - root - INFO - step: 24290 loss: 2.7280 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.02 mfu: 49.65% global_avg_ntp_loss: 0.7720 global_avg_top_loss: 1.9559 +[titan] 2025-09-09 13:56:20,548 - root - INFO - lr: 8.1305e-06 gnorm: 0.46 [1 day, 20:20:53<1 day, 4:40:58] +[titan] 2025-09-09 13:56:52,585 - root - INFO - step: 24295 loss: 3.7614 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.48 mfu: 49.29% global_avg_ntp_loss: 1.3523 global_avg_top_loss: 2.4091 +[titan] 2025-09-09 13:56:52,585 - root - INFO - lr: 8.1271e-06 gnorm: 0.44 [1 day, 20:21:25<1 day, 4:40:25] +[titan] 2025-09-09 13:57:18,112 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 13:57:24,604 - root - INFO - step: 24300 loss: 2.7631 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.75 mfu: 49.32% global_avg_ntp_loss: 0.7887 global_avg_top_loss: 1.9745 +[titan] 2025-09-09 13:57:24,604 - root - INFO - lr: 8.1237e-06 gnorm: 0.35 [1 day, 20:21:57<1 day, 4:39:52] +[titan] 2025-09-09 13:57:56,422 - root - INFO - step: 24305 loss: 2.7928 memory: 122.03GiB(87.57%) tps: 10,298 tflops: 490.82 mfu: 49.63% global_avg_ntp_loss: 0.7999 global_avg_top_loss: 1.9929 +[titan] 2025-09-09 13:57:56,423 - root - INFO - lr: 8.1203e-06 gnorm: 0.35 [1 day, 20:22:29<1 day, 4:39:18] +[titan] 2025-09-09 13:58:28,384 - root - INFO - step: 24310 loss: 3.2507 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.62 mfu: 49.41% global_avg_ntp_loss: 1.0666 global_avg_top_loss: 2.1841 +[titan] 2025-09-09 13:58:28,384 - root - INFO - lr: 8.1169e-06 gnorm: 0.42 [1 day, 20:23:01<1 day, 4:38:45] +[titan] 2025-09-09 13:59:00,441 - root - INFO - step: 24315 loss: 2.8062 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.17 mfu: 49.26% global_avg_ntp_loss: 0.8062 global_avg_top_loss: 2.0000 +[titan] 2025-09-09 13:59:00,442 - root - INFO - lr: 8.1135e-06 gnorm: 0.35 [1 day, 20:23:33<1 day, 4:38:11] +[titan] 2025-09-09 13:59:32,432 - root - INFO - step: 24320 loss: 2.6860 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.18 mfu: 49.36% global_avg_ntp_loss: 0.7554 global_avg_top_loss: 1.9306 +[titan] 2025-09-09 13:59:32,433 - root - INFO - lr: 8.1102e-06 gnorm: 0.35 [1 day, 20:24:05<1 day, 4:37:38] +[titan] 2025-09-09 14:00:04,473 - root - INFO - step: 24325 loss: 2.7479 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.42 mfu: 49.28% global_avg_ntp_loss: 0.7839 global_avg_top_loss: 1.9641 +[titan] 2025-09-09 14:00:04,473 - root - INFO - lr: 8.1068e-06 gnorm: 0.35 [1 day, 20:24:37<1 day, 4:37:04] +[titan] 2025-09-09 14:00:36,320 - root - INFO - step: 24330 loss: 2.7769 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.38 mfu: 49.58% global_avg_ntp_loss: 0.7931 global_avg_top_loss: 1.9839 +[titan] 2025-09-09 14:00:36,320 - root - INFO - lr: 8.1034e-06 gnorm: 0.59 [1 day, 20:25:09<1 day, 4:36:31] +[titan] 2025-09-09 14:01:08,166 - root - INFO - step: 24335 loss: 2.6969 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.40 mfu: 49.59% global_avg_ntp_loss: 0.7575 global_avg_top_loss: 1.9394 +[titan] 2025-09-09 14:01:08,166 - root - INFO - lr: 8.1000e-06 gnorm: 0.46 [1 day, 20:25:41<1 day, 4:35:57] +[titan] 2025-09-09 14:01:40,148 - root - INFO - step: 24340 loss: 2.6555 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.31 mfu: 49.37% global_avg_ntp_loss: 0.7437 global_avg_top_loss: 1.9118 +[titan] 2025-09-09 14:01:40,149 - root - INFO - lr: 8.0966e-06 gnorm: 0.35 [1 day, 20:26:13<1 day, 4:35:24] +[titan] 2025-09-09 14:02:12,035 - root - INFO - step: 24345 loss: 3.1250 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.78 mfu: 49.52% global_avg_ntp_loss: 1.0039 global_avg_top_loss: 2.1211 +[titan] 2025-09-09 14:02:12,035 - root - INFO - lr: 8.0933e-06 gnorm: 0.41 [1 day, 20:26:45<1 day, 4:34:51] +[titan] 2025-09-09 14:02:37,511 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 14:02:43,884 - root - INFO - step: 24350 loss: 2.7159 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.35 mfu: 49.58% global_avg_ntp_loss: 0.7667 global_avg_top_loss: 1.9492 +[titan] 2025-09-09 14:02:43,884 - root - INFO - lr: 8.0899e-06 gnorm: 0.35 [1 day, 20:27:17<1 day, 4:34:17] +[titan] 2025-09-09 14:03:15,713 - root - INFO - step: 24355 loss: 2.6675 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.65 mfu: 49.61% global_avg_ntp_loss: 0.7449 global_avg_top_loss: 1.9226 +[titan] 2025-09-09 14:03:15,714 - root - INFO - lr: 8.0865e-06 gnorm: 0.35 [1 day, 20:27:48<1 day, 4:33:43] +[titan] 2025-09-09 14:03:47,787 - root - INFO - step: 24360 loss: 2.7716 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.91 mfu: 49.23% global_avg_ntp_loss: 0.7906 global_avg_top_loss: 1.9810 +[titan] 2025-09-09 14:03:47,788 - root - INFO - lr: 8.0831e-06 gnorm: 0.37 [1 day, 20:28:20<1 day, 4:33:10] +[titan] 2025-09-09 14:04:19,865 - root - INFO - step: 24365 loss: 2.8802 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.85 mfu: 49.23% global_avg_ntp_loss: 0.8467 global_avg_top_loss: 2.0336 +[titan] 2025-09-09 14:04:19,866 - root - INFO - lr: 8.0797e-06 gnorm: 0.36 [1 day, 20:28:53<1 day, 4:32:37] +[titan] 2025-09-09 14:04:51,742 - root - INFO - step: 24370 loss: 2.7575 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.93 mfu: 49.54% global_avg_ntp_loss: 0.7834 global_avg_top_loss: 1.9741 +[titan] 2025-09-09 14:04:51,742 - root - INFO - lr: 8.0764e-06 gnorm: 0.37 [1 day, 20:29:24<1 day, 4:32:03] +[titan] 2025-09-09 14:05:23,957 - root - INFO - step: 24375 loss: 3.2172 memory: 122.03GiB(87.57%) tps: 10,172 tflops: 484.77 mfu: 49.02% global_avg_ntp_loss: 1.0461 global_avg_top_loss: 2.1712 +[titan] 2025-09-09 14:05:23,958 - root - INFO - lr: 8.0730e-06 gnorm: 0.37 [1 day, 20:29:57<1 day, 4:31:30] +[titan] 2025-09-09 14:05:55,934 - root - INFO - step: 24380 loss: 2.8040 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.40 mfu: 49.38% global_avg_ntp_loss: 0.8065 global_avg_top_loss: 1.9975 +[titan] 2025-09-09 14:05:55,934 - root - INFO - lr: 8.0696e-06 gnorm: 0.34 [1 day, 20:30:29<1 day, 4:30:57] +[titan] 2025-09-09 14:06:27,730 - root - INFO - step: 24385 loss: 2.7604 memory: 122.03GiB(87.57%) tps: 10,306 tflops: 491.17 mfu: 49.66% global_avg_ntp_loss: 0.7856 global_avg_top_loss: 1.9748 +[titan] 2025-09-09 14:06:27,730 - root - INFO - lr: 8.0662e-06 gnorm: 0.36 [1 day, 20:31:00<1 day, 4:30:23] +[titan] 2025-09-09 14:06:59,607 - root - INFO - step: 24390 loss: 3.1832 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.92 mfu: 49.54% global_avg_ntp_loss: 1.0331 global_avg_top_loss: 2.1501 +[titan] 2025-09-09 14:06:59,607 - root - INFO - lr: 8.0629e-06 gnorm: 0.37 [1 day, 20:31:32<1 day, 4:29:50] +[titan] 2025-09-09 14:07:31,570 - root - INFO - step: 24395 loss: 3.1253 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.60 mfu: 49.40% global_avg_ntp_loss: 0.9935 global_avg_top_loss: 2.1319 +[titan] 2025-09-09 14:07:31,570 - root - INFO - lr: 8.0595e-06 gnorm: 0.34 [1 day, 20:32:04<1 day, 4:29:16] +[titan] 2025-09-09 14:07:57,159 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 14:08:03,581 - root - INFO - step: 24400 loss: 2.7249 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.86 mfu: 49.33% global_avg_ntp_loss: 0.7699 global_avg_top_loss: 1.9550 +[titan] 2025-09-09 14:08:03,581 - root - INFO - lr: 8.0561e-06 gnorm: 0.35 [1 day, 20:32:36<1 day, 4:28:43] +[titan] 2025-09-09 14:08:35,429 - root - INFO - step: 24405 loss: 2.7365 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.37 mfu: 49.58% global_avg_ntp_loss: 0.7742 global_avg_top_loss: 1.9623 +[titan] 2025-09-09 14:08:35,429 - root - INFO - lr: 8.0527e-06 gnorm: 0.36 [1 day, 20:33:08<1 day, 4:28:09] +[titan] 2025-09-09 14:09:07,321 - root - INFO - step: 24410 loss: 2.5786 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.69 mfu: 49.51% global_avg_ntp_loss: 0.7040 global_avg_top_loss: 1.8746 +[titan] 2025-09-09 14:09:07,322 - root - INFO - lr: 8.0494e-06 gnorm: 0.34 [1 day, 20:33:40<1 day, 4:27:36] +[titan] 2025-09-09 14:09:38,999 - root - INFO - step: 24415 loss: 2.7584 memory: 122.03GiB(87.57%) tps: 10,344 tflops: 493.01 mfu: 49.85% global_avg_ntp_loss: 0.7876 global_avg_top_loss: 1.9708 +[titan] 2025-09-09 14:09:38,999 - root - INFO - lr: 8.0460e-06 gnorm: 0.36 [1 day, 20:34:12<1 day, 4:27:02] +[titan] 2025-09-09 14:10:10,906 - root - INFO - step: 24420 loss: 2.7637 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.45 mfu: 49.49% global_avg_ntp_loss: 0.7908 global_avg_top_loss: 1.9729 +[titan] 2025-09-09 14:10:10,907 - root - INFO - lr: 8.0426e-06 gnorm: 0.35 [1 day, 20:34:44<1 day, 4:26:29] +[titan] 2025-09-09 14:10:42,812 - root - INFO - step: 24425 loss: 3.0768 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.48 mfu: 49.49% global_avg_ntp_loss: 0.9865 global_avg_top_loss: 2.0903 +[titan] 2025-09-09 14:10:42,813 - root - INFO - lr: 8.0393e-06 gnorm: 0.38 [1 day, 20:35:15<1 day, 4:25:55] +[titan] 2025-09-09 14:11:14,959 - root - INFO - step: 24430 loss: 2.6405 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.81 mfu: 49.12% global_avg_ntp_loss: 0.7345 global_avg_top_loss: 1.9060 +[titan] 2025-09-09 14:11:14,959 - root - INFO - lr: 8.0359e-06 gnorm: 0.37 [1 day, 20:35:48<1 day, 4:25:22] +[titan] 2025-09-09 14:11:46,819 - root - INFO - step: 24435 loss: 2.7144 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.18 mfu: 49.56% global_avg_ntp_loss: 0.7668 global_avg_top_loss: 1.9476 +[titan] 2025-09-09 14:11:46,819 - root - INFO - lr: 8.0325e-06 gnorm: 0.36 [1 day, 20:36:19<1 day, 4:24:48] +[titan] 2025-09-09 14:12:18,694 - root - INFO - step: 24440 loss: 2.7474 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.95 mfu: 49.54% global_avg_ntp_loss: 0.7800 global_avg_top_loss: 1.9674 +[titan] 2025-09-09 14:12:18,694 - root - INFO - lr: 8.0291e-06 gnorm: 0.38 [1 day, 20:36:51<1 day, 4:24:15] +[titan] 2025-09-09 14:12:50,660 - root - INFO - step: 24445 loss: 2.7675 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.56 mfu: 49.40% global_avg_ntp_loss: 0.7923 global_avg_top_loss: 1.9751 +[titan] 2025-09-09 14:12:50,660 - root - INFO - lr: 8.0258e-06 gnorm: 0.35 [1 day, 20:37:23<1 day, 4:23:41] +[titan] 2025-09-09 14:13:16,179 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 14:13:22,524 - root - INFO - step: 24450 loss: 2.6339 memory: 122.03GiB(87.57%) tps: 10,284 tflops: 490.11 mfu: 49.56% global_avg_ntp_loss: 0.7252 global_avg_top_loss: 1.9088 +[titan] 2025-09-09 14:13:22,524 - root - INFO - lr: 8.0224e-06 gnorm: 0.53 [1 day, 20:37:55<1 day, 4:23:08] +[titan] 2025-09-09 14:13:54,340 - root - INFO - step: 24455 loss: 3.2545 memory: 122.03GiB(87.57%) tps: 10,300 tflops: 490.87 mfu: 49.63% global_avg_ntp_loss: 1.0613 global_avg_top_loss: 2.1931 +[titan] 2025-09-09 14:13:54,340 - root - INFO - lr: 8.0190e-06 gnorm: 0.39 [1 day, 20:38:27<1 day, 4:22:34] +[titan] 2025-09-09 14:14:26,271 - root - INFO - step: 24460 loss: 2.7233 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.09 mfu: 49.45% global_avg_ntp_loss: 0.7694 global_avg_top_loss: 1.9539 +[titan] 2025-09-09 14:14:26,271 - root - INFO - lr: 8.0157e-06 gnorm: 0.36 [1 day, 20:38:59<1 day, 4:22:01] +[titan] 2025-09-09 14:14:58,325 - root - INFO - step: 24465 loss: 2.7286 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.22 mfu: 49.26% global_avg_ntp_loss: 0.7700 global_avg_top_loss: 1.9586 +[titan] 2025-09-09 14:14:58,325 - root - INFO - lr: 8.0123e-06 gnorm: 0.36 [1 day, 20:39:31<1 day, 4:21:28] +[titan] 2025-09-09 14:15:30,261 - root - INFO - step: 24470 loss: 3.2392 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 489.01 mfu: 49.44% global_avg_ntp_loss: 1.0598 global_avg_top_loss: 2.1794 +[titan] 2025-09-09 14:15:30,261 - root - INFO - lr: 8.0089e-06 gnorm: 0.39 [1 day, 20:40:03<1 day, 4:20:54] +[titan] 2025-09-09 14:16:02,029 - root - INFO - step: 24475 loss: 2.7670 memory: 122.03GiB(87.57%) tps: 10,315 tflops: 491.61 mfu: 49.71% global_avg_ntp_loss: 0.7878 global_avg_top_loss: 1.9792 +[titan] 2025-09-09 14:16:02,029 - root - INFO - lr: 8.0056e-06 gnorm: 0.36 [1 day, 20:40:35<1 day, 4:20:21] +[titan] 2025-09-09 14:16:33,793 - root - INFO - step: 24480 loss: 2.7746 memory: 122.03GiB(87.57%) tps: 10,316 tflops: 491.66 mfu: 49.71% global_avg_ntp_loss: 0.7928 global_avg_top_loss: 1.9818 +[titan] 2025-09-09 14:16:33,793 - root - INFO - lr: 8.0022e-06 gnorm: 0.35 [1 day, 20:41:06<1 day, 4:19:47] +[titan] 2025-09-09 14:17:05,709 - root - INFO - step: 24485 loss: 2.8442 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.32 mfu: 49.48% global_avg_ntp_loss: 0.8201 global_avg_top_loss: 2.0241 +[titan] 2025-09-09 14:17:05,709 - root - INFO - lr: 7.9988e-06 gnorm: 0.37 [1 day, 20:41:38<1 day, 4:19:14] +[titan] 2025-09-09 14:17:37,522 - root - INFO - step: 24490 loss: 2.7197 memory: 122.03GiB(87.57%) tps: 10,300 tflops: 490.90 mfu: 49.64% global_avg_ntp_loss: 0.7665 global_avg_top_loss: 1.9532 +[titan] 2025-09-09 14:17:37,523 - root - INFO - lr: 7.9955e-06 gnorm: 0.35 [1 day, 20:42:10<1 day, 4:18:40] +[titan] 2025-09-09 14:18:09,649 - root - INFO - step: 24495 loss: 2.8103 memory: 122.03GiB(87.57%) tps: 10,200 tflops: 486.10 mfu: 49.15% global_avg_ntp_loss: 0.8092 global_avg_top_loss: 2.0011 +[titan] 2025-09-09 14:18:09,650 - root - INFO - lr: 7.9921e-06 gnorm: 0.34 [1 day, 20:42:42<1 day, 4:18:07] +[titan] 2025-09-09 14:18:35,132 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 14:18:41,666 - root - INFO - step: 24500 loss: 2.7095 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.79 mfu: 49.32% global_avg_ntp_loss: 0.7635 global_avg_top_loss: 1.9460 +[titan] 2025-09-09 14:18:41,666 - root - INFO - lr: 7.9887e-06 gnorm: 0.35 [1 day, 20:43:14<1 day, 4:17:33] +[titan] 2025-09-09 14:19:13,423 - root - INFO - step: 24505 loss: 3.1674 memory: 122.03GiB(87.57%) tps: 10,319 tflops: 491.77 mfu: 49.72% global_avg_ntp_loss: 1.0246 global_avg_top_loss: 2.1428 +[titan] 2025-09-09 14:19:13,423 - root - INFO - lr: 7.9854e-06 gnorm: 0.36 [1 day, 20:43:46<1 day, 4:17:00] +[titan] 2025-09-09 14:19:45,316 - root - INFO - step: 24510 loss: 2.7075 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.67 mfu: 49.51% global_avg_ntp_loss: 0.7621 global_avg_top_loss: 1.9453 +[titan] 2025-09-09 14:19:45,316 - root - INFO - lr: 7.9820e-06 gnorm: 0.34 [1 day, 20:44:18<1 day, 4:16:26] +[titan] 2025-09-09 14:20:17,116 - root - INFO - step: 24515 loss: 2.6781 memory: 122.03GiB(87.57%) tps: 10,304 tflops: 491.10 mfu: 49.66% global_avg_ntp_loss: 0.7481 global_avg_top_loss: 1.9299 +[titan] 2025-09-09 14:20:17,117 - root - INFO - lr: 7.9787e-06 gnorm: 0.38 [1 day, 20:44:50<1 day, 4:15:53] +[titan] 2025-09-09 14:20:49,105 - root - INFO - step: 24520 loss: 2.5948 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.21 mfu: 49.36% global_avg_ntp_loss: 0.7119 global_avg_top_loss: 1.8829 +[titan] 2025-09-09 14:20:49,105 - root - INFO - lr: 7.9753e-06 gnorm: 0.41 [1 day, 20:45:22<1 day, 4:15:19] +[titan] 2025-09-09 14:21:21,055 - root - INFO - step: 24525 loss: 2.8212 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.80 mfu: 49.42% global_avg_ntp_loss: 0.8133 global_avg_top_loss: 2.0079 +[titan] 2025-09-09 14:21:21,055 - root - INFO - lr: 7.9719e-06 gnorm: 0.38 [1 day, 20:45:54<1 day, 4:14:46] +[titan] 2025-09-09 14:21:53,159 - root - INFO - step: 24530 loss: 2.7070 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.45 mfu: 49.19% global_avg_ntp_loss: 0.7627 global_avg_top_loss: 1.9444 +[titan] 2025-09-09 14:21:53,160 - root - INFO - lr: 7.9686e-06 gnorm: 0.38 [1 day, 20:46:26<1 day, 4:14:13] +[titan] 2025-09-09 14:22:25,241 - root - INFO - step: 24535 loss: 2.7028 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.79 mfu: 49.22% global_avg_ntp_loss: 0.7609 global_avg_top_loss: 1.9418 +[titan] 2025-09-09 14:22:25,241 - root - INFO - lr: 7.9652e-06 gnorm: 0.43 [1 day, 20:46:58<1 day, 4:13:39] +[titan] 2025-09-09 14:22:57,345 - root - INFO - step: 24540 loss: 2.7080 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.47 mfu: 49.19% global_avg_ntp_loss: 0.7649 global_avg_top_loss: 1.9431 +[titan] 2025-09-09 14:22:57,345 - root - INFO - lr: 7.9618e-06 gnorm: 0.34 [1 day, 20:47:30<1 day, 4:13:06] +[titan] 2025-09-09 14:23:29,243 - root - INFO - step: 24545 loss: 2.7602 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.59 mfu: 49.50% global_avg_ntp_loss: 0.7864 global_avg_top_loss: 1.9739 +[titan] 2025-09-09 14:23:29,243 - root - INFO - lr: 7.9585e-06 gnorm: 0.36 [1 day, 20:48:02<1 day, 4:12:33] +[titan] 2025-09-09 14:23:54,693 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 14:24:01,077 - root - INFO - step: 24550 loss: 2.7308 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.58 mfu: 49.60% global_avg_ntp_loss: 0.7721 global_avg_top_loss: 1.9586 +[titan] 2025-09-09 14:24:01,077 - root - INFO - lr: 7.9551e-06 gnorm: 0.42 [1 day, 20:48:34<1 day, 4:11:59] +[titan] 2025-09-09 14:24:33,036 - root - INFO - step: 24555 loss: 2.7520 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.66 mfu: 49.41% global_avg_ntp_loss: 0.7840 global_avg_top_loss: 1.9680 +[titan] 2025-09-09 14:24:33,037 - root - INFO - lr: 7.9518e-06 gnorm: 0.36 [1 day, 20:49:06<1 day, 4:11:26] +[titan] 2025-09-09 14:25:04,856 - root - INFO - step: 24560 loss: 2.7458 memory: 122.03GiB(87.57%) tps: 10,298 tflops: 490.80 mfu: 49.63% global_avg_ntp_loss: 0.7833 global_avg_top_loss: 1.9625 +[titan] 2025-09-09 14:25:04,856 - root - INFO - lr: 7.9484e-06 gnorm: 0.38 [1 day, 20:49:38<1 day, 4:10:52] +[titan] 2025-09-09 14:25:36,808 - root - INFO - step: 24565 loss: 2.7147 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.77 mfu: 49.42% global_avg_ntp_loss: 0.7641 global_avg_top_loss: 1.9505 +[titan] 2025-09-09 14:25:36,808 - root - INFO - lr: 7.9451e-06 gnorm: 0.35 [1 day, 20:50:09<1 day, 4:10:19] +[titan] 2025-09-09 14:26:08,795 - root - INFO - step: 24570 loss: 2.6328 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.24 mfu: 49.37% global_avg_ntp_loss: 0.7274 global_avg_top_loss: 1.9054 +[titan] 2025-09-09 14:26:08,795 - root - INFO - lr: 7.9417e-06 gnorm: 0.37 [1 day, 20:50:41<1 day, 4:09:45] +[titan] 2025-09-09 14:26:40,892 - root - INFO - step: 24575 loss: 2.8019 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.56 mfu: 49.20% global_avg_ntp_loss: 0.8043 global_avg_top_loss: 1.9976 +[titan] 2025-09-09 14:26:40,892 - root - INFO - lr: 7.9383e-06 gnorm: 0.36 [1 day, 20:51:14<1 day, 4:09:12] +[titan] 2025-09-09 14:26:47,472 - root - INFO - Dumping profiler traces at step 24576 +[titan] 2025-09-09 14:26:47,540 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-09 14:27:12,818 - root - INFO - step: 24580 loss: 2.7852 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.17 mfu: 49.46% global_avg_ntp_loss: 0.7954 global_avg_top_loss: 1.9898 +[titan] 2025-09-09 14:27:12,818 - root - INFO - lr: 7.9350e-06 gnorm: 0.38 [1 day, 20:51:45<1 day, 4:08:39] +[titan] 2025-09-09 14:27:44,783 - root - INFO - step: 24585 loss: 3.1717 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.56 mfu: 49.40% global_avg_ntp_loss: 1.0256 global_avg_top_loss: 2.1461 +[titan] 2025-09-09 14:27:44,783 - root - INFO - lr: 7.9316e-06 gnorm: 0.34 [1 day, 20:52:17<1 day, 4:08:05] +[titan] 2025-09-09 14:28:16,876 - root - INFO - step: 24590 loss: 2.6412 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.63 mfu: 49.20% global_avg_ntp_loss: 0.7344 global_avg_top_loss: 1.9068 +[titan] 2025-09-09 14:28:16,876 - root - INFO - lr: 7.9283e-06 gnorm: 0.34 [1 day, 20:52:50<1 day, 4:07:32] +[titan] 2025-09-09 14:28:48,798 - root - INFO - step: 24595 loss: 2.6406 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.23 mfu: 49.47% global_avg_ntp_loss: 0.7305 global_avg_top_loss: 1.9101 +[titan] 2025-09-09 14:28:48,799 - root - INFO - lr: 7.9249e-06 gnorm: 0.36 [1 day, 20:53:21<1 day, 4:06:58] +[titan] 2025-09-09 14:29:14,289 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 14:29:20,650 - root - INFO - step: 24600 loss: 2.6756 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.31 mfu: 49.58% global_avg_ntp_loss: 0.7485 global_avg_top_loss: 1.9271 +[titan] 2025-09-09 14:29:20,650 - root - INFO - lr: 7.9216e-06 gnorm: 0.35 [1 day, 20:53:53<1 day, 4:06:25] +[titan] 2025-09-09 14:29:52,464 - root - INFO - step: 24605 loss: 2.7393 memory: 122.03GiB(87.57%) tps: 10,300 tflops: 490.89 mfu: 49.63% global_avg_ntp_loss: 0.7822 global_avg_top_loss: 1.9571 +[titan] 2025-09-09 14:29:52,464 - root - INFO - lr: 7.9182e-06 gnorm: 0.36 [1 day, 20:54:25<1 day, 4:05:51] +[titan] 2025-09-09 14:30:24,386 - root - INFO - step: 24610 loss: 2.7336 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.24 mfu: 49.47% global_avg_ntp_loss: 0.7730 global_avg_top_loss: 1.9606 +[titan] 2025-09-09 14:30:24,386 - root - INFO - lr: 7.9148e-06 gnorm: 0.36 [1 day, 20:54:57<1 day, 4:05:18] +[titan] 2025-09-09 14:30:56,468 - root - INFO - step: 24615 loss: 2.7362 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.78 mfu: 49.22% global_avg_ntp_loss: 0.7740 global_avg_top_loss: 1.9622 +[titan] 2025-09-09 14:30:56,469 - root - INFO - lr: 7.9115e-06 gnorm: 0.35 [1 day, 20:55:29<1 day, 4:04:45] +[titan] 2025-09-09 14:31:28,484 - root - INFO - step: 24620 loss: 2.7094 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.79 mfu: 49.32% global_avg_ntp_loss: 0.7571 global_avg_top_loss: 1.9523 +[titan] 2025-09-09 14:31:28,485 - root - INFO - lr: 7.9081e-06 gnorm: 0.54 [1 day, 20:56:01<1 day, 4:04:11] +[titan] 2025-09-09 14:32:00,387 - root - INFO - step: 24625 loss: 2.6534 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.52 mfu: 49.50% global_avg_ntp_loss: 0.7373 global_avg_top_loss: 1.9160 +[titan] 2025-09-09 14:32:00,388 - root - INFO - lr: 7.9048e-06 gnorm: 0.37 [1 day, 20:56:33<1 day, 4:03:38] +[titan] 2025-09-09 14:32:32,442 - root - INFO - step: 24630 loss: 2.7817 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.21 mfu: 49.26% global_avg_ntp_loss: 0.7958 global_avg_top_loss: 1.9859 +[titan] 2025-09-09 14:32:32,442 - root - INFO - lr: 7.9014e-06 gnorm: 0.39 [1 day, 20:57:05<1 day, 4:03:04] +[titan] 2025-09-09 14:33:04,238 - root - INFO - step: 24635 loss: 2.7515 memory: 122.03GiB(87.57%) tps: 10,306 tflops: 491.17 mfu: 49.66% global_avg_ntp_loss: 0.7818 global_avg_top_loss: 1.9697 +[titan] 2025-09-09 14:33:04,238 - root - INFO - lr: 7.8981e-06 gnorm: 0.34 [1 day, 20:57:37<1 day, 4:02:31] +[titan] 2025-09-09 14:33:36,037 - root - INFO - step: 24640 loss: 2.7131 memory: 122.03GiB(87.57%) tps: 10,305 tflops: 491.12 mfu: 49.66% global_avg_ntp_loss: 0.7644 global_avg_top_loss: 1.9488 +[titan] 2025-09-09 14:33:36,037 - root - INFO - lr: 7.8947e-06 gnorm: 0.35 [1 day, 20:58:09<1 day, 4:01:57] +[titan] 2025-09-09 14:34:07,906 - root - INFO - step: 24645 loss: 2.7299 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.04 mfu: 49.55% global_avg_ntp_loss: 0.7729 global_avg_top_loss: 1.9570 +[titan] 2025-09-09 14:34:07,907 - root - INFO - lr: 7.8914e-06 gnorm: 0.34 [1 day, 20:58:41<1 day, 4:01:24] +[titan] 2025-09-09 14:34:33,348 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 14:34:39,733 - root - INFO - step: 24650 loss: 2.7870 memory: 122.03GiB(87.57%) tps: 10,296 tflops: 490.70 mfu: 49.62% global_avg_ntp_loss: 0.7977 global_avg_top_loss: 1.9893 +[titan] 2025-09-09 14:34:39,733 - root - INFO - lr: 7.8880e-06 gnorm: 0.36 [1 day, 20:59:12<1 day, 4:00:50] +[titan] 2025-09-09 14:35:11,660 - root - INFO - step: 24655 loss: 2.7815 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.15 mfu: 49.46% global_avg_ntp_loss: 0.7969 global_avg_top_loss: 1.9845 +[titan] 2025-09-09 14:35:11,660 - root - INFO - lr: 7.8847e-06 gnorm: 0.34 [1 day, 20:59:44<1 day, 4:00:17] +[titan] 2025-09-09 14:35:43,537 - root - INFO - step: 24660 loss: 2.7108 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.92 mfu: 49.54% global_avg_ntp_loss: 0.7669 global_avg_top_loss: 1.9439 +[titan] 2025-09-09 14:35:43,537 - root - INFO - lr: 7.8813e-06 gnorm: 0.34 [1 day, 21:00:16<1 day, 3:59:44] +[titan] 2025-09-09 14:36:15,608 - root - INFO - step: 24665 loss: 2.7022 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.96 mfu: 49.24% global_avg_ntp_loss: 0.7633 global_avg_top_loss: 1.9389 +[titan] 2025-09-09 14:36:15,608 - root - INFO - lr: 7.8780e-06 gnorm: 0.36 [1 day, 21:00:48<1 day, 3:59:10] +[titan] 2025-09-09 14:36:47,445 - root - INFO - step: 24670 loss: 2.6612 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.54 mfu: 49.60% global_avg_ntp_loss: 0.7436 global_avg_top_loss: 1.9175 +[titan] 2025-09-09 14:36:47,445 - root - INFO - lr: 7.8746e-06 gnorm: 0.33 [1 day, 21:01:20<1 day, 3:58:37] +[titan] 2025-09-09 14:37:19,288 - root - INFO - step: 24675 loss: 2.7244 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.45 mfu: 49.59% global_avg_ntp_loss: 0.7718 global_avg_top_loss: 1.9526 +[titan] 2025-09-09 14:37:19,288 - root - INFO - lr: 7.8713e-06 gnorm: 0.37 [1 day, 21:01:52<1 day, 3:58:03] +[titan] 2025-09-09 14:37:51,363 - root - INFO - step: 24680 loss: 2.7588 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.89 mfu: 49.23% global_avg_ntp_loss: 0.7815 global_avg_top_loss: 1.9773 +[titan] 2025-09-09 14:37:51,363 - root - INFO - lr: 7.8679e-06 gnorm: 0.35 [1 day, 21:02:24<1 day, 3:57:30] +[titan] 2025-09-09 14:38:23,298 - root - INFO - step: 24685 loss: 2.7681 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.03 mfu: 49.45% global_avg_ntp_loss: 0.7891 global_avg_top_loss: 1.9790 +[titan] 2025-09-09 14:38:23,298 - root - INFO - lr: 7.8646e-06 gnorm: 0.39 [1 day, 21:02:56<1 day, 3:56:57] +[titan] 2025-09-09 14:38:55,253 - root - INFO - step: 24690 loss: 2.7401 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.73 mfu: 49.42% global_avg_ntp_loss: 0.7722 global_avg_top_loss: 1.9679 +[titan] 2025-09-09 14:38:55,253 - root - INFO - lr: 7.8613e-06 gnorm: 0.37 [1 day, 21:03:28<1 day, 3:56:23] +[titan] 2025-09-09 14:39:26,993 - root - INFO - step: 24695 loss: 2.8241 memory: 122.03GiB(87.57%) tps: 10,324 tflops: 492.04 mfu: 49.75% global_avg_ntp_loss: 0.8151 global_avg_top_loss: 2.0090 +[titan] 2025-09-09 14:39:26,993 - root - INFO - lr: 7.8579e-06 gnorm: 0.35 [1 day, 21:04:00<1 day, 3:55:50] +[titan] 2025-09-09 14:39:52,704 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 14:39:59,087 - root - INFO - step: 24700 loss: 2.8121 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.60 mfu: 49.20% global_avg_ntp_loss: 0.7910 global_avg_top_loss: 2.0212 +[titan] 2025-09-09 14:39:59,087 - root - INFO - lr: 7.8546e-06 gnorm: 3.70 [1 day, 21:04:32<1 day, 3:55:16] +[titan] 2025-09-09 14:40:30,950 - root - INFO - step: 24705 loss: 2.7303 memory: 122.03GiB(87.57%) tps: 10,284 tflops: 490.14 mfu: 49.56% global_avg_ntp_loss: 0.7716 global_avg_top_loss: 1.9588 +[titan] 2025-09-09 14:40:30,950 - root - INFO - lr: 7.8512e-06 gnorm: 0.35 [1 day, 21:05:04<1 day, 3:54:43] +[titan] 2025-09-09 14:41:03,059 - root - INFO - step: 24710 loss: 2.6409 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.38 mfu: 49.18% global_avg_ntp_loss: 0.7367 global_avg_top_loss: 1.9042 +[titan] 2025-09-09 14:41:03,059 - root - INFO - lr: 7.8479e-06 gnorm: 0.35 [1 day, 21:05:36<1 day, 3:54:10] +[titan] 2025-09-09 14:41:34,968 - root - INFO - step: 24715 loss: 2.7260 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.43 mfu: 49.49% global_avg_ntp_loss: 0.7745 global_avg_top_loss: 1.9516 +[titan] 2025-09-09 14:41:34,968 - root - INFO - lr: 7.8445e-06 gnorm: 0.35 [1 day, 21:06:08<1 day, 3:53:36] +[titan] 2025-09-09 14:42:06,966 - root - INFO - step: 24720 loss: 2.7557 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.07 mfu: 49.35% global_avg_ntp_loss: 0.7831 global_avg_top_loss: 1.9726 +[titan] 2025-09-09 14:42:06,966 - root - INFO - lr: 7.8412e-06 gnorm: 0.36 [1 day, 21:06:40<1 day, 3:53:03] +[titan] 2025-09-09 14:42:38,908 - root - INFO - step: 24725 loss: 2.6834 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.92 mfu: 49.44% global_avg_ntp_loss: 0.7521 global_avg_top_loss: 1.9313 +[titan] 2025-09-09 14:42:38,908 - root - INFO - lr: 7.8378e-06 gnorm: 0.34 [1 day, 21:07:12<1 day, 3:52:29] +[titan] 2025-09-09 14:43:10,814 - root - INFO - step: 24730 loss: 2.7838 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.48 mfu: 49.49% global_avg_ntp_loss: 0.7999 global_avg_top_loss: 1.9839 +[titan] 2025-09-09 14:43:10,814 - root - INFO - lr: 7.8345e-06 gnorm: 0.35 [1 day, 21:07:43<1 day, 3:51:56] +[titan] 2025-09-09 14:43:42,704 - root - INFO - step: 24735 loss: 2.7348 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.71 mfu: 49.52% global_avg_ntp_loss: 0.7740 global_avg_top_loss: 1.9608 +[titan] 2025-09-09 14:43:42,705 - root - INFO - lr: 7.8312e-06 gnorm: 0.35 [1 day, 21:08:15<1 day, 3:51:22] +[titan] 2025-09-09 14:44:14,644 - root - INFO - step: 24740 loss: 2.7169 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.96 mfu: 49.44% global_avg_ntp_loss: 0.7670 global_avg_top_loss: 1.9499 +[titan] 2025-09-09 14:44:14,645 - root - INFO - lr: 7.8278e-06 gnorm: 0.34 [1 day, 21:08:47<1 day, 3:50:49] +[titan] 2025-09-09 14:44:46,722 - root - INFO - step: 24745 loss: 2.6963 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.86 mfu: 49.23% global_avg_ntp_loss: 0.7644 global_avg_top_loss: 1.9319 +[titan] 2025-09-09 14:44:46,722 - root - INFO - lr: 7.8245e-06 gnorm: 0.36 [1 day, 21:09:19<1 day, 3:50:16] +[titan] 2025-09-09 14:45:12,225 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 14:45:18,632 - root - INFO - step: 24750 loss: 2.7410 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.42 mfu: 49.49% global_avg_ntp_loss: 0.7766 global_avg_top_loss: 1.9644 +[titan] 2025-09-09 14:45:18,632 - root - INFO - lr: 7.8211e-06 gnorm: 0.36 [1 day, 21:09:51<1 day, 3:49:42] +[titan] 2025-09-09 14:45:50,412 - root - INFO - step: 24755 loss: 2.6954 memory: 122.03GiB(87.57%) tps: 10,311 tflops: 491.41 mfu: 49.69% global_avg_ntp_loss: 0.7550 global_avg_top_loss: 1.9404 +[titan] 2025-09-09 14:45:50,412 - root - INFO - lr: 7.8178e-06 gnorm: 0.35 [1 day, 21:10:23<1 day, 3:49:09] +[titan] 2025-09-09 14:46:22,365 - root - INFO - step: 24760 loss: 3.7430 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.76 mfu: 49.42% global_avg_ntp_loss: 1.3471 global_avg_top_loss: 2.3960 +[titan] 2025-09-09 14:46:22,365 - root - INFO - lr: 7.8145e-06 gnorm: 0.39 [1 day, 21:10:55<1 day, 3:48:35] +[titan] 2025-09-09 14:46:54,411 - root - INFO - step: 24765 loss: 2.7549 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.32 mfu: 49.27% global_avg_ntp_loss: 0.7850 global_avg_top_loss: 1.9699 +[titan] 2025-09-09 14:46:54,412 - root - INFO - lr: 7.8111e-06 gnorm: 0.34 [1 day, 21:11:27<1 day, 3:48:02] +[titan] 2025-09-09 14:47:26,371 - root - INFO - step: 24770 loss: 2.8128 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.66 mfu: 49.41% global_avg_ntp_loss: 0.8189 global_avg_top_loss: 1.9939 +[titan] 2025-09-09 14:47:26,371 - root - INFO - lr: 7.8078e-06 gnorm: 0.34 [1 day, 21:11:59<1 day, 3:47:29] +[titan] 2025-09-09 14:47:58,294 - root - INFO - step: 24775 loss: 2.7395 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.21 mfu: 49.47% global_avg_ntp_loss: 0.7772 global_avg_top_loss: 1.9623 +[titan] 2025-09-09 14:47:58,294 - root - INFO - lr: 7.8044e-06 gnorm: 0.37 [1 day, 21:12:31<1 day, 3:46:55] +[titan] 2025-09-09 14:48:30,389 - root - INFO - step: 24780 loss: 2.7780 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.60 mfu: 49.20% global_avg_ntp_loss: 0.7968 global_avg_top_loss: 1.9812 +[titan] 2025-09-09 14:48:30,389 - root - INFO - lr: 7.8011e-06 gnorm: 0.35 [1 day, 21:13:03<1 day, 3:46:22] +[titan] 2025-09-09 14:49:02,456 - root - INFO - step: 24785 loss: 2.7093 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.01 mfu: 49.24% global_avg_ntp_loss: 0.7584 global_avg_top_loss: 1.9509 +[titan] 2025-09-09 14:49:02,456 - root - INFO - lr: 7.7978e-06 gnorm: 0.41 [1 day, 21:13:35<1 day, 3:45:49] +[titan] 2025-09-09 14:49:34,286 - root - INFO - step: 24790 loss: 2.6898 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.64 mfu: 49.61% global_avg_ntp_loss: 0.7607 global_avg_top_loss: 1.9290 +[titan] 2025-09-09 14:49:34,286 - root - INFO - lr: 7.7944e-06 gnorm: 0.34 [1 day, 21:14:07<1 day, 3:45:15] +[titan] 2025-09-09 14:50:06,094 - root - INFO - step: 24795 loss: 3.1757 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 490.98 mfu: 49.64% global_avg_ntp_loss: 1.0245 global_avg_top_loss: 2.1511 +[titan] 2025-09-09 14:50:06,094 - root - INFO - lr: 7.7911e-06 gnorm: 0.36 [1 day, 21:14:39<1 day, 3:44:42] +[titan] 2025-09-09 14:50:31,674 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 14:50:38,103 - root - INFO - step: 24800 loss: 2.7427 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.91 mfu: 49.33% global_avg_ntp_loss: 0.7777 global_avg_top_loss: 1.9651 +[titan] 2025-09-09 14:50:38,103 - root - INFO - lr: 7.7878e-06 gnorm: 0.36 [1 day, 21:15:11<1 day, 3:44:08] +[titan] 2025-09-09 14:51:10,096 - root - INFO - step: 24805 loss: 2.7060 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.14 mfu: 49.36% global_avg_ntp_loss: 0.7642 global_avg_top_loss: 1.9418 +[titan] 2025-09-09 14:51:10,096 - root - INFO - lr: 7.7844e-06 gnorm: 0.37 [1 day, 21:15:43<1 day, 3:43:35] +[titan] 2025-09-09 14:51:42,191 - root - INFO - step: 24810 loss: 2.7922 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.59 mfu: 49.20% global_avg_ntp_loss: 0.7986 global_avg_top_loss: 1.9936 +[titan] 2025-09-09 14:51:42,191 - root - INFO - lr: 7.7811e-06 gnorm: 0.36 [1 day, 21:16:15<1 day, 3:43:02] +[titan] 2025-09-09 14:52:14,027 - root - INFO - step: 24815 loss: 2.7399 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.55 mfu: 49.60% global_avg_ntp_loss: 0.7776 global_avg_top_loss: 1.9623 +[titan] 2025-09-09 14:52:14,027 - root - INFO - lr: 7.7778e-06 gnorm: 0.34 [1 day, 21:16:47<1 day, 3:42:28] +[titan] 2025-09-09 14:52:45,978 - root - INFO - step: 24820 loss: 2.8046 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.80 mfu: 49.42% global_avg_ntp_loss: 0.8083 global_avg_top_loss: 1.9964 +[titan] 2025-09-09 14:52:45,978 - root - INFO - lr: 7.7744e-06 gnorm: 0.37 [1 day, 21:17:19<1 day, 3:41:55] +[titan] 2025-09-09 14:53:18,076 - root - INFO - step: 24825 loss: 2.7670 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.54 mfu: 49.20% global_avg_ntp_loss: 0.7887 global_avg_top_loss: 1.9784 +[titan] 2025-09-09 14:53:18,076 - root - INFO - lr: 7.7711e-06 gnorm: 0.35 [1 day, 21:17:51<1 day, 3:41:21] +[titan] 2025-09-09 14:53:50,117 - root - INFO - step: 24830 loss: 2.5914 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.41 mfu: 49.28% global_avg_ntp_loss: 0.7095 global_avg_top_loss: 1.8820 +[titan] 2025-09-09 14:53:50,118 - root - INFO - lr: 7.7678e-06 gnorm: 0.33 [1 day, 21:18:23<1 day, 3:40:48] +[titan] 2025-09-09 14:54:22,208 - root - INFO - step: 24835 loss: 2.7687 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.66 mfu: 49.21% global_avg_ntp_loss: 0.7876 global_avg_top_loss: 1.9811 +[titan] 2025-09-09 14:54:22,208 - root - INFO - lr: 7.7644e-06 gnorm: 0.36 [1 day, 21:18:55<1 day, 3:40:15] +[titan] 2025-09-09 14:54:54,168 - root - INFO - step: 24840 loss: 3.1903 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.64 mfu: 49.41% global_avg_ntp_loss: 1.0353 global_avg_top_loss: 2.1550 +[titan] 2025-09-09 14:54:54,169 - root - INFO - lr: 7.7611e-06 gnorm: 0.36 [1 day, 21:19:27<1 day, 3:39:41] +[titan] 2025-09-09 14:55:26,267 - root - INFO - step: 24845 loss: 2.7710 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.54 mfu: 49.19% global_avg_ntp_loss: 0.7889 global_avg_top_loss: 1.9821 +[titan] 2025-09-09 14:55:26,267 - root - INFO - lr: 7.7578e-06 gnorm: 0.45 [1 day, 21:19:59<1 day, 3:39:08] +[titan] 2025-09-09 14:55:51,734 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 14:55:58,123 - root - INFO - step: 24850 loss: 2.6850 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.25 mfu: 49.57% global_avg_ntp_loss: 0.7535 global_avg_top_loss: 1.9316 +[titan] 2025-09-09 14:55:58,123 - root - INFO - lr: 7.7544e-06 gnorm: 0.38 [1 day, 21:20:31<1 day, 3:38:35] +[titan] 2025-09-09 14:56:30,001 - root - INFO - step: 24855 loss: 2.6466 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.90 mfu: 49.53% global_avg_ntp_loss: 0.7419 global_avg_top_loss: 1.9047 +[titan] 2025-09-09 14:56:30,001 - root - INFO - lr: 7.7511e-06 gnorm: 0.35 [1 day, 21:21:03<1 day, 3:38:01] +[titan] 2025-09-09 14:57:01,912 - root - INFO - step: 24860 loss: 2.8290 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.40 mfu: 49.48% global_avg_ntp_loss: 0.8163 global_avg_top_loss: 2.0127 +[titan] 2025-09-09 14:57:01,912 - root - INFO - lr: 7.7478e-06 gnorm: 0.39 [1 day, 21:21:35<1 day, 3:37:28] +[titan] 2025-09-09 14:57:33,747 - root - INFO - step: 24865 loss: 2.7765 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.56 mfu: 49.60% global_avg_ntp_loss: 0.7959 global_avg_top_loss: 1.9806 +[titan] 2025-09-09 14:57:33,748 - root - INFO - lr: 7.7445e-06 gnorm: 0.39 [1 day, 21:22:06<1 day, 3:36:54] +[titan] 2025-09-09 14:58:05,708 - root - INFO - step: 24870 loss: 2.5812 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.64 mfu: 49.41% global_avg_ntp_loss: 0.7076 global_avg_top_loss: 1.8736 +[titan] 2025-09-09 14:58:05,708 - root - INFO - lr: 7.7411e-06 gnorm: 0.38 [1 day, 21:22:38<1 day, 3:36:21] +[titan] 2025-09-09 14:58:37,697 - root - INFO - step: 24875 loss: 2.7751 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.20 mfu: 49.36% global_avg_ntp_loss: 0.7920 global_avg_top_loss: 1.9831 +[titan] 2025-09-09 14:58:37,697 - root - INFO - lr: 7.7378e-06 gnorm: 0.35 [1 day, 21:23:10<1 day, 3:35:48] +[titan] 2025-09-09 14:59:09,602 - root - INFO - step: 24880 loss: 2.8366 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.50 mfu: 49.49% global_avg_ntp_loss: 0.8213 global_avg_top_loss: 2.0153 +[titan] 2025-09-09 14:59:09,602 - root - INFO - lr: 7.7345e-06 gnorm: 0.37 [1 day, 21:23:42<1 day, 3:35:14] +[titan] 2025-09-09 14:59:41,690 - root - INFO - step: 24885 loss: 2.7641 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.69 mfu: 49.21% global_avg_ntp_loss: 0.7857 global_avg_top_loss: 1.9783 +[titan] 2025-09-09 14:59:41,690 - root - INFO - lr: 7.7312e-06 gnorm: 0.36 [1 day, 21:24:14<1 day, 3:34:41] +[titan] 2025-09-09 15:00:13,594 - root - INFO - step: 24890 loss: 2.7429 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.51 mfu: 49.49% global_avg_ntp_loss: 0.7790 global_avg_top_loss: 1.9639 +[titan] 2025-09-09 15:00:13,594 - root - INFO - lr: 7.7278e-06 gnorm: 0.36 [1 day, 21:24:46<1 day, 3:34:08] +[titan] 2025-09-09 15:00:45,716 - root - INFO - step: 24895 loss: 2.7298 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.19 mfu: 49.16% global_avg_ntp_loss: 0.7738 global_avg_top_loss: 1.9559 +[titan] 2025-09-09 15:00:45,716 - root - INFO - lr: 7.7245e-06 gnorm: 0.36 [1 day, 21:25:18<1 day, 3:33:34] +[titan] 2025-09-09 15:01:11,130 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 15:01:17,532 - root - INFO - step: 24900 loss: 3.1743 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.85 mfu: 49.63% global_avg_ntp_loss: 1.0251 global_avg_top_loss: 2.1492 +[titan] 2025-09-09 15:01:17,532 - root - INFO - lr: 7.7212e-06 gnorm: 0.51 [1 day, 21:25:50<1 day, 3:33:01] +[titan] 2025-09-09 15:01:49,285 - root - INFO - step: 24905 loss: 2.7488 memory: 122.03GiB(87.57%) tps: 10,320 tflops: 491.84 mfu: 49.73% global_avg_ntp_loss: 0.7819 global_avg_top_loss: 1.9670 +[titan] 2025-09-09 15:01:49,285 - root - INFO - lr: 7.7178e-06 gnorm: 0.36 [1 day, 21:26:22<1 day, 3:32:27] +[titan] 2025-09-09 15:02:21,257 - root - INFO - step: 24910 loss: 2.7033 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.47 mfu: 49.39% global_avg_ntp_loss: 0.7582 global_avg_top_loss: 1.9451 +[titan] 2025-09-09 15:02:21,257 - root - INFO - lr: 7.7145e-06 gnorm: 0.36 [1 day, 21:26:54<1 day, 3:31:54] +[titan] 2025-09-09 15:02:53,178 - root - INFO - step: 24915 loss: 2.7503 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.24 mfu: 49.47% global_avg_ntp_loss: 0.7838 global_avg_top_loss: 1.9664 +[titan] 2025-09-09 15:02:53,178 - root - INFO - lr: 7.7112e-06 gnorm: 0.35 [1 day, 21:27:26<1 day, 3:31:21] +[titan] 2025-09-09 15:03:24,949 - root - INFO - step: 24920 loss: 3.1861 memory: 122.03GiB(87.57%) tps: 10,314 tflops: 491.56 mfu: 49.70% global_avg_ntp_loss: 1.0351 global_avg_top_loss: 2.1510 +[titan] 2025-09-09 15:03:24,949 - root - INFO - lr: 7.7079e-06 gnorm: 0.36 [1 day, 21:27:58<1 day, 3:30:47] +[titan] 2025-09-09 15:03:56,819 - root - INFO - step: 24925 loss: 2.7772 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.03 mfu: 49.55% global_avg_ntp_loss: 0.7955 global_avg_top_loss: 1.9817 +[titan] 2025-09-09 15:03:56,819 - root - INFO - lr: 7.7046e-06 gnorm: 0.37 [1 day, 21:28:29<1 day, 3:30:14] +[titan] 2025-09-09 15:04:28,916 - root - INFO - step: 24930 loss: 2.7256 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.57 mfu: 49.20% global_avg_ntp_loss: 0.7724 global_avg_top_loss: 1.9532 +[titan] 2025-09-09 15:04:28,916 - root - INFO - lr: 7.7012e-06 gnorm: 0.36 [1 day, 21:29:02<1 day, 3:29:40] +[titan] 2025-09-09 15:05:00,579 - root - INFO - step: 24935 loss: 2.6950 memory: 122.03GiB(87.57%) tps: 10,349 tflops: 493.23 mfu: 49.87% global_avg_ntp_loss: 0.7613 global_avg_top_loss: 1.9337 +[titan] 2025-09-09 15:05:00,579 - root - INFO - lr: 7.6979e-06 gnorm: 0.36 [1 day, 21:29:33<1 day, 3:29:07] +[titan] 2025-09-09 15:05:32,534 - root - INFO - step: 24940 loss: 2.6885 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.73 mfu: 49.42% global_avg_ntp_loss: 0.7537 global_avg_top_loss: 1.9348 +[titan] 2025-09-09 15:05:32,534 - root - INFO - lr: 7.6946e-06 gnorm: 0.34 [1 day, 21:30:05<1 day, 3:28:33] +[titan] 2025-09-09 15:06:04,529 - root - INFO - step: 24945 loss: 2.6884 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.12 mfu: 49.35% global_avg_ntp_loss: 0.7578 global_avg_top_loss: 1.9305 +[titan] 2025-09-09 15:06:04,529 - root - INFO - lr: 7.6913e-06 gnorm: 0.36 [1 day, 21:30:37<1 day, 3:28:00] +[titan] 2025-09-09 15:06:30,063 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 15:06:36,469 - root - INFO - step: 24950 loss: 2.7473 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.95 mfu: 49.44% global_avg_ntp_loss: 0.7793 global_avg_top_loss: 1.9680 +[titan] 2025-09-09 15:06:36,469 - root - INFO - lr: 7.6880e-06 gnorm: 0.34 [1 day, 21:31:09<1 day, 3:27:27] +[titan] 2025-09-09 15:07:08,506 - root - INFO - step: 24955 loss: 2.9943 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.47 mfu: 49.29% global_avg_ntp_loss: 0.9099 global_avg_top_loss: 2.0844 +[titan] 2025-09-09 15:07:08,507 - root - INFO - lr: 7.6846e-06 gnorm: 0.35 [1 day, 21:31:41<1 day, 3:26:53] +[titan] 2025-09-09 15:07:40,317 - root - INFO - step: 24960 loss: 2.7376 memory: 122.03GiB(87.57%) tps: 10,301 tflops: 490.94 mfu: 49.64% global_avg_ntp_loss: 0.7739 global_avg_top_loss: 1.9637 +[titan] 2025-09-09 15:07:40,317 - root - INFO - lr: 7.6813e-06 gnorm: 0.35 [1 day, 21:32:13<1 day, 3:26:20] +[titan] 2025-09-09 15:08:12,212 - root - INFO - step: 24965 loss: 2.7561 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.64 mfu: 49.51% global_avg_ntp_loss: 0.7854 global_avg_top_loss: 1.9707 +[titan] 2025-09-09 15:08:12,213 - root - INFO - lr: 7.6780e-06 gnorm: 0.35 [1 day, 21:32:45<1 day, 3:25:46] +[titan] 2025-09-09 15:08:44,173 - root - INFO - step: 24970 loss: 3.2382 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.65 mfu: 49.41% global_avg_ntp_loss: 1.0568 global_avg_top_loss: 2.1814 +[titan] 2025-09-09 15:08:44,173 - root - INFO - lr: 7.6747e-06 gnorm: 0.36 [1 day, 21:33:17<1 day, 3:25:13] +[titan] 2025-09-09 15:09:16,220 - root - INFO - step: 24975 loss: 2.7016 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.32 mfu: 49.27% global_avg_ntp_loss: 0.7614 global_avg_top_loss: 1.9402 +[titan] 2025-09-09 15:09:16,220 - root - INFO - lr: 7.6714e-06 gnorm: 0.35 [1 day, 21:33:49<1 day, 3:24:40] +[titan] 2025-09-09 15:09:48,044 - root - INFO - step: 24980 loss: 2.8174 memory: 122.03GiB(87.57%) tps: 10,297 tflops: 490.73 mfu: 49.62% global_avg_ntp_loss: 0.8100 global_avg_top_loss: 2.0074 +[titan] 2025-09-09 15:09:48,045 - root - INFO - lr: 7.6680e-06 gnorm: 0.36 [1 day, 21:34:21<1 day, 3:24:06] +[titan] 2025-09-09 15:10:19,921 - root - INFO - step: 24985 loss: 2.7007 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.92 mfu: 49.54% global_avg_ntp_loss: 0.7611 global_avg_top_loss: 1.9396 +[titan] 2025-09-09 15:10:19,922 - root - INFO - lr: 7.6647e-06 gnorm: 0.38 [1 day, 21:34:53<1 day, 3:23:33] +[titan] 2025-09-09 15:10:51,842 - root - INFO - step: 24990 loss: 2.7268 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.25 mfu: 49.47% global_avg_ntp_loss: 0.7742 global_avg_top_loss: 1.9525 +[titan] 2025-09-09 15:10:51,843 - root - INFO - lr: 7.6614e-06 gnorm: 0.34 [1 day, 21:35:24<1 day, 3:23:00] +[titan] 2025-09-09 15:11:23,640 - root - INFO - step: 24995 loss: 2.7406 memory: 122.03GiB(87.57%) tps: 10,305 tflops: 491.14 mfu: 49.66% global_avg_ntp_loss: 0.7777 global_avg_top_loss: 1.9629 +[titan] 2025-09-09 15:11:23,641 - root - INFO - lr: 7.6581e-06 gnorm: 0.35 [1 day, 21:35:56<1 day, 3:22:26] +[titan] 2025-09-09 15:11:49,082 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 15:11:55,514 - root - INFO - step: 25000 loss: 3.2441 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.97 mfu: 49.54% global_avg_ntp_loss: 1.0620 global_avg_top_loss: 2.1820 +[titan] 2025-09-09 15:11:55,515 - root - INFO - lr: 7.6548e-06 gnorm: 0.34 [1 day, 21:36:28<1 day, 3:21:53] +[titan] 2025-09-09 15:11:55,515 - root - INFO - Saving the checkpoint (or staging if async is enabled). +[titan] 2025-09-09 15:12:27,731 - root - INFO - [GC] GC collection invoked by checkpointer. 0.02 seconds. +[titan] 2025-09-09 15:12:27,731 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 32.22 seconds. +[titan] 2025-09-09 15:30:51,806 - root - INFO - step: 25005 loss: 2.6619 memory: 122.03GiB(87.57%) tps: 288 tflops: 13.74 mfu: 1.39% global_avg_ntp_loss: 0.7419 global_avg_top_loss: 1.9200 +[titan] 2025-09-09 15:30:51,806 - root - INFO - lr: 7.6515e-06 gnorm: 0.34 [1 day, 21:55:24<1 day, 3:32:22] +[titan] 2025-09-09 15:31:21,790 - root - INFO - step: 25010 loss: 2.6994 memory: 122.03GiB(87.57%) tps: 10,928 tflops: 520.84 mfu: 52.66% global_avg_ntp_loss: 0.7595 global_avg_top_loss: 1.9399 +[titan] 2025-09-09 15:31:21,791 - root - INFO - lr: 7.6482e-06 gnorm: 0.34 [1 day, 21:55:54<1 day, 3:31:47] +[titan] 2025-09-09 15:31:51,816 - root - INFO - step: 25015 loss: 2.7680 memory: 122.03GiB(87.57%) tps: 10,914 tflops: 520.13 mfu: 52.59% global_avg_ntp_loss: 0.7980 global_avg_top_loss: 1.9700 +[titan] 2025-09-09 15:31:51,816 - root - INFO - lr: 7.6448e-06 gnorm: 0.39 [1 day, 21:56:24<1 day, 3:31:12] +[titan] 2025-09-09 15:32:22,100 - root - INFO - step: 25020 loss: 2.8093 memory: 122.03GiB(87.57%) tps: 10,820 tflops: 515.69 mfu: 52.14% global_avg_ntp_loss: 0.8147 global_avg_top_loss: 1.9946 +[titan] 2025-09-09 15:32:22,100 - root - INFO - lr: 7.6415e-06 gnorm: 0.35 [1 day, 21:56:55<1 day, 3:30:37] +[titan] 2025-09-09 15:32:52,563 - root - INFO - step: 25025 loss: 2.7062 memory: 122.03GiB(87.57%) tps: 10,757 tflops: 512.66 mfu: 51.84% global_avg_ntp_loss: 0.7619 global_avg_top_loss: 1.9443 +[titan] 2025-09-09 15:32:52,563 - root - INFO - lr: 7.6382e-06 gnorm: 0.34 [1 day, 21:57:25<1 day, 3:30:02] +[titan] 2025-09-09 15:33:23,013 - root - INFO - step: 25030 loss: 2.7342 memory: 122.03GiB(87.57%) tps: 10,761 tflops: 512.88 mfu: 51.86% global_avg_ntp_loss: 0.7730 global_avg_top_loss: 1.9612 +[titan] 2025-09-09 15:33:23,013 - root - INFO - lr: 7.6349e-06 gnorm: 0.35 [1 day, 21:57:56<1 day, 3:29:28] +[titan] 2025-09-09 15:33:53,496 - root - INFO - step: 25035 loss: 2.7302 memory: 122.03GiB(87.57%) tps: 10,750 tflops: 512.33 mfu: 51.80% global_avg_ntp_loss: 0.7741 global_avg_top_loss: 1.9561 +[titan] 2025-09-09 15:33:53,496 - root - INFO - lr: 7.6316e-06 gnorm: 0.36 [1 day, 21:58:26<1 day, 3:28:53] +[titan] 2025-09-09 15:34:24,349 - root - INFO - step: 25040 loss: 2.6863 memory: 122.03GiB(87.57%) tps: 10,621 tflops: 506.18 mfu: 51.18% global_avg_ntp_loss: 0.7550 global_avg_top_loss: 1.9313 +[titan] 2025-09-09 15:34:24,349 - root - INFO - lr: 7.6283e-06 gnorm: 0.35 [1 day, 21:58:57<1 day, 3:28:19] +[titan] 2025-09-09 15:34:55,417 - root - INFO - step: 25045 loss: 2.7444 memory: 122.03GiB(87.57%) tps: 10,547 tflops: 502.68 mfu: 50.83% global_avg_ntp_loss: 0.7779 global_avg_top_loss: 1.9664 +[titan] 2025-09-09 15:34:55,417 - root - INFO - lr: 7.6250e-06 gnorm: 0.34 [1 day, 21:59:28<1 day, 3:27:45] +[titan] 2025-09-09 15:35:20,334 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 15:35:26,657 - root - INFO - step: 25050 loss: 3.2923 memory: 122.03GiB(87.57%) tps: 10,489 tflops: 499.91 mfu: 50.55% global_avg_ntp_loss: 1.0811 global_avg_top_loss: 2.2112 +[titan] 2025-09-09 15:35:26,657 - root - INFO - lr: 7.6217e-06 gnorm: 0.37 [1 day, 21:59:59<1 day, 3:27:10] +[titan] 2025-09-09 15:35:58,018 - root - INFO - step: 25055 loss: 2.7621 memory: 122.03GiB(87.57%) tps: 10,449 tflops: 497.98 mfu: 50.35% global_avg_ntp_loss: 0.7882 global_avg_top_loss: 1.9739 +[titan] 2025-09-09 15:35:58,018 - root - INFO - lr: 7.6184e-06 gnorm: 0.34 [1 day, 22:00:31<1 day, 3:26:36] +[titan] 2025-09-09 15:36:29,514 - root - INFO - step: 25060 loss: 2.6972 memory: 122.03GiB(87.57%) tps: 10,404 tflops: 495.84 mfu: 50.14% global_avg_ntp_loss: 0.7655 global_avg_top_loss: 1.9317 +[titan] 2025-09-09 15:36:29,514 - root - INFO - lr: 7.6151e-06 gnorm: 0.36 [1 day, 22:01:02<1 day, 3:26:02] +[titan] 2025-09-09 15:37:01,088 - root - INFO - step: 25065 loss: 2.7690 memory: 122.03GiB(87.57%) tps: 10,378 tflops: 494.63 mfu: 50.01% global_avg_ntp_loss: 0.7900 global_avg_top_loss: 1.9790 +[titan] 2025-09-09 15:37:01,088 - root - INFO - lr: 7.6117e-06 gnorm: 0.36 [1 day, 22:01:34<1 day, 3:25:29] +[titan] 2025-09-09 15:37:32,800 - root - INFO - step: 25070 loss: 2.6993 memory: 122.03GiB(87.57%) tps: 10,333 tflops: 492.47 mfu: 49.79% global_avg_ntp_loss: 0.7578 global_avg_top_loss: 1.9415 +[titan] 2025-09-09 15:37:32,800 - root - INFO - lr: 7.6084e-06 gnorm: 0.34 [1 day, 22:02:05<1 day, 3:24:55] +[titan] 2025-09-09 15:38:04,486 - root - INFO - step: 25075 loss: 2.7475 memory: 122.03GiB(87.57%) tps: 10,342 tflops: 492.88 mfu: 49.84% global_avg_ntp_loss: 0.7803 global_avg_top_loss: 1.9672 +[titan] 2025-09-09 15:38:04,486 - root - INFO - lr: 7.6051e-06 gnorm: 0.36 [1 day, 22:02:37<1 day, 3:24:21] +[titan] 2025-09-09 15:38:36,006 - root - INFO - step: 25080 loss: 3.2259 memory: 122.03GiB(87.57%) tps: 10,396 tflops: 495.46 mfu: 50.10% global_avg_ntp_loss: 1.0516 global_avg_top_loss: 2.1744 +[titan] 2025-09-09 15:38:36,006 - root - INFO - lr: 7.6018e-06 gnorm: 0.37 [1 day, 22:03:09<1 day, 3:23:47] +[titan] 2025-09-09 15:39:07,903 - root - INFO - step: 25085 loss: 2.8085 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.62 mfu: 49.51% global_avg_ntp_loss: 0.8177 global_avg_top_loss: 1.9908 +[titan] 2025-09-09 15:39:07,903 - root - INFO - lr: 7.5985e-06 gnorm: 0.36 [1 day, 22:03:40<1 day, 3:23:13] +[titan] 2025-09-09 15:39:27,229 - root - INFO - Dumping profiler traces at step 25088 +[titan] 2025-09-09 15:39:27,286 - root - INFO - Finished dumping profiler traces in 0.06 seconds +[titan] 2025-09-09 15:39:39,964 - root - INFO - step: 25090 loss: 2.7552 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.11 mfu: 49.25% global_avg_ntp_loss: 0.7837 global_avg_top_loss: 1.9715 +[titan] 2025-09-09 15:39:39,964 - root - INFO - lr: 7.5952e-06 gnorm: 0.35 [1 day, 22:04:13<1 day, 3:22:39] +[titan] 2025-09-09 15:40:11,633 - root - INFO - step: 25095 loss: 2.6368 memory: 122.03GiB(87.57%) tps: 10,347 tflops: 493.13 mfu: 49.86% global_avg_ntp_loss: 0.7343 global_avg_top_loss: 1.9025 +[titan] 2025-09-09 15:40:11,634 - root - INFO - lr: 7.5919e-06 gnorm: 0.35 [1 day, 22:04:44<1 day, 3:22:06] +[titan] 2025-09-09 15:40:37,140 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 15:40:43,505 - root - INFO - step: 25100 loss: 2.7106 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 490.01 mfu: 49.55% global_avg_ntp_loss: 0.7623 global_avg_top_loss: 1.9483 +[titan] 2025-09-09 15:40:43,505 - root - INFO - lr: 7.5886e-06 gnorm: 0.36 [1 day, 22:05:16<1 day, 3:21:32] +[titan] 2025-09-09 15:41:15,403 - root - INFO - step: 25105 loss: 2.8491 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.59 mfu: 49.50% global_avg_ntp_loss: 0.8388 global_avg_top_loss: 2.0103 +[titan] 2025-09-09 15:41:15,403 - root - INFO - lr: 7.5853e-06 gnorm: 0.36 [1 day, 22:05:48<1 day, 3:20:58] +[titan] 2025-09-09 15:41:47,216 - root - INFO - step: 25110 loss: 3.1185 memory: 122.03GiB(87.57%) tps: 10,300 tflops: 490.90 mfu: 49.64% global_avg_ntp_loss: 0.9999 global_avg_top_loss: 2.1186 +[titan] 2025-09-09 15:41:47,217 - root - INFO - lr: 7.5820e-06 gnorm: 0.36 [1 day, 22:06:20<1 day, 3:20:24] +[titan] 2025-09-09 15:42:19,236 - root - INFO - step: 25115 loss: 2.7550 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.74 mfu: 49.32% global_avg_ntp_loss: 0.7815 global_avg_top_loss: 1.9735 +[titan] 2025-09-09 15:42:19,236 - root - INFO - lr: 7.5787e-06 gnorm: 0.37 [1 day, 22:06:52<1 day, 3:19:51] +[titan] 2025-09-09 15:42:51,091 - root - INFO - step: 25120 loss: 2.6678 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.26 mfu: 49.57% global_avg_ntp_loss: 0.7426 global_avg_top_loss: 1.9252 +[titan] 2025-09-09 15:42:51,092 - root - INFO - lr: 7.5754e-06 gnorm: 0.35 [1 day, 22:07:24<1 day, 3:19:17] +[titan] 2025-09-09 15:43:22,872 - root - INFO - step: 25125 loss: 2.7577 memory: 122.03GiB(87.57%) tps: 10,311 tflops: 491.40 mfu: 49.69% global_avg_ntp_loss: 0.7896 global_avg_top_loss: 1.9681 +[titan] 2025-09-09 15:43:22,873 - root - INFO - lr: 7.5721e-06 gnorm: 0.35 [1 day, 22:07:55<1 day, 3:18:43] +[titan] 2025-09-09 15:43:55,009 - root - INFO - step: 25130 loss: 3.8224 memory: 122.03GiB(87.57%) tps: 10,197 tflops: 485.97 mfu: 49.14% global_avg_ntp_loss: 1.3779 global_avg_top_loss: 2.4445 +[titan] 2025-09-09 15:43:55,009 - root - INFO - lr: 7.5688e-06 gnorm: 0.37 [1 day, 22:08:28<1 day, 3:18:09] +[titan] 2025-09-09 15:44:26,867 - root - INFO - step: 25135 loss: 2.7911 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.20 mfu: 49.57% global_avg_ntp_loss: 0.8004 global_avg_top_loss: 1.9907 +[titan] 2025-09-09 15:44:26,868 - root - INFO - lr: 7.5655e-06 gnorm: 0.36 [1 day, 22:08:59<1 day, 3:17:36] +[titan] 2025-09-09 15:44:58,746 - root - INFO - step: 25140 loss: 2.7238 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.90 mfu: 49.54% global_avg_ntp_loss: 0.7708 global_avg_top_loss: 1.9530 +[titan] 2025-09-09 15:44:58,746 - root - INFO - lr: 7.5622e-06 gnorm: 0.37 [1 day, 22:09:31<1 day, 3:17:02] +[titan] 2025-09-09 15:45:30,719 - root - INFO - step: 25145 loss: 2.6947 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.45 mfu: 49.39% global_avg_ntp_loss: 0.7596 global_avg_top_loss: 1.9351 +[titan] 2025-09-09 15:45:30,719 - root - INFO - lr: 7.5589e-06 gnorm: 0.39 [1 day, 22:10:03<1 day, 3:16:28] +[titan] 2025-09-09 15:45:56,230 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 15:46:02,680 - root - INFO - step: 25150 loss: 2.7373 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.63 mfu: 49.41% global_avg_ntp_loss: 0.7793 global_avg_top_loss: 1.9580 +[titan] 2025-09-09 15:46:02,680 - root - INFO - lr: 7.5556e-06 gnorm: 0.37 [1 day, 22:10:35<1 day, 3:15:55] +[titan] 2025-09-09 15:46:34,514 - root - INFO - step: 25155 loss: 2.8187 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.58 mfu: 49.60% global_avg_ntp_loss: 0.8108 global_avg_top_loss: 2.0079 +[titan] 2025-09-09 15:46:34,515 - root - INFO - lr: 7.5523e-06 gnorm: 0.36 [1 day, 22:11:07<1 day, 3:15:21] +[titan] 2025-09-09 15:47:06,565 - root - INFO - step: 25160 loss: 3.1470 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.26 mfu: 49.27% global_avg_ntp_loss: 1.0122 global_avg_top_loss: 2.1347 +[titan] 2025-09-09 15:47:06,566 - root - INFO - lr: 7.5490e-06 gnorm: 0.53 [1 day, 22:11:39<1 day, 3:14:47] +[titan] 2025-09-09 15:47:38,505 - root - INFO - step: 25165 loss: 2.7516 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.97 mfu: 49.44% global_avg_ntp_loss: 0.7797 global_avg_top_loss: 1.9719 +[titan] 2025-09-09 15:47:38,505 - root - INFO - lr: 7.5457e-06 gnorm: 0.34 [1 day, 22:12:11<1 day, 3:14:14] +[titan] 2025-09-09 15:48:10,738 - root - INFO - step: 25170 loss: 2.6569 memory: 122.03GiB(87.57%) tps: 10,166 tflops: 484.50 mfu: 48.99% global_avg_ntp_loss: 0.7375 global_avg_top_loss: 1.9194 +[titan] 2025-09-09 15:48:10,738 - root - INFO - lr: 7.5424e-06 gnorm: 0.39 [1 day, 22:12:43<1 day, 3:13:40] +[titan] 2025-09-09 15:48:43,103 - root - INFO - step: 25175 loss: 2.7905 memory: 122.03GiB(87.57%) tps: 10,125 tflops: 482.53 mfu: 48.79% global_avg_ntp_loss: 0.8050 global_avg_top_loss: 1.9855 +[titan] 2025-09-09 15:48:43,103 - root - INFO - lr: 7.5391e-06 gnorm: 0.38 [1 day, 22:13:16<1 day, 3:13:07] +[titan] 2025-09-09 15:49:15,524 - root - INFO - step: 25180 loss: 2.6982 memory: 122.03GiB(87.57%) tps: 10,107 tflops: 481.70 mfu: 48.71% global_avg_ntp_loss: 0.7601 global_avg_top_loss: 1.9381 +[titan] 2025-09-09 15:49:15,524 - root - INFO - lr: 7.5358e-06 gnorm: 0.35 [1 day, 22:13:48<1 day, 3:12:33] +[titan] 2025-09-09 15:49:47,604 - root - INFO - step: 25185 loss: 2.7802 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.82 mfu: 49.22% global_avg_ntp_loss: 0.7961 global_avg_top_loss: 1.9841 +[titan] 2025-09-09 15:49:47,604 - root - INFO - lr: 7.5325e-06 gnorm: 0.35 [1 day, 22:14:20<1 day, 3:11:59] +[titan] 2025-09-09 15:50:19,580 - root - INFO - step: 25190 loss: 2.6150 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.41 mfu: 49.38% global_avg_ntp_loss: 0.7175 global_avg_top_loss: 1.8975 +[titan] 2025-09-09 15:50:19,580 - root - INFO - lr: 7.5292e-06 gnorm: 0.35 [1 day, 22:14:52<1 day, 3:11:26] +[titan] 2025-09-09 15:50:51,496 - root - INFO - step: 25195 loss: 2.7843 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.32 mfu: 49.48% global_avg_ntp_loss: 0.7977 global_avg_top_loss: 1.9866 +[titan] 2025-09-09 15:50:51,496 - root - INFO - lr: 7.5259e-06 gnorm: 0.36 [1 day, 22:15:24<1 day, 3:10:52] +[titan] 2025-09-09 15:51:17,077 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 15:51:23,453 - root - INFO - step: 25200 loss: 2.8446 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.70 mfu: 49.41% global_avg_ntp_loss: 0.8239 global_avg_top_loss: 2.0207 +[titan] 2025-09-09 15:51:23,453 - root - INFO - lr: 7.5226e-06 gnorm: 0.35 [1 day, 22:15:56<1 day, 3:10:18] +[titan] 2025-09-09 15:51:55,521 - root - INFO - step: 25205 loss: 2.7168 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.99 mfu: 49.24% global_avg_ntp_loss: 0.7654 global_avg_top_loss: 1.9514 +[titan] 2025-09-09 15:51:55,522 - root - INFO - lr: 7.5194e-06 gnorm: 0.34 [1 day, 22:16:28<1 day, 3:09:45] +[titan] 2025-09-09 15:52:27,545 - root - INFO - step: 25210 loss: 3.2243 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.68 mfu: 49.31% global_avg_ntp_loss: 1.0547 global_avg_top_loss: 2.1697 +[titan] 2025-09-09 15:52:27,545 - root - INFO - lr: 7.5161e-06 gnorm: 0.36 [1 day, 22:17:00<1 day, 3:09:11] +[titan] 2025-09-09 15:52:59,589 - root - INFO - step: 25215 loss: 2.7330 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.36 mfu: 49.28% global_avg_ntp_loss: 0.7738 global_avg_top_loss: 1.9592 +[titan] 2025-09-09 15:52:59,589 - root - INFO - lr: 7.5128e-06 gnorm: 0.35 [1 day, 22:17:32<1 day, 3:08:38] +[titan] 2025-09-09 15:53:31,498 - root - INFO - step: 25220 loss: 2.7844 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.43 mfu: 49.49% global_avg_ntp_loss: 0.7998 global_avg_top_loss: 1.9846 +[titan] 2025-09-09 15:53:31,498 - root - INFO - lr: 7.5095e-06 gnorm: 0.36 [1 day, 22:18:04<1 day, 3:08:04] +[titan] 2025-09-09 15:54:03,680 - root - INFO - step: 25225 loss: 3.2497 memory: 122.03GiB(87.57%) tps: 10,182 tflops: 485.27 mfu: 49.07% global_avg_ntp_loss: 1.0622 global_avg_top_loss: 2.1876 +[titan] 2025-09-09 15:54:03,681 - root - INFO - lr: 7.5062e-06 gnorm: 0.38 [1 day, 22:18:36<1 day, 3:07:30] +[titan] 2025-09-09 15:54:35,599 - root - INFO - step: 25230 loss: 2.7190 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.29 mfu: 49.47% global_avg_ntp_loss: 0.7677 global_avg_top_loss: 1.9513 +[titan] 2025-09-09 15:54:35,599 - root - INFO - lr: 7.5029e-06 gnorm: 0.34 [1 day, 22:19:08<1 day, 3:06:57] +[titan] 2025-09-09 15:55:07,717 - root - INFO - step: 25235 loss: 2.7784 memory: 122.03GiB(87.57%) tps: 10,203 tflops: 486.25 mfu: 49.17% global_avg_ntp_loss: 0.7948 global_avg_top_loss: 1.9836 +[titan] 2025-09-09 15:55:07,717 - root - INFO - lr: 7.4996e-06 gnorm: 0.36 [1 day, 22:19:40<1 day, 3:06:23] +[titan] 2025-09-09 15:55:39,841 - root - INFO - step: 25240 loss: 2.7457 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.16 mfu: 49.16% global_avg_ntp_loss: 0.7792 global_avg_top_loss: 1.9665 +[titan] 2025-09-09 15:55:39,841 - root - INFO - lr: 7.4963e-06 gnorm: 0.37 [1 day, 22:20:12<1 day, 3:05:49] +[titan] 2025-09-09 15:56:11,940 - root - INFO - step: 25245 loss: 2.9717 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.53 mfu: 49.19% global_avg_ntp_loss: 0.9030 global_avg_top_loss: 2.0687 +[titan] 2025-09-09 15:56:11,940 - root - INFO - lr: 7.4930e-06 gnorm: 0.37 [1 day, 22:20:45<1 day, 3:05:16] +[titan] 2025-09-09 15:56:37,250 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 15:56:43,757 - root - INFO - step: 25250 loss: 2.7741 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.84 mfu: 49.63% global_avg_ntp_loss: 0.7955 global_avg_top_loss: 1.9786 +[titan] 2025-09-09 15:56:43,757 - root - INFO - lr: 7.4897e-06 gnorm: 0.35 [1 day, 22:21:16<1 day, 3:04:42] +[titan] 2025-09-09 15:57:15,957 - root - INFO - step: 25255 loss: 2.6937 memory: 122.03GiB(87.57%) tps: 10,176 tflops: 485.00 mfu: 49.04% global_avg_ntp_loss: 0.7581 global_avg_top_loss: 1.9355 +[titan] 2025-09-09 15:57:15,958 - root - INFO - lr: 7.4865e-06 gnorm: 0.35 [1 day, 22:21:49<1 day, 3:04:08] +[titan] 2025-09-09 15:57:48,186 - root - INFO - step: 25260 loss: 2.7593 memory: 122.03GiB(87.57%) tps: 10,168 tflops: 484.58 mfu: 49.00% global_avg_ntp_loss: 0.7973 global_avg_top_loss: 1.9620 +[titan] 2025-09-09 15:57:48,186 - root - INFO - lr: 7.4832e-06 gnorm: 0.35 [1 day, 22:22:21<1 day, 3:03:35] +[titan] 2025-09-09 15:58:20,115 - root - INFO - step: 25265 loss: 2.7804 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.12 mfu: 49.46% global_avg_ntp_loss: 0.7960 global_avg_top_loss: 1.9844 +[titan] 2025-09-09 15:58:20,115 - root - INFO - lr: 7.4799e-06 gnorm: 0.35 [1 day, 22:22:53<1 day, 3:03:01] +[titan] 2025-09-09 15:58:52,148 - root - INFO - step: 25270 loss: 2.6477 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.54 mfu: 49.30% global_avg_ntp_loss: 0.7338 global_avg_top_loss: 1.9139 +[titan] 2025-09-09 15:58:52,148 - root - INFO - lr: 7.4766e-06 gnorm: 0.37 [1 day, 22:23:25<1 day, 3:02:28] +[titan] 2025-09-09 15:59:24,265 - root - INFO - step: 25275 loss: 3.2774 memory: 122.03GiB(87.57%) tps: 10,203 tflops: 486.25 mfu: 49.17% global_avg_ntp_loss: 1.0742 global_avg_top_loss: 2.2032 +[titan] 2025-09-09 15:59:24,266 - root - INFO - lr: 7.4733e-06 gnorm: 0.35 [1 day, 22:23:57<1 day, 3:01:54] +[titan] 2025-09-09 15:59:56,183 - root - INFO - step: 25280 loss: 2.6913 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.31 mfu: 49.47% global_avg_ntp_loss: 0.7585 global_avg_top_loss: 1.9327 +[titan] 2025-09-09 15:59:56,183 - root - INFO - lr: 7.4700e-06 gnorm: 0.34 [1 day, 22:24:29<1 day, 3:01:20] +[titan] 2025-09-09 16:00:28,156 - root - INFO - step: 25285 loss: 2.7369 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.45 mfu: 49.39% global_avg_ntp_loss: 0.7726 global_avg_top_loss: 1.9643 +[titan] 2025-09-09 16:00:28,156 - root - INFO - lr: 7.4668e-06 gnorm: 0.36 [1 day, 22:25:01<1 day, 3:00:47] +[titan] 2025-09-09 16:01:00,271 - root - INFO - step: 25290 loss: 3.2298 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.29 mfu: 49.17% global_avg_ntp_loss: 1.0548 global_avg_top_loss: 2.1750 +[titan] 2025-09-09 16:01:00,271 - root - INFO - lr: 7.4635e-06 gnorm: 0.38 [1 day, 22:25:33<1 day, 3:00:13] +[titan] 2025-09-09 16:01:32,358 - root - INFO - step: 25295 loss: 2.6449 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.71 mfu: 49.21% global_avg_ntp_loss: 0.7307 global_avg_top_loss: 1.9141 +[titan] 2025-09-09 16:01:32,358 - root - INFO - lr: 7.4602e-06 gnorm: 0.36 [1 day, 22:26:05<1 day, 2:59:39] +[titan] 2025-09-09 16:01:58,086 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 16:02:04,545 - root - INFO - step: 25300 loss: 2.7873 memory: 122.03GiB(87.57%) tps: 10,180 tflops: 485.19 mfu: 49.06% global_avg_ntp_loss: 0.7996 global_avg_top_loss: 1.9877 +[titan] 2025-09-09 16:02:04,545 - root - INFO - lr: 7.4569e-06 gnorm: 0.37 [1 day, 22:26:37<1 day, 2:59:06] +[titan] 2025-09-09 16:02:36,423 - root - INFO - step: 25305 loss: 3.1935 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.91 mfu: 49.54% global_avg_ntp_loss: 1.0325 global_avg_top_loss: 2.1610 +[titan] 2025-09-09 16:02:36,423 - root - INFO - lr: 7.4536e-06 gnorm: 0.42 [1 day, 22:27:09<1 day, 2:58:32] +[titan] 2025-09-09 16:03:08,492 - root - INFO - step: 25310 loss: 2.7130 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.98 mfu: 49.24% global_avg_ntp_loss: 0.7612 global_avg_top_loss: 1.9518 +[titan] 2025-09-09 16:03:08,493 - root - INFO - lr: 7.4503e-06 gnorm: 0.39 [1 day, 22:27:41<1 day, 2:57:59] +[titan] 2025-09-09 16:03:40,383 - root - INFO - step: 25315 loss: 2.7946 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.70 mfu: 49.52% global_avg_ntp_loss: 0.8052 global_avg_top_loss: 1.9894 +[titan] 2025-09-09 16:03:40,384 - root - INFO - lr: 7.4471e-06 gnorm: 0.37 [1 day, 22:28:13<1 day, 2:57:25] +[titan] 2025-09-09 16:04:12,552 - root - INFO - step: 25320 loss: 2.7190 memory: 122.03GiB(87.57%) tps: 10,186 tflops: 485.48 mfu: 49.09% global_avg_ntp_loss: 0.7686 global_avg_top_loss: 1.9503 +[titan] 2025-09-09 16:04:12,553 - root - INFO - lr: 7.4438e-06 gnorm: 0.37 [1 day, 22:28:45<1 day, 2:56:51] +[titan] 2025-09-09 16:04:44,447 - root - INFO - step: 25325 loss: 2.7165 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.66 mfu: 49.51% global_avg_ntp_loss: 0.7663 global_avg_top_loss: 1.9502 +[titan] 2025-09-09 16:04:44,447 - root - INFO - lr: 7.4405e-06 gnorm: 0.36 [1 day, 22:29:17<1 day, 2:56:18] +[titan] 2025-09-09 16:05:16,589 - root - INFO - step: 25330 loss: 2.7882 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.87 mfu: 49.13% global_avg_ntp_loss: 0.7978 global_avg_top_loss: 1.9904 +[titan] 2025-09-09 16:05:16,590 - root - INFO - lr: 7.4372e-06 gnorm: 0.36 [1 day, 22:29:49<1 day, 2:55:44] +[titan] 2025-09-09 16:05:48,482 - root - INFO - step: 25335 loss: 2.6578 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.69 mfu: 49.51% global_avg_ntp_loss: 0.7428 global_avg_top_loss: 1.9151 +[titan] 2025-09-09 16:05:48,482 - root - INFO - lr: 7.4339e-06 gnorm: 0.34 [1 day, 22:30:21<1 day, 2:55:10] +[titan] 2025-09-09 16:06:20,499 - root - INFO - step: 25340 loss: 2.7553 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.77 mfu: 49.32% global_avg_ntp_loss: 0.7829 global_avg_top_loss: 1.9724 +[titan] 2025-09-09 16:06:20,500 - root - INFO - lr: 7.4307e-06 gnorm: 0.50 [1 day, 22:30:53<1 day, 2:54:37] +[titan] 2025-09-09 16:06:52,373 - root - INFO - step: 25345 loss: 2.7815 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.97 mfu: 49.54% global_avg_ntp_loss: 0.7959 global_avg_top_loss: 1.9856 +[titan] 2025-09-09 16:06:52,374 - root - INFO - lr: 7.4274e-06 gnorm: 0.35 [1 day, 22:31:25<1 day, 2:54:03] +[titan] 2025-09-09 16:07:17,986 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 16:07:24,276 - root - INFO - step: 25350 loss: 2.9346 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.53 mfu: 49.50% global_avg_ntp_loss: 0.8872 global_avg_top_loss: 2.0474 +[titan] 2025-09-09 16:07:24,276 - root - INFO - lr: 7.4241e-06 gnorm: 0.35 [1 day, 22:31:57<1 day, 2:53:29] +[titan] 2025-09-09 16:07:56,252 - root - INFO - step: 25355 loss: 3.1781 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.41 mfu: 49.38% global_avg_ntp_loss: 1.0354 global_avg_top_loss: 2.1427 +[titan] 2025-09-09 16:07:56,252 - root - INFO - lr: 7.4208e-06 gnorm: 0.35 [1 day, 22:32:29<1 day, 2:52:56] +[titan] 2025-09-09 16:08:28,188 - root - INFO - step: 25360 loss: 2.7347 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.01 mfu: 49.45% global_avg_ntp_loss: 0.7702 global_avg_top_loss: 1.9646 +[titan] 2025-09-09 16:08:28,188 - root - INFO - lr: 7.4176e-06 gnorm: 0.37 [1 day, 22:33:01<1 day, 2:52:22] +[titan] 2025-09-09 16:09:00,345 - root - INFO - step: 25365 loss: 2.6873 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.65 mfu: 49.11% global_avg_ntp_loss: 0.7526 global_avg_top_loss: 1.9347 +[titan] 2025-09-09 16:09:00,345 - root - INFO - lr: 7.4143e-06 gnorm: 0.36 [1 day, 22:33:33<1 day, 2:51:48] +[titan] 2025-09-09 16:09:32,366 - root - INFO - step: 25370 loss: 3.2046 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.72 mfu: 49.31% global_avg_ntp_loss: 1.0395 global_avg_top_loss: 2.1651 +[titan] 2025-09-09 16:09:32,366 - root - INFO - lr: 7.4110e-06 gnorm: 0.36 [1 day, 22:34:05<1 day, 2:51:15] +[titan] 2025-09-09 16:10:04,449 - root - INFO - step: 25375 loss: 2.6200 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.77 mfu: 49.22% global_avg_ntp_loss: 0.7211 global_avg_top_loss: 1.8989 +[titan] 2025-09-09 16:10:04,450 - root - INFO - lr: 7.4077e-06 gnorm: 0.34 [1 day, 22:34:37<1 day, 2:50:41] +[titan] 2025-09-09 16:10:36,322 - root - INFO - step: 25380 loss: 2.7924 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.99 mfu: 49.54% global_avg_ntp_loss: 0.7981 global_avg_top_loss: 1.9943 +[titan] 2025-09-09 16:10:36,322 - root - INFO - lr: 7.4045e-06 gnorm: 0.37 [1 day, 22:35:09<1 day, 2:50:08] +[titan] 2025-09-09 16:11:08,389 - root - INFO - step: 25385 loss: 3.2676 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.01 mfu: 49.24% global_avg_ntp_loss: 1.0810 global_avg_top_loss: 2.1866 +[titan] 2025-09-09 16:11:08,390 - root - INFO - lr: 7.4012e-06 gnorm: 0.39 [1 day, 22:35:41<1 day, 2:49:34] +[titan] 2025-09-09 16:11:40,275 - root - INFO - step: 25390 loss: 2.7992 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.79 mfu: 49.52% global_avg_ntp_loss: 0.7962 global_avg_top_loss: 2.0030 +[titan] 2025-09-09 16:11:40,275 - root - INFO - lr: 7.3979e-06 gnorm: 1.10 [1 day, 22:36:13<1 day, 2:49:00] +[titan] 2025-09-09 16:12:12,338 - root - INFO - step: 25395 loss: 2.6125 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.08 mfu: 49.25% global_avg_ntp_loss: 0.7189 global_avg_top_loss: 1.8936 +[titan] 2025-09-09 16:12:12,338 - root - INFO - lr: 7.3947e-06 gnorm: 0.34 [1 day, 22:36:45<1 day, 2:48:27] +[titan] 2025-09-09 16:12:37,994 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 16:12:44,311 - root - INFO - step: 25400 loss: 2.6897 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.44 mfu: 49.39% global_avg_ntp_loss: 0.7574 global_avg_top_loss: 1.9323 +[titan] 2025-09-09 16:12:44,311 - root - INFO - lr: 7.3914e-06 gnorm: 0.38 [1 day, 22:37:17<1 day, 2:47:53] +[titan] 2025-09-09 16:13:16,347 - root - INFO - step: 25405 loss: 2.6516 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.49 mfu: 49.29% global_avg_ntp_loss: 0.7371 global_avg_top_loss: 1.9145 +[titan] 2025-09-09 16:13:16,348 - root - INFO - lr: 7.3881e-06 gnorm: 0.35 [1 day, 22:37:49<1 day, 2:47:19] +[titan] 2025-09-09 16:13:48,361 - root - INFO - step: 25410 loss: 2.8164 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.83 mfu: 49.33% global_avg_ntp_loss: 0.8323 global_avg_top_loss: 1.9841 +[titan] 2025-09-09 16:13:48,361 - root - INFO - lr: 7.3848e-06 gnorm: 0.40 [1 day, 22:38:21<1 day, 2:46:46] +[titan] 2025-09-09 16:14:20,324 - root - INFO - step: 25415 loss: 2.6447 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.60 mfu: 49.40% global_avg_ntp_loss: 0.7317 global_avg_top_loss: 1.9130 +[titan] 2025-09-09 16:14:20,324 - root - INFO - lr: 7.3816e-06 gnorm: 0.56 [1 day, 22:38:53<1 day, 2:46:12] +[titan] 2025-09-09 16:14:52,409 - root - INFO - step: 25420 loss: 2.6779 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.74 mfu: 49.22% global_avg_ntp_loss: 0.7545 global_avg_top_loss: 1.9234 +[titan] 2025-09-09 16:14:52,409 - root - INFO - lr: 7.3783e-06 gnorm: 0.35 [1 day, 22:39:25<1 day, 2:45:38] +[titan] 2025-09-09 16:15:24,374 - root - INFO - step: 25425 loss: 2.6319 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.57 mfu: 49.40% global_avg_ntp_loss: 0.7303 global_avg_top_loss: 1.9015 +[titan] 2025-09-09 16:15:24,374 - root - INFO - lr: 7.3750e-06 gnorm: 0.41 [1 day, 22:39:57<1 day, 2:45:05] +[titan] 2025-09-09 16:15:56,587 - root - INFO - step: 25430 loss: 2.7570 memory: 122.03GiB(87.57%) tps: 10,173 tflops: 484.82 mfu: 49.02% global_avg_ntp_loss: 0.7801 global_avg_top_loss: 1.9769 +[titan] 2025-09-09 16:15:56,587 - root - INFO - lr: 7.3718e-06 gnorm: 0.37 [1 day, 22:40:29<1 day, 2:44:31] +[titan] 2025-09-09 16:16:28,928 - root - INFO - step: 25435 loss: 3.1302 memory: 122.03GiB(87.57%) tps: 10,132 tflops: 482.88 mfu: 48.83% global_avg_ntp_loss: 1.0074 global_avg_top_loss: 2.1228 +[titan] 2025-09-09 16:16:28,929 - root - INFO - lr: 7.3685e-06 gnorm: 0.38 [1 day, 22:41:01<1 day, 2:43:58] +[titan] 2025-09-09 16:17:00,957 - root - INFO - step: 25440 loss: 2.5854 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.60 mfu: 49.30% global_avg_ntp_loss: 0.7078 global_avg_top_loss: 1.8776 +[titan] 2025-09-09 16:17:00,957 - root - INFO - lr: 7.3652e-06 gnorm: 0.33 [1 day, 22:41:34<1 day, 2:43:24] +[titan] 2025-09-09 16:17:33,262 - root - INFO - step: 25445 loss: 2.7243 memory: 122.03GiB(87.57%) tps: 10,144 tflops: 483.43 mfu: 48.88% global_avg_ntp_loss: 0.7702 global_avg_top_loss: 1.9541 +[titan] 2025-09-09 16:17:33,262 - root - INFO - lr: 7.3620e-06 gnorm: 0.39 [1 day, 22:42:06<1 day, 2:42:51] +[titan] 2025-09-09 16:17:58,884 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 16:18:05,280 - root - INFO - step: 25450 loss: 3.2906 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.76 mfu: 49.32% global_avg_ntp_loss: 1.0822 global_avg_top_loss: 2.2084 +[titan] 2025-09-09 16:18:05,281 - root - INFO - lr: 7.3587e-06 gnorm: 0.37 [1 day, 22:42:38<1 day, 2:42:17] +[titan] 2025-09-09 16:18:37,228 - root - INFO - step: 25455 loss: 2.8179 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.83 mfu: 49.43% global_avg_ntp_loss: 0.8167 global_avg_top_loss: 2.0013 +[titan] 2025-09-09 16:18:37,229 - root - INFO - lr: 7.3554e-06 gnorm: 0.37 [1 day, 22:43:10<1 day, 2:41:44] +[titan] 2025-09-09 16:19:09,122 - root - INFO - step: 25460 loss: 2.7257 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.67 mfu: 49.51% global_avg_ntp_loss: 0.7717 global_avg_top_loss: 1.9540 +[titan] 2025-09-09 16:19:09,122 - root - INFO - lr: 7.3522e-06 gnorm: 0.43 [1 day, 22:43:42<1 day, 2:41:10] +[titan] 2025-09-09 16:19:41,167 - root - INFO - step: 25465 loss: 3.0916 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.35 mfu: 49.28% global_avg_ntp_loss: 0.9822 global_avg_top_loss: 2.1094 +[titan] 2025-09-09 16:19:41,167 - root - INFO - lr: 7.3489e-06 gnorm: 0.40 [1 day, 22:44:14<1 day, 2:40:36] +[titan] 2025-09-09 16:20:13,272 - root - INFO - step: 25470 loss: 2.7736 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.44 mfu: 49.19% global_avg_ntp_loss: 0.7890 global_avg_top_loss: 1.9846 +[titan] 2025-09-09 16:20:13,272 - root - INFO - lr: 7.3457e-06 gnorm: 0.46 [1 day, 22:44:46<1 day, 2:40:03] +[titan] 2025-09-09 16:20:45,235 - root - INFO - step: 25475 loss: 2.7916 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.60 mfu: 49.40% global_avg_ntp_loss: 0.8181 global_avg_top_loss: 1.9735 +[titan] 2025-09-09 16:20:45,235 - root - INFO - lr: 7.3424e-06 gnorm: 0.59 [1 day, 22:45:18<1 day, 2:39:29] +[titan] 2025-09-09 16:21:17,230 - root - INFO - step: 25480 loss: 2.7603 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.10 mfu: 49.35% global_avg_ntp_loss: 0.7877 global_avg_top_loss: 1.9726 +[titan] 2025-09-09 16:21:17,231 - root - INFO - lr: 7.3391e-06 gnorm: 0.37 [1 day, 22:45:50<1 day, 2:38:55] +[titan] 2025-09-09 16:21:49,241 - root - INFO - step: 25485 loss: 2.7340 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.88 mfu: 49.33% global_avg_ntp_loss: 0.7740 global_avg_top_loss: 1.9601 +[titan] 2025-09-09 16:21:49,241 - root - INFO - lr: 7.3359e-06 gnorm: 0.37 [1 day, 22:46:22<1 day, 2:38:22] +[titan] 2025-09-09 16:22:21,439 - root - INFO - step: 25490 loss: 2.6630 memory: 122.03GiB(87.57%) tps: 10,177 tflops: 485.03 mfu: 49.04% global_avg_ntp_loss: 0.7415 global_avg_top_loss: 1.9215 +[titan] 2025-09-09 16:22:21,439 - root - INFO - lr: 7.3326e-06 gnorm: 0.36 [1 day, 22:46:54<1 day, 2:37:48] +[titan] 2025-09-09 16:22:53,656 - root - INFO - step: 25495 loss: 3.2009 memory: 122.03GiB(87.57%) tps: 10,171 tflops: 484.74 mfu: 49.01% global_avg_ntp_loss: 1.0382 global_avg_top_loss: 2.1628 +[titan] 2025-09-09 16:22:53,657 - root - INFO - lr: 7.3293e-06 gnorm: 0.37 [1 day, 22:47:26<1 day, 2:37:15] +[titan] 2025-09-09 16:23:19,122 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 16:23:25,491 - root - INFO - step: 25500 loss: 2.7314 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.57 mfu: 49.60% global_avg_ntp_loss: 0.7747 global_avg_top_loss: 1.9567 +[titan] 2025-09-09 16:23:25,491 - root - INFO - lr: 7.3261e-06 gnorm: 0.36 [1 day, 22:47:58<1 day, 2:36:41] +[titan] 2025-09-09 16:23:57,488 - root - INFO - step: 25505 loss: 2.6509 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.09 mfu: 49.35% global_avg_ntp_loss: 0.7394 global_avg_top_loss: 1.9115 +[titan] 2025-09-09 16:23:57,488 - root - INFO - lr: 7.3228e-06 gnorm: 0.35 [1 day, 22:48:30<1 day, 2:36:07] +[titan] 2025-09-09 16:24:29,549 - root - INFO - step: 25510 loss: 2.8351 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.10 mfu: 49.25% global_avg_ntp_loss: 0.8257 global_avg_top_loss: 2.0094 +[titan] 2025-09-09 16:24:29,549 - root - INFO - lr: 7.3196e-06 gnorm: 0.38 [1 day, 22:49:02<1 day, 2:35:34] +[titan] 2025-09-09 16:25:01,465 - root - INFO - step: 25515 loss: 3.2158 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.32 mfu: 49.48% global_avg_ntp_loss: 1.0482 global_avg_top_loss: 2.1676 +[titan] 2025-09-09 16:25:01,466 - root - INFO - lr: 7.3163e-06 gnorm: 0.36 [1 day, 22:49:34<1 day, 2:35:00] +[titan] 2025-09-09 16:25:33,569 - root - INFO - step: 25520 loss: 2.7626 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.46 mfu: 49.19% global_avg_ntp_loss: 0.7863 global_avg_top_loss: 1.9763 +[titan] 2025-09-09 16:25:33,569 - root - INFO - lr: 7.3131e-06 gnorm: 0.43 [1 day, 22:50:06<1 day, 2:34:27] +[titan] 2025-09-09 16:26:05,485 - root - INFO - step: 25525 loss: 2.7257 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.33 mfu: 49.48% global_avg_ntp_loss: 0.7780 global_avg_top_loss: 1.9477 +[titan] 2025-09-09 16:26:05,485 - root - INFO - lr: 7.3098e-06 gnorm: 0.39 [1 day, 22:50:38<1 day, 2:33:53] +[titan] 2025-09-09 16:26:37,376 - root - INFO - step: 25530 loss: 2.8116 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.70 mfu: 49.51% global_avg_ntp_loss: 0.8103 global_avg_top_loss: 2.0013 +[titan] 2025-09-09 16:26:37,376 - root - INFO - lr: 7.3065e-06 gnorm: 0.37 [1 day, 22:51:10<1 day, 2:33:19] +[titan] 2025-09-09 16:27:09,381 - root - INFO - step: 25535 loss: 2.7881 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.96 mfu: 49.34% global_avg_ntp_loss: 0.8013 global_avg_top_loss: 1.9869 +[titan] 2025-09-09 16:27:09,382 - root - INFO - lr: 7.3033e-06 gnorm: 0.40 [1 day, 22:51:42<1 day, 2:32:46] +[titan] 2025-09-09 16:27:41,249 - root - INFO - step: 25540 loss: 2.7665 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.06 mfu: 49.55% global_avg_ntp_loss: 0.7870 global_avg_top_loss: 1.9794 +[titan] 2025-09-09 16:27:41,249 - root - INFO - lr: 7.3000e-06 gnorm: 0.36 [1 day, 22:52:14<1 day, 2:32:12] +[titan] 2025-09-09 16:28:13,155 - root - INFO - step: 25545 loss: 2.6871 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.48 mfu: 49.49% global_avg_ntp_loss: 0.7578 global_avg_top_loss: 1.9293 +[titan] 2025-09-09 16:28:13,155 - root - INFO - lr: 7.2968e-06 gnorm: 0.35 [1 day, 22:52:46<1 day, 2:31:38] +[titan] 2025-09-09 16:28:38,631 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 16:28:44,998 - root - INFO - step: 25550 loss: 2.6822 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.44 mfu: 49.59% global_avg_ntp_loss: 0.7496 global_avg_top_loss: 1.9326 +[titan] 2025-09-09 16:28:44,999 - root - INFO - lr: 7.2935e-06 gnorm: 0.37 [1 day, 22:53:18<1 day, 2:31:05] +[titan] 2025-09-09 16:29:16,905 - root - INFO - step: 25555 loss: 2.7471 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.46 mfu: 49.49% global_avg_ntp_loss: 0.7891 global_avg_top_loss: 1.9580 +[titan] 2025-09-09 16:29:16,906 - root - INFO - lr: 7.2903e-06 gnorm: 0.38 [1 day, 22:53:49<1 day, 2:30:31] +[titan] 2025-09-09 16:29:49,017 - root - INFO - step: 25560 loss: 2.8651 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.34 mfu: 49.17% global_avg_ntp_loss: 0.8428 global_avg_top_loss: 2.0224 +[titan] 2025-09-09 16:29:49,017 - root - INFO - lr: 7.2870e-06 gnorm: 0.37 [1 day, 22:54:22<1 day, 2:29:57] +[titan] 2025-09-09 16:30:20,954 - root - INFO - step: 25565 loss: 2.7229 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 489.00 mfu: 49.44% global_avg_ntp_loss: 0.7705 global_avg_top_loss: 1.9524 +[titan] 2025-09-09 16:30:20,955 - root - INFO - lr: 7.2838e-06 gnorm: 0.35 [1 day, 22:54:53<1 day, 2:29:24] +[titan] 2025-09-09 16:30:52,822 - root - INFO - step: 25570 loss: 2.7319 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.07 mfu: 49.55% global_avg_ntp_loss: 0.7726 global_avg_top_loss: 1.9592 +[titan] 2025-09-09 16:30:52,822 - root - INFO - lr: 7.2805e-06 gnorm: 0.36 [1 day, 22:55:25<1 day, 2:28:50] +[titan] 2025-09-09 16:31:24,711 - root - INFO - step: 25575 loss: 3.2013 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.74 mfu: 49.52% global_avg_ntp_loss: 1.0378 global_avg_top_loss: 2.1635 +[titan] 2025-09-09 16:31:24,711 - root - INFO - lr: 7.2773e-06 gnorm: 0.37 [1 day, 22:55:57<1 day, 2:28:16] +[titan] 2025-09-09 16:31:56,872 - root - INFO - step: 25580 loss: 2.7531 memory: 122.03GiB(87.57%) tps: 10,189 tflops: 485.58 mfu: 49.10% global_avg_ntp_loss: 0.7817 global_avg_top_loss: 1.9714 +[titan] 2025-09-09 16:31:56,873 - root - INFO - lr: 7.2740e-06 gnorm: 0.37 [1 day, 22:56:29<1 day, 2:27:43] +[titan] 2025-09-09 16:32:28,798 - root - INFO - step: 25585 loss: 2.6784 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.17 mfu: 49.46% global_avg_ntp_loss: 0.7474 global_avg_top_loss: 1.9310 +[titan] 2025-09-09 16:32:28,798 - root - INFO - lr: 7.2708e-06 gnorm: 0.35 [1 day, 22:57:01<1 day, 2:27:09] +[titan] 2025-09-09 16:33:00,631 - root - INFO - step: 25590 loss: 2.6783 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.61 mfu: 49.61% global_avg_ntp_loss: 0.7502 global_avg_top_loss: 1.9281 +[titan] 2025-09-09 16:33:00,631 - root - INFO - lr: 7.2675e-06 gnorm: 0.35 [1 day, 22:57:33<1 day, 2:26:35] +[titan] 2025-09-09 16:33:32,642 - root - INFO - step: 25595 loss: 3.1604 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.87 mfu: 49.33% global_avg_ntp_loss: 1.0208 global_avg_top_loss: 2.1396 +[titan] 2025-09-09 16:33:32,642 - root - INFO - lr: 7.2643e-06 gnorm: 0.35 [1 day, 22:58:05<1 day, 2:26:02] +[titan] 2025-09-09 16:33:58,236 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 16:34:04,694 - root - INFO - step: 25600 loss: 2.6280 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.24 mfu: 49.27% global_avg_ntp_loss: 0.7240 global_avg_top_loss: 1.9040 +[titan] 2025-09-09 16:34:04,695 - root - INFO - lr: 7.2610e-06 gnorm: 0.34 [1 day, 22:58:37<1 day, 2:25:28] +[titan] 2025-09-09 16:34:04,980 - root - INFO - Dumping profiler traces at step 25600 +[titan] 2025-09-09 16:34:05,049 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-09 16:34:36,889 - root - INFO - step: 25605 loss: 2.7729 memory: 122.03GiB(87.57%) tps: 10,178 tflops: 485.08 mfu: 49.05% global_avg_ntp_loss: 0.7875 global_avg_top_loss: 1.9855 +[titan] 2025-09-09 16:34:36,889 - root - INFO - lr: 7.2578e-06 gnorm: 0.36 [1 day, 22:59:09<1 day, 2:24:55] +[titan] 2025-09-09 16:35:08,770 - root - INFO - step: 25610 loss: 2.7704 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.87 mfu: 49.53% global_avg_ntp_loss: 0.7873 global_avg_top_loss: 1.9831 +[titan] 2025-09-09 16:35:08,770 - root - INFO - lr: 7.2545e-06 gnorm: 0.37 [1 day, 22:59:41<1 day, 2:24:21] +[titan] 2025-09-09 16:35:40,587 - root - INFO - step: 25615 loss: 2.6083 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.85 mfu: 49.63% global_avg_ntp_loss: 0.7160 global_avg_top_loss: 1.8923 +[titan] 2025-09-09 16:35:40,587 - root - INFO - lr: 7.2513e-06 gnorm: 0.38 [1 day, 23:00:13<1 day, 2:23:47] +[titan] 2025-09-09 16:36:12,515 - root - INFO - step: 25620 loss: 2.8002 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.14 mfu: 49.46% global_avg_ntp_loss: 0.8017 global_avg_top_loss: 1.9985 +[titan] 2025-09-09 16:36:12,515 - root - INFO - lr: 7.2480e-06 gnorm: 0.36 [1 day, 23:00:45<1 day, 2:23:14] +[titan] 2025-09-09 16:36:44,513 - root - INFO - step: 25625 loss: 2.7163 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.07 mfu: 49.35% global_avg_ntp_loss: 0.7670 global_avg_top_loss: 1.9494 +[titan] 2025-09-09 16:36:44,513 - root - INFO - lr: 7.2448e-06 gnorm: 0.35 [1 day, 23:01:17<1 day, 2:22:40] +[titan] 2025-09-09 16:37:16,447 - root - INFO - step: 25630 loss: 2.7114 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.04 mfu: 49.45% global_avg_ntp_loss: 0.7650 global_avg_top_loss: 1.9465 +[titan] 2025-09-09 16:37:16,447 - root - INFO - lr: 7.2416e-06 gnorm: 0.37 [1 day, 23:01:49<1 day, 2:22:06] +[titan] 2025-09-09 16:37:48,513 - root - INFO - step: 25635 loss: 2.7578 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.03 mfu: 49.24% global_avg_ntp_loss: 0.7860 global_avg_top_loss: 1.9718 +[titan] 2025-09-09 16:37:48,514 - root - INFO - lr: 7.2383e-06 gnorm: 0.36 [1 day, 23:02:21<1 day, 2:21:33] +[titan] 2025-09-09 16:38:20,467 - root - INFO - step: 25640 loss: 2.6960 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.74 mfu: 49.42% global_avg_ntp_loss: 0.7554 global_avg_top_loss: 1.9406 +[titan] 2025-09-09 16:38:20,467 - root - INFO - lr: 7.2351e-06 gnorm: 0.35 [1 day, 23:02:53<1 day, 2:20:59] +[titan] 2025-09-09 16:38:52,615 - root - INFO - step: 25645 loss: 2.7464 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.80 mfu: 49.12% global_avg_ntp_loss: 0.7800 global_avg_top_loss: 1.9664 +[titan] 2025-09-09 16:38:52,615 - root - INFO - lr: 7.2318e-06 gnorm: 0.36 [1 day, 23:03:25<1 day, 2:20:26] +[titan] 2025-09-09 16:39:18,124 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 16:39:24,526 - root - INFO - step: 25650 loss: 2.8335 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.39 mfu: 49.48% global_avg_ntp_loss: 0.8255 global_avg_top_loss: 2.0080 +[titan] 2025-09-09 16:39:24,527 - root - INFO - lr: 7.2286e-06 gnorm: 0.40 [1 day, 23:03:57<1 day, 2:19:52] +[titan] 2025-09-09 16:39:56,644 - root - INFO - step: 25655 loss: 3.2557 memory: 122.03GiB(87.57%) tps: 10,203 tflops: 486.25 mfu: 49.17% global_avg_ntp_loss: 1.0758 global_avg_top_loss: 2.1799 +[titan] 2025-09-09 16:39:56,644 - root - INFO - lr: 7.2253e-06 gnorm: 0.35 [1 day, 23:04:29<1 day, 2:19:19] +[titan] 2025-09-09 16:40:28,606 - root - INFO - step: 25660 loss: 2.7666 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.62 mfu: 49.41% global_avg_ntp_loss: 0.7880 global_avg_top_loss: 1.9786 +[titan] 2025-09-09 16:40:28,606 - root - INFO - lr: 7.2221e-06 gnorm: 0.37 [1 day, 23:05:01<1 day, 2:18:45] +[titan] 2025-09-09 16:41:00,483 - root - INFO - step: 25665 loss: 2.8095 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.92 mfu: 49.54% global_avg_ntp_loss: 0.8120 global_avg_top_loss: 1.9974 +[titan] 2025-09-09 16:41:00,483 - root - INFO - lr: 7.2189e-06 gnorm: 0.38 [1 day, 23:05:33<1 day, 2:18:11] +[titan] 2025-09-09 16:41:32,556 - root - INFO - step: 25670 loss: 2.8191 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.92 mfu: 49.23% global_avg_ntp_loss: 0.8116 global_avg_top_loss: 2.0074 +[titan] 2025-09-09 16:41:32,557 - root - INFO - lr: 7.2156e-06 gnorm: 0.38 [1 day, 23:06:05<1 day, 2:17:38] +[titan] 2025-09-09 16:42:04,690 - root - INFO - step: 25675 loss: 2.8360 memory: 122.03GiB(87.57%) tps: 10,197 tflops: 486.00 mfu: 49.14% global_avg_ntp_loss: 0.8426 global_avg_top_loss: 1.9935 +[titan] 2025-09-09 16:42:04,691 - root - INFO - lr: 7.2124e-06 gnorm: 0.36 [1 day, 23:06:37<1 day, 2:17:04] +[titan] 2025-09-09 16:42:36,428 - root - INFO - step: 25680 loss: 2.6169 memory: 122.03GiB(87.57%) tps: 10,325 tflops: 492.08 mfu: 49.76% global_avg_ntp_loss: 0.7155 global_avg_top_loss: 1.9014 +[titan] 2025-09-09 16:42:36,428 - root - INFO - lr: 7.2091e-06 gnorm: 0.51 [1 day, 23:07:09<1 day, 2:16:30] +[titan] 2025-09-09 16:43:08,421 - root - INFO - step: 25685 loss: 2.7441 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.13 mfu: 49.36% global_avg_ntp_loss: 0.7760 global_avg_top_loss: 1.9681 +[titan] 2025-09-09 16:43:08,422 - root - INFO - lr: 7.2059e-06 gnorm: 0.36 [1 day, 23:07:41<1 day, 2:15:57] +[titan] 2025-09-09 16:43:40,545 - root - INFO - step: 25690 loss: 2.7775 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.16 mfu: 49.16% global_avg_ntp_loss: 0.7930 global_avg_top_loss: 1.9846 +[titan] 2025-09-09 16:43:40,546 - root - INFO - lr: 7.2027e-06 gnorm: 0.37 [1 day, 23:08:13<1 day, 2:15:23] +[titan] 2025-09-09 16:44:12,481 - root - INFO - step: 25695 loss: 2.6731 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.03 mfu: 49.45% global_avg_ntp_loss: 0.7430 global_avg_top_loss: 1.9301 +[titan] 2025-09-09 16:44:12,481 - root - INFO - lr: 7.1994e-06 gnorm: 0.42 [1 day, 23:08:45<1 day, 2:14:50] +[titan] 2025-09-09 16:44:38,153 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 16:44:44,521 - root - INFO - step: 25700 loss: 2.7730 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.43 mfu: 49.29% global_avg_ntp_loss: 0.7941 global_avg_top_loss: 1.9789 +[titan] 2025-09-09 16:44:44,521 - root - INFO - lr: 7.1962e-06 gnorm: 0.38 [1 day, 23:09:17<1 day, 2:14:16] +[titan] 2025-09-09 16:45:16,477 - root - INFO - step: 25705 loss: 2.7261 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.70 mfu: 49.41% global_avg_ntp_loss: 0.7730 global_avg_top_loss: 1.9531 +[titan] 2025-09-09 16:45:16,478 - root - INFO - lr: 7.1930e-06 gnorm: 0.37 [1 day, 23:09:49<1 day, 2:13:42] +[titan] 2025-09-09 16:45:48,423 - root - INFO - step: 25710 loss: 2.7508 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.87 mfu: 49.43% global_avg_ntp_loss: 0.7822 global_avg_top_loss: 1.9686 +[titan] 2025-09-09 16:45:48,423 - root - INFO - lr: 7.1897e-06 gnorm: 0.37 [1 day, 23:10:21<1 day, 2:13:09] +[titan] 2025-09-09 16:46:20,280 - root - INFO - step: 25715 loss: 2.6996 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.23 mfu: 49.57% global_avg_ntp_loss: 0.7619 global_avg_top_loss: 1.9377 +[titan] 2025-09-09 16:46:20,280 - root - INFO - lr: 7.1865e-06 gnorm: 0.36 [1 day, 23:10:53<1 day, 2:12:35] +[titan] 2025-09-09 16:46:52,243 - root - INFO - step: 25720 loss: 2.7088 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.60 mfu: 49.40% global_avg_ntp_loss: 0.7621 global_avg_top_loss: 1.9468 +[titan] 2025-09-09 16:46:52,243 - root - INFO - lr: 7.1833e-06 gnorm: 3.44 [1 day, 23:11:25<1 day, 2:12:01] +[titan] 2025-09-09 16:47:24,366 - root - INFO - step: 25725 loss: 2.6939 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.17 mfu: 49.16% global_avg_ntp_loss: 0.7550 global_avg_top_loss: 1.9389 +[titan] 2025-09-09 16:47:24,367 - root - INFO - lr: 7.1800e-06 gnorm: 0.37 [1 day, 23:11:57<1 day, 2:11:28] +[titan] 2025-09-09 16:47:56,419 - root - INFO - step: 25730 loss: 2.7190 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.23 mfu: 49.27% global_avg_ntp_loss: 0.7672 global_avg_top_loss: 1.9518 +[titan] 2025-09-09 16:47:56,419 - root - INFO - lr: 7.1768e-06 gnorm: 0.38 [1 day, 23:12:29<1 day, 2:10:54] +[titan] 2025-09-09 16:48:28,326 - root - INFO - step: 25735 loss: 3.1863 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.46 mfu: 49.49% global_avg_ntp_loss: 1.0377 global_avg_top_loss: 2.1486 +[titan] 2025-09-09 16:48:28,327 - root - INFO - lr: 7.1736e-06 gnorm: 0.37 [1 day, 23:13:01<1 day, 2:10:21] +[titan] 2025-09-09 16:49:00,377 - root - INFO - step: 25740 loss: 2.7238 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.27 mfu: 49.27% global_avg_ntp_loss: 0.7691 global_avg_top_loss: 1.9547 +[titan] 2025-09-09 16:49:00,377 - root - INFO - lr: 7.1703e-06 gnorm: 0.36 [1 day, 23:13:33<1 day, 2:09:47] +[titan] 2025-09-09 16:49:32,664 - root - INFO - step: 25745 loss: 2.8013 memory: 122.03GiB(87.57%) tps: 10,149 tflops: 483.69 mfu: 48.91% global_avg_ntp_loss: 0.8027 global_avg_top_loss: 1.9986 +[titan] 2025-09-09 16:49:32,664 - root - INFO - lr: 7.1671e-06 gnorm: 0.46 [1 day, 23:14:05<1 day, 2:09:14] +[titan] 2025-09-09 16:49:58,131 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 16:50:04,486 - root - INFO - step: 25750 loss: 2.9044 memory: 122.03GiB(87.57%) tps: 10,297 tflops: 490.77 mfu: 49.62% global_avg_ntp_loss: 0.8640 global_avg_top_loss: 2.0404 +[titan] 2025-09-09 16:50:04,487 - root - INFO - lr: 7.1639e-06 gnorm: 0.36 [1 day, 23:14:37<1 day, 2:08:40] +[titan] 2025-09-09 16:50:36,447 - root - INFO - step: 25755 loss: 2.7662 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.64 mfu: 49.41% global_avg_ntp_loss: 0.7859 global_avg_top_loss: 1.9803 +[titan] 2025-09-09 16:50:36,447 - root - INFO - lr: 7.1606e-06 gnorm: 0.35 [1 day, 23:15:09<1 day, 2:08:06] +[titan] 2025-09-09 16:51:08,409 - root - INFO - step: 25760 loss: 2.6984 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.61 mfu: 49.40% global_avg_ntp_loss: 0.7640 global_avg_top_loss: 1.9344 +[titan] 2025-09-09 16:51:08,410 - root - INFO - lr: 7.1574e-06 gnorm: 0.38 [1 day, 23:15:41<1 day, 2:07:33] +[titan] 2025-09-09 16:51:40,427 - root - INFO - step: 25765 loss: 2.7768 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.78 mfu: 49.32% global_avg_ntp_loss: 0.7948 global_avg_top_loss: 1.9820 +[titan] 2025-09-09 16:51:40,427 - root - INFO - lr: 7.1542e-06 gnorm: 0.37 [1 day, 23:16:13<1 day, 2:06:59] +[titan] 2025-09-09 16:52:12,494 - root - INFO - step: 25770 loss: 2.6964 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.02 mfu: 49.24% global_avg_ntp_loss: 0.7578 global_avg_top_loss: 1.9386 +[titan] 2025-09-09 16:52:12,494 - root - INFO - lr: 7.1510e-06 gnorm: 0.36 [1 day, 23:16:45<1 day, 2:06:26] +[titan] 2025-09-09 16:52:44,570 - root - INFO - step: 25775 loss: 2.8399 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.88 mfu: 49.23% global_avg_ntp_loss: 0.8329 global_avg_top_loss: 2.0070 +[titan] 2025-09-09 16:52:44,570 - root - INFO - lr: 7.1477e-06 gnorm: 0.42 [1 day, 23:17:17<1 day, 2:05:52] +[titan] 2025-09-09 16:53:16,626 - root - INFO - step: 25780 loss: 2.7932 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.18 mfu: 49.26% global_avg_ntp_loss: 0.8052 global_avg_top_loss: 1.9880 +[titan] 2025-09-09 16:53:16,626 - root - INFO - lr: 7.1445e-06 gnorm: 0.42 [1 day, 23:17:49<1 day, 2:05:19] +[titan] 2025-09-09 16:53:48,538 - root - INFO - step: 25785 loss: 2.7354 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.38 mfu: 49.48% global_avg_ntp_loss: 0.7747 global_avg_top_loss: 1.9607 +[titan] 2025-09-09 16:53:48,538 - root - INFO - lr: 7.1413e-06 gnorm: 0.36 [1 day, 23:18:21<1 day, 2:04:45] +[titan] 2025-09-09 16:54:20,637 - root - INFO - step: 25790 loss: 2.7536 memory: 122.03GiB(87.57%) tps: 10,208 tflops: 486.53 mfu: 49.19% global_avg_ntp_loss: 0.7840 global_avg_top_loss: 1.9697 +[titan] 2025-09-09 16:54:20,638 - root - INFO - lr: 7.1381e-06 gnorm: 0.35 [1 day, 23:18:53<1 day, 2:04:11] +[titan] 2025-09-09 16:54:52,828 - root - INFO - step: 25795 loss: 2.6883 memory: 122.03GiB(87.57%) tps: 10,179 tflops: 485.15 mfu: 49.05% global_avg_ntp_loss: 0.7527 global_avg_top_loss: 1.9357 +[titan] 2025-09-09 16:54:52,828 - root - INFO - lr: 7.1348e-06 gnorm: 0.40 [1 day, 23:19:25<1 day, 2:03:38] +[titan] 2025-09-09 16:55:18,493 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 16:55:25,010 - root - INFO - step: 25800 loss: 2.7297 memory: 122.03GiB(87.57%) tps: 10,182 tflops: 485.27 mfu: 49.07% global_avg_ntp_loss: 0.7700 global_avg_top_loss: 1.9598 +[titan] 2025-09-09 16:55:25,010 - root - INFO - lr: 7.1316e-06 gnorm: 0.37 [1 day, 23:19:58<1 day, 2:03:04] +[titan] 2025-09-09 16:55:57,111 - root - INFO - step: 25805 loss: 2.6895 memory: 122.03GiB(87.57%) tps: 10,208 tflops: 486.51 mfu: 49.19% global_avg_ntp_loss: 0.7543 global_avg_top_loss: 1.9352 +[titan] 2025-09-09 16:55:57,111 - root - INFO - lr: 7.1284e-06 gnorm: 0.39 [1 day, 23:20:30<1 day, 2:02:31] +[titan] 2025-09-09 16:56:29,125 - root - INFO - step: 25810 loss: 2.7379 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.83 mfu: 49.33% global_avg_ntp_loss: 0.7756 global_avg_top_loss: 1.9623 +[titan] 2025-09-09 16:56:29,125 - root - INFO - lr: 7.1252e-06 gnorm: 0.36 [1 day, 23:21:02<1 day, 2:01:57] +[titan] 2025-09-09 16:57:01,095 - root - INFO - step: 25815 loss: 2.7877 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.49 mfu: 49.39% global_avg_ntp_loss: 0.7999 global_avg_top_loss: 1.9878 +[titan] 2025-09-09 16:57:01,095 - root - INFO - lr: 7.1219e-06 gnorm: 0.39 [1 day, 23:21:34<1 day, 2:01:24] +[titan] 2025-09-09 16:57:33,367 - root - INFO - step: 25820 loss: 2.6703 memory: 122.03GiB(87.57%) tps: 10,154 tflops: 483.92 mfu: 48.93% global_avg_ntp_loss: 0.7450 global_avg_top_loss: 1.9253 +[titan] 2025-09-09 16:57:33,368 - root - INFO - lr: 7.1187e-06 gnorm: 0.38 [1 day, 23:22:06<1 day, 2:00:50] +[titan] 2025-09-09 16:58:05,416 - root - INFO - step: 25825 loss: 2.7347 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.30 mfu: 49.27% global_avg_ntp_loss: 0.7751 global_avg_top_loss: 1.9596 +[titan] 2025-09-09 16:58:05,416 - root - INFO - lr: 7.1155e-06 gnorm: 0.36 [1 day, 23:22:38<1 day, 2:00:17] +[titan] 2025-09-09 16:58:37,439 - root - INFO - step: 25830 loss: 2.5870 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.69 mfu: 49.31% global_avg_ntp_loss: 0.7084 global_avg_top_loss: 1.8786 +[titan] 2025-09-09 16:58:37,439 - root - INFO - lr: 7.1123e-06 gnorm: 0.37 [1 day, 23:23:10<1 day, 1:59:43] +[titan] 2025-09-09 16:59:09,623 - root - INFO - step: 25835 loss: 2.7255 memory: 122.03GiB(87.57%) tps: 10,181 tflops: 485.24 mfu: 49.06% global_avg_ntp_loss: 0.7672 global_avg_top_loss: 1.9583 +[titan] 2025-09-09 16:59:09,624 - root - INFO - lr: 7.1091e-06 gnorm: 0.39 [1 day, 23:23:42<1 day, 1:59:10] +[titan] 2025-09-09 16:59:41,635 - root - INFO - step: 25840 loss: 2.8682 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.86 mfu: 49.33% global_avg_ntp_loss: 0.8473 global_avg_top_loss: 2.0209 +[titan] 2025-09-09 16:59:41,635 - root - INFO - lr: 7.1058e-06 gnorm: 0.40 [1 day, 23:24:14<1 day, 1:58:36] +[titan] 2025-09-09 17:00:13,784 - root - INFO - step: 25845 loss: 2.7899 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.78 mfu: 49.12% global_avg_ntp_loss: 0.7967 global_avg_top_loss: 1.9932 +[titan] 2025-09-09 17:00:13,784 - root - INFO - lr: 7.1026e-06 gnorm: 0.38 [1 day, 23:24:46<1 day, 1:58:03] +[titan] 2025-09-09 17:00:39,706 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:00:46,109 - root - INFO - step: 25850 loss: 2.6656 memory: 122.03GiB(87.57%) tps: 10,137 tflops: 483.13 mfu: 48.85% global_avg_ntp_loss: 0.7446 global_avg_top_loss: 1.9209 +[titan] 2025-09-09 17:00:46,109 - root - INFO - lr: 7.0994e-06 gnorm: 0.36 [1 day, 23:25:19<1 day, 1:57:29] +[titan] 2025-09-09 17:01:17,992 - root - INFO - step: 25855 loss: 2.7729 memory: 122.03GiB(87.57%) tps: 10,278 tflops: 489.82 mfu: 49.53% global_avg_ntp_loss: 0.7911 global_avg_top_loss: 1.9819 +[titan] 2025-09-09 17:01:17,993 - root - INFO - lr: 7.0962e-06 gnorm: 0.38 [1 day, 23:25:51<1 day, 1:56:56] +[titan] 2025-09-09 17:01:49,973 - root - INFO - step: 25860 loss: 2.7198 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.33 mfu: 49.38% global_avg_ntp_loss: 0.7680 global_avg_top_loss: 1.9517 +[titan] 2025-09-09 17:01:49,974 - root - INFO - lr: 7.0930e-06 gnorm: 0.37 [1 day, 23:26:22<1 day, 1:56:22] +[titan] 2025-09-09 17:02:21,804 - root - INFO - step: 25865 loss: 2.6994 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.64 mfu: 49.61% global_avg_ntp_loss: 0.7595 global_avg_top_loss: 1.9399 +[titan] 2025-09-09 17:02:21,804 - root - INFO - lr: 7.0898e-06 gnorm: 0.36 [1 day, 23:26:54<1 day, 1:55:48] +[titan] 2025-09-09 17:02:53,718 - root - INFO - step: 25870 loss: 2.7622 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.35 mfu: 49.48% global_avg_ntp_loss: 0.7843 global_avg_top_loss: 1.9779 +[titan] 2025-09-09 17:02:53,718 - root - INFO - lr: 7.0865e-06 gnorm: 0.39 [1 day, 23:27:26<1 day, 1:55:15] +[titan] 2025-09-09 17:03:25,967 - root - INFO - step: 25875 loss: 2.6897 memory: 122.03GiB(87.57%) tps: 10,161 tflops: 484.26 mfu: 48.96% global_avg_ntp_loss: 0.7555 global_avg_top_loss: 1.9343 +[titan] 2025-09-09 17:03:25,968 - root - INFO - lr: 7.0833e-06 gnorm: 0.35 [1 day, 23:27:58<1 day, 1:54:41] +[titan] 2025-09-09 17:03:58,066 - root - INFO - step: 25880 loss: 2.7137 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.54 mfu: 49.19% global_avg_ntp_loss: 0.7647 global_avg_top_loss: 1.9489 +[titan] 2025-09-09 17:03:58,066 - root - INFO - lr: 7.0801e-06 gnorm: 0.36 [1 day, 23:28:31<1 day, 1:54:08] +[titan] 2025-09-09 17:04:30,040 - root - INFO - step: 25885 loss: 2.6279 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.43 mfu: 49.39% global_avg_ntp_loss: 0.7299 global_avg_top_loss: 1.8980 +[titan] 2025-09-09 17:04:30,040 - root - INFO - lr: 7.0769e-06 gnorm: 0.37 [1 day, 23:29:03<1 day, 1:53:34] +[titan] 2025-09-09 17:05:02,081 - root - INFO - step: 25890 loss: 2.7596 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.42 mfu: 49.28% global_avg_ntp_loss: 0.7935 global_avg_top_loss: 1.9662 +[titan] 2025-09-09 17:05:02,081 - root - INFO - lr: 7.0737e-06 gnorm: 0.37 [1 day, 23:29:35<1 day, 1:53:01] +[titan] 2025-09-09 17:05:33,879 - root - INFO - step: 25895 loss: 2.7432 memory: 122.03GiB(87.57%) tps: 10,305 tflops: 491.13 mfu: 49.66% global_avg_ntp_loss: 0.7774 global_avg_top_loss: 1.9658 +[titan] 2025-09-09 17:05:33,879 - root - INFO - lr: 7.0705e-06 gnorm: 0.37 [1 day, 23:30:06<1 day, 1:52:27] +[titan] 2025-09-09 17:05:59,651 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:06:06,076 - root - INFO - step: 25900 loss: 2.7325 memory: 122.03GiB(87.57%) tps: 10,177 tflops: 485.05 mfu: 49.04% global_avg_ntp_loss: 0.7737 global_avg_top_loss: 1.9588 +[titan] 2025-09-09 17:06:06,077 - root - INFO - lr: 7.0673e-06 gnorm: 0.56 [1 day, 23:30:39<1 day, 1:51:53] +[titan] 2025-09-09 17:06:38,040 - root - INFO - step: 25905 loss: 2.8397 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.59 mfu: 49.40% global_avg_ntp_loss: 0.8264 global_avg_top_loss: 2.0133 +[titan] 2025-09-09 17:06:38,041 - root - INFO - lr: 7.0641e-06 gnorm: 0.37 [1 day, 23:31:11<1 day, 1:51:20] +[titan] 2025-09-09 17:07:10,116 - root - INFO - step: 25910 loss: 2.7564 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.88 mfu: 49.23% global_avg_ntp_loss: 0.7814 global_avg_top_loss: 1.9749 +[titan] 2025-09-09 17:07:10,117 - root - INFO - lr: 7.0608e-06 gnorm: 0.36 [1 day, 23:31:43<1 day, 1:50:46] +[titan] 2025-09-09 17:07:42,188 - root - INFO - step: 25915 loss: 3.1628 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.95 mfu: 49.24% global_avg_ntp_loss: 1.0202 global_avg_top_loss: 2.1426 +[titan] 2025-09-09 17:07:42,188 - root - INFO - lr: 7.0576e-06 gnorm: 0.36 [1 day, 23:32:15<1 day, 1:50:13] +[titan] 2025-09-09 17:08:14,262 - root - INFO - step: 25920 loss: 2.8122 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.90 mfu: 49.23% global_avg_ntp_loss: 0.8137 global_avg_top_loss: 1.9985 +[titan] 2025-09-09 17:08:14,263 - root - INFO - lr: 7.0544e-06 gnorm: 0.36 [1 day, 23:32:47<1 day, 1:49:39] +[titan] 2025-09-09 17:08:46,253 - root - INFO - step: 25925 loss: 2.8404 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.18 mfu: 49.36% global_avg_ntp_loss: 0.8278 global_avg_top_loss: 2.0126 +[titan] 2025-09-09 17:08:46,254 - root - INFO - lr: 7.0512e-06 gnorm: 0.66 [1 day, 23:33:19<1 day, 1:49:06] +[titan] 2025-09-09 17:09:18,394 - root - INFO - step: 25930 loss: 2.7268 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.90 mfu: 49.13% global_avg_ntp_loss: 0.7700 global_avg_top_loss: 1.9569 +[titan] 2025-09-09 17:09:18,394 - root - INFO - lr: 7.0480e-06 gnorm: 0.37 [1 day, 23:33:51<1 day, 1:48:32] +[titan] 2025-09-09 17:09:50,352 - root - INFO - step: 25935 loss: 2.6415 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.69 mfu: 49.41% global_avg_ntp_loss: 0.7317 global_avg_top_loss: 1.9098 +[titan] 2025-09-09 17:09:50,352 - root - INFO - lr: 7.0448e-06 gnorm: 0.36 [1 day, 23:34:23<1 day, 1:47:59] +[titan] 2025-09-09 17:10:22,460 - root - INFO - step: 25940 loss: 2.7190 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.39 mfu: 49.18% global_avg_ntp_loss: 0.7676 global_avg_top_loss: 1.9514 +[titan] 2025-09-09 17:10:22,460 - root - INFO - lr: 7.0416e-06 gnorm: 0.37 [1 day, 23:34:55<1 day, 1:47:25] +[titan] 2025-09-09 17:10:54,645 - root - INFO - step: 25945 loss: 2.7431 memory: 122.03GiB(87.57%) tps: 10,181 tflops: 485.24 mfu: 49.06% global_avg_ntp_loss: 0.7822 global_avg_top_loss: 1.9610 +[titan] 2025-09-09 17:10:54,645 - root - INFO - lr: 7.0384e-06 gnorm: 0.37 [1 day, 23:35:27<1 day, 1:46:52] +[titan] 2025-09-09 17:11:20,181 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:11:26,613 - root - INFO - step: 25950 loss: 2.6404 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.53 mfu: 49.40% global_avg_ntp_loss: 0.7345 global_avg_top_loss: 1.9060 +[titan] 2025-09-09 17:11:26,613 - root - INFO - lr: 7.0352e-06 gnorm: 0.36 [1 day, 23:35:59<1 day, 1:46:18] +[titan] 2025-09-09 17:11:58,760 - root - INFO - step: 25955 loss: 2.7990 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.80 mfu: 49.12% global_avg_ntp_loss: 0.8003 global_avg_top_loss: 1.9987 +[titan] 2025-09-09 17:11:58,761 - root - INFO - lr: 7.0320e-06 gnorm: 0.37 [1 day, 23:36:31<1 day, 1:45:45] +[titan] 2025-09-09 17:12:30,683 - root - INFO - step: 25960 loss: 2.7329 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.22 mfu: 49.47% global_avg_ntp_loss: 0.7694 global_avg_top_loss: 1.9635 +[titan] 2025-09-09 17:12:30,683 - root - INFO - lr: 7.0288e-06 gnorm: 0.37 [1 day, 23:37:03<1 day, 1:45:11] +[titan] 2025-09-09 17:13:02,661 - root - INFO - step: 25965 loss: 2.6478 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.37 mfu: 49.38% global_avg_ntp_loss: 0.7350 global_avg_top_loss: 1.9128 +[titan] 2025-09-09 17:13:02,661 - root - INFO - lr: 7.0256e-06 gnorm: 0.38 [1 day, 23:37:35<1 day, 1:44:37] +[titan] 2025-09-09 17:13:34,658 - root - INFO - step: 25970 loss: 2.6278 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.08 mfu: 49.35% global_avg_ntp_loss: 0.7235 global_avg_top_loss: 1.9042 +[titan] 2025-09-09 17:13:34,659 - root - INFO - lr: 7.0224e-06 gnorm: 0.36 [1 day, 23:38:07<1 day, 1:44:04] +[titan] 2025-09-09 17:14:06,778 - root - INFO - step: 25975 loss: 3.0415 memory: 122.03GiB(87.57%) tps: 10,202 tflops: 486.23 mfu: 49.16% global_avg_ntp_loss: 0.9439 global_avg_top_loss: 2.0976 +[titan] 2025-09-09 17:14:06,778 - root - INFO - lr: 7.0192e-06 gnorm: 0.37 [1 day, 23:38:39<1 day, 1:43:30] +[titan] 2025-09-09 17:14:38,819 - root - INFO - step: 25980 loss: 2.6871 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.41 mfu: 49.28% global_avg_ntp_loss: 0.7530 global_avg_top_loss: 1.9341 +[titan] 2025-09-09 17:14:38,819 - root - INFO - lr: 7.0160e-06 gnorm: 0.36 [1 day, 23:39:11<1 day, 1:42:57] +[titan] 2025-09-09 17:15:10,777 - root - INFO - step: 25985 loss: 2.6717 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.68 mfu: 49.41% global_avg_ntp_loss: 0.7479 global_avg_top_loss: 1.9238 +[titan] 2025-09-09 17:15:10,777 - root - INFO - lr: 7.0128e-06 gnorm: 0.36 [1 day, 23:39:43<1 day, 1:42:23] +[titan] 2025-09-09 17:15:42,889 - root - INFO - step: 25990 loss: 2.7102 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.34 mfu: 49.17% global_avg_ntp_loss: 0.7640 global_avg_top_loss: 1.9462 +[titan] 2025-09-09 17:15:42,889 - root - INFO - lr: 7.0096e-06 gnorm: 0.37 [1 day, 23:40:15<1 day, 1:41:50] +[titan] 2025-09-09 17:16:15,002 - root - INFO - step: 25995 loss: 3.1320 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.31 mfu: 49.17% global_avg_ntp_loss: 1.0082 global_avg_top_loss: 2.1237 +[titan] 2025-09-09 17:16:15,002 - root - INFO - lr: 7.0064e-06 gnorm: 0.37 [1 day, 23:40:47<1 day, 1:41:16] +[titan] 2025-09-09 17:16:40,779 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:16:47,195 - root - INFO - step: 26000 loss: 2.6892 memory: 122.03GiB(87.57%) tps: 10,179 tflops: 485.11 mfu: 49.05% global_avg_ntp_loss: 0.7537 global_avg_top_loss: 1.9355 +[titan] 2025-09-09 17:16:47,196 - root - INFO - lr: 7.0032e-06 gnorm: 0.35 [1 day, 23:41:20<1 day, 1:40:43] +[titan] 2025-09-09 17:17:19,261 - root - INFO - step: 26005 loss: 2.6976 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.04 mfu: 49.25% global_avg_ntp_loss: 0.7588 global_avg_top_loss: 1.9388 +[titan] 2025-09-09 17:17:19,262 - root - INFO - lr: 7.0000e-06 gnorm: 0.38 [1 day, 23:41:52<1 day, 1:40:09] +[titan] 2025-09-09 17:17:51,311 - root - INFO - step: 26010 loss: 2.7888 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.27 mfu: 49.27% global_avg_ntp_loss: 0.7977 global_avg_top_loss: 1.9911 +[titan] 2025-09-09 17:17:51,312 - root - INFO - lr: 6.9968e-06 gnorm: 0.37 [1 day, 23:42:24<1 day, 1:39:36] +[titan] 2025-09-09 17:18:23,359 - root - INFO - step: 26015 loss: 2.7930 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.31 mfu: 49.27% global_avg_ntp_loss: 0.8097 global_avg_top_loss: 1.9833 +[titan] 2025-09-09 17:18:23,360 - root - INFO - lr: 6.9936e-06 gnorm: 0.48 [1 day, 23:42:56<1 day, 1:39:02] +[titan] 2025-09-09 17:18:55,360 - root - INFO - step: 26020 loss: 2.7541 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.03 mfu: 49.35% global_avg_ntp_loss: 0.7820 global_avg_top_loss: 1.9721 +[titan] 2025-09-09 17:18:55,360 - root - INFO - lr: 6.9904e-06 gnorm: 0.36 [1 day, 23:43:28<1 day, 1:38:29] +[titan] 2025-09-09 17:19:27,445 - root - INFO - step: 26025 loss: 2.7449 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.74 mfu: 49.22% global_avg_ntp_loss: 0.7743 global_avg_top_loss: 1.9706 +[titan] 2025-09-09 17:19:27,445 - root - INFO - lr: 6.9872e-06 gnorm: 0.45 [1 day, 23:44:00<1 day, 1:37:55] +[titan] 2025-09-09 17:19:59,479 - root - INFO - step: 26030 loss: 2.7256 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.53 mfu: 49.29% global_avg_ntp_loss: 0.7765 global_avg_top_loss: 1.9492 +[titan] 2025-09-09 17:19:59,479 - root - INFO - lr: 6.9840e-06 gnorm: 0.38 [1 day, 23:44:32<1 day, 1:37:21] +[titan] 2025-09-09 17:20:31,585 - root - INFO - step: 26035 loss: 2.7881 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.42 mfu: 49.18% global_avg_ntp_loss: 0.7991 global_avg_top_loss: 1.9889 +[titan] 2025-09-09 17:20:31,585 - root - INFO - lr: 6.9808e-06 gnorm: 0.41 [1 day, 23:45:04<1 day, 1:36:48] +[titan] 2025-09-09 17:21:03,561 - root - INFO - step: 26040 loss: 2.7484 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.40 mfu: 49.38% global_avg_ntp_loss: 0.7811 global_avg_top_loss: 1.9673 +[titan] 2025-09-09 17:21:03,561 - root - INFO - lr: 6.9776e-06 gnorm: 0.37 [1 day, 23:45:36<1 day, 1:36:14] +[titan] 2025-09-09 17:21:35,665 - root - INFO - step: 26045 loss: 3.0163 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.46 mfu: 49.19% global_avg_ntp_loss: 0.9318 global_avg_top_loss: 2.0845 +[titan] 2025-09-09 17:21:35,665 - root - INFO - lr: 6.9744e-06 gnorm: 0.36 [1 day, 23:46:08<1 day, 1:35:41] +[titan] 2025-09-09 17:22:01,267 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:22:07,693 - root - INFO - step: 26050 loss: 2.7443 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.61 mfu: 49.30% global_avg_ntp_loss: 0.7822 global_avg_top_loss: 1.9621 +[titan] 2025-09-09 17:22:07,693 - root - INFO - lr: 6.9712e-06 gnorm: 0.36 [1 day, 23:46:40<1 day, 1:35:07] +[titan] 2025-09-09 17:22:39,718 - root - INFO - step: 26055 loss: 2.7208 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.66 mfu: 49.31% global_avg_ntp_loss: 0.7660 global_avg_top_loss: 1.9549 +[titan] 2025-09-09 17:22:39,719 - root - INFO - lr: 6.9680e-06 gnorm: 0.38 [1 day, 23:47:12<1 day, 1:34:34] +[titan] 2025-09-09 17:23:11,858 - root - INFO - step: 26060 loss: 2.6359 memory: 122.03GiB(87.57%) tps: 10,196 tflops: 485.92 mfu: 49.13% global_avg_ntp_loss: 0.7269 global_avg_top_loss: 1.9090 +[titan] 2025-09-09 17:23:11,858 - root - INFO - lr: 6.9648e-06 gnorm: 0.50 [1 day, 23:47:44<1 day, 1:34:00] +[titan] 2025-09-09 17:23:43,901 - root - INFO - step: 26065 loss: 2.7613 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.37 mfu: 49.28% global_avg_ntp_loss: 0.7847 global_avg_top_loss: 1.9766 +[titan] 2025-09-09 17:23:43,902 - root - INFO - lr: 6.9616e-06 gnorm: 0.40 [1 day, 23:48:16<1 day, 1:33:27] +[titan] 2025-09-09 17:24:15,915 - root - INFO - step: 26070 loss: 2.7550 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.83 mfu: 49.33% global_avg_ntp_loss: 0.7811 global_avg_top_loss: 1.9740 +[titan] 2025-09-09 17:24:15,915 - root - INFO - lr: 6.9585e-06 gnorm: 0.38 [1 day, 23:48:48<1 day, 1:32:53] +[titan] 2025-09-09 17:24:47,926 - root - INFO - step: 26075 loss: 3.1116 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.87 mfu: 49.33% global_avg_ntp_loss: 0.9942 global_avg_top_loss: 2.1173 +[titan] 2025-09-09 17:24:47,927 - root - INFO - lr: 6.9553e-06 gnorm: 0.38 [1 day, 23:49:20<1 day, 1:32:20] +[titan] 2025-09-09 17:25:20,262 - root - INFO - step: 26080 loss: 2.7241 memory: 122.03GiB(87.57%) tps: 10,134 tflops: 482.98 mfu: 48.83% global_avg_ntp_loss: 0.7668 global_avg_top_loss: 1.9573 +[titan] 2025-09-09 17:25:20,262 - root - INFO - lr: 6.9521e-06 gnorm: 0.38 [1 day, 23:49:53<1 day, 1:31:46] +[titan] 2025-09-09 17:25:52,315 - root - INFO - step: 26085 loss: 2.6729 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.22 mfu: 49.26% global_avg_ntp_loss: 0.7504 global_avg_top_loss: 1.9225 +[titan] 2025-09-09 17:25:52,316 - root - INFO - lr: 6.9489e-06 gnorm: 0.37 [1 day, 23:50:25<1 day, 1:31:13] +[titan] 2025-09-09 17:26:24,324 - root - INFO - step: 26090 loss: 2.7296 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.90 mfu: 49.33% global_avg_ntp_loss: 0.7711 global_avg_top_loss: 1.9585 +[titan] 2025-09-09 17:26:24,324 - root - INFO - lr: 6.9457e-06 gnorm: 0.37 [1 day, 23:50:57<1 day, 1:30:39] +[titan] 2025-09-09 17:26:56,361 - root - INFO - step: 26095 loss: 2.7303 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.48 mfu: 49.29% global_avg_ntp_loss: 0.7752 global_avg_top_loss: 1.9551 +[titan] 2025-09-09 17:26:56,361 - root - INFO - lr: 6.9425e-06 gnorm: 0.37 [1 day, 23:51:29<1 day, 1:30:06] +[titan] 2025-09-09 17:27:22,100 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:27:28,467 - root - INFO - step: 26100 loss: 2.7222 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.42 mfu: 49.18% global_avg_ntp_loss: 0.7676 global_avg_top_loss: 1.9546 +[titan] 2025-09-09 17:27:28,467 - root - INFO - lr: 6.9393e-06 gnorm: 0.37 [1 day, 23:52:01<1 day, 1:29:32] +[titan] 2025-09-09 17:28:00,387 - root - INFO - step: 26105 loss: 2.6318 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.26 mfu: 49.47% global_avg_ntp_loss: 0.7282 global_avg_top_loss: 1.9036 +[titan] 2025-09-09 17:28:00,388 - root - INFO - lr: 6.9361e-06 gnorm: 0.35 [1 day, 23:52:33<1 day, 1:28:59] +[titan] 2025-09-09 17:28:32,440 - root - INFO - step: 26110 loss: 2.9489 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.23 mfu: 49.27% global_avg_ntp_loss: 0.8720 global_avg_top_loss: 2.0769 +[titan] 2025-09-09 17:28:32,441 - root - INFO - lr: 6.9330e-06 gnorm: 0.38 [1 day, 23:53:05<1 day, 1:28:25] +[titan] 2025-09-09 17:28:45,502 - root - INFO - Dumping profiler traces at step 26112 +[titan] 2025-09-09 17:28:45,561 - root - INFO - Finished dumping profiler traces in 0.06 seconds +[titan] 2025-09-09 17:29:04,741 - root - INFO - step: 26115 loss: 2.6586 memory: 122.03GiB(87.57%) tps: 10,145 tflops: 483.50 mfu: 48.89% global_avg_ntp_loss: 0.7405 global_avg_top_loss: 1.9181 +[titan] 2025-09-09 17:29:04,741 - root - INFO - lr: 6.9298e-06 gnorm: 0.41 [1 day, 23:53:37<1 day, 1:27:52] +[titan] 2025-09-09 17:29:36,776 - root - INFO - step: 26120 loss: 2.7465 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.50 mfu: 49.29% global_avg_ntp_loss: 0.7785 global_avg_top_loss: 1.9680 +[titan] 2025-09-09 17:29:36,777 - root - INFO - lr: 6.9266e-06 gnorm: 0.40 [1 day, 23:54:09<1 day, 1:27:18] +[titan] 2025-09-09 17:30:08,932 - root - INFO - step: 26125 loss: 3.2059 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.67 mfu: 49.11% global_avg_ntp_loss: 1.0409 global_avg_top_loss: 2.1650 +[titan] 2025-09-09 17:30:08,933 - root - INFO - lr: 6.9234e-06 gnorm: 0.39 [1 day, 23:54:41<1 day, 1:26:45] +[titan] 2025-09-09 17:30:41,084 - root - INFO - step: 26130 loss: 2.7242 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.74 mfu: 49.11% global_avg_ntp_loss: 0.7707 global_avg_top_loss: 1.9535 +[titan] 2025-09-09 17:30:41,084 - root - INFO - lr: 6.9202e-06 gnorm: 0.39 [1 day, 23:55:14<1 day, 1:26:11] +[titan] 2025-09-09 17:31:12,963 - root - INFO - step: 26135 loss: 2.6939 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.89 mfu: 49.53% global_avg_ntp_loss: 0.7583 global_avg_top_loss: 1.9356 +[titan] 2025-09-09 17:31:12,963 - root - INFO - lr: 6.9170e-06 gnorm: 0.37 [1 day, 23:55:45<1 day, 1:25:38] +[titan] 2025-09-09 17:31:44,747 - root - INFO - step: 26140 loss: 2.7265 memory: 122.03GiB(87.57%) tps: 10,310 tflops: 491.36 mfu: 49.68% global_avg_ntp_loss: 0.7773 global_avg_top_loss: 1.9492 +[titan] 2025-09-09 17:31:44,747 - root - INFO - lr: 6.9139e-06 gnorm: 0.38 [1 day, 23:56:17<1 day, 1:25:04] +[titan] 2025-09-09 17:32:16,788 - root - INFO - step: 26145 loss: 2.7751 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.41 mfu: 49.28% global_avg_ntp_loss: 0.7924 global_avg_top_loss: 1.9827 +[titan] 2025-09-09 17:32:16,788 - root - INFO - lr: 6.9107e-06 gnorm: 0.36 [1 day, 23:56:49<1 day, 1:24:30] +[titan] 2025-09-09 17:32:42,414 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:32:48,788 - root - INFO - step: 26150 loss: 2.7013 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.04 mfu: 49.35% global_avg_ntp_loss: 0.7583 global_avg_top_loss: 1.9429 +[titan] 2025-09-09 17:32:48,788 - root - INFO - lr: 6.9075e-06 gnorm: 0.35 [1 day, 23:57:21<1 day, 1:23:57] +[titan] 2025-09-09 17:33:20,777 - root - INFO - step: 26155 loss: 3.1825 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.21 mfu: 49.36% global_avg_ntp_loss: 1.0251 global_avg_top_loss: 2.1575 +[titan] 2025-09-09 17:33:20,777 - root - INFO - lr: 6.9043e-06 gnorm: 0.51 [1 day, 23:57:53<1 day, 1:23:23] +[titan] 2025-09-09 17:33:52,528 - root - INFO - step: 26160 loss: 2.7225 memory: 122.03GiB(87.57%) tps: 10,320 tflops: 491.86 mfu: 49.73% global_avg_ntp_loss: 0.7712 global_avg_top_loss: 1.9513 +[titan] 2025-09-09 17:33:52,528 - root - INFO - lr: 6.9011e-06 gnorm: 0.37 [1 day, 23:58:25<1 day, 1:22:50] +[titan] 2025-09-09 17:34:24,606 - root - INFO - step: 26165 loss: 2.6600 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.85 mfu: 49.23% global_avg_ntp_loss: 0.7450 global_avg_top_loss: 1.9150 +[titan] 2025-09-09 17:34:24,606 - root - INFO - lr: 6.8980e-06 gnorm: 0.37 [1 day, 23:58:57<1 day, 1:22:16] +[titan] 2025-09-09 17:34:56,580 - root - INFO - step: 26170 loss: 2.6813 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.42 mfu: 49.39% global_avg_ntp_loss: 0.7488 global_avg_top_loss: 1.9324 +[titan] 2025-09-09 17:34:56,581 - root - INFO - lr: 6.8948e-06 gnorm: 0.38 [1 day, 23:59:29<1 day, 1:21:43] +[titan] 2025-09-09 17:35:28,770 - root - INFO - step: 26175 loss: 2.7274 memory: 122.03GiB(87.57%) tps: 10,180 tflops: 485.16 mfu: 49.06% global_avg_ntp_loss: 0.7659 global_avg_top_loss: 1.9615 +[titan] 2025-09-09 17:35:28,770 - root - INFO - lr: 6.8916e-06 gnorm: 0.37 [2 days, 0:00:01<1 day, 1:21:09] +[titan] 2025-09-09 17:36:00,718 - root - INFO - step: 26180 loss: 2.6162 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.83 mfu: 49.43% global_avg_ntp_loss: 0.7187 global_avg_top_loss: 1.8974 +[titan] 2025-09-09 17:36:00,718 - root - INFO - lr: 6.8884e-06 gnorm: 0.37 [2 days, 0:00:33<1 day, 1:20:36] +[titan] 2025-09-09 17:36:32,747 - root - INFO - step: 26185 loss: 2.7548 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.59 mfu: 49.30% global_avg_ntp_loss: 0.7824 global_avg_top_loss: 1.9723 +[titan] 2025-09-09 17:36:32,747 - root - INFO - lr: 6.8853e-06 gnorm: 0.42 [2 days, 0:01:05<1 day, 1:20:02] +[titan] 2025-09-09 17:37:04,587 - root - INFO - step: 26190 loss: 2.9840 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.49 mfu: 49.59% global_avg_ntp_loss: 0.8906 global_avg_top_loss: 2.0935 +[titan] 2025-09-09 17:37:04,588 - root - INFO - lr: 6.8821e-06 gnorm: 0.46 [2 days, 0:01:37<1 day, 1:19:28] +[titan] 2025-09-09 17:37:36,421 - root - INFO - step: 26195 loss: 2.7330 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.59 mfu: 49.60% global_avg_ntp_loss: 0.7713 global_avg_top_loss: 1.9616 +[titan] 2025-09-09 17:37:36,421 - root - INFO - lr: 6.8789e-06 gnorm: 0.37 [2 days, 0:02:09<1 day, 1:18:55] +[titan] 2025-09-09 17:38:01,897 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:38:08,364 - root - INFO - step: 26200 loss: 2.8158 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.91 mfu: 49.43% global_avg_ntp_loss: 0.8180 global_avg_top_loss: 1.9978 +[titan] 2025-09-09 17:38:08,364 - root - INFO - lr: 6.8757e-06 gnorm: 0.46 [2 days, 0:02:41<1 day, 1:18:21] +[titan] 2025-09-09 17:38:40,477 - root - INFO - step: 26205 loss: 3.2506 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.31 mfu: 49.17% global_avg_ntp_loss: 1.0605 global_avg_top_loss: 2.1902 +[titan] 2025-09-09 17:38:40,478 - root - INFO - lr: 6.8726e-06 gnorm: 0.37 [2 days, 0:03:13<1 day, 1:17:48] +[titan] 2025-09-09 17:39:12,409 - root - INFO - step: 26210 loss: 2.7501 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.09 mfu: 49.45% global_avg_ntp_loss: 0.7823 global_avg_top_loss: 1.9677 +[titan] 2025-09-09 17:39:12,409 - root - INFO - lr: 6.8694e-06 gnorm: 0.40 [2 days, 0:03:45<1 day, 1:17:14] +[titan] 2025-09-09 17:39:44,557 - root - INFO - step: 26215 loss: 2.7479 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.79 mfu: 49.12% global_avg_ntp_loss: 0.7814 global_avg_top_loss: 1.9665 +[titan] 2025-09-09 17:39:44,557 - root - INFO - lr: 6.8662e-06 gnorm: 0.36 [2 days, 0:04:17<1 day, 1:16:41] +[titan] 2025-09-09 17:40:16,476 - root - INFO - step: 26220 loss: 2.7353 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.28 mfu: 49.47% global_avg_ntp_loss: 0.7754 global_avg_top_loss: 1.9600 +[titan] 2025-09-09 17:40:16,476 - root - INFO - lr: 6.8631e-06 gnorm: 0.41 [2 days, 0:04:49<1 day, 1:16:07] +[titan] 2025-09-09 17:40:48,348 - root - INFO - step: 26225 loss: 2.6104 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 490.00 mfu: 49.54% global_avg_ntp_loss: 0.7270 global_avg_top_loss: 1.8834 +[titan] 2025-09-09 17:40:48,348 - root - INFO - lr: 6.8599e-06 gnorm: 0.40 [2 days, 0:05:21<1 day, 1:15:34] +[titan] 2025-09-09 17:41:20,514 - root - INFO - step: 26230 loss: 2.7211 memory: 122.03GiB(87.57%) tps: 10,187 tflops: 485.51 mfu: 49.09% global_avg_ntp_loss: 0.7662 global_avg_top_loss: 1.9548 +[titan] 2025-09-09 17:41:20,514 - root - INFO - lr: 6.8567e-06 gnorm: 0.37 [2 days, 0:05:53<1 day, 1:15:00] +[titan] 2025-09-09 17:41:52,524 - root - INFO - step: 26235 loss: 3.1629 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.89 mfu: 49.33% global_avg_ntp_loss: 1.0223 global_avg_top_loss: 2.1405 +[titan] 2025-09-09 17:41:52,524 - root - INFO - lr: 6.8535e-06 gnorm: 0.39 [2 days, 0:06:25<1 day, 1:14:27] +[titan] 2025-09-09 17:42:24,723 - root - INFO - step: 26240 loss: 2.6700 memory: 122.03GiB(87.57%) tps: 10,177 tflops: 485.02 mfu: 49.04% global_avg_ntp_loss: 0.7476 global_avg_top_loss: 1.9224 +[titan] 2025-09-09 17:42:24,723 - root - INFO - lr: 6.8504e-06 gnorm: 0.36 [2 days, 0:06:57<1 day, 1:13:53] +[titan] 2025-09-09 17:42:56,548 - root - INFO - step: 26245 loss: 2.8124 memory: 122.03GiB(87.57%) tps: 10,296 tflops: 490.71 mfu: 49.62% global_avg_ntp_loss: 0.8050 global_avg_top_loss: 2.0074 +[titan] 2025-09-09 17:42:56,549 - root - INFO - lr: 6.8472e-06 gnorm: 0.45 [2 days, 0:07:29<1 day, 1:13:20] +[titan] 2025-09-09 17:43:22,086 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:43:28,509 - root - INFO - step: 26250 loss: 2.7200 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.64 mfu: 49.41% global_avg_ntp_loss: 0.7670 global_avg_top_loss: 1.9529 +[titan] 2025-09-09 17:43:28,509 - root - INFO - lr: 6.8440e-06 gnorm: 0.39 [2 days, 0:08:01<1 day, 1:12:46] +[titan] 2025-09-09 17:44:00,493 - root - INFO - step: 26255 loss: 2.7502 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.29 mfu: 49.37% global_avg_ntp_loss: 0.7798 global_avg_top_loss: 1.9704 +[titan] 2025-09-09 17:44:00,493 - root - INFO - lr: 6.8409e-06 gnorm: 0.40 [2 days, 0:08:33<1 day, 1:12:12] +[titan] 2025-09-09 17:44:32,454 - root - INFO - step: 26260 loss: 2.7866 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.63 mfu: 49.41% global_avg_ntp_loss: 0.8035 global_avg_top_loss: 1.9832 +[titan] 2025-09-09 17:44:32,454 - root - INFO - lr: 6.8377e-06 gnorm: 0.38 [2 days, 0:09:05<1 day, 1:11:39] +[titan] 2025-09-09 17:45:04,432 - root - INFO - step: 26265 loss: 2.7524 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.38 mfu: 49.38% global_avg_ntp_loss: 0.7777 global_avg_top_loss: 1.9748 +[titan] 2025-09-09 17:45:04,432 - root - INFO - lr: 6.8345e-06 gnorm: 0.44 [2 days, 0:09:37<1 day, 1:11:05] +[titan] 2025-09-09 17:45:36,549 - root - INFO - step: 26270 loss: 3.0611 memory: 122.03GiB(87.57%) tps: 10,203 tflops: 486.25 mfu: 49.17% global_avg_ntp_loss: 0.9269 global_avg_top_loss: 2.1343 +[titan] 2025-09-09 17:45:36,550 - root - INFO - lr: 6.8314e-06 gnorm: 0.47 [2 days, 0:10:09<1 day, 1:10:32] +[titan] 2025-09-09 17:46:08,719 - root - INFO - step: 26275 loss: 2.7708 memory: 122.03GiB(87.57%) tps: 10,186 tflops: 485.47 mfu: 49.09% global_avg_ntp_loss: 0.7906 global_avg_top_loss: 1.9801 +[titan] 2025-09-09 17:46:08,719 - root - INFO - lr: 6.8282e-06 gnorm: 0.44 [2 days, 0:10:41<1 day, 1:09:58] +[titan] 2025-09-09 17:46:40,774 - root - INFO - step: 26280 loss: 2.7453 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.19 mfu: 49.26% global_avg_ntp_loss: 0.7787 global_avg_top_loss: 1.9666 +[titan] 2025-09-09 17:46:40,775 - root - INFO - lr: 6.8251e-06 gnorm: 0.38 [2 days, 0:11:13<1 day, 1:09:25] +[titan] 2025-09-09 17:47:12,851 - root - INFO - step: 26285 loss: 3.2434 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.88 mfu: 49.23% global_avg_ntp_loss: 1.0574 global_avg_top_loss: 2.1860 +[titan] 2025-09-09 17:47:12,851 - root - INFO - lr: 6.8219e-06 gnorm: 0.36 [2 days, 0:11:45<1 day, 1:08:51] +[titan] 2025-09-09 17:47:44,915 - root - INFO - step: 26290 loss: 2.6759 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.06 mfu: 49.25% global_avg_ntp_loss: 0.7502 global_avg_top_loss: 1.9257 +[titan] 2025-09-09 17:47:44,915 - root - INFO - lr: 6.8187e-06 gnorm: 0.37 [2 days, 0:12:17<1 day, 1:08:18] +[titan] 2025-09-09 17:48:16,759 - root - INFO - step: 26295 loss: 2.7558 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.42 mfu: 49.59% global_avg_ntp_loss: 0.7835 global_avg_top_loss: 1.9723 +[titan] 2025-09-09 17:48:16,760 - root - INFO - lr: 6.8156e-06 gnorm: 0.37 [2 days, 0:12:49<1 day, 1:07:44] +[titan] 2025-09-09 17:48:42,599 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:48:48,993 - root - INFO - step: 26300 loss: 2.6701 memory: 122.03GiB(87.57%) tps: 10,166 tflops: 484.50 mfu: 48.99% global_avg_ntp_loss: 0.7485 global_avg_top_loss: 1.9216 +[titan] 2025-09-09 17:48:48,994 - root - INFO - lr: 6.8124e-06 gnorm: 0.39 [2 days, 0:13:21<1 day, 1:07:11] +[titan] 2025-09-09 17:49:21,002 - root - INFO - step: 26305 loss: 2.6700 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.90 mfu: 49.33% global_avg_ntp_loss: 0.7552 global_avg_top_loss: 1.9149 +[titan] 2025-09-09 17:49:21,003 - root - INFO - lr: 6.8093e-06 gnorm: 0.36 [2 days, 0:13:53<1 day, 1:06:37] +[titan] 2025-09-09 17:49:53,125 - root - INFO - step: 26310 loss: 2.7345 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.18 mfu: 49.16% global_avg_ntp_loss: 0.7737 global_avg_top_loss: 1.9608 +[titan] 2025-09-09 17:49:53,125 - root - INFO - lr: 6.8061e-06 gnorm: 0.38 [2 days, 0:14:26<1 day, 1:06:04] +[titan] 2025-09-09 17:50:25,098 - root - INFO - step: 26315 loss: 2.5116 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.44 mfu: 49.39% global_avg_ntp_loss: 0.6761 global_avg_top_loss: 1.8355 +[titan] 2025-09-09 17:50:25,099 - root - INFO - lr: 6.8029e-06 gnorm: 0.36 [2 days, 0:14:58<1 day, 1:05:30] +[titan] 2025-09-09 17:50:57,141 - root - INFO - step: 26320 loss: 2.6805 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.39 mfu: 49.28% global_avg_ntp_loss: 0.7471 global_avg_top_loss: 1.9334 +[titan] 2025-09-09 17:50:57,141 - root - INFO - lr: 6.7998e-06 gnorm: 0.36 [2 days, 0:15:30<1 day, 1:04:57] +[titan] 2025-09-09 17:51:28,971 - root - INFO - step: 26325 loss: 2.7257 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.64 mfu: 49.61% global_avg_ntp_loss: 0.7710 global_avg_top_loss: 1.9547 +[titan] 2025-09-09 17:51:28,971 - root - INFO - lr: 6.7966e-06 gnorm: 0.37 [2 days, 0:16:01<1 day, 1:04:23] +[titan] 2025-09-09 17:52:01,087 - root - INFO - step: 26330 loss: 2.8199 memory: 122.03GiB(87.57%) tps: 10,203 tflops: 486.27 mfu: 49.17% global_avg_ntp_loss: 0.8124 global_avg_top_loss: 2.0075 +[titan] 2025-09-09 17:52:01,087 - root - INFO - lr: 6.7935e-06 gnorm: 0.38 [2 days, 0:16:34<1 day, 1:03:50] +[titan] 2025-09-09 17:52:33,235 - root - INFO - step: 26335 loss: 2.7084 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.79 mfu: 49.12% global_avg_ntp_loss: 0.7606 global_avg_top_loss: 1.9478 +[titan] 2025-09-09 17:52:33,236 - root - INFO - lr: 6.7903e-06 gnorm: 0.36 [2 days, 0:17:06<1 day, 1:03:16] +[titan] 2025-09-09 17:53:05,067 - root - INFO - step: 26340 loss: 2.7015 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.61 mfu: 49.61% global_avg_ntp_loss: 0.7610 global_avg_top_loss: 1.9405 +[titan] 2025-09-09 17:53:05,068 - root - INFO - lr: 6.7872e-06 gnorm: 0.36 [2 days, 0:17:38<1 day, 1:02:43] +[titan] 2025-09-09 17:53:37,161 - root - INFO - step: 26345 loss: 2.6775 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.62 mfu: 49.20% global_avg_ntp_loss: 0.7480 global_avg_top_loss: 1.9296 +[titan] 2025-09-09 17:53:37,161 - root - INFO - lr: 6.7840e-06 gnorm: 0.37 [2 days, 0:18:10<1 day, 1:02:09] +[titan] 2025-09-09 17:54:02,817 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:54:09,188 - root - INFO - step: 26350 loss: 2.7072 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.62 mfu: 49.30% global_avg_ntp_loss: 0.7623 global_avg_top_loss: 1.9449 +[titan] 2025-09-09 17:54:09,188 - root - INFO - lr: 6.7808e-06 gnorm: 0.37 [2 days, 0:18:42<1 day, 1:01:36] +[titan] 2025-09-09 17:54:41,133 - root - INFO - step: 26355 loss: 2.7382 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.87 mfu: 49.43% global_avg_ntp_loss: 0.7775 global_avg_top_loss: 1.9607 +[titan] 2025-09-09 17:54:41,133 - root - INFO - lr: 6.7777e-06 gnorm: 0.37 [2 days, 0:19:14<1 day, 1:01:02] +[titan] 2025-09-09 17:55:13,223 - root - INFO - step: 26360 loss: 2.7167 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.66 mfu: 49.21% global_avg_ntp_loss: 0.7659 global_avg_top_loss: 1.9509 +[titan] 2025-09-09 17:55:13,224 - root - INFO - lr: 6.7745e-06 gnorm: 0.39 [2 days, 0:19:46<1 day, 1:00:29] +[titan] 2025-09-09 17:55:45,333 - root - INFO - step: 26365 loss: 3.2179 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.37 mfu: 49.18% global_avg_ntp_loss: 1.0477 global_avg_top_loss: 2.1702 +[titan] 2025-09-09 17:55:45,334 - root - INFO - lr: 6.7714e-06 gnorm: 0.36 [2 days, 0:20:18<1 day, 0:59:55] +[titan] 2025-09-09 17:56:17,292 - root - INFO - step: 26370 loss: 2.7177 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.67 mfu: 49.41% global_avg_ntp_loss: 0.7681 global_avg_top_loss: 1.9496 +[titan] 2025-09-09 17:56:17,292 - root - INFO - lr: 6.7682e-06 gnorm: 0.37 [2 days, 0:20:50<1 day, 0:59:22] +[titan] 2025-09-09 17:56:49,398 - root - INFO - step: 26375 loss: 2.6157 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.43 mfu: 49.18% global_avg_ntp_loss: 0.7332 global_avg_top_loss: 1.8825 +[titan] 2025-09-09 17:56:49,398 - root - INFO - lr: 6.7651e-06 gnorm: 0.38 [2 days, 0:21:22<1 day, 0:58:48] +[titan] 2025-09-09 17:57:21,625 - root - INFO - step: 26380 loss: 2.6597 memory: 122.03GiB(87.57%) tps: 10,168 tflops: 484.59 mfu: 49.00% global_avg_ntp_loss: 0.7400 global_avg_top_loss: 1.9197 +[titan] 2025-09-09 17:57:21,626 - root - INFO - lr: 6.7619e-06 gnorm: 0.36 [2 days, 0:21:54<1 day, 0:58:15] +[titan] 2025-09-09 17:57:53,624 - root - INFO - step: 26385 loss: 2.6869 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.06 mfu: 49.35% global_avg_ntp_loss: 0.7589 global_avg_top_loss: 1.9280 +[titan] 2025-09-09 17:57:53,624 - root - INFO - lr: 6.7588e-06 gnorm: 0.37 [2 days, 0:22:26<1 day, 0:57:41] +[titan] 2025-09-09 17:58:25,650 - root - INFO - step: 26390 loss: 2.8947 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.64 mfu: 49.31% global_avg_ntp_loss: 0.8609 global_avg_top_loss: 2.0338 +[titan] 2025-09-09 17:58:25,650 - root - INFO - lr: 6.7556e-06 gnorm: 0.36 [2 days, 0:22:58<1 day, 0:57:08] +[titan] 2025-09-09 17:58:57,689 - root - INFO - step: 26395 loss: 2.5015 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.44 mfu: 49.29% global_avg_ntp_loss: 0.6745 global_avg_top_loss: 1.8270 +[titan] 2025-09-09 17:58:57,689 - root - INFO - lr: 6.7525e-06 gnorm: 0.36 [2 days, 0:23:30<1 day, 0:56:34] +[titan] 2025-09-09 17:59:23,318 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:59:29,796 - root - INFO - step: 26400 loss: 2.7635 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.41 mfu: 49.18% global_avg_ntp_loss: 0.7863 global_avg_top_loss: 1.9771 +[titan] 2025-09-09 17:59:29,796 - root - INFO - lr: 6.7493e-06 gnorm: 0.38 [2 days, 0:24:02<1 day, 0:56:01] +[titan] 2025-09-09 18:00:01,816 - root - INFO - step: 26405 loss: 2.6588 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.73 mfu: 49.32% global_avg_ntp_loss: 0.7360 global_avg_top_loss: 1.9228 +[titan] 2025-09-09 18:00:01,816 - root - INFO - lr: 6.7462e-06 gnorm: 0.37 [2 days, 0:24:34<1 day, 0:55:27] +[titan] 2025-09-09 18:00:34,082 - root - INFO - step: 26410 loss: 2.7292 memory: 122.03GiB(87.57%) tps: 10,156 tflops: 484.01 mfu: 48.94% global_avg_ntp_loss: 0.7729 global_avg_top_loss: 1.9563 +[titan] 2025-09-09 18:00:34,082 - root - INFO - lr: 6.7431e-06 gnorm: 0.43 [2 days, 0:25:07<1 day, 0:54:54] +[titan] 2025-09-09 18:01:06,220 - root - INFO - step: 26415 loss: 2.7316 memory: 122.03GiB(87.57%) tps: 10,196 tflops: 485.95 mfu: 49.14% global_avg_ntp_loss: 0.7731 global_avg_top_loss: 1.9586 +[titan] 2025-09-09 18:01:06,220 - root - INFO - lr: 6.7399e-06 gnorm: 0.36 [2 days, 0:25:39<1 day, 0:54:21] +[titan] 2025-09-09 18:01:38,095 - root - INFO - step: 26420 loss: 2.7657 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.95 mfu: 49.54% global_avg_ntp_loss: 0.7806 global_avg_top_loss: 1.9851 +[titan] 2025-09-09 18:01:38,095 - root - INFO - lr: 6.7368e-06 gnorm: 0.39 [2 days, 0:26:11<1 day, 0:53:47] +[titan] 2025-09-09 18:02:10,201 - root - INFO - step: 26425 loss: 2.7568 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.41 mfu: 49.18% global_avg_ntp_loss: 0.7904 global_avg_top_loss: 1.9664 +[titan] 2025-09-09 18:02:10,202 - root - INFO - lr: 6.7336e-06 gnorm: 0.37 [2 days, 0:26:43<1 day, 0:53:14] +[titan] 2025-09-09 18:02:42,082 - root - INFO - step: 26430 loss: 2.8021 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.87 mfu: 49.53% global_avg_ntp_loss: 0.7979 global_avg_top_loss: 2.0042 +[titan] 2025-09-09 18:02:42,082 - root - INFO - lr: 6.7305e-06 gnorm: 1.15 [2 days, 0:27:15<1 day, 0:52:40] +[titan] 2025-09-09 18:03:13,873 - root - INFO - step: 26435 loss: 2.8537 memory: 122.03GiB(87.57%) tps: 10,307 tflops: 491.25 mfu: 49.67% global_avg_ntp_loss: 0.8392 global_avg_top_loss: 2.0145 +[titan] 2025-09-09 18:03:13,873 - root - INFO - lr: 6.7273e-06 gnorm: 0.38 [2 days, 0:27:46<1 day, 0:52:06] +[titan] 2025-09-09 18:03:45,895 - root - INFO - step: 26440 loss: 2.6853 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.69 mfu: 49.31% global_avg_ntp_loss: 0.7534 global_avg_top_loss: 1.9319 +[titan] 2025-09-09 18:03:45,896 - root - INFO - lr: 6.7242e-06 gnorm: 0.38 [2 days, 0:28:18<1 day, 0:51:33] +[titan] 2025-09-09 18:04:17,896 - root - INFO - step: 26445 loss: 2.7965 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.03 mfu: 49.35% global_avg_ntp_loss: 0.8035 global_avg_top_loss: 1.9930 +[titan] 2025-09-09 18:04:17,896 - root - INFO - lr: 6.7211e-06 gnorm: 0.37 [2 days, 0:28:50<1 day, 0:50:59] +[titan] 2025-09-09 18:04:43,413 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 18:04:49,848 - root - INFO - step: 26450 loss: 2.7367 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.78 mfu: 49.42% global_avg_ntp_loss: 0.7727 global_avg_top_loss: 1.9640 +[titan] 2025-09-09 18:04:49,848 - root - INFO - lr: 6.7179e-06 gnorm: 0.36 [2 days, 0:29:22<1 day, 0:50:26] +[titan] 2025-09-09 18:05:21,849 - root - INFO - step: 26455 loss: 2.6576 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.01 mfu: 49.34% global_avg_ntp_loss: 0.7378 global_avg_top_loss: 1.9199 +[titan] 2025-09-09 18:05:21,849 - root - INFO - lr: 6.7148e-06 gnorm: 0.37 [2 days, 0:29:54<1 day, 0:49:52] +[titan] 2025-09-09 18:05:53,703 - root - INFO - step: 26460 loss: 2.7638 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.28 mfu: 49.57% global_avg_ntp_loss: 0.7850 global_avg_top_loss: 1.9788 +[titan] 2025-09-09 18:05:53,703 - root - INFO - lr: 6.7116e-06 gnorm: 0.79 [2 days, 0:30:26<1 day, 0:49:19] +[titan] 2025-09-09 18:06:25,661 - root - INFO - step: 26465 loss: 2.7646 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.67 mfu: 49.41% global_avg_ntp_loss: 0.7900 global_avg_top_loss: 1.9745 +[titan] 2025-09-09 18:06:25,662 - root - INFO - lr: 6.7085e-06 gnorm: 0.37 [2 days, 0:30:58<1 day, 0:48:45] +[titan] 2025-09-09 18:06:57,520 - root - INFO - step: 26470 loss: 2.6511 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.21 mfu: 49.57% global_avg_ntp_loss: 0.7346 global_avg_top_loss: 1.9166 +[titan] 2025-09-09 18:06:57,520 - root - INFO - lr: 6.7054e-06 gnorm: 0.36 [2 days, 0:31:30<1 day, 0:48:12] +[titan] 2025-09-09 18:07:29,449 - root - INFO - step: 26475 loss: 2.7796 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.12 mfu: 49.46% global_avg_ntp_loss: 0.7954 global_avg_top_loss: 1.9842 +[titan] 2025-09-09 18:07:29,449 - root - INFO - lr: 6.7022e-06 gnorm: 0.39 [2 days, 0:32:02<1 day, 0:47:38] +[titan] 2025-09-09 18:08:01,415 - root - INFO - step: 26480 loss: 2.6602 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.55 mfu: 49.40% global_avg_ntp_loss: 0.7415 global_avg_top_loss: 1.9188 +[titan] 2025-09-09 18:08:01,415 - root - INFO - lr: 6.6991e-06 gnorm: 0.40 [2 days, 0:32:34<1 day, 0:47:05] +[titan] 2025-09-09 18:08:33,352 - root - INFO - step: 26485 loss: 2.7112 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 489.00 mfu: 49.44% global_avg_ntp_loss: 0.7618 global_avg_top_loss: 1.9494 +[titan] 2025-09-09 18:08:33,352 - root - INFO - lr: 6.6959e-06 gnorm: 0.39 [2 days, 0:33:06<1 day, 0:46:31] +[titan] 2025-09-09 18:09:05,385 - root - INFO - step: 26490 loss: 2.7449 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.54 mfu: 49.30% global_avg_ntp_loss: 0.7773 global_avg_top_loss: 1.9675 +[titan] 2025-09-09 18:09:05,385 - root - INFO - lr: 6.6928e-06 gnorm: 0.36 [2 days, 0:33:38<1 day, 0:45:58] +[titan] 2025-09-09 18:09:37,198 - root - INFO - step: 26495 loss: 2.7375 memory: 122.03GiB(87.57%) tps: 10,300 tflops: 490.90 mfu: 49.64% global_avg_ntp_loss: 0.7756 global_avg_top_loss: 1.9619 +[titan] 2025-09-09 18:09:37,198 - root - INFO - lr: 6.6897e-06 gnorm: 0.36 [2 days, 0:34:10<1 day, 0:45:24] +[titan] 2025-09-09 18:10:02,774 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 18:10:09,220 - root - INFO - step: 26500 loss: 2.7651 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.69 mfu: 49.31% global_avg_ntp_loss: 0.7881 global_avg_top_loss: 1.9770 +[titan] 2025-09-09 18:10:09,221 - root - INFO - lr: 6.6865e-06 gnorm: 0.37 [2 days, 0:34:42<1 day, 0:44:50] +[titan] 2025-09-09 18:10:41,146 - root - INFO - step: 26505 loss: 2.7141 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.18 mfu: 49.46% global_avg_ntp_loss: 0.7685 global_avg_top_loss: 1.9456 +[titan] 2025-09-09 18:10:41,146 - root - INFO - lr: 6.6834e-06 gnorm: 0.39 [2 days, 0:35:14<1 day, 0:44:17] +[titan] 2025-09-09 18:11:13,150 - root - INFO - step: 26510 loss: 2.7420 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.98 mfu: 49.34% global_avg_ntp_loss: 0.7779 global_avg_top_loss: 1.9641 +[titan] 2025-09-09 18:11:13,150 - root - INFO - lr: 6.6803e-06 gnorm: 0.37 [2 days, 0:35:46<1 day, 0:43:43] +[titan] 2025-09-09 18:11:45,191 - root - INFO - step: 26515 loss: 3.1672 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.41 mfu: 49.28% global_avg_ntp_loss: 1.0187 global_avg_top_loss: 2.1484 +[titan] 2025-09-09 18:11:45,191 - root - INFO - lr: 6.6771e-06 gnorm: 0.41 [2 days, 0:36:18<1 day, 0:43:10] +[titan] 2025-09-09 18:12:17,092 - root - INFO - step: 26520 loss: 2.6628 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.55 mfu: 49.50% global_avg_ntp_loss: 0.7401 global_avg_top_loss: 1.9227 +[titan] 2025-09-09 18:12:17,092 - root - INFO - lr: 6.6740e-06 gnorm: 0.37 [2 days, 0:36:50<1 day, 0:42:36] +[titan] 2025-09-09 18:12:49,049 - root - INFO - step: 26525 loss: 2.6877 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.69 mfu: 49.41% global_avg_ntp_loss: 0.7521 global_avg_top_loss: 1.9356 +[titan] 2025-09-09 18:12:49,049 - root - INFO - lr: 6.6709e-06 gnorm: 0.39 [2 days, 0:37:21<1 day, 0:42:03] +[titan] 2025-09-09 18:13:20,965 - root - INFO - step: 26530 loss: 2.7931 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.33 mfu: 49.48% global_avg_ntp_loss: 0.8094 global_avg_top_loss: 1.9837 +[titan] 2025-09-09 18:13:20,965 - root - INFO - lr: 6.6678e-06 gnorm: 0.38 [2 days, 0:37:53<1 day, 0:41:29] +[titan] 2025-09-09 18:13:52,774 - root - INFO - step: 26535 loss: 2.5594 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 490.97 mfu: 49.64% global_avg_ntp_loss: 0.6996 global_avg_top_loss: 1.8598 +[titan] 2025-09-09 18:13:52,774 - root - INFO - lr: 6.6646e-06 gnorm: 0.39 [2 days, 0:38:25<1 day, 0:40:56] +[titan] 2025-09-09 18:14:24,706 - root - INFO - step: 26540 loss: 2.7469 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.08 mfu: 49.45% global_avg_ntp_loss: 0.7782 global_avg_top_loss: 1.9688 +[titan] 2025-09-09 18:14:24,706 - root - INFO - lr: 6.6615e-06 gnorm: 0.36 [2 days, 0:38:57<1 day, 0:40:22] +[titan] 2025-09-09 18:14:56,756 - root - INFO - step: 26545 loss: 2.7816 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.27 mfu: 49.27% global_avg_ntp_loss: 0.7984 global_avg_top_loss: 1.9832 +[titan] 2025-09-09 18:14:56,756 - root - INFO - lr: 6.6584e-06 gnorm: 0.36 [2 days, 0:39:29<1 day, 0:39:49] +[titan] 2025-09-09 18:15:22,337 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 18:15:28,768 - root - INFO - step: 26550 loss: 2.7103 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.86 mfu: 49.33% global_avg_ntp_loss: 0.7605 global_avg_top_loss: 1.9497 +[titan] 2025-09-09 18:15:28,768 - root - INFO - lr: 6.6552e-06 gnorm: 0.39 [2 days, 0:40:01<1 day, 0:39:15] +[titan] 2025-09-09 18:16:00,542 - root - INFO - step: 26555 loss: 2.7149 memory: 122.03GiB(87.57%) tps: 10,313 tflops: 491.50 mfu: 49.70% global_avg_ntp_loss: 0.7613 global_avg_top_loss: 1.9536 +[titan] 2025-09-09 18:16:00,542 - root - INFO - lr: 6.6521e-06 gnorm: 0.37 [2 days, 0:40:33<1 day, 0:38:42] +[titan] 2025-09-09 18:16:32,527 - root - INFO - step: 26560 loss: 2.6688 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.27 mfu: 49.37% global_avg_ntp_loss: 0.7442 global_avg_top_loss: 1.9246 +[titan] 2025-09-09 18:16:32,527 - root - INFO - lr: 6.6490e-06 gnorm: 0.35 [2 days, 0:41:05<1 day, 0:38:08] +[titan] 2025-09-09 18:17:04,428 - root - INFO - step: 26565 loss: 3.0258 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.55 mfu: 49.50% global_avg_ntp_loss: 0.9468 global_avg_top_loss: 2.0790 +[titan] 2025-09-09 18:17:04,428 - root - INFO - lr: 6.6459e-06 gnorm: 0.36 [2 days, 0:41:37<1 day, 0:37:34] +[titan] 2025-09-09 18:17:36,435 - root - INFO - step: 26570 loss: 2.7843 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.93 mfu: 49.34% global_avg_ntp_loss: 0.8151 global_avg_top_loss: 1.9691 +[titan] 2025-09-09 18:17:36,435 - root - INFO - lr: 6.6427e-06 gnorm: 0.52 [2 days, 0:42:09<1 day, 0:37:01] +[titan] 2025-09-09 18:18:08,371 - root - INFO - step: 26575 loss: 2.7543 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.03 mfu: 49.45% global_avg_ntp_loss: 0.7811 global_avg_top_loss: 1.9732 +[titan] 2025-09-09 18:18:08,371 - root - INFO - lr: 6.6396e-06 gnorm: 0.36 [2 days, 0:42:41<1 day, 0:36:27] +[titan] 2025-09-09 18:18:40,351 - root - INFO - step: 26580 loss: 2.5754 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.34 mfu: 49.38% global_avg_ntp_loss: 0.7003 global_avg_top_loss: 1.8751 +[titan] 2025-09-09 18:18:40,351 - root - INFO - lr: 6.6365e-06 gnorm: 0.38 [2 days, 0:43:13<1 day, 0:35:54] +[titan] 2025-09-09 18:19:12,026 - root - INFO - step: 26585 loss: 2.7384 memory: 122.03GiB(87.57%) tps: 10,345 tflops: 493.04 mfu: 49.85% global_avg_ntp_loss: 0.7740 global_avg_top_loss: 1.9644 +[titan] 2025-09-09 18:19:12,027 - root - INFO - lr: 6.6334e-06 gnorm: 0.37 [2 days, 0:43:44<1 day, 0:35:20] +[titan] 2025-09-09 18:19:43,955 - root - INFO - step: 26590 loss: 2.6997 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.13 mfu: 49.46% global_avg_ntp_loss: 0.7598 global_avg_top_loss: 1.9399 +[titan] 2025-09-09 18:19:43,955 - root - INFO - lr: 6.6303e-06 gnorm: 0.37 [2 days, 0:44:16<1 day, 0:34:47] +[titan] 2025-09-09 18:20:15,941 - root - INFO - step: 26595 loss: 2.7426 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.24 mfu: 49.37% global_avg_ntp_loss: 0.7755 global_avg_top_loss: 1.9671 +[titan] 2025-09-09 18:20:15,941 - root - INFO - lr: 6.6271e-06 gnorm: 0.38 [2 days, 0:44:48<1 day, 0:34:13] +[titan] 2025-09-09 18:20:41,522 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 18:20:47,896 - root - INFO - step: 26600 loss: 2.7555 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.73 mfu: 49.42% global_avg_ntp_loss: 0.7882 global_avg_top_loss: 1.9673 +[titan] 2025-09-09 18:20:47,896 - root - INFO - lr: 6.6240e-06 gnorm: 0.37 [2 days, 0:45:20<1 day, 0:33:40] +[titan] 2025-09-09 18:21:19,953 - root - INFO - step: 26605 loss: 2.7358 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.18 mfu: 49.26% global_avg_ntp_loss: 0.7759 global_avg_top_loss: 1.9599 +[titan] 2025-09-09 18:21:19,953 - root - INFO - lr: 6.6209e-06 gnorm: 0.36 [2 days, 0:45:52<1 day, 0:33:06] +[titan] 2025-09-09 18:21:51,968 - root - INFO - step: 26610 loss: 2.7057 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.81 mfu: 49.32% global_avg_ntp_loss: 0.7625 global_avg_top_loss: 1.9432 +[titan] 2025-09-09 18:21:51,968 - root - INFO - lr: 6.6178e-06 gnorm: 0.36 [2 days, 0:46:24<1 day, 0:32:33] +[titan] 2025-09-09 18:22:23,977 - root - INFO - step: 26615 loss: 2.6111 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.90 mfu: 49.33% global_avg_ntp_loss: 0.7177 global_avg_top_loss: 1.8934 +[titan] 2025-09-09 18:22:23,977 - root - INFO - lr: 6.6147e-06 gnorm: 0.36 [2 days, 0:46:56<1 day, 0:31:59] +[titan] 2025-09-09 18:22:55,882 - root - INFO - step: 26620 loss: 2.7573 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.49 mfu: 49.49% global_avg_ntp_loss: 0.7826 global_avg_top_loss: 1.9747 +[titan] 2025-09-09 18:22:55,882 - root - INFO - lr: 6.6115e-06 gnorm: 0.37 [2 days, 0:47:28<1 day, 0:31:26] +[titan] 2025-09-09 18:23:21,665 - root - INFO - Dumping profiler traces at step 26624 +[titan] 2025-09-09 18:23:21,725 - root - INFO - Finished dumping profiler traces in 0.06 seconds +[titan] 2025-09-09 18:23:28,076 - root - INFO - step: 26625 loss: 2.6706 memory: 122.03GiB(87.57%) tps: 10,178 tflops: 485.09 mfu: 49.05% global_avg_ntp_loss: 0.7474 global_avg_top_loss: 1.9232 +[titan] 2025-09-09 18:23:28,076 - root - INFO - lr: 6.6084e-06 gnorm: 0.36 [2 days, 0:48:01<1 day, 0:30:52] +[titan] 2025-09-09 18:23:59,880 - root - INFO - step: 26630 loss: 2.6770 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.04 mfu: 49.65% global_avg_ntp_loss: 0.7498 global_avg_top_loss: 1.9273 +[titan] 2025-09-09 18:23:59,880 - root - INFO - lr: 6.6053e-06 gnorm: 0.36 [2 days, 0:48:32<1 day, 0:30:19] +[titan] 2025-09-09 18:24:31,904 - root - INFO - step: 26635 loss: 2.7035 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.68 mfu: 49.31% global_avg_ntp_loss: 0.7591 global_avg_top_loss: 1.9444 +[titan] 2025-09-09 18:24:31,904 - root - INFO - lr: 6.6022e-06 gnorm: 0.35 [2 days, 0:49:04<1 day, 0:29:45] +[titan] 2025-09-09 18:25:03,946 - root - INFO - step: 26640 loss: 2.7697 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.39 mfu: 49.28% global_avg_ntp_loss: 0.7919 global_avg_top_loss: 1.9778 +[titan] 2025-09-09 18:25:03,947 - root - INFO - lr: 6.5991e-06 gnorm: 0.36 [2 days, 0:49:36<1 day, 0:29:12] +[titan] 2025-09-09 18:25:35,713 - root - INFO - step: 26645 loss: 2.6643 memory: 122.03GiB(87.57%) tps: 10,315 tflops: 491.62 mfu: 49.71% global_avg_ntp_loss: 0.7427 global_avg_top_loss: 1.9217 +[titan] 2025-09-09 18:25:35,713 - root - INFO - lr: 6.5960e-06 gnorm: 0.36 [2 days, 0:50:08<1 day, 0:28:38] +[titan] 2025-09-09 18:26:01,148 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 18:26:07,551 - root - INFO - step: 26650 loss: 2.7395 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.52 mfu: 49.60% global_avg_ntp_loss: 0.7801 global_avg_top_loss: 1.9594 +[titan] 2025-09-09 18:26:07,551 - root - INFO - lr: 6.5929e-06 gnorm: 0.37 [2 days, 0:50:40<1 day, 0:28:05] +[titan] 2025-09-09 18:26:39,576 - root - INFO - step: 26655 loss: 2.7005 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.65 mfu: 49.31% global_avg_ntp_loss: 0.7555 global_avg_top_loss: 1.9450 +[titan] 2025-09-09 18:26:39,577 - root - INFO - lr: 6.5897e-06 gnorm: 0.36 [2 days, 0:51:12<1 day, 0:27:31] +[titan] 2025-09-09 18:27:11,661 - root - INFO - step: 26660 loss: 2.5784 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.75 mfu: 49.22% global_avg_ntp_loss: 0.7040 global_avg_top_loss: 1.8744 +[titan] 2025-09-09 18:27:11,661 - root - INFO - lr: 6.5866e-06 gnorm: 0.57 [2 days, 0:51:44<1 day, 0:26:58] +[titan] 2025-09-09 18:27:43,856 - root - INFO - step: 26665 loss: 2.7274 memory: 122.03GiB(87.57%) tps: 10,178 tflops: 485.07 mfu: 49.05% global_avg_ntp_loss: 0.7693 global_avg_top_loss: 1.9581 +[titan] 2025-09-09 18:27:43,857 - root - INFO - lr: 6.5835e-06 gnorm: 0.50 [2 days, 0:52:16<1 day, 0:26:24] +[titan] 2025-09-09 18:28:15,625 - root - INFO - step: 26670 loss: 2.7489 memory: 122.03GiB(87.57%) tps: 10,315 tflops: 491.59 mfu: 49.71% global_avg_ntp_loss: 0.7797 global_avg_top_loss: 1.9692 +[titan] 2025-09-09 18:28:15,625 - root - INFO - lr: 6.5804e-06 gnorm: 0.37 [2 days, 0:52:48<1 day, 0:25:51] +[titan] 2025-09-09 18:28:47,736 - root - INFO - step: 26675 loss: 2.6154 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.36 mfu: 49.18% global_avg_ntp_loss: 0.7216 global_avg_top_loss: 1.8938 +[titan] 2025-09-09 18:28:47,736 - root - INFO - lr: 6.5773e-06 gnorm: 0.37 [2 days, 0:53:20<1 day, 0:25:17] +[titan] 2025-09-09 18:29:19,704 - root - INFO - step: 26680 loss: 2.6659 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.52 mfu: 49.40% global_avg_ntp_loss: 0.7406 global_avg_top_loss: 1.9253 +[titan] 2025-09-09 18:29:19,704 - root - INFO - lr: 6.5742e-06 gnorm: 0.37 [2 days, 0:53:52<1 day, 0:24:44] +[titan] 2025-09-09 18:29:51,640 - root - INFO - step: 26685 loss: 2.7422 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.01 mfu: 49.44% global_avg_ntp_loss: 0.7777 global_avg_top_loss: 1.9645 +[titan] 2025-09-09 18:29:51,641 - root - INFO - lr: 6.5711e-06 gnorm: 0.37 [2 days, 0:54:24<1 day, 0:24:10] +[titan] 2025-09-09 18:30:23,887 - root - INFO - step: 26690 loss: 2.8764 memory: 122.03GiB(87.57%) tps: 10,162 tflops: 484.31 mfu: 48.97% global_avg_ntp_loss: 0.8292 global_avg_top_loss: 2.0471 +[titan] 2025-09-09 18:30:23,887 - root - INFO - lr: 6.5680e-06 gnorm: 0.42 [2 days, 0:54:56<1 day, 0:23:37] +[titan] 2025-09-09 18:30:55,929 - root - INFO - step: 26695 loss: 2.6844 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.40 mfu: 49.28% global_avg_ntp_loss: 0.7501 global_avg_top_loss: 1.9344 +[titan] 2025-09-09 18:30:55,929 - root - INFO - lr: 6.5649e-06 gnorm: 0.40 [2 days, 0:55:28<1 day, 0:23:04] +[titan] 2025-09-09 18:31:21,542 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 18:31:27,901 - root - INFO - step: 26700 loss: 2.7301 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.47 mfu: 49.39% global_avg_ntp_loss: 0.7713 global_avg_top_loss: 1.9588 +[titan] 2025-09-09 18:31:27,901 - root - INFO - lr: 6.5618e-06 gnorm: 0.39 [2 days, 0:56:00<1 day, 0:22:30] +[titan] 2025-09-09 18:31:59,878 - root - INFO - step: 26705 loss: 2.7289 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.39 mfu: 49.38% global_avg_ntp_loss: 0.7786 global_avg_top_loss: 1.9503 +[titan] 2025-09-09 18:31:59,878 - root - INFO - lr: 6.5587e-06 gnorm: 0.39 [2 days, 0:56:32<1 day, 0:21:57] +[titan] 2025-09-09 18:32:31,845 - root - INFO - step: 26710 loss: 2.7364 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.54 mfu: 49.40% global_avg_ntp_loss: 0.7807 global_avg_top_loss: 1.9557 +[titan] 2025-09-09 18:32:31,845 - root - INFO - lr: 6.5556e-06 gnorm: 0.38 [2 days, 0:57:04<1 day, 0:21:23] +[titan] 2025-09-09 18:33:03,926 - root - INFO - step: 26715 loss: 2.6721 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.80 mfu: 49.22% global_avg_ntp_loss: 0.7460 global_avg_top_loss: 1.9261 +[titan] 2025-09-09 18:33:03,927 - root - INFO - lr: 6.5525e-06 gnorm: 0.43 [2 days, 0:57:36<1 day, 0:20:50] +[titan] 2025-09-09 18:33:35,894 - root - INFO - step: 26720 loss: 2.6865 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.53 mfu: 49.40% global_avg_ntp_loss: 0.7515 global_avg_top_loss: 1.9351 +[titan] 2025-09-09 18:33:35,894 - root - INFO - lr: 6.5493e-06 gnorm: 0.39 [2 days, 0:58:08<1 day, 0:20:16] +[titan] 2025-09-09 18:34:07,755 - root - INFO - step: 26725 loss: 2.6890 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.18 mfu: 49.56% global_avg_ntp_loss: 0.7545 global_avg_top_loss: 1.9345 +[titan] 2025-09-09 18:34:07,755 - root - INFO - lr: 6.5462e-06 gnorm: 0.39 [2 days, 0:58:40<1 day, 0:19:43] +[titan] 2025-09-09 18:34:39,712 - root - INFO - step: 26730 loss: 2.7492 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.68 mfu: 49.41% global_avg_ntp_loss: 0.7823 global_avg_top_loss: 1.9668 +[titan] 2025-09-09 18:34:39,712 - root - INFO - lr: 6.5431e-06 gnorm: 0.42 [2 days, 0:59:12<1 day, 0:19:09] +[titan] 2025-09-09 18:35:11,503 - root - INFO - step: 26735 loss: 2.7350 memory: 122.03GiB(87.57%) tps: 10,308 tflops: 491.25 mfu: 49.67% global_avg_ntp_loss: 0.7739 global_avg_top_loss: 1.9610 +[titan] 2025-09-09 18:35:11,503 - root - INFO - lr: 6.5400e-06 gnorm: 0.35 [2 days, 0:59:44<1 day, 0:18:35] +[titan] 2025-09-09 18:35:43,529 - root - INFO - step: 26740 loss: 2.6251 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.64 mfu: 49.31% global_avg_ntp_loss: 0.7294 global_avg_top_loss: 1.8956 +[titan] 2025-09-09 18:35:43,529 - root - INFO - lr: 6.5369e-06 gnorm: 0.51 [2 days, 1:00:16<1 day, 0:18:02] +[titan] 2025-09-09 18:36:15,427 - root - INFO - step: 26745 loss: 2.7441 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.60 mfu: 49.50% global_avg_ntp_loss: 0.7772 global_avg_top_loss: 1.9669 +[titan] 2025-09-09 18:36:15,427 - root - INFO - lr: 6.5338e-06 gnorm: 0.44 [2 days, 1:00:48<1 day, 0:17:28] +[titan] 2025-09-09 18:36:40,904 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 18:36:47,248 - root - INFO - step: 26750 loss: 2.7808 memory: 122.03GiB(87.57%) tps: 10,298 tflops: 490.78 mfu: 49.62% global_avg_ntp_loss: 0.7984 global_avg_top_loss: 1.9824 +[titan] 2025-09-09 18:36:47,249 - root - INFO - lr: 6.5307e-06 gnorm: 0.39 [2 days, 1:01:20<1 day, 0:16:55] +[titan] 2025-09-09 18:37:19,394 - root - INFO - step: 26755 loss: 2.7120 memory: 122.03GiB(87.57%) tps: 10,194 tflops: 485.82 mfu: 49.12% global_avg_ntp_loss: 0.7616 global_avg_top_loss: 1.9503 +[titan] 2025-09-09 18:37:19,394 - root - INFO - lr: 6.5276e-06 gnorm: 0.37 [2 days, 1:01:52<1 day, 0:16:22] +[titan] 2025-09-09 18:37:51,277 - root - INFO - step: 26760 loss: 2.7289 memory: 122.03GiB(87.57%) tps: 10,278 tflops: 489.83 mfu: 49.53% global_avg_ntp_loss: 0.7720 global_avg_top_loss: 1.9569 +[titan] 2025-09-09 18:37:51,277 - root - INFO - lr: 6.5245e-06 gnorm: 0.36 [2 days, 1:02:24<1 day, 0:15:48] +[titan] 2025-09-09 18:38:23,336 - root - INFO - step: 26765 loss: 2.9793 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.13 mfu: 49.25% global_avg_ntp_loss: 0.9055 global_avg_top_loss: 2.0738 +[titan] 2025-09-09 18:38:23,337 - root - INFO - lr: 6.5215e-06 gnorm: 0.38 [2 days, 1:02:56<1 day, 0:15:15] +[titan] 2025-09-09 18:38:55,033 - root - INFO - step: 26770 loss: 2.6750 memory: 122.03GiB(87.57%) tps: 10,338 tflops: 492.70 mfu: 49.82% global_avg_ntp_loss: 0.7462 global_avg_top_loss: 1.9288 +[titan] 2025-09-09 18:38:55,034 - root - INFO - lr: 6.5184e-06 gnorm: 0.48 [2 days, 1:03:27<1 day, 0:14:41] +[titan] 2025-09-09 18:39:27,104 - root - INFO - step: 26775 loss: 2.8126 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.96 mfu: 49.24% global_avg_ntp_loss: 0.8093 global_avg_top_loss: 2.0033 +[titan] 2025-09-09 18:39:27,104 - root - INFO - lr: 6.5153e-06 gnorm: 0.38 [2 days, 1:04:00<1 day, 0:14:07] +[titan] 2025-09-09 18:39:59,117 - root - INFO - step: 26780 loss: 2.7232 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.83 mfu: 49.33% global_avg_ntp_loss: 0.7691 global_avg_top_loss: 1.9541 +[titan] 2025-09-09 18:39:59,118 - root - INFO - lr: 6.5122e-06 gnorm: 0.38 [2 days, 1:04:32<1 day, 0:13:34] +[titan] 2025-09-09 18:40:30,998 - root - INFO - step: 26785 loss: 2.6975 memory: 122.03GiB(87.57%) tps: 10,278 tflops: 489.86 mfu: 49.53% global_avg_ntp_loss: 0.7588 global_avg_top_loss: 1.9387 +[titan] 2025-09-09 18:40:30,998 - root - INFO - lr: 6.5091e-06 gnorm: 0.37 [2 days, 1:05:03<1 day, 0:13:00] +[titan] 2025-09-09 18:41:02,989 - root - INFO - step: 26790 loss: 2.8991 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.18 mfu: 49.36% global_avg_ntp_loss: 0.8509 global_avg_top_loss: 2.0482 +[titan] 2025-09-09 18:41:02,989 - root - INFO - lr: 6.5060e-06 gnorm: 0.41 [2 days, 1:05:35<1 day, 0:12:27] +[titan] 2025-09-09 18:41:34,968 - root - INFO - step: 26795 loss: 2.7226 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.35 mfu: 49.38% global_avg_ntp_loss: 0.7696 global_avg_top_loss: 1.9530 +[titan] 2025-09-09 18:41:34,969 - root - INFO - lr: 6.5029e-06 gnorm: 0.37 [2 days, 1:06:07<1 day, 0:11:54] +[titan] 2025-09-09 18:42:00,508 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 18:42:06,982 - root - INFO - step: 26800 loss: 2.7583 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.84 mfu: 49.33% global_avg_ntp_loss: 0.7973 global_avg_top_loss: 1.9610 +[titan] 2025-09-09 18:42:06,982 - root - INFO - lr: 6.4998e-06 gnorm: 0.45 [2 days, 1:06:39<1 day, 0:11:20] +[titan] 2025-09-09 18:42:38,835 - root - INFO - step: 26805 loss: 2.6549 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.29 mfu: 49.57% global_avg_ntp_loss: 0.7381 global_avg_top_loss: 1.9168 +[titan] 2025-09-09 18:42:38,835 - root - INFO - lr: 6.4967e-06 gnorm: 0.35 [2 days, 1:07:11<1 day, 0:10:46] +[titan] 2025-09-09 18:43:10,861 - root - INFO - step: 26810 loss: 2.7149 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.63 mfu: 49.31% global_avg_ntp_loss: 0.7648 global_avg_top_loss: 1.9501 +[titan] 2025-09-09 18:43:10,862 - root - INFO - lr: 6.4936e-06 gnorm: 0.35 [2 days, 1:07:43<1 day, 0:10:13] +[titan] 2025-09-09 18:43:43,061 - root - INFO - step: 26815 loss: 2.7017 memory: 122.03GiB(87.57%) tps: 10,177 tflops: 485.01 mfu: 49.04% global_avg_ntp_loss: 0.7592 global_avg_top_loss: 1.9426 +[titan] 2025-09-09 18:43:43,061 - root - INFO - lr: 6.4905e-06 gnorm: 0.36 [2 days, 1:08:15<1 day, 0:09:40] +[titan] 2025-09-09 18:44:14,871 - root - INFO - step: 26820 loss: 2.6268 memory: 122.03GiB(87.57%) tps: 10,301 tflops: 490.95 mfu: 49.64% global_avg_ntp_loss: 0.7222 global_avg_top_loss: 1.9046 +[titan] 2025-09-09 18:44:14,872 - root - INFO - lr: 6.4874e-06 gnorm: 0.55 [2 days, 1:08:47<1 day, 0:09:06] +[titan] 2025-09-09 18:44:46,895 - root - INFO - step: 26825 loss: 2.6928 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.68 mfu: 49.31% global_avg_ntp_loss: 0.7572 global_avg_top_loss: 1.9356 +[titan] 2025-09-09 18:44:46,895 - root - INFO - lr: 6.4843e-06 gnorm: 0.38 [2 days, 1:09:19<1 day, 0:08:33] +[titan] 2025-09-09 18:45:18,819 - root - INFO - step: 26830 loss: 2.6896 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.20 mfu: 49.46% global_avg_ntp_loss: 0.7526 global_avg_top_loss: 1.9371 +[titan] 2025-09-09 18:45:18,819 - root - INFO - lr: 6.4813e-06 gnorm: 0.37 [2 days, 1:09:51<1 day, 0:07:59] +[titan] 2025-09-09 18:45:50,797 - root - INFO - step: 26835 loss: 2.6914 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.38 mfu: 49.38% global_avg_ntp_loss: 0.7555 global_avg_top_loss: 1.9359 +[titan] 2025-09-09 18:45:50,797 - root - INFO - lr: 6.4782e-06 gnorm: 0.36 [2 days, 1:10:23<1 day, 0:07:26] +[titan] 2025-09-09 18:46:22,809 - root - INFO - step: 26840 loss: 2.7312 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.86 mfu: 49.33% global_avg_ntp_loss: 0.7715 global_avg_top_loss: 1.9596 +[titan] 2025-09-09 18:46:22,809 - root - INFO - lr: 6.4751e-06 gnorm: 0.37 [2 days, 1:10:55<1 day, 0:06:52] +[titan] 2025-09-09 18:46:54,844 - root - INFO - step: 26845 loss: 2.7830 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.49 mfu: 49.29% global_avg_ntp_loss: 0.7965 global_avg_top_loss: 1.9865 +[titan] 2025-09-09 18:46:54,845 - root - INFO - lr: 6.4720e-06 gnorm: 0.37 [2 days, 1:11:27<1 day, 0:06:19] +[titan] 2025-09-09 18:47:20,437 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 18:47:26,952 - root - INFO - step: 26850 loss: 2.8378 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.40 mfu: 49.18% global_avg_ntp_loss: 0.8144 global_avg_top_loss: 2.0234 +[titan] 2025-09-09 18:47:26,952 - root - INFO - lr: 6.4689e-06 gnorm: 1.00 [2 days, 1:11:59<1 day, 0:05:45] +[titan] 2025-09-09 18:47:58,977 - root - INFO - step: 26855 loss: 3.2085 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.65 mfu: 49.31% global_avg_ntp_loss: 1.0421 global_avg_top_loss: 2.1664 +[titan] 2025-09-09 18:47:58,978 - root - INFO - lr: 6.4658e-06 gnorm: 0.42 [2 days, 1:12:31<1 day, 0:05:12] +[titan] 2025-09-09 18:48:30,675 - root - INFO - step: 26860 loss: 2.6858 memory: 122.03GiB(87.57%) tps: 10,338 tflops: 492.70 mfu: 49.82% global_avg_ntp_loss: 0.7521 global_avg_top_loss: 1.9337 +[titan] 2025-09-09 18:48:30,675 - root - INFO - lr: 6.4627e-06 gnorm: 0.36 [2 days, 1:13:03<1 day, 0:04:38] +[titan] 2025-09-09 18:49:02,689 - root - INFO - step: 26865 loss: 2.5476 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.82 mfu: 49.32% global_avg_ntp_loss: 0.6916 global_avg_top_loss: 1.8560 +[titan] 2025-09-09 18:49:02,689 - root - INFO - lr: 6.4597e-06 gnorm: 0.35 [2 days, 1:13:35<1 day, 0:04:05] +[titan] 2025-09-09 18:49:34,519 - root - INFO - step: 26870 loss: 2.6289 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.64 mfu: 49.61% global_avg_ntp_loss: 0.7313 global_avg_top_loss: 1.8976 +[titan] 2025-09-09 18:49:34,520 - root - INFO - lr: 6.4566e-06 gnorm: 0.36 [2 days, 1:14:07<1 day, 0:03:31] +[titan] 2025-09-09 18:50:06,555 - root - INFO - step: 26875 loss: 2.7057 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.49 mfu: 49.29% global_avg_ntp_loss: 0.7613 global_avg_top_loss: 1.9444 +[titan] 2025-09-09 18:50:06,556 - root - INFO - lr: 6.4535e-06 gnorm: 0.38 [2 days, 1:14:39<1 day, 0:02:58] +[titan] 2025-09-09 18:50:38,478 - root - INFO - step: 26880 loss: 2.8484 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.22 mfu: 49.47% global_avg_ntp_loss: 0.8308 global_avg_top_loss: 2.0176 +[titan] 2025-09-09 18:50:38,478 - root - INFO - lr: 6.4504e-06 gnorm: 0.37 [2 days, 1:15:11<1 day, 0:02:24] +[titan] 2025-09-09 18:51:10,479 - root - INFO - step: 26885 loss: 2.6820 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.03 mfu: 49.35% global_avg_ntp_loss: 0.7489 global_avg_top_loss: 1.9331 +[titan] 2025-09-09 18:51:10,479 - root - INFO - lr: 6.4473e-06 gnorm: 0.38 [2 days, 1:15:43<1 day, 0:01:51] +[titan] 2025-09-09 18:51:42,514 - root - INFO - step: 26890 loss: 2.7534 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.50 mfu: 49.29% global_avg_ntp_loss: 0.7844 global_avg_top_loss: 1.9691 +[titan] 2025-09-09 18:51:42,514 - root - INFO - lr: 6.4443e-06 gnorm: 0.37 [2 days, 1:16:15<1 day, 0:01:17] +[titan] 2025-09-09 18:52:14,305 - root - INFO - step: 26895 loss: 2.6009 memory: 122.03GiB(87.57%) tps: 10,308 tflops: 491.25 mfu: 49.67% global_avg_ntp_loss: 0.7154 global_avg_top_loss: 1.8856 +[titan] 2025-09-09 18:52:14,305 - root - INFO - lr: 6.4412e-06 gnorm: 0.35 [2 days, 1:16:47<1 day, 0:00:44] +[titan] 2025-09-09 18:52:40,050 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 18:52:46,410 - root - INFO - step: 26900 loss: 2.8136 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.44 mfu: 49.19% global_avg_ntp_loss: 0.8116 global_avg_top_loss: 2.0020 +[titan] 2025-09-09 18:52:46,410 - root - INFO - lr: 6.4381e-06 gnorm: 0.38 [2 days, 1:17:19<1 day, 0:00:10] +[titan] 2025-09-09 18:53:18,399 - root - INFO - step: 26905 loss: 2.7392 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.19 mfu: 49.36% global_avg_ntp_loss: 0.7787 global_avg_top_loss: 1.9605 +[titan] 2025-09-09 18:53:18,400 - root - INFO - lr: 6.4350e-06 gnorm: 0.38 [2 days, 1:17:51<23:59:37] +[titan] 2025-09-09 18:53:50,344 - root - INFO - step: 26910 loss: 2.7703 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.88 mfu: 49.43% global_avg_ntp_loss: 0.7903 global_avg_top_loss: 1.9800 +[titan] 2025-09-09 18:53:50,344 - root - INFO - lr: 6.4320e-06 gnorm: 0.37 [2 days, 1:18:23<23:59:04] +[titan] 2025-09-09 18:54:22,055 - root - INFO - step: 26915 loss: 2.6668 memory: 122.03GiB(87.57%) tps: 10,334 tflops: 492.49 mfu: 49.80% global_avg_ntp_loss: 0.7436 global_avg_top_loss: 1.9232 +[titan] 2025-09-09 18:54:22,055 - root - INFO - lr: 6.4289e-06 gnorm: 0.39 [2 days, 1:18:54<23:58:30] +[titan] 2025-09-09 18:54:53,974 - root - INFO - step: 26920 loss: 2.7720 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.27 mfu: 49.47% global_avg_ntp_loss: 0.7904 global_avg_top_loss: 1.9816 +[titan] 2025-09-09 18:54:53,974 - root - INFO - lr: 6.4258e-06 gnorm: 0.37 [2 days, 1:19:26<23:57:56] +[titan] 2025-09-09 18:55:26,065 - root - INFO - step: 26925 loss: 2.7029 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.65 mfu: 49.21% global_avg_ntp_loss: 0.7597 global_avg_top_loss: 1.9431 +[titan] 2025-09-09 18:55:26,065 - root - INFO - lr: 6.4227e-06 gnorm: 0.40 [2 days, 1:19:58<23:57:23] +[titan] 2025-09-09 18:55:58,131 - root - INFO - step: 26930 loss: 2.7287 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.03 mfu: 49.24% global_avg_ntp_loss: 0.7702 global_avg_top_loss: 1.9585 +[titan] 2025-09-09 18:55:58,131 - root - INFO - lr: 6.4197e-06 gnorm: 0.39 [2 days, 1:20:31<23:56:50] +[titan] 2025-09-09 18:56:30,182 - root - INFO - step: 26935 loss: 2.8694 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.26 mfu: 49.27% global_avg_ntp_loss: 0.8479 global_avg_top_loss: 2.0215 +[titan] 2025-09-09 18:56:30,182 - root - INFO - lr: 6.4166e-06 gnorm: 0.38 [2 days, 1:21:03<23:56:16] +[titan] 2025-09-09 18:57:02,151 - root - INFO - step: 26940 loss: 2.7370 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.51 mfu: 49.39% global_avg_ntp_loss: 0.7747 global_avg_top_loss: 1.9623 +[titan] 2025-09-09 18:57:02,151 - root - INFO - lr: 6.4135e-06 gnorm: 0.37 [2 days, 1:21:35<23:55:43] +[titan] 2025-09-09 18:57:34,348 - root - INFO - step: 26945 loss: 2.7522 memory: 122.03GiB(87.57%) tps: 10,177 tflops: 485.05 mfu: 49.04% global_avg_ntp_loss: 0.7980 global_avg_top_loss: 1.9542 +[titan] 2025-09-09 18:57:34,348 - root - INFO - lr: 6.4104e-06 gnorm: 0.38 [2 days, 1:22:07<23:55:09] +[titan] 2025-09-09 18:57:59,825 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 18:58:06,251 - root - INFO - step: 26950 loss: 3.0763 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.53 mfu: 49.50% global_avg_ntp_loss: 0.9866 global_avg_top_loss: 2.0897 +[titan] 2025-09-09 18:58:06,251 - root - INFO - lr: 6.4074e-06 gnorm: 0.43 [2 days, 1:22:39<23:54:36] +[titan] 2025-09-09 18:58:38,209 - root - INFO - step: 26955 loss: 2.6407 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.68 mfu: 49.41% global_avg_ntp_loss: 0.7312 global_avg_top_loss: 1.9095 +[titan] 2025-09-09 18:58:38,209 - root - INFO - lr: 6.4043e-06 gnorm: 0.36 [2 days, 1:23:11<23:54:02] +[titan] 2025-09-09 18:59:10,199 - root - INFO - step: 26960 loss: 2.6080 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.19 mfu: 49.36% global_avg_ntp_loss: 0.7167 global_avg_top_loss: 1.8912 +[titan] 2025-09-09 18:59:10,199 - root - INFO - lr: 6.4012e-06 gnorm: 0.37 [2 days, 1:23:43<23:53:29] +[titan] 2025-09-09 18:59:42,158 - root - INFO - step: 26965 loss: 2.6799 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.67 mfu: 49.41% global_avg_ntp_loss: 0.7459 global_avg_top_loss: 1.9340 +[titan] 2025-09-09 18:59:42,158 - root - INFO - lr: 6.3982e-06 gnorm: 0.36 [2 days, 1:24:15<23:52:55] +[titan] 2025-09-09 19:00:14,176 - root - INFO - step: 26970 loss: 2.6982 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.75 mfu: 49.32% global_avg_ntp_loss: 0.7540 global_avg_top_loss: 1.9442 +[titan] 2025-09-09 19:00:14,177 - root - INFO - lr: 6.3951e-06 gnorm: 0.36 [2 days, 1:24:47<23:52:22] +[titan] 2025-09-09 19:00:46,237 - root - INFO - step: 26975 loss: 2.6967 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.11 mfu: 49.25% global_avg_ntp_loss: 0.7569 global_avg_top_loss: 1.9398 +[titan] 2025-09-09 19:00:46,238 - root - INFO - lr: 6.3920e-06 gnorm: 0.37 [2 days, 1:25:19<23:51:49] +[titan] 2025-09-09 19:01:18,132 - root - INFO - step: 26980 loss: 2.8244 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.64 mfu: 49.51% global_avg_ntp_loss: 0.8155 global_avg_top_loss: 2.0090 +[titan] 2025-09-09 19:01:18,133 - root - INFO - lr: 6.3890e-06 gnorm: 0.37 [2 days, 1:25:51<23:51:15] +[titan] 2025-09-09 19:01:49,980 - root - INFO - step: 26985 loss: 2.7032 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.37 mfu: 49.58% global_avg_ntp_loss: 0.7608 global_avg_top_loss: 1.9424 +[titan] 2025-09-09 19:01:49,980 - root - INFO - lr: 6.3859e-06 gnorm: 0.37 [2 days, 1:26:22<23:50:42] +[titan] 2025-09-09 19:02:22,023 - root - INFO - step: 26990 loss: 2.7053 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.39 mfu: 49.28% global_avg_ntp_loss: 0.7472 global_avg_top_loss: 1.9582 +[titan] 2025-09-09 19:02:22,023 - root - INFO - lr: 6.3828e-06 gnorm: 1.18 [2 days, 1:26:54<23:50:08] +[titan] 2025-09-09 19:02:54,069 - root - INFO - step: 26995 loss: 2.6801 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.33 mfu: 49.28% global_avg_ntp_loss: 0.7467 global_avg_top_loss: 1.9334 +[titan] 2025-09-09 19:02:54,069 - root - INFO - lr: 6.3798e-06 gnorm: 0.35 [2 days, 1:27:26<23:49:35] +[titan] 2025-09-09 19:03:19,664 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 19:03:26,079 - root - INFO - step: 27000 loss: 2.6747 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.89 mfu: 49.33% global_avg_ntp_loss: 0.7454 global_avg_top_loss: 1.9293 +[titan] 2025-09-09 19:03:26,079 - root - INFO - lr: 6.3767e-06 gnorm: 0.36 [2 days, 1:27:58<23:49:01] +[titan] 2025-09-09 19:03:57,995 - root - INFO - step: 27005 loss: 3.0726 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.32 mfu: 49.48% global_avg_ntp_loss: 0.9287 global_avg_top_loss: 2.1439 +[titan] 2025-09-09 19:03:57,995 - root - INFO - lr: 6.3736e-06 gnorm: 0.39 [2 days, 1:28:30<23:48:28] +[titan] 2025-09-09 19:04:30,067 - root - INFO - step: 27010 loss: 2.7375 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.94 mfu: 49.24% global_avg_ntp_loss: 0.7730 global_avg_top_loss: 1.9645 +[titan] 2025-09-09 19:04:30,067 - root - INFO - lr: 6.3706e-06 gnorm: 0.37 [2 days, 1:29:02<23:47:54] +[titan] 2025-09-09 19:05:02,102 - root - INFO - step: 27015 loss: 2.8582 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.50 mfu: 49.29% global_avg_ntp_loss: 0.8349 global_avg_top_loss: 2.0232 +[titan] 2025-09-09 19:05:02,102 - root - INFO - lr: 6.3675e-06 gnorm: 0.38 [2 days, 1:29:34<23:47:21] +[titan] 2025-09-09 19:05:34,130 - root - INFO - step: 27020 loss: 2.6838 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.61 mfu: 49.30% global_avg_ntp_loss: 0.7519 global_avg_top_loss: 1.9319 +[titan] 2025-09-09 19:05:34,130 - root - INFO - lr: 6.3645e-06 gnorm: 0.36 [2 days, 1:30:07<23:46:47] +[titan] 2025-09-09 19:06:06,082 - root - INFO - step: 27025 loss: 2.6730 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.77 mfu: 49.42% global_avg_ntp_loss: 0.7428 global_avg_top_loss: 1.9302 +[titan] 2025-09-09 19:06:06,082 - root - INFO - lr: 6.3614e-06 gnorm: 0.38 [2 days, 1:30:38<23:46:14] +[titan] 2025-09-09 19:06:37,883 - root - INFO - step: 27030 loss: 3.1885 memory: 122.03GiB(87.57%) tps: 10,304 tflops: 491.09 mfu: 49.66% global_avg_ntp_loss: 1.0375 global_avg_top_loss: 2.1509 +[titan] 2025-09-09 19:06:37,883 - root - INFO - lr: 6.3583e-06 gnorm: 0.36 [2 days, 1:31:10<23:45:40] +[titan] 2025-09-09 19:07:09,881 - root - INFO - step: 27035 loss: 2.7117 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.07 mfu: 49.35% global_avg_ntp_loss: 0.7632 global_avg_top_loss: 1.9485 +[titan] 2025-09-09 19:07:09,881 - root - INFO - lr: 6.3553e-06 gnorm: 0.38 [2 days, 1:31:42<23:45:07] +[titan] 2025-09-09 19:07:41,905 - root - INFO - step: 27040 loss: 2.6774 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.67 mfu: 49.31% global_avg_ntp_loss: 0.7484 global_avg_top_loss: 1.9290 +[titan] 2025-09-09 19:07:41,905 - root - INFO - lr: 6.3522e-06 gnorm: 0.37 [2 days, 1:32:14<23:44:34] +[titan] 2025-09-09 19:08:13,945 - root - INFO - step: 27045 loss: 2.7450 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.42 mfu: 49.28% global_avg_ntp_loss: 0.7756 global_avg_top_loss: 1.9694 +[titan] 2025-09-09 19:08:13,945 - root - INFO - lr: 6.3492e-06 gnorm: 0.40 [2 days, 1:32:46<23:44:00] +[titan] 2025-09-09 19:08:39,583 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 19:08:46,030 - root - INFO - step: 27050 loss: 2.6478 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.75 mfu: 49.22% global_avg_ntp_loss: 0.7343 global_avg_top_loss: 1.9134 +[titan] 2025-09-09 19:08:46,030 - root - INFO - lr: 6.3461e-06 gnorm: 0.42 [2 days, 1:33:18<23:43:27] +[titan] 2025-09-09 19:09:17,964 - root - INFO - step: 27055 loss: 2.7008 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.05 mfu: 49.45% global_avg_ntp_loss: 0.7598 global_avg_top_loss: 1.9409 +[titan] 2025-09-09 19:09:17,964 - root - INFO - lr: 6.3431e-06 gnorm: 0.39 [2 days, 1:33:50<23:42:53] +[titan] 2025-09-09 19:09:50,017 - root - INFO - step: 27060 loss: 2.7380 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.23 mfu: 49.26% global_avg_ntp_loss: 0.7725 global_avg_top_loss: 1.9655 +[titan] 2025-09-09 19:09:50,017 - root - INFO - lr: 6.3400e-06 gnorm: 0.38 [2 days, 1:34:22<23:42:20] +[titan] 2025-09-09 19:10:22,223 - root - INFO - step: 27065 loss: 2.6734 memory: 122.03GiB(87.57%) tps: 10,175 tflops: 484.92 mfu: 49.03% global_avg_ntp_loss: 0.7467 global_avg_top_loss: 1.9267 +[titan] 2025-09-09 19:10:22,223 - root - INFO - lr: 6.3369e-06 gnorm: 0.39 [2 days, 1:34:55<23:41:47] +[titan] 2025-09-09 19:10:54,267 - root - INFO - step: 27070 loss: 2.6952 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.36 mfu: 49.28% global_avg_ntp_loss: 0.7551 global_avg_top_loss: 1.9402 +[titan] 2025-09-09 19:10:54,268 - root - INFO - lr: 6.3339e-06 gnorm: 0.36 [2 days, 1:35:27<23:41:13] +[titan] 2025-09-09 19:11:26,435 - root - INFO - step: 27075 loss: 2.7463 memory: 122.03GiB(87.57%) tps: 10,187 tflops: 485.50 mfu: 49.09% global_avg_ntp_loss: 0.7781 global_avg_top_loss: 1.9681 +[titan] 2025-09-09 19:11:26,435 - root - INFO - lr: 6.3308e-06 gnorm: 0.42 [2 days, 1:35:59<23:40:40] +[titan] 2025-09-09 19:11:58,375 - root - INFO - step: 27080 loss: 2.7435 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.96 mfu: 49.44% global_avg_ntp_loss: 0.7763 global_avg_top_loss: 1.9671 +[titan] 2025-09-09 19:11:58,375 - root - INFO - lr: 6.3278e-06 gnorm: 0.39 [2 days, 1:36:31<23:40:06] +[titan] 2025-09-09 19:12:30,533 - root - INFO - step: 27085 loss: 2.6053 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.63 mfu: 49.10% global_avg_ntp_loss: 0.7191 global_avg_top_loss: 1.8862 +[titan] 2025-09-09 19:12:30,534 - root - INFO - lr: 6.3247e-06 gnorm: 0.37 [2 days, 1:37:03<23:39:33] +[titan] 2025-09-09 19:13:02,558 - root - INFO - step: 27090 loss: 2.7293 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.66 mfu: 49.31% global_avg_ntp_loss: 0.7670 global_avg_top_loss: 1.9623 +[titan] 2025-09-09 19:13:02,558 - root - INFO - lr: 6.3217e-06 gnorm: 0.37 [2 days, 1:37:35<23:38:59] +[titan] 2025-09-09 19:13:34,564 - root - INFO - step: 27095 loss: 3.6952 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.94 mfu: 49.34% global_avg_ntp_loss: 1.3170 global_avg_top_loss: 2.3782 +[titan] 2025-09-09 19:13:34,565 - root - INFO - lr: 6.3186e-06 gnorm: 0.40 [2 days, 1:38:07<23:38:26] +[titan] 2025-09-09 19:14:00,107 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 19:14:06,437 - root - INFO - step: 27100 loss: 2.7371 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.99 mfu: 49.54% global_avg_ntp_loss: 0.7732 global_avg_top_loss: 1.9639 +[titan] 2025-09-09 19:14:06,437 - root - INFO - lr: 6.3156e-06 gnorm: 0.39 [2 days, 1:38:39<23:37:53] +[titan] 2025-09-09 19:14:38,534 - root - INFO - step: 27105 loss: 2.7704 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.56 mfu: 49.20% global_avg_ntp_loss: 0.7909 global_avg_top_loss: 1.9795 +[titan] 2025-09-09 19:14:38,534 - root - INFO - lr: 6.3125e-06 gnorm: 0.40 [2 days, 1:39:11<23:37:19] +[titan] 2025-09-09 19:15:10,418 - root - INFO - step: 27110 loss: 3.1899 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.82 mfu: 49.53% global_avg_ntp_loss: 1.0342 global_avg_top_loss: 2.1557 +[titan] 2025-09-09 19:15:10,418 - root - INFO - lr: 6.3095e-06 gnorm: 0.37 [2 days, 1:39:43<23:36:46] +[titan] 2025-09-09 19:15:42,483 - root - INFO - step: 27115 loss: 2.9980 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.04 mfu: 49.25% global_avg_ntp_loss: 0.9298 global_avg_top_loss: 2.0682 +[titan] 2025-09-09 19:15:42,483 - root - INFO - lr: 6.3064e-06 gnorm: 0.36 [2 days, 1:40:15<23:36:12] +[titan] 2025-09-09 19:16:14,528 - root - INFO - step: 27120 loss: 2.7013 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.35 mfu: 49.28% global_avg_ntp_loss: 0.7600 global_avg_top_loss: 1.9413 +[titan] 2025-09-09 19:16:14,529 - root - INFO - lr: 6.3034e-06 gnorm: 0.36 [2 days, 1:40:47<23:35:39] +[titan] 2025-09-09 19:16:46,604 - root - INFO - step: 27125 loss: 2.7012 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.89 mfu: 49.23% global_avg_ntp_loss: 0.7581 global_avg_top_loss: 1.9430 +[titan] 2025-09-09 19:16:46,604 - root - INFO - lr: 6.3004e-06 gnorm: 0.36 [2 days, 1:41:19<23:35:05] +[titan] 2025-09-09 19:17:18,648 - root - INFO - step: 27130 loss: 2.7045 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.36 mfu: 49.28% global_avg_ntp_loss: 0.7655 global_avg_top_loss: 1.9391 +[titan] 2025-09-09 19:17:18,649 - root - INFO - lr: 6.2973e-06 gnorm: 0.38 [2 days, 1:41:51<23:34:32] +[titan] 2025-09-09 19:17:50,654 - root - INFO - step: 27135 loss: 2.9253 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.95 mfu: 49.34% global_avg_ntp_loss: 0.8866 global_avg_top_loss: 2.0387 +[titan] 2025-09-09 19:17:50,655 - root - INFO - lr: 6.2943e-06 gnorm: 0.37 [2 days, 1:42:23<23:33:59] +[titan] 2025-09-09 19:17:57,323 - root - INFO - Dumping profiler traces at step 27136 +[titan] 2025-09-09 19:17:57,395 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-09 19:18:22,919 - root - INFO - step: 27140 loss: 2.6909 memory: 122.03GiB(87.57%) tps: 10,156 tflops: 484.03 mfu: 48.94% global_avg_ntp_loss: 0.7540 global_avg_top_loss: 1.9369 +[titan] 2025-09-09 19:18:22,920 - root - INFO - lr: 6.2912e-06 gnorm: 0.43 [2 days, 1:42:55<23:33:25] +[titan] 2025-09-09 19:18:55,015 - root - INFO - step: 27145 loss: 2.6698 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.59 mfu: 49.20% global_avg_ntp_loss: 0.7440 global_avg_top_loss: 1.9258 +[titan] 2025-09-09 19:18:55,015 - root - INFO - lr: 6.2882e-06 gnorm: 0.40 [2 days, 1:43:27<23:32:52] +[titan] 2025-09-09 19:19:20,803 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 19:19:27,191 - root - INFO - step: 27150 loss: 2.7734 memory: 122.03GiB(87.57%) tps: 10,184 tflops: 485.36 mfu: 49.08% global_avg_ntp_loss: 0.7875 global_avg_top_loss: 1.9859 +[titan] 2025-09-09 19:19:27,192 - root - INFO - lr: 6.2851e-06 gnorm: 0.38 [2 days, 1:44:00<23:32:19] +[titan] 2025-09-09 19:19:59,033 - root - INFO - step: 27155 loss: 2.6800 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.46 mfu: 49.59% global_avg_ntp_loss: 0.7478 global_avg_top_loss: 1.9323 +[titan] 2025-09-09 19:19:59,033 - root - INFO - lr: 6.2821e-06 gnorm: 0.40 [2 days, 1:44:31<23:31:45] +[titan] 2025-09-09 19:20:31,023 - root - INFO - step: 27160 loss: 2.7391 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.20 mfu: 49.36% global_avg_ntp_loss: 0.7746 global_avg_top_loss: 1.9645 +[titan] 2025-09-09 19:20:31,023 - root - INFO - lr: 6.2791e-06 gnorm: 0.36 [2 days, 1:45:03<23:31:12] +[titan] 2025-09-09 19:21:02,981 - root - INFO - step: 27165 loss: 3.0957 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.67 mfu: 49.41% global_avg_ntp_loss: 0.9706 global_avg_top_loss: 2.1251 +[titan] 2025-09-09 19:21:02,982 - root - INFO - lr: 6.2760e-06 gnorm: 0.37 [2 days, 1:45:35<23:30:38] +[titan] 2025-09-09 19:21:34,912 - root - INFO - step: 27170 loss: 2.6701 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.09 mfu: 49.45% global_avg_ntp_loss: 0.7452 global_avg_top_loss: 1.9249 +[titan] 2025-09-09 19:21:34,913 - root - INFO - lr: 6.2730e-06 gnorm: 0.37 [2 days, 1:46:07<23:30:05] +[titan] 2025-09-09 19:22:06,813 - root - INFO - step: 27175 loss: 3.6108 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.55 mfu: 49.50% global_avg_ntp_loss: 1.2809 global_avg_top_loss: 2.3299 +[titan] 2025-09-09 19:22:06,814 - root - INFO - lr: 6.2700e-06 gnorm: 0.44 [2 days, 1:46:39<23:29:31] +[titan] 2025-09-09 19:22:38,918 - root - INFO - step: 27180 loss: 2.7138 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.45 mfu: 49.19% global_avg_ntp_loss: 0.7626 global_avg_top_loss: 1.9512 +[titan] 2025-09-09 19:22:38,918 - root - INFO - lr: 6.2669e-06 gnorm: 0.40 [2 days, 1:47:11<23:28:58] +[titan] 2025-09-09 19:23:11,081 - root - INFO - step: 27185 loss: 2.7029 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.56 mfu: 49.10% global_avg_ntp_loss: 0.7599 global_avg_top_loss: 1.9430 +[titan] 2025-09-09 19:23:11,081 - root - INFO - lr: 6.2639e-06 gnorm: 0.39 [2 days, 1:47:43<23:28:24] +[titan] 2025-09-09 19:23:43,028 - root - INFO - step: 27190 loss: 2.7070 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.84 mfu: 49.43% global_avg_ntp_loss: 0.7610 global_avg_top_loss: 1.9460 +[titan] 2025-09-09 19:23:43,028 - root - INFO - lr: 6.2608e-06 gnorm: 0.37 [2 days, 1:48:15<23:27:51] +[titan] 2025-09-09 19:24:15,006 - root - INFO - step: 27195 loss: 2.6947 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.38 mfu: 49.38% global_avg_ntp_loss: 0.7549 global_avg_top_loss: 1.9398 +[titan] 2025-09-09 19:24:15,006 - root - INFO - lr: 6.2578e-06 gnorm: 0.37 [2 days, 1:48:47<23:27:18] +[titan] 2025-09-09 19:24:40,597 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 19:24:47,045 - root - INFO - step: 27200 loss: 2.7372 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.44 mfu: 49.29% global_avg_ntp_loss: 0.7747 global_avg_top_loss: 1.9625 +[titan] 2025-09-09 19:24:47,045 - root - INFO - lr: 6.2548e-06 gnorm: 0.41 [2 days, 1:49:19<23:26:44] +[titan] 2025-09-09 19:25:19,280 - root - INFO - step: 27205 loss: 2.6635 memory: 122.03GiB(87.57%) tps: 10,165 tflops: 484.48 mfu: 48.99% global_avg_ntp_loss: 0.7423 global_avg_top_loss: 1.9211 +[titan] 2025-09-09 19:25:19,280 - root - INFO - lr: 6.2517e-06 gnorm: 0.37 [2 days, 1:49:52<23:26:11] +[titan] 2025-09-09 19:25:51,302 - root - INFO - step: 27210 loss: 2.7562 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.71 mfu: 49.31% global_avg_ntp_loss: 0.7837 global_avg_top_loss: 1.9725 +[titan] 2025-09-09 19:25:51,302 - root - INFO - lr: 6.2487e-06 gnorm: 0.45 [2 days, 1:50:24<23:25:37] +[titan] 2025-09-09 19:26:23,451 - root - INFO - step: 27215 loss: 2.6512 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.77 mfu: 49.12% global_avg_ntp_loss: 0.7339 global_avg_top_loss: 1.9173 +[titan] 2025-09-09 19:26:23,452 - root - INFO - lr: 6.2457e-06 gnorm: 0.36 [2 days, 1:50:56<23:25:04] +[titan] 2025-09-09 19:26:55,408 - root - INFO - step: 27220 loss: 2.6973 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.69 mfu: 49.41% global_avg_ntp_loss: 0.7558 global_avg_top_loss: 1.9415 +[titan] 2025-09-09 19:26:55,409 - root - INFO - lr: 6.2426e-06 gnorm: 0.36 [2 days, 1:51:28<23:24:31] +[titan] 2025-09-09 19:27:27,275 - root - INFO - step: 27225 loss: 2.6883 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.09 mfu: 49.55% global_avg_ntp_loss: 0.7520 global_avg_top_loss: 1.9364 +[titan] 2025-09-09 19:27:27,275 - root - INFO - lr: 6.2396e-06 gnorm: 0.41 [2 days, 1:52:00<23:23:57] +[titan] 2025-09-09 19:27:59,386 - root - INFO - step: 27230 loss: 2.7478 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.34 mfu: 49.17% global_avg_ntp_loss: 0.7774 global_avg_top_loss: 1.9704 +[titan] 2025-09-09 19:27:59,387 - root - INFO - lr: 6.2366e-06 gnorm: 0.38 [2 days, 1:52:32<23:23:24] +[titan] 2025-09-09 19:28:31,434 - root - INFO - step: 27235 loss: 2.6302 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.32 mfu: 49.27% global_avg_ntp_loss: 0.7288 global_avg_top_loss: 1.9014 +[titan] 2025-09-09 19:28:31,434 - root - INFO - lr: 6.2336e-06 gnorm: 0.40 [2 days, 1:53:04<23:22:50] +[titan] 2025-09-09 19:29:03,279 - root - INFO - step: 27240 loss: 2.6055 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.40 mfu: 49.59% global_avg_ntp_loss: 0.7136 global_avg_top_loss: 1.8919 +[titan] 2025-09-09 19:29:03,280 - root - INFO - lr: 6.2305e-06 gnorm: 0.38 [2 days, 1:53:36<23:22:17] +[titan] 2025-09-09 19:29:35,314 - root - INFO - step: 27245 loss: 2.7772 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.51 mfu: 49.29% global_avg_ntp_loss: 0.7943 global_avg_top_loss: 1.9829 +[titan] 2025-09-09 19:29:35,314 - root - INFO - lr: 6.2275e-06 gnorm: 0.38 [2 days, 1:54:08<23:21:43] +[titan] 2025-09-09 19:30:00,910 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 19:30:07,285 - root - INFO - step: 27250 loss: 2.7578 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.49 mfu: 49.39% global_avg_ntp_loss: 0.7853 global_avg_top_loss: 1.9726 +[titan] 2025-09-09 19:30:07,285 - root - INFO - lr: 6.2245e-06 gnorm: 0.40 [2 days, 1:54:40<23:21:10] +[titan] 2025-09-09 19:30:39,306 - root - INFO - step: 27255 loss: 3.7046 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.71 mfu: 49.31% global_avg_ntp_loss: 1.3229 global_avg_top_loss: 2.3816 +[titan] 2025-09-09 19:30:39,306 - root - INFO - lr: 6.2215e-06 gnorm: 0.37 [2 days, 1:55:12<23:20:37] +[titan] 2025-09-09 19:31:11,556 - root - INFO - step: 27260 loss: 2.6905 memory: 122.03GiB(87.57%) tps: 10,161 tflops: 484.25 mfu: 48.96% global_avg_ntp_loss: 0.7532 global_avg_top_loss: 1.9373 +[titan] 2025-09-09 19:31:11,556 - root - INFO - lr: 6.2184e-06 gnorm: 0.40 [2 days, 1:55:44<23:20:03] +[titan] 2025-09-09 19:31:43,625 - root - INFO - step: 27265 loss: 2.7020 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.98 mfu: 49.24% global_avg_ntp_loss: 0.7563 global_avg_top_loss: 1.9457 +[titan] 2025-09-09 19:31:43,625 - root - INFO - lr: 6.2154e-06 gnorm: 0.38 [2 days, 1:56:16<23:19:30] +[titan] 2025-09-09 19:32:15,716 - root - INFO - step: 27270 loss: 3.1301 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.65 mfu: 49.21% global_avg_ntp_loss: 1.0078 global_avg_top_loss: 2.1223 +[titan] 2025-09-09 19:32:15,717 - root - INFO - lr: 6.2124e-06 gnorm: 0.39 [2 days, 1:56:48<23:18:57] +[titan] 2025-09-09 19:32:47,847 - root - INFO - step: 27275 loss: 2.7305 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.06 mfu: 49.15% global_avg_ntp_loss: 0.7718 global_avg_top_loss: 1.9586 +[titan] 2025-09-09 19:32:47,847 - root - INFO - lr: 6.2094e-06 gnorm: 0.37 [2 days, 1:57:20<23:18:23] +[titan] 2025-09-09 19:33:19,745 - root - INFO - step: 27280 loss: 2.7766 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.59 mfu: 49.50% global_avg_ntp_loss: 0.7970 global_avg_top_loss: 1.9796 +[titan] 2025-09-09 19:33:19,745 - root - INFO - lr: 6.2063e-06 gnorm: 0.43 [2 days, 1:57:52<23:17:50] +[titan] 2025-09-09 19:33:51,887 - root - INFO - step: 27285 loss: 2.7533 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.88 mfu: 49.13% global_avg_ntp_loss: 0.7798 global_avg_top_loss: 1.9734 +[titan] 2025-09-09 19:33:51,888 - root - INFO - lr: 6.2033e-06 gnorm: 0.37 [2 days, 1:58:24<23:17:16] +[titan] 2025-09-09 19:34:23,928 - root - INFO - step: 27290 loss: 2.9562 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.42 mfu: 49.28% global_avg_ntp_loss: 0.9042 global_avg_top_loss: 2.0520 +[titan] 2025-09-09 19:34:23,928 - root - INFO - lr: 6.2003e-06 gnorm: 0.37 [2 days, 1:58:56<23:16:43] +[titan] 2025-09-09 19:34:55,791 - root - INFO - step: 27295 loss: 2.6356 memory: 122.03GiB(87.57%) tps: 10,284 tflops: 490.12 mfu: 49.56% global_avg_ntp_loss: 0.7307 global_avg_top_loss: 1.9048 +[titan] 2025-09-09 19:34:55,792 - root - INFO - lr: 6.1973e-06 gnorm: 0.41 [2 days, 1:59:28<23:16:09] +[titan] 2025-09-09 19:35:21,453 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 19:35:27,806 - root - INFO - step: 27300 loss: 2.6964 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.82 mfu: 49.32% global_avg_ntp_loss: 0.7595 global_avg_top_loss: 1.9370 +[titan] 2025-09-09 19:35:27,806 - root - INFO - lr: 6.1943e-06 gnorm: 0.37 [2 days, 2:00:00<23:15:36] +[titan] 2025-09-09 19:35:59,660 - root - INFO - step: 27305 loss: 2.7542 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.27 mfu: 49.57% global_avg_ntp_loss: 0.7859 global_avg_top_loss: 1.9683 +[titan] 2025-09-09 19:35:59,660 - root - INFO - lr: 6.1912e-06 gnorm: 0.37 [2 days, 2:00:32<23:15:03] +[titan] 2025-09-09 19:36:31,538 - root - INFO - step: 27310 loss: 2.7520 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.91 mfu: 49.54% global_avg_ntp_loss: 0.7791 global_avg_top_loss: 1.9729 +[titan] 2025-09-09 19:36:31,538 - root - INFO - lr: 6.1882e-06 gnorm: 0.38 [2 days, 2:01:04<23:14:29] +[titan] 2025-09-09 19:37:03,781 - root - INFO - step: 27315 loss: 2.6413 memory: 122.03GiB(87.57%) tps: 10,163 tflops: 484.36 mfu: 48.97% global_avg_ntp_loss: 0.7354 global_avg_top_loss: 1.9059 +[titan] 2025-09-09 19:37:03,781 - root - INFO - lr: 6.1852e-06 gnorm: 0.38 [2 days, 2:01:36<23:13:56] +[titan] 2025-09-09 19:37:35,889 - root - INFO - step: 27320 loss: 2.7525 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.40 mfu: 49.18% global_avg_ntp_loss: 0.7833 global_avg_top_loss: 1.9692 +[titan] 2025-09-09 19:37:35,889 - root - INFO - lr: 6.1822e-06 gnorm: 0.38 [2 days, 2:02:08<23:13:22] +[titan] 2025-09-09 19:38:08,291 - root - INFO - step: 27325 loss: 2.7362 memory: 122.03GiB(87.57%) tps: 10,113 tflops: 481.98 mfu: 48.73% global_avg_ntp_loss: 0.7729 global_avg_top_loss: 1.9632 +[titan] 2025-09-09 19:38:08,291 - root - INFO - lr: 6.1792e-06 gnorm: 0.37 [2 days, 2:02:41<23:12:49] +[titan] 2025-09-09 19:38:40,424 - root - INFO - step: 27330 loss: 2.7705 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.02 mfu: 49.14% global_avg_ntp_loss: 0.7971 global_avg_top_loss: 1.9734 +[titan] 2025-09-09 19:38:40,424 - root - INFO - lr: 6.1762e-06 gnorm: 0.37 [2 days, 2:03:13<23:12:16] +[titan] 2025-09-09 19:39:12,592 - root - INFO - step: 27335 loss: 3.6722 memory: 122.03GiB(87.57%) tps: 10,187 tflops: 485.49 mfu: 49.09% global_avg_ntp_loss: 1.3130 global_avg_top_loss: 2.3592 +[titan] 2025-09-09 19:39:12,592 - root - INFO - lr: 6.1731e-06 gnorm: 0.42 [2 days, 2:03:45<23:11:43] +[titan] 2025-09-09 19:39:44,432 - root - INFO - step: 27340 loss: 2.6653 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.49 mfu: 49.59% global_avg_ntp_loss: 0.7394 global_avg_top_loss: 1.9259 +[titan] 2025-09-09 19:39:44,432 - root - INFO - lr: 6.1701e-06 gnorm: 0.38 [2 days, 2:04:17<23:11:09] +[titan] 2025-09-09 19:40:16,442 - root - INFO - step: 27345 loss: 2.7383 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.89 mfu: 49.33% global_avg_ntp_loss: 0.7746 global_avg_top_loss: 1.9637 +[titan] 2025-09-09 19:40:16,442 - root - INFO - lr: 6.1671e-06 gnorm: 0.37 [2 days, 2:04:49<23:10:36] +[titan] 2025-09-09 19:40:42,011 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 19:40:48,530 - root - INFO - step: 27350 loss: 3.2440 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.70 mfu: 49.21% global_avg_ntp_loss: 1.0583 global_avg_top_loss: 2.1858 +[titan] 2025-09-09 19:40:48,530 - root - INFO - lr: 6.1641e-06 gnorm: 0.38 [2 days, 2:05:21<23:10:02] +[titan] 2025-09-09 19:41:20,522 - root - INFO - step: 27355 loss: 3.2478 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.15 mfu: 49.36% global_avg_ntp_loss: 1.0625 global_avg_top_loss: 2.1853 +[titan] 2025-09-09 19:41:20,522 - root - INFO - lr: 6.1611e-06 gnorm: 0.39 [2 days, 2:05:53<23:09:29] +[titan] 2025-09-09 19:41:52,521 - root - INFO - step: 27360 loss: 2.7529 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.05 mfu: 49.35% global_avg_ntp_loss: 0.7827 global_avg_top_loss: 1.9702 +[titan] 2025-09-09 19:41:52,521 - root - INFO - lr: 6.1581e-06 gnorm: 0.39 [2 days, 2:06:25<23:08:55] +[titan] 2025-09-09 19:42:24,369 - root - INFO - step: 27365 loss: 2.6660 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.37 mfu: 49.58% global_avg_ntp_loss: 0.7412 global_avg_top_loss: 1.9247 +[titan] 2025-09-09 19:42:24,369 - root - INFO - lr: 6.1551e-06 gnorm: 0.37 [2 days, 2:06:57<23:08:22] +[titan] 2025-09-09 19:42:56,637 - root - INFO - step: 27370 loss: 2.6101 memory: 122.03GiB(87.57%) tps: 10,155 tflops: 483.98 mfu: 48.94% global_avg_ntp_loss: 0.7181 global_avg_top_loss: 1.8921 +[titan] 2025-09-09 19:42:56,638 - root - INFO - lr: 6.1521e-06 gnorm: 0.36 [2 days, 2:07:29<23:07:49] +[titan] 2025-09-09 19:43:28,559 - root - INFO - step: 27375 loss: 2.7820 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.24 mfu: 49.47% global_avg_ntp_loss: 0.7932 global_avg_top_loss: 1.9888 +[titan] 2025-09-09 19:43:28,559 - root - INFO - lr: 6.1491e-06 gnorm: 0.55 [2 days, 2:08:01<23:07:15] +[titan] 2025-09-09 19:44:00,626 - root - INFO - step: 27380 loss: 2.7627 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.02 mfu: 49.24% global_avg_ntp_loss: 0.7883 global_avg_top_loss: 1.9744 +[titan] 2025-09-09 19:44:00,626 - root - INFO - lr: 6.1461e-06 gnorm: 0.39 [2 days, 2:08:33<23:06:42] +[titan] 2025-09-09 19:44:32,575 - root - INFO - step: 27385 loss: 2.6749 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.81 mfu: 49.42% global_avg_ntp_loss: 0.7450 global_avg_top_loss: 1.9299 +[titan] 2025-09-09 19:44:32,575 - root - INFO - lr: 6.1431e-06 gnorm: 0.38 [2 days, 2:09:05<23:06:08] +[titan] 2025-09-09 19:45:04,822 - root - INFO - step: 27390 loss: 2.7189 memory: 122.03GiB(87.57%) tps: 10,162 tflops: 484.30 mfu: 48.97% global_avg_ntp_loss: 0.7633 global_avg_top_loss: 1.9556 +[titan] 2025-09-09 19:45:04,822 - root - INFO - lr: 6.1400e-06 gnorm: 0.40 [2 days, 2:09:37<23:05:35] +[titan] 2025-09-09 19:45:36,726 - root - INFO - step: 27395 loss: 2.6272 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.51 mfu: 49.50% global_avg_ntp_loss: 0.7262 global_avg_top_loss: 1.9010 +[titan] 2025-09-09 19:45:36,726 - root - INFO - lr: 6.1370e-06 gnorm: 0.38 [2 days, 2:10:09<23:05:02] +[titan] 2025-09-09 19:46:02,252 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 19:46:08,670 - root - INFO - step: 27400 loss: 3.2022 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.90 mfu: 49.43% global_avg_ntp_loss: 1.0387 global_avg_top_loss: 2.1635 +[titan] 2025-09-09 19:46:08,670 - root - INFO - lr: 6.1340e-06 gnorm: 0.39 [2 days, 2:10:41<23:04:28] +[titan] 2025-09-09 19:46:40,733 - root - INFO - step: 27405 loss: 2.7130 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.07 mfu: 49.25% global_avg_ntp_loss: 0.7644 global_avg_top_loss: 1.9487 +[titan] 2025-09-09 19:46:40,733 - root - INFO - lr: 6.1310e-06 gnorm: 0.37 [2 days, 2:11:13<23:03:55] +[titan] 2025-09-09 19:47:12,686 - root - INFO - step: 27410 loss: 2.7198 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.76 mfu: 49.42% global_avg_ntp_loss: 0.7647 global_avg_top_loss: 1.9552 +[titan] 2025-09-09 19:47:12,686 - root - INFO - lr: 6.1280e-06 gnorm: 0.37 [2 days, 2:11:45<23:03:21] +[titan] 2025-09-09 19:47:44,811 - root - INFO - step: 27415 loss: 3.0966 memory: 122.03GiB(87.57%) tps: 10,200 tflops: 486.13 mfu: 49.15% global_avg_ntp_loss: 0.9709 global_avg_top_loss: 2.1257 +[titan] 2025-09-09 19:47:44,812 - root - INFO - lr: 6.1250e-06 gnorm: 0.38 [2 days, 2:12:17<23:02:48] +[titan] 2025-09-09 19:48:16,814 - root - INFO - step: 27420 loss: 2.7303 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 488.00 mfu: 49.34% global_avg_ntp_loss: 0.7730 global_avg_top_loss: 1.9573 +[titan] 2025-09-09 19:48:16,814 - root - INFO - lr: 6.1220e-06 gnorm: 0.37 [2 days, 2:12:49<23:02:15] +[titan] 2025-09-09 19:48:48,824 - root - INFO - step: 27425 loss: 2.6604 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.89 mfu: 49.33% global_avg_ntp_loss: 0.7384 global_avg_top_loss: 1.9220 +[titan] 2025-09-09 19:48:48,824 - root - INFO - lr: 6.1190e-06 gnorm: 0.36 [2 days, 2:13:21<23:01:41] +[titan] 2025-09-09 19:49:20,979 - root - INFO - step: 27430 loss: 3.2045 memory: 122.03GiB(87.57%) tps: 10,191 tflops: 485.68 mfu: 49.11% global_avg_ntp_loss: 1.0378 global_avg_top_loss: 2.1667 +[titan] 2025-09-09 19:49:20,979 - root - INFO - lr: 6.1160e-06 gnorm: 0.39 [2 days, 2:13:53<23:01:08] +[titan] 2025-09-09 19:49:53,147 - root - INFO - step: 27435 loss: 2.6938 memory: 122.03GiB(87.57%) tps: 10,187 tflops: 485.50 mfu: 49.09% global_avg_ntp_loss: 0.7552 global_avg_top_loss: 1.9387 +[titan] 2025-09-09 19:49:53,147 - root - INFO - lr: 6.1130e-06 gnorm: 0.37 [2 days, 2:14:25<23:00:35] +[titan] 2025-09-09 19:50:25,231 - root - INFO - step: 27440 loss: 2.7039 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.76 mfu: 49.22% global_avg_ntp_loss: 0.7604 global_avg_top_loss: 1.9435 +[titan] 2025-09-09 19:50:25,231 - root - INFO - lr: 6.1100e-06 gnorm: 0.38 [2 days, 2:14:58<23:00:01] +[titan] 2025-09-09 19:50:57,335 - root - INFO - step: 27445 loss: 2.6224 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.46 mfu: 49.19% global_avg_ntp_loss: 0.7239 global_avg_top_loss: 1.8986 +[titan] 2025-09-09 19:50:57,335 - root - INFO - lr: 6.1070e-06 gnorm: 0.36 [2 days, 2:15:30<22:59:28] +[titan] 2025-09-09 19:51:23,014 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 19:51:29,567 - root - INFO - step: 27450 loss: 2.7362 memory: 122.03GiB(87.57%) tps: 10,166 tflops: 484.52 mfu: 48.99% global_avg_ntp_loss: 0.7747 global_avg_top_loss: 1.9615 +[titan] 2025-09-09 19:51:29,568 - root - INFO - lr: 6.1040e-06 gnorm: 0.38 [2 days, 2:16:02<22:58:55] +[titan] 2025-09-09 19:52:01,613 - root - INFO - step: 27455 loss: 2.7540 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.34 mfu: 49.28% global_avg_ntp_loss: 0.7794 global_avg_top_loss: 1.9747 +[titan] 2025-09-09 19:52:01,613 - root - INFO - lr: 6.1011e-06 gnorm: 0.50 [2 days, 2:16:34<22:58:21] +[titan] 2025-09-09 19:52:33,610 - root - INFO - step: 27460 loss: 2.7341 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.09 mfu: 49.35% global_avg_ntp_loss: 0.7713 global_avg_top_loss: 1.9628 +[titan] 2025-09-09 19:52:33,610 - root - INFO - lr: 6.0981e-06 gnorm: 0.39 [2 days, 2:17:06<22:57:48] +[titan] 2025-09-09 19:53:05,587 - root - INFO - step: 27465 loss: 2.6968 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.38 mfu: 49.38% global_avg_ntp_loss: 0.7580 global_avg_top_loss: 1.9388 +[titan] 2025-09-09 19:53:05,588 - root - INFO - lr: 6.0951e-06 gnorm: 0.39 [2 days, 2:17:38<22:57:14] +[titan] 2025-09-09 19:53:37,733 - root - INFO - step: 27470 loss: 2.7709 memory: 122.03GiB(87.57%) tps: 10,194 tflops: 485.83 mfu: 49.12% global_avg_ntp_loss: 0.7893 global_avg_top_loss: 1.9816 +[titan] 2025-09-09 19:53:37,733 - root - INFO - lr: 6.0921e-06 gnorm: 0.40 [2 days, 2:18:10<22:56:41] +[titan] 2025-09-09 19:54:09,955 - root - INFO - step: 27475 loss: 2.6824 memory: 122.03GiB(87.57%) tps: 10,170 tflops: 484.67 mfu: 49.01% global_avg_ntp_loss: 0.7459 global_avg_top_loss: 1.9365 +[titan] 2025-09-09 19:54:09,955 - root - INFO - lr: 6.0891e-06 gnorm: 0.37 [2 days, 2:18:42<22:56:08] +[titan] 2025-09-09 19:54:42,002 - root - INFO - step: 27480 loss: 2.7159 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.33 mfu: 49.27% global_avg_ntp_loss: 0.7651 global_avg_top_loss: 1.9508 +[titan] 2025-09-09 19:54:42,002 - root - INFO - lr: 6.0861e-06 gnorm: 0.37 [2 days, 2:19:14<22:55:34] +[titan] 2025-09-09 19:55:14,136 - root - INFO - step: 27485 loss: 2.6776 memory: 122.03GiB(87.57%) tps: 10,197 tflops: 485.99 mfu: 49.14% global_avg_ntp_loss: 0.7357 global_avg_top_loss: 1.9419 +[titan] 2025-09-09 19:55:14,137 - root - INFO - lr: 6.0831e-06 gnorm: 0.66 [2 days, 2:19:46<22:55:01] +[titan] 2025-09-09 19:55:46,147 - root - INFO - step: 27490 loss: 2.7540 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.88 mfu: 49.33% global_avg_ntp_loss: 0.7806 global_avg_top_loss: 1.9734 +[titan] 2025-09-09 19:55:46,147 - root - INFO - lr: 6.0801e-06 gnorm: 0.37 [2 days, 2:20:18<22:54:28] +[titan] 2025-09-09 19:56:18,092 - root - INFO - step: 27495 loss: 2.6953 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.87 mfu: 49.43% global_avg_ntp_loss: 0.7540 global_avg_top_loss: 1.9413 +[titan] 2025-09-09 19:56:18,093 - root - INFO - lr: 6.0771e-06 gnorm: 0.38 [2 days, 2:20:50<22:53:54] +[titan] 2025-09-09 19:56:43,684 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 19:56:50,109 - root - INFO - step: 27500 loss: 2.6832 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.79 mfu: 49.32% global_avg_ntp_loss: 0.7485 global_avg_top_loss: 1.9347 +[titan] 2025-09-09 19:56:50,109 - root - INFO - lr: 6.0741e-06 gnorm: 0.37 [2 days, 2:21:22<22:53:21] +[titan] 2025-09-09 19:57:22,269 - root - INFO - step: 27505 loss: 2.6244 memory: 122.03GiB(87.57%) tps: 10,189 tflops: 485.61 mfu: 49.10% global_avg_ntp_loss: 0.7235 global_avg_top_loss: 1.9009 +[titan] 2025-09-09 19:57:22,269 - root - INFO - lr: 6.0711e-06 gnorm: 0.36 [2 days, 2:21:55<22:52:48] +[titan] 2025-09-09 19:57:54,296 - root - INFO - step: 27510 loss: 3.1628 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.63 mfu: 49.31% global_avg_ntp_loss: 1.0215 global_avg_top_loss: 2.1413 +[titan] 2025-09-09 19:57:54,296 - root - INFO - lr: 6.0682e-06 gnorm: 0.37 [2 days, 2:22:27<22:52:14] +[titan] 2025-09-09 19:58:26,526 - root - INFO - step: 27515 loss: 2.7030 memory: 122.03GiB(87.57%) tps: 10,167 tflops: 484.55 mfu: 48.99% global_avg_ntp_loss: 0.7594 global_avg_top_loss: 1.9436 +[titan] 2025-09-09 19:58:26,526 - root - INFO - lr: 6.0652e-06 gnorm: 0.44 [2 days, 2:22:59<22:51:41] +[titan] 2025-09-09 19:58:58,587 - root - INFO - step: 27520 loss: 2.6953 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.12 mfu: 49.25% global_avg_ntp_loss: 0.7537 global_avg_top_loss: 1.9416 +[titan] 2025-09-09 19:58:58,587 - root - INFO - lr: 6.0622e-06 gnorm: 0.41 [2 days, 2:23:31<22:51:07] +[titan] 2025-09-09 19:59:30,616 - root - INFO - step: 27525 loss: 2.6840 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.59 mfu: 49.30% global_avg_ntp_loss: 0.7492 global_avg_top_loss: 1.9348 +[titan] 2025-09-09 19:59:30,616 - root - INFO - lr: 6.0592e-06 gnorm: 0.38 [2 days, 2:24:03<22:50:34] +[titan] 2025-09-09 20:00:02,762 - root - INFO - step: 27530 loss: 2.7763 memory: 122.03GiB(87.57%) tps: 10,194 tflops: 485.82 mfu: 49.12% global_avg_ntp_loss: 0.7919 global_avg_top_loss: 1.9844 +[titan] 2025-09-09 20:00:02,763 - root - INFO - lr: 6.0562e-06 gnorm: 0.38 [2 days, 2:24:35<22:50:01] +[titan] 2025-09-09 20:00:34,732 - root - INFO - step: 27535 loss: 2.7301 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.50 mfu: 49.39% global_avg_ntp_loss: 0.7703 global_avg_top_loss: 1.9598 +[titan] 2025-09-09 20:00:34,732 - root - INFO - lr: 6.0532e-06 gnorm: 0.42 [2 days, 2:25:07<22:49:27] +[titan] 2025-09-09 20:01:06,791 - root - INFO - step: 27540 loss: 2.7005 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.13 mfu: 49.26% global_avg_ntp_loss: 0.7577 global_avg_top_loss: 1.9428 +[titan] 2025-09-09 20:01:06,792 - root - INFO - lr: 6.0503e-06 gnorm: 0.38 [2 days, 2:25:39<22:48:54] +[titan] 2025-09-09 20:01:38,813 - root - INFO - step: 27545 loss: 3.2007 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.72 mfu: 49.31% global_avg_ntp_loss: 1.0387 global_avg_top_loss: 2.1619 +[titan] 2025-09-09 20:01:38,813 - root - INFO - lr: 6.0473e-06 gnorm: 0.37 [2 days, 2:26:11<22:48:21] +[titan] 2025-09-09 20:02:04,418 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 20:02:10,844 - root - INFO - step: 27550 loss: 2.7095 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.56 mfu: 49.30% global_avg_ntp_loss: 0.7634 global_avg_top_loss: 1.9461 +[titan] 2025-09-09 20:02:10,844 - root - INFO - lr: 6.0443e-06 gnorm: 0.36 [2 days, 2:26:43<22:47:47] +[titan] 2025-09-09 20:02:43,093 - root - INFO - step: 27555 loss: 2.6413 memory: 122.03GiB(87.57%) tps: 10,161 tflops: 484.27 mfu: 48.97% global_avg_ntp_loss: 0.7298 global_avg_top_loss: 1.9114 +[titan] 2025-09-09 20:02:43,093 - root - INFO - lr: 6.0413e-06 gnorm: 0.38 [2 days, 2:27:15<22:47:14] +[titan] 2025-09-09 20:03:15,158 - root - INFO - step: 27560 loss: 2.7785 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.04 mfu: 49.25% global_avg_ntp_loss: 0.7945 global_avg_top_loss: 1.9840 +[titan] 2025-09-09 20:03:15,158 - root - INFO - lr: 6.0383e-06 gnorm: 0.38 [2 days, 2:27:47<22:46:41] +[titan] 2025-09-09 20:03:47,318 - root - INFO - step: 27565 loss: 2.6802 memory: 122.03GiB(87.57%) tps: 10,189 tflops: 485.62 mfu: 49.10% global_avg_ntp_loss: 0.7481 global_avg_top_loss: 1.9320 +[titan] 2025-09-09 20:03:47,318 - root - INFO - lr: 6.0354e-06 gnorm: 0.36 [2 days, 2:28:20<22:46:07] +[titan] 2025-09-09 20:04:19,472 - root - INFO - step: 27570 loss: 2.7005 memory: 122.03GiB(87.57%) tps: 10,191 tflops: 485.70 mfu: 49.11% global_avg_ntp_loss: 0.7580 global_avg_top_loss: 1.9425 +[titan] 2025-09-09 20:04:19,472 - root - INFO - lr: 6.0324e-06 gnorm: 0.62 [2 days, 2:28:52<22:45:34] +[titan] 2025-09-09 20:04:51,440 - root - INFO - step: 27575 loss: 2.7155 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.52 mfu: 49.39% global_avg_ntp_loss: 0.7669 global_avg_top_loss: 1.9486 +[titan] 2025-09-09 20:04:51,441 - root - INFO - lr: 6.0294e-06 gnorm: 0.37 [2 days, 2:29:24<22:45:01] +[titan] 2025-09-09 20:05:23,276 - root - INFO - step: 27580 loss: 2.7277 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.56 mfu: 49.60% global_avg_ntp_loss: 0.7699 global_avg_top_loss: 1.9577 +[titan] 2025-09-09 20:05:23,276 - root - INFO - lr: 6.0264e-06 gnorm: 0.36 [2 days, 2:29:56<22:44:27] +[titan] 2025-09-09 20:05:55,367 - root - INFO - step: 27585 loss: 2.7860 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.65 mfu: 49.21% global_avg_ntp_loss: 0.8004 global_avg_top_loss: 1.9856 +[titan] 2025-09-09 20:05:55,367 - root - INFO - lr: 6.0235e-06 gnorm: 0.42 [2 days, 2:30:28<22:43:54] +[titan] 2025-09-09 20:06:27,596 - root - INFO - step: 27590 loss: 2.6917 memory: 122.03GiB(87.57%) tps: 10,167 tflops: 484.57 mfu: 49.00% global_avg_ntp_loss: 0.7537 global_avg_top_loss: 1.9379 +[titan] 2025-09-09 20:06:27,596 - root - INFO - lr: 6.0205e-06 gnorm: 0.38 [2 days, 2:31:00<22:43:20] +[titan] 2025-09-09 20:06:59,531 - root - INFO - step: 27595 loss: 2.7512 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.03 mfu: 49.45% global_avg_ntp_loss: 0.7786 global_avg_top_loss: 1.9726 +[titan] 2025-09-09 20:06:59,531 - root - INFO - lr: 6.0175e-06 gnorm: 0.38 [2 days, 2:31:32<22:42:47] +[titan] 2025-09-09 20:07:25,377 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 20:07:31,768 - root - INFO - step: 27600 loss: 2.6837 memory: 122.03GiB(87.57%) tps: 10,165 tflops: 484.44 mfu: 48.98% global_avg_ntp_loss: 0.7489 global_avg_top_loss: 1.9348 +[titan] 2025-09-09 20:07:31,769 - root - INFO - lr: 6.0145e-06 gnorm: 0.38 [2 days, 2:32:04<22:42:14] +[titan] 2025-09-09 20:08:03,866 - root - INFO - step: 27605 loss: 2.7257 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.56 mfu: 49.20% global_avg_ntp_loss: 0.7721 global_avg_top_loss: 1.9536 +[titan] 2025-09-09 20:08:03,866 - root - INFO - lr: 6.0116e-06 gnorm: 0.37 [2 days, 2:32:36<22:41:40] +[titan] 2025-09-09 20:08:35,732 - root - INFO - step: 27610 loss: 2.7324 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.08 mfu: 49.55% global_avg_ntp_loss: 0.7754 global_avg_top_loss: 1.9570 +[titan] 2025-09-09 20:08:35,732 - root - INFO - lr: 6.0086e-06 gnorm: 0.39 [2 days, 2:33:08<22:41:07] +[titan] 2025-09-09 20:09:07,693 - root - INFO - step: 27615 loss: 2.7002 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.64 mfu: 49.41% global_avg_ntp_loss: 0.7548 global_avg_top_loss: 1.9454 +[titan] 2025-09-09 20:09:07,693 - root - INFO - lr: 6.0056e-06 gnorm: 0.45 [2 days, 2:33:40<22:40:34] +[titan] 2025-09-09 20:09:39,612 - root - INFO - step: 27620 loss: 2.7725 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.27 mfu: 49.47% global_avg_ntp_loss: 0.7925 global_avg_top_loss: 1.9799 +[titan] 2025-09-09 20:09:39,612 - root - INFO - lr: 6.0027e-06 gnorm: 0.38 [2 days, 2:34:12<22:40:00] +[titan] 2025-09-09 20:10:11,763 - root - INFO - step: 27625 loss: 3.2329 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.74 mfu: 49.11% global_avg_ntp_loss: 1.0521 global_avg_top_loss: 2.1808 +[titan] 2025-09-09 20:10:11,764 - root - INFO - lr: 5.9997e-06 gnorm: 0.37 [2 days, 2:34:44<22:39:27] +[titan] 2025-09-09 20:10:43,627 - root - INFO - step: 27630 loss: 2.7781 memory: 122.03GiB(87.57%) tps: 10,284 tflops: 490.13 mfu: 49.56% global_avg_ntp_loss: 0.7998 global_avg_top_loss: 1.9783 +[titan] 2025-09-09 20:10:43,627 - root - INFO - lr: 5.9967e-06 gnorm: 0.38 [2 days, 2:35:16<22:38:53] +[titan] 2025-09-09 20:11:15,602 - root - INFO - step: 27635 loss: 2.6257 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.43 mfu: 49.39% global_avg_ntp_loss: 0.7219 global_avg_top_loss: 1.9037 +[titan] 2025-09-09 20:11:15,602 - root - INFO - lr: 5.9937e-06 gnorm: 0.36 [2 days, 2:35:48<22:38:20] +[titan] 2025-09-09 20:11:47,609 - root - INFO - step: 27640 loss: 2.6846 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.92 mfu: 49.33% global_avg_ntp_loss: 0.7516 global_avg_top_loss: 1.9330 +[titan] 2025-09-09 20:11:47,610 - root - INFO - lr: 5.9908e-06 gnorm: 0.37 [2 days, 2:36:20<22:37:47] +[titan] 2025-09-09 20:12:19,838 - root - INFO - step: 27645 loss: 2.6790 memory: 122.03GiB(87.57%) tps: 10,167 tflops: 484.57 mfu: 49.00% global_avg_ntp_loss: 0.7468 global_avg_top_loss: 1.9322 +[titan] 2025-09-09 20:12:19,839 - root - INFO - lr: 5.9878e-06 gnorm: 0.37 [2 days, 2:36:52<22:37:13] +[titan] 2025-09-09 20:12:39,324 - root - INFO - Dumping profiler traces at step 27648 +[titan] 2025-09-09 20:12:39,395 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-09 20:12:45,703 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 20:12:52,061 - root - INFO - step: 27650 loss: 2.6148 memory: 122.03GiB(87.57%) tps: 10,170 tflops: 484.67 mfu: 49.01% global_avg_ntp_loss: 0.7215 global_avg_top_loss: 1.8933 +[titan] 2025-09-09 20:12:52,061 - root - INFO - lr: 5.9849e-06 gnorm: 0.36 [2 days, 2:37:24<22:36:40] +[titan] 2025-09-09 20:13:24,071 - root - INFO - step: 27655 loss: 2.6972 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.89 mfu: 49.33% global_avg_ntp_loss: 0.7571 global_avg_top_loss: 1.9401 +[titan] 2025-09-09 20:13:24,071 - root - INFO - lr: 5.9819e-06 gnorm: 0.37 [2 days, 2:37:56<22:36:07] +[titan] 2025-09-09 20:13:56,081 - root - INFO - step: 27660 loss: 2.6261 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.88 mfu: 49.33% global_avg_ntp_loss: 0.7168 global_avg_top_loss: 1.9093 +[titan] 2025-09-09 20:13:56,082 - root - INFO - lr: 5.9789e-06 gnorm: 0.45 [2 days, 2:38:28<22:35:33] +[titan] 2025-09-09 20:14:28,102 - root - INFO - step: 27665 loss: 2.7333 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.73 mfu: 49.32% global_avg_ntp_loss: 0.7723 global_avg_top_loss: 1.9610 +[titan] 2025-09-09 20:14:28,102 - root - INFO - lr: 5.9760e-06 gnorm: 0.38 [2 days, 2:39:00<22:35:00] +[titan] 2025-09-09 20:15:00,192 - root - INFO - step: 27670 loss: 2.6630 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.67 mfu: 49.21% global_avg_ntp_loss: 0.7456 global_avg_top_loss: 1.9174 +[titan] 2025-09-09 20:15:00,192 - root - INFO - lr: 5.9730e-06 gnorm: 0.37 [2 days, 2:39:33<22:34:27] +[titan] 2025-09-09 20:15:31,995 - root - INFO - step: 27675 loss: 2.6408 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.06 mfu: 49.65% global_avg_ntp_loss: 0.7313 global_avg_top_loss: 1.9095 +[titan] 2025-09-09 20:15:31,995 - root - INFO - lr: 5.9700e-06 gnorm: 0.37 [2 days, 2:40:04<22:33:53] +[titan] 2025-09-09 20:16:04,045 - root - INFO - step: 27680 loss: 2.6419 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.28 mfu: 49.27% global_avg_ntp_loss: 0.7316 global_avg_top_loss: 1.9104 +[titan] 2025-09-09 20:16:04,045 - root - INFO - lr: 5.9671e-06 gnorm: 0.36 [2 days, 2:40:36<22:33:20] +[titan] 2025-09-09 20:16:36,103 - root - INFO - step: 27685 loss: 2.7632 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.15 mfu: 49.26% global_avg_ntp_loss: 0.7870 global_avg_top_loss: 1.9761 +[titan] 2025-09-09 20:16:36,103 - root - INFO - lr: 5.9641e-06 gnorm: 0.38 [2 days, 2:41:08<22:32:46] +[titan] 2025-09-09 20:17:08,384 - root - INFO - step: 27690 loss: 2.7392 memory: 122.03GiB(87.57%) tps: 10,151 tflops: 483.79 mfu: 48.92% global_avg_ntp_loss: 0.7735 global_avg_top_loss: 1.9657 +[titan] 2025-09-09 20:17:08,384 - root - INFO - lr: 5.9612e-06 gnorm: 0.38 [2 days, 2:41:41<22:32:13] +[titan] 2025-09-09 20:17:40,217 - root - INFO - step: 27695 loss: 2.7230 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.59 mfu: 49.60% global_avg_ntp_loss: 0.7709 global_avg_top_loss: 1.9521 +[titan] 2025-09-09 20:17:40,217 - root - INFO - lr: 5.9582e-06 gnorm: 0.37 [2 days, 2:42:13<22:31:40] +[titan] 2025-09-09 20:18:05,935 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 20:18:12,388 - root - INFO - step: 27700 loss: 2.6947 memory: 122.03GiB(87.57%) tps: 10,186 tflops: 485.44 mfu: 49.08% global_avg_ntp_loss: 0.7504 global_avg_top_loss: 1.9443 +[titan] 2025-09-09 20:18:12,389 - root - INFO - lr: 5.9552e-06 gnorm: 0.38 [2 days, 2:42:45<22:31:06] +[titan] 2025-09-09 20:18:44,121 - root - INFO - step: 27705 loss: 3.2882 memory: 122.03GiB(87.57%) tps: 10,326 tflops: 492.15 mfu: 49.76% global_avg_ntp_loss: 1.0857 global_avg_top_loss: 2.2025 +[titan] 2025-09-09 20:18:44,121 - root - INFO - lr: 5.9523e-06 gnorm: 0.36 [2 days, 2:43:16<22:30:33] +[titan] 2025-09-09 20:19:16,182 - root - INFO - step: 27710 loss: 2.6179 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.11 mfu: 49.25% global_avg_ntp_loss: 0.7210 global_avg_top_loss: 1.8969 +[titan] 2025-09-09 20:19:16,182 - root - INFO - lr: 5.9493e-06 gnorm: 0.36 [2 days, 2:43:48<22:30:00] +[titan] 2025-09-09 20:19:48,197 - root - INFO - step: 27715 loss: 2.7285 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.81 mfu: 49.32% global_avg_ntp_loss: 0.7739 global_avg_top_loss: 1.9546 +[titan] 2025-09-09 20:19:48,197 - root - INFO - lr: 5.9464e-06 gnorm: 0.38 [2 days, 2:44:21<22:29:26] +[titan] 2025-09-09 20:20:20,146 - root - INFO - step: 27720 loss: 2.6512 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.81 mfu: 49.43% global_avg_ntp_loss: 0.7383 global_avg_top_loss: 1.9129 +[titan] 2025-09-09 20:20:20,146 - root - INFO - lr: 5.9434e-06 gnorm: 0.38 [2 days, 2:44:52<22:28:53] +[titan] 2025-09-09 20:20:52,030 - root - INFO - step: 27725 loss: 2.7467 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.82 mfu: 49.53% global_avg_ntp_loss: 0.7770 global_avg_top_loss: 1.9697 +[titan] 2025-09-09 20:20:52,030 - root - INFO - lr: 5.9405e-06 gnorm: 0.38 [2 days, 2:45:24<22:28:19] +[titan] 2025-09-09 20:21:23,984 - root - INFO - step: 27730 loss: 2.6245 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.73 mfu: 49.42% global_avg_ntp_loss: 0.7239 global_avg_top_loss: 1.9006 +[titan] 2025-09-09 20:21:23,985 - root - INFO - lr: 5.9375e-06 gnorm: 0.40 [2 days, 2:45:56<22:27:46] +[titan] 2025-09-09 20:21:56,411 - root - INFO - step: 27735 loss: 2.7080 memory: 122.03GiB(87.57%) tps: 10,105 tflops: 481.61 mfu: 48.70% global_avg_ntp_loss: 0.7666 global_avg_top_loss: 1.9414 +[titan] 2025-09-09 20:21:56,412 - root - INFO - lr: 5.9346e-06 gnorm: 0.38 [2 days, 2:46:29<22:27:13] +[titan] 2025-09-09 20:22:28,310 - root - INFO - step: 27740 loss: 2.6339 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.59 mfu: 49.50% global_avg_ntp_loss: 0.7294 global_avg_top_loss: 1.9045 +[titan] 2025-09-09 20:22:28,310 - root - INFO - lr: 5.9316e-06 gnorm: 0.39 [2 days, 2:47:01<22:26:39] +[titan] 2025-09-09 20:23:00,335 - root - INFO - step: 27745 loss: 2.7238 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.65 mfu: 49.31% global_avg_ntp_loss: 0.7675 global_avg_top_loss: 1.9563 +[titan] 2025-09-09 20:23:00,336 - root - INFO - lr: 5.9287e-06 gnorm: 0.39 [2 days, 2:47:33<22:26:06] +[titan] 2025-09-09 20:23:25,738 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 20:23:32,139 - root - INFO - step: 27750 loss: 3.1013 memory: 122.03GiB(87.57%) tps: 10,304 tflops: 491.06 mfu: 49.65% global_avg_ntp_loss: 0.9949 global_avg_top_loss: 2.1064 +[titan] 2025-09-09 20:23:32,139 - root - INFO - lr: 5.9257e-06 gnorm: 0.37 [2 days, 2:48:04<22:25:32] +[titan] 2025-09-09 20:24:04,277 - root - INFO - step: 27755 loss: 2.7528 memory: 122.03GiB(87.57%) tps: 10,196 tflops: 485.94 mfu: 49.13% global_avg_ntp_loss: 0.7814 global_avg_top_loss: 1.9714 +[titan] 2025-09-09 20:24:04,277 - root - INFO - lr: 5.9228e-06 gnorm: 0.38 [2 days, 2:48:37<22:24:59] +[titan] 2025-09-09 20:24:36,238 - root - INFO - step: 27760 loss: 2.6562 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.63 mfu: 49.41% global_avg_ntp_loss: 0.7350 global_avg_top_loss: 1.9213 +[titan] 2025-09-09 20:24:36,239 - root - INFO - lr: 5.9198e-06 gnorm: 0.42 [2 days, 2:49:09<22:24:26] +[titan] 2025-09-09 20:25:08,110 - root - INFO - step: 27765 loss: 2.6901 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 490.00 mfu: 49.55% global_avg_ntp_loss: 0.7548 global_avg_top_loss: 1.9353 +[titan] 2025-09-09 20:25:08,110 - root - INFO - lr: 5.9169e-06 gnorm: 0.38 [2 days, 2:49:40<22:23:52] +[titan] 2025-09-09 20:25:39,934 - root - INFO - step: 27770 loss: 2.7141 memory: 122.03GiB(87.57%) tps: 10,297 tflops: 490.74 mfu: 49.62% global_avg_ntp_loss: 0.7625 global_avg_top_loss: 1.9516 +[titan] 2025-09-09 20:25:39,934 - root - INFO - lr: 5.9139e-06 gnorm: 0.39 [2 days, 2:50:12<22:23:19] +[titan] 2025-09-09 20:26:11,989 - root - INFO - step: 27775 loss: 2.5712 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.19 mfu: 49.26% global_avg_ntp_loss: 0.7013 global_avg_top_loss: 1.8698 +[titan] 2025-09-09 20:26:11,990 - root - INFO - lr: 5.9110e-06 gnorm: 0.37 [2 days, 2:50:44<22:22:46] +[titan] 2025-09-09 20:26:43,981 - root - INFO - step: 27780 loss: 2.6533 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.17 mfu: 49.36% global_avg_ntp_loss: 0.7368 global_avg_top_loss: 1.9165 +[titan] 2025-09-09 20:26:43,981 - root - INFO - lr: 5.9080e-06 gnorm: 0.37 [2 days, 2:51:16<22:22:12] +[titan] 2025-09-09 20:27:15,991 - root - INFO - step: 27785 loss: 3.1062 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.88 mfu: 49.33% global_avg_ntp_loss: 0.9976 global_avg_top_loss: 2.1086 +[titan] 2025-09-09 20:27:15,992 - root - INFO - lr: 5.9051e-06 gnorm: 0.35 [2 days, 2:51:48<22:21:39] +[titan] 2025-09-09 20:27:48,058 - root - INFO - step: 27790 loss: 2.6567 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.03 mfu: 49.24% global_avg_ntp_loss: 0.7385 global_avg_top_loss: 1.9182 +[titan] 2025-09-09 20:27:48,058 - root - INFO - lr: 5.9022e-06 gnorm: 0.37 [2 days, 2:52:20<22:21:05] +[titan] 2025-09-09 20:28:20,124 - root - INFO - step: 27795 loss: 2.7361 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.03 mfu: 49.24% global_avg_ntp_loss: 0.7737 global_avg_top_loss: 1.9624 +[titan] 2025-09-09 20:28:20,124 - root - INFO - lr: 5.8992e-06 gnorm: 0.40 [2 days, 2:52:52<22:20:32] +[titan] 2025-09-09 20:28:45,737 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 20:28:52,137 - root - INFO - step: 27800 loss: 2.7212 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.83 mfu: 49.33% global_avg_ntp_loss: 0.7679 global_avg_top_loss: 1.9533 +[titan] 2025-09-09 20:28:52,137 - root - INFO - lr: 5.8963e-06 gnorm: 0.38 [2 days, 2:53:24<22:19:59] +[titan] 2025-09-09 20:29:24,069 - root - INFO - step: 27805 loss: 2.7507 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.09 mfu: 49.45% global_avg_ntp_loss: 0.7806 global_avg_top_loss: 1.9700 +[titan] 2025-09-09 20:29:24,069 - root - INFO - lr: 5.8933e-06 gnorm: 0.41 [2 days, 2:53:56<22:19:25] +[titan] 2025-09-09 20:29:56,044 - root - INFO - step: 27810 loss: 2.6492 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.41 mfu: 49.38% global_avg_ntp_loss: 0.7346 global_avg_top_loss: 1.9146 +[titan] 2025-09-09 20:29:56,044 - root - INFO - lr: 5.8904e-06 gnorm: 0.39 [2 days, 2:54:28<22:18:52] +[titan] 2025-09-09 20:30:27,920 - root - INFO - step: 27815 loss: 2.7739 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.94 mfu: 49.54% global_avg_ntp_loss: 0.7886 global_avg_top_loss: 1.9853 +[titan] 2025-09-09 20:30:27,920 - root - INFO - lr: 5.8875e-06 gnorm: 0.43 [2 days, 2:55:00<22:18:19] +[titan] 2025-09-09 20:30:59,941 - root - INFO - step: 27820 loss: 2.7760 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.72 mfu: 49.31% global_avg_ntp_loss: 0.7907 global_avg_top_loss: 1.9852 +[titan] 2025-09-09 20:30:59,941 - root - INFO - lr: 5.8845e-06 gnorm: 0.47 [2 days, 2:55:32<22:17:45] +[titan] 2025-09-09 20:31:31,956 - root - INFO - step: 27825 loss: 2.6948 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.81 mfu: 49.32% global_avg_ntp_loss: 0.7546 global_avg_top_loss: 1.9402 +[titan] 2025-09-09 20:31:31,956 - root - INFO - lr: 5.8816e-06 gnorm: 0.40 [2 days, 2:56:04<22:17:12] +[titan] 2025-09-09 20:32:04,191 - root - INFO - step: 27830 loss: 2.7136 memory: 122.03GiB(87.57%) tps: 10,165 tflops: 484.47 mfu: 48.99% global_avg_ntp_loss: 0.7647 global_avg_top_loss: 1.9489 +[titan] 2025-09-09 20:32:04,192 - root - INFO - lr: 5.8787e-06 gnorm: 0.38 [2 days, 2:56:36<22:16:39] +[titan] 2025-09-09 20:32:36,176 - root - INFO - step: 27835 loss: 2.6755 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.27 mfu: 49.37% global_avg_ntp_loss: 0.7463 global_avg_top_loss: 1.9292 +[titan] 2025-09-09 20:32:36,176 - root - INFO - lr: 5.8757e-06 gnorm: 0.37 [2 days, 2:57:08<22:16:05] +[titan] 2025-09-09 20:33:08,157 - root - INFO - step: 27840 loss: 2.7392 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.33 mfu: 49.38% global_avg_ntp_loss: 0.7754 global_avg_top_loss: 1.9638 +[titan] 2025-09-09 20:33:08,157 - root - INFO - lr: 5.8728e-06 gnorm: 0.41 [2 days, 2:57:40<22:15:32] +[titan] 2025-09-09 20:33:39,972 - root - INFO - step: 27845 loss: 2.7575 memory: 122.03GiB(87.57%) tps: 10,300 tflops: 490.88 mfu: 49.63% global_avg_ntp_loss: 0.7808 global_avg_top_loss: 1.9766 +[titan] 2025-09-09 20:33:39,972 - root - INFO - lr: 5.8698e-06 gnorm: 0.41 [2 days, 2:58:12<22:14:58] +[titan] 2025-09-09 20:34:05,443 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 20:34:11,851 - root - INFO - step: 27850 loss: 2.6612 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.89 mfu: 49.53% global_avg_ntp_loss: 0.7407 global_avg_top_loss: 1.9204 +[titan] 2025-09-09 20:34:11,851 - root - INFO - lr: 5.8669e-06 gnorm: 0.38 [2 days, 2:58:44<22:14:25] +[titan] 2025-09-09 20:34:43,832 - root - INFO - step: 27855 loss: 2.7084 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.32 mfu: 49.38% global_avg_ntp_loss: 0.7628 global_avg_top_loss: 1.9456 +[titan] 2025-09-09 20:34:43,833 - root - INFO - lr: 5.8640e-06 gnorm: 0.37 [2 days, 2:59:16<22:13:52] +[titan] 2025-09-09 20:35:15,729 - root - INFO - step: 27860 loss: 2.6756 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.61 mfu: 49.51% global_avg_ntp_loss: 0.7448 global_avg_top_loss: 1.9307 +[titan] 2025-09-09 20:35:15,730 - root - INFO - lr: 5.8611e-06 gnorm: 0.38 [2 days, 2:59:48<22:13:18] +[titan] 2025-09-09 20:35:47,711 - root - INFO - step: 27865 loss: 2.6445 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.32 mfu: 49.38% global_avg_ntp_loss: 0.7335 global_avg_top_loss: 1.9110 +[titan] 2025-09-09 20:35:47,711 - root - INFO - lr: 5.8581e-06 gnorm: 0.36 [2 days, 3:00:20<22:12:45] +[titan] 2025-09-09 20:36:19,553 - root - INFO - step: 27870 loss: 2.6463 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.45 mfu: 49.59% global_avg_ntp_loss: 0.7369 global_avg_top_loss: 1.9094 +[titan] 2025-09-09 20:36:19,553 - root - INFO - lr: 5.8552e-06 gnorm: 0.37 [2 days, 3:00:52<22:12:11] +[titan] 2025-09-09 20:36:51,375 - root - INFO - step: 27875 loss: 2.5530 memory: 122.03GiB(87.57%) tps: 10,297 tflops: 490.77 mfu: 49.62% global_avg_ntp_loss: 0.6897 global_avg_top_loss: 1.8633 +[titan] 2025-09-09 20:36:51,375 - root - INFO - lr: 5.8523e-06 gnorm: 0.38 [2 days, 3:01:24<22:11:38] +[titan] 2025-09-09 20:37:23,101 - root - INFO - step: 27880 loss: 2.7208 memory: 122.03GiB(87.57%) tps: 10,329 tflops: 492.26 mfu: 49.77% global_avg_ntp_loss: 0.7674 global_avg_top_loss: 1.9535 +[titan] 2025-09-09 20:37:23,101 - root - INFO - lr: 5.8493e-06 gnorm: 0.38 [2 days, 3:01:55<22:11:05] +[titan] 2025-09-09 20:37:55,021 - root - INFO - step: 27885 loss: 2.7644 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.26 mfu: 49.47% global_avg_ntp_loss: 0.7860 global_avg_top_loss: 1.9784 +[titan] 2025-09-09 20:37:55,021 - root - INFO - lr: 5.8464e-06 gnorm: 0.37 [2 days, 3:02:27<22:10:31] +[titan] 2025-09-09 20:38:27,081 - root - INFO - step: 27890 loss: 2.7070 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.11 mfu: 49.25% global_avg_ntp_loss: 0.7628 global_avg_top_loss: 1.9442 +[titan] 2025-09-09 20:38:27,082 - root - INFO - lr: 5.8435e-06 gnorm: 0.39 [2 days, 3:02:59<22:09:58] +[titan] 2025-09-09 20:38:58,877 - root - INFO - step: 27895 loss: 2.6664 memory: 122.03GiB(87.57%) tps: 10,306 tflops: 491.17 mfu: 49.66% global_avg_ntp_loss: 0.7416 global_avg_top_loss: 1.9248 +[titan] 2025-09-09 20:38:58,877 - root - INFO - lr: 5.8406e-06 gnorm: 0.40 [2 days, 3:03:31<22:09:24] +[titan] 2025-09-09 20:39:24,353 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 20:39:30,830 - root - INFO - step: 27900 loss: 2.7641 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.76 mfu: 49.42% global_avg_ntp_loss: 0.7852 global_avg_top_loss: 1.9788 +[titan] 2025-09-09 20:39:30,830 - root - INFO - lr: 5.8376e-06 gnorm: 0.40 [2 days, 3:04:03<22:08:51] +[titan] 2025-09-09 20:40:02,573 - root - INFO - step: 27905 loss: 2.6812 memory: 122.03GiB(87.57%) tps: 10,323 tflops: 491.99 mfu: 49.75% global_avg_ntp_loss: 0.7510 global_avg_top_loss: 1.9302 +[titan] 2025-09-09 20:40:02,573 - root - INFO - lr: 5.8347e-06 gnorm: 0.39 [2 days, 3:04:35<22:08:17] +[titan] 2025-09-09 20:40:34,425 - root - INFO - step: 27910 loss: 2.7515 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.31 mfu: 49.58% global_avg_ntp_loss: 0.7792 global_avg_top_loss: 1.9723 +[titan] 2025-09-09 20:40:34,425 - root - INFO - lr: 5.8318e-06 gnorm: 0.38 [2 days, 3:05:07<22:07:44] +[titan] 2025-09-09 20:41:06,220 - root - INFO - step: 27915 loss: 2.6838 memory: 122.03GiB(87.57%) tps: 10,306 tflops: 491.18 mfu: 49.66% global_avg_ntp_loss: 0.7545 global_avg_top_loss: 1.9292 +[titan] 2025-09-09 20:41:06,220 - root - INFO - lr: 5.8289e-06 gnorm: 0.37 [2 days, 3:05:39<22:07:11] +[titan] 2025-09-09 20:41:38,369 - root - INFO - step: 27920 loss: 2.8344 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.78 mfu: 49.12% global_avg_ntp_loss: 0.8100 global_avg_top_loss: 2.0244 +[titan] 2025-09-09 20:41:38,369 - root - INFO - lr: 5.8259e-06 gnorm: 0.46 [2 days, 3:06:11<22:06:37] +[titan] 2025-09-09 20:42:10,335 - root - INFO - step: 27925 loss: 2.6726 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.57 mfu: 49.40% global_avg_ntp_loss: 0.7456 global_avg_top_loss: 1.9271 +[titan] 2025-09-09 20:42:10,335 - root - INFO - lr: 5.8230e-06 gnorm: 0.38 [2 days, 3:06:43<22:06:04] +[titan] 2025-09-09 20:42:42,202 - root - INFO - step: 27930 loss: 2.7168 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.07 mfu: 49.55% global_avg_ntp_loss: 0.7630 global_avg_top_loss: 1.9538 +[titan] 2025-09-09 20:42:42,202 - root - INFO - lr: 5.8201e-06 gnorm: 0.37 [2 days, 3:07:14<22:05:31] +[titan] 2025-09-09 20:43:14,355 - root - INFO - step: 27935 loss: 2.7060 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.72 mfu: 49.11% global_avg_ntp_loss: 0.7617 global_avg_top_loss: 1.9442 +[titan] 2025-09-09 20:43:14,355 - root - INFO - lr: 5.8172e-06 gnorm: 0.44 [2 days, 3:07:47<22:04:57] +[titan] 2025-09-09 20:43:46,304 - root - INFO - step: 27940 loss: 2.7714 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.80 mfu: 49.42% global_avg_ntp_loss: 0.7948 global_avg_top_loss: 1.9766 +[titan] 2025-09-09 20:43:46,305 - root - INFO - lr: 5.8143e-06 gnorm: 0.40 [2 days, 3:08:19<22:04:24] +[titan] 2025-09-09 20:44:18,548 - root - INFO - step: 27945 loss: 2.6450 memory: 122.03GiB(87.57%) tps: 10,163 tflops: 484.34 mfu: 48.97% global_avg_ntp_loss: 0.7317 global_avg_top_loss: 1.9133 +[titan] 2025-09-09 20:44:18,549 - root - INFO - lr: 5.8113e-06 gnorm: 0.40 [2 days, 3:08:51<22:03:51] +[titan] 2025-09-09 20:44:44,211 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 20:44:50,577 - root - INFO - step: 27950 loss: 2.5807 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.60 mfu: 49.30% global_avg_ntp_loss: 0.7058 global_avg_top_loss: 1.8749 +[titan] 2025-09-09 20:44:50,578 - root - INFO - lr: 5.8084e-06 gnorm: 0.36 [2 days, 3:09:23<22:03:17] +[titan] 2025-09-09 20:45:22,546 - root - INFO - step: 27955 loss: 2.5778 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.52 mfu: 49.40% global_avg_ntp_loss: 0.7009 global_avg_top_loss: 1.8769 +[titan] 2025-09-09 20:45:22,546 - root - INFO - lr: 5.8055e-06 gnorm: 0.38 [2 days, 3:09:55<22:02:44] +[titan] 2025-09-09 20:45:54,501 - root - INFO - step: 27960 loss: 2.7177 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.73 mfu: 49.42% global_avg_ntp_loss: 0.7674 global_avg_top_loss: 1.9503 +[titan] 2025-09-09 20:45:54,501 - root - INFO - lr: 5.8026e-06 gnorm: 0.39 [2 days, 3:10:27<22:02:11] +[titan] 2025-09-09 20:46:26,433 - root - INFO - step: 27965 loss: 2.7119 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.08 mfu: 49.45% global_avg_ntp_loss: 0.7644 global_avg_top_loss: 1.9475 +[titan] 2025-09-09 20:46:26,433 - root - INFO - lr: 5.7997e-06 gnorm: 0.38 [2 days, 3:10:59<22:01:37] +[titan] 2025-09-09 20:46:58,520 - root - INFO - step: 27970 loss: 2.6835 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.70 mfu: 49.21% global_avg_ntp_loss: 0.7488 global_avg_top_loss: 1.9347 +[titan] 2025-09-09 20:46:58,521 - root - INFO - lr: 5.7968e-06 gnorm: 0.37 [2 days, 3:11:31<22:01:04] +[titan] 2025-09-09 20:47:30,572 - root - INFO - step: 27975 loss: 2.7375 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.26 mfu: 49.27% global_avg_ntp_loss: 0.7756 global_avg_top_loss: 1.9619 +[titan] 2025-09-09 20:47:30,572 - root - INFO - lr: 5.7939e-06 gnorm: 0.37 [2 days, 3:12:03<22:00:31] +[titan] 2025-09-09 20:48:02,631 - root - INFO - step: 27980 loss: 2.6951 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.13 mfu: 49.25% global_avg_ntp_loss: 0.7562 global_avg_top_loss: 1.9389 +[titan] 2025-09-09 20:48:02,632 - root - INFO - lr: 5.7909e-06 gnorm: 0.36 [2 days, 3:12:35<21:59:57] +[titan] 2025-09-09 20:48:34,549 - root - INFO - step: 27985 loss: 2.7530 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.29 mfu: 49.47% global_avg_ntp_loss: 0.7831 global_avg_top_loss: 1.9699 +[titan] 2025-09-09 20:48:34,549 - root - INFO - lr: 5.7880e-06 gnorm: 0.39 [2 days, 3:13:07<21:59:24] +[titan] 2025-09-09 20:49:06,345 - root - INFO - step: 27990 loss: 2.7218 memory: 122.03GiB(87.57%) tps: 10,306 tflops: 491.17 mfu: 49.66% global_avg_ntp_loss: 0.7642 global_avg_top_loss: 1.9576 +[titan] 2025-09-09 20:49:06,345 - root - INFO - lr: 5.7851e-06 gnorm: 0.39 [2 days, 3:13:39<21:58:50] +[titan] 2025-09-09 20:49:38,332 - root - INFO - step: 27995 loss: 2.6885 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.24 mfu: 49.37% global_avg_ntp_loss: 0.7592 global_avg_top_loss: 1.9293 +[titan] 2025-09-09 20:49:38,332 - root - INFO - lr: 5.7822e-06 gnorm: 0.39 [2 days, 3:14:11<21:58:17] +[titan] 2025-09-09 20:50:03,919 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 20:50:10,288 - root - INFO - step: 28000 loss: 3.0002 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.70 mfu: 49.41% global_avg_ntp_loss: 0.9230 global_avg_top_loss: 2.0772 +[titan] 2025-09-09 20:50:10,289 - root - INFO - lr: 5.7793e-06 gnorm: 0.37 [2 days, 3:14:43<21:57:44] +[titan] 2025-09-09 20:50:42,194 - root - INFO - step: 28005 loss: 2.7525 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.48 mfu: 49.49% global_avg_ntp_loss: 0.7789 global_avg_top_loss: 1.9736 +[titan] 2025-09-09 20:50:42,194 - root - INFO - lr: 5.7764e-06 gnorm: 0.38 [2 days, 3:15:14<21:57:10] +[titan] 2025-09-09 20:51:14,075 - root - INFO - step: 28010 loss: 2.6766 memory: 122.03GiB(87.57%) tps: 10,278 tflops: 489.86 mfu: 49.53% global_avg_ntp_loss: 0.7462 global_avg_top_loss: 1.9304 +[titan] 2025-09-09 20:51:14,075 - root - INFO - lr: 5.7735e-06 gnorm: 0.36 [2 days, 3:15:46<21:56:37] +[titan] 2025-09-09 20:51:46,090 - root - INFO - step: 28015 loss: 2.7187 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.81 mfu: 49.32% global_avg_ntp_loss: 0.7629 global_avg_top_loss: 1.9557 +[titan] 2025-09-09 20:51:46,090 - root - INFO - lr: 5.7706e-06 gnorm: 0.37 [2 days, 3:16:18<21:56:04] +[titan] 2025-09-09 20:52:18,012 - root - INFO - step: 28020 loss: 2.7724 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.24 mfu: 49.47% global_avg_ntp_loss: 0.8058 global_avg_top_loss: 1.9666 +[titan] 2025-09-09 20:52:18,012 - root - INFO - lr: 5.7677e-06 gnorm: 0.41 [2 days, 3:16:50<21:55:30] +[titan] 2025-09-09 20:52:49,996 - root - INFO - step: 28025 loss: 2.6849 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.27 mfu: 49.37% global_avg_ntp_loss: 0.7529 global_avg_top_loss: 1.9319 +[titan] 2025-09-09 20:52:49,996 - root - INFO - lr: 5.7648e-06 gnorm: 0.44 [2 days, 3:17:22<21:54:57] +[titan] 2025-09-09 20:53:21,970 - root - INFO - step: 28030 loss: 2.6014 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.44 mfu: 49.39% global_avg_ntp_loss: 0.7126 global_avg_top_loss: 1.8888 +[titan] 2025-09-09 20:53:21,970 - root - INFO - lr: 5.7619e-06 gnorm: 0.36 [2 days, 3:17:54<21:54:23] +[titan] 2025-09-09 20:53:54,070 - root - INFO - step: 28035 loss: 2.7625 memory: 122.03GiB(87.57%) tps: 10,208 tflops: 486.52 mfu: 49.19% global_avg_ntp_loss: 0.8031 global_avg_top_loss: 1.9594 +[titan] 2025-09-09 20:53:54,070 - root - INFO - lr: 5.7590e-06 gnorm: 0.38 [2 days, 3:18:26<21:53:50] +[titan] 2025-09-09 20:54:26,025 - root - INFO - step: 28040 loss: 2.6993 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.72 mfu: 49.42% global_avg_ntp_loss: 0.7587 global_avg_top_loss: 1.9407 +[titan] 2025-09-09 20:54:26,025 - root - INFO - lr: 5.7561e-06 gnorm: 0.40 [2 days, 3:18:58<21:53:17] +[titan] 2025-09-09 20:54:58,025 - root - INFO - step: 28045 loss: 2.6724 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.03 mfu: 49.35% global_avg_ntp_loss: 0.7497 global_avg_top_loss: 1.9227 +[titan] 2025-09-09 20:54:58,026 - root - INFO - lr: 5.7532e-06 gnorm: 0.38 [2 days, 3:19:30<21:52:43] +[titan] 2025-09-09 20:55:23,611 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 20:55:30,087 - root - INFO - step: 28050 loss: 2.7747 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.11 mfu: 49.25% global_avg_ntp_loss: 0.7913 global_avg_top_loss: 1.9834 +[titan] 2025-09-09 20:55:30,087 - root - INFO - lr: 5.7503e-06 gnorm: 0.37 [2 days, 3:20:02<21:52:10] +[titan] 2025-09-09 20:56:01,944 - root - INFO - step: 28055 loss: 3.0967 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.23 mfu: 49.57% global_avg_ntp_loss: 0.9778 global_avg_top_loss: 2.1190 +[titan] 2025-09-09 20:56:01,944 - root - INFO - lr: 5.7474e-06 gnorm: 0.44 [2 days, 3:20:34<21:51:37] +[titan] 2025-09-09 20:56:33,947 - root - INFO - step: 28060 loss: 2.6766 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.99 mfu: 49.34% global_avg_ntp_loss: 0.7476 global_avg_top_loss: 1.9290 +[titan] 2025-09-09 20:56:33,947 - root - INFO - lr: 5.7445e-06 gnorm: 0.38 [2 days, 3:21:06<21:51:03] +[titan] 2025-09-09 20:57:06,122 - root - INFO - step: 28065 loss: 2.6681 memory: 122.03GiB(87.57%) tps: 10,184 tflops: 485.38 mfu: 49.08% global_avg_ntp_loss: 0.7432 global_avg_top_loss: 1.9250 +[titan] 2025-09-09 20:57:06,122 - root - INFO - lr: 5.7416e-06 gnorm: 0.37 [2 days, 3:21:38<21:50:30] +[titan] 2025-09-09 20:57:38,033 - root - INFO - step: 28070 loss: 2.6305 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.39 mfu: 49.48% global_avg_ntp_loss: 0.7281 global_avg_top_loss: 1.9024 +[titan] 2025-09-09 20:57:38,034 - root - INFO - lr: 5.7387e-06 gnorm: 0.36 [2 days, 3:22:10<21:49:57] +[titan] 2025-09-09 20:58:10,127 - root - INFO - step: 28075 loss: 2.7710 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.62 mfu: 49.20% global_avg_ntp_loss: 0.7932 global_avg_top_loss: 1.9779 +[titan] 2025-09-09 20:58:10,127 - root - INFO - lr: 5.7358e-06 gnorm: 0.38 [2 days, 3:22:42<21:49:23] +[titan] 2025-09-09 20:58:42,068 - root - INFO - step: 28080 loss: 2.7469 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.94 mfu: 49.44% global_avg_ntp_loss: 0.7770 global_avg_top_loss: 1.9699 +[titan] 2025-09-09 20:58:42,068 - root - INFO - lr: 5.7329e-06 gnorm: 0.39 [2 days, 3:23:14<21:48:50] +[titan] 2025-09-09 20:59:13,973 - root - INFO - step: 28085 loss: 2.6655 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.49 mfu: 49.49% global_avg_ntp_loss: 0.7405 global_avg_top_loss: 1.9250 +[titan] 2025-09-09 20:59:13,973 - root - INFO - lr: 5.7300e-06 gnorm: 0.38 [2 days, 3:23:46<21:48:17] +[titan] 2025-09-09 20:59:45,792 - root - INFO - step: 28090 loss: 2.8909 memory: 122.03GiB(87.57%) tps: 10,298 tflops: 490.81 mfu: 49.63% global_avg_ntp_loss: 0.8568 global_avg_top_loss: 2.0341 +[titan] 2025-09-09 20:59:45,792 - root - INFO - lr: 5.7271e-06 gnorm: 0.37 [2 days, 3:24:18<21:47:43] +[titan] 2025-09-09 21:00:17,734 - root - INFO - step: 28095 loss: 2.7080 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.92 mfu: 49.44% global_avg_ntp_loss: 0.7599 global_avg_top_loss: 1.9481 +[titan] 2025-09-09 21:00:17,735 - root - INFO - lr: 5.7242e-06 gnorm: 0.36 [2 days, 3:24:50<21:47:10] +[titan] 2025-09-09 21:00:43,421 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:00:50,019 - root - INFO - step: 28100 loss: 3.1012 memory: 122.03GiB(87.57%) tps: 10,150 tflops: 483.74 mfu: 48.91% global_avg_ntp_loss: 0.9895 global_avg_top_loss: 2.1117 +[titan] 2025-09-09 21:00:50,019 - root - INFO - lr: 5.7213e-06 gnorm: 0.46 [2 days, 3:25:22<21:46:37] +[titan] 2025-09-09 21:01:21,923 - root - INFO - step: 28105 loss: 2.6933 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.50 mfu: 49.49% global_avg_ntp_loss: 0.7546 global_avg_top_loss: 1.9386 +[titan] 2025-09-09 21:01:21,923 - root - INFO - lr: 5.7184e-06 gnorm: 0.38 [2 days, 3:25:54<21:46:03] +[titan] 2025-09-09 21:01:53,924 - root - INFO - step: 28110 loss: 2.6600 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.02 mfu: 49.34% global_avg_ntp_loss: 0.7388 global_avg_top_loss: 1.9212 +[titan] 2025-09-09 21:01:53,925 - root - INFO - lr: 5.7155e-06 gnorm: 0.39 [2 days, 3:26:26<21:45:30] +[titan] 2025-09-09 21:02:25,993 - root - INFO - step: 28115 loss: 2.7463 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.99 mfu: 49.24% global_avg_ntp_loss: 0.7790 global_avg_top_loss: 1.9673 +[titan] 2025-09-09 21:02:25,994 - root - INFO - lr: 5.7126e-06 gnorm: 0.38 [2 days, 3:26:58<21:44:57] +[titan] 2025-09-09 21:02:57,892 - root - INFO - step: 28120 loss: 2.7644 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.58 mfu: 49.50% global_avg_ntp_loss: 0.7905 global_avg_top_loss: 1.9739 +[titan] 2025-09-09 21:02:57,893 - root - INFO - lr: 5.7098e-06 gnorm: 0.38 [2 days, 3:27:30<21:44:23] +[titan] 2025-09-09 21:03:29,957 - root - INFO - step: 28125 loss: 2.7378 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.05 mfu: 49.25% global_avg_ntp_loss: 0.7735 global_avg_top_loss: 1.9643 +[titan] 2025-09-09 21:03:29,957 - root - INFO - lr: 5.7069e-06 gnorm: 0.38 [2 days, 3:28:02<21:43:50] +[titan] 2025-09-09 21:04:02,044 - root - INFO - step: 28130 loss: 2.7900 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.72 mfu: 49.21% global_avg_ntp_loss: 0.7982 global_avg_top_loss: 1.9918 +[titan] 2025-09-09 21:04:02,044 - root - INFO - lr: 5.7040e-06 gnorm: 0.38 [2 days, 3:28:34<21:43:17] +[titan] 2025-09-09 21:04:33,945 - root - INFO - step: 28135 loss: 2.6533 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.56 mfu: 49.50% global_avg_ntp_loss: 0.7391 global_avg_top_loss: 1.9142 +[titan] 2025-09-09 21:04:33,945 - root - INFO - lr: 5.7011e-06 gnorm: 0.38 [2 days, 3:29:06<21:42:43] +[titan] 2025-09-09 21:05:05,868 - root - INFO - step: 28140 loss: 2.7795 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.21 mfu: 49.47% global_avg_ntp_loss: 0.7961 global_avg_top_loss: 1.9834 +[titan] 2025-09-09 21:05:05,868 - root - INFO - lr: 5.6982e-06 gnorm: 0.39 [2 days, 3:29:38<21:42:10] +[titan] 2025-09-09 21:05:37,770 - root - INFO - step: 28145 loss: 2.7587 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.53 mfu: 49.50% global_avg_ntp_loss: 0.7843 global_avg_top_loss: 1.9745 +[titan] 2025-09-09 21:05:37,771 - root - INFO - lr: 5.6953e-06 gnorm: 0.40 [2 days, 3:30:10<21:41:37] +[titan] 2025-09-09 21:06:03,272 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:06:09,730 - root - INFO - step: 28150 loss: 2.7301 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.66 mfu: 49.41% global_avg_ntp_loss: 0.7694 global_avg_top_loss: 1.9607 +[titan] 2025-09-09 21:06:09,730 - root - INFO - lr: 5.6924e-06 gnorm: 0.42 [2 days, 3:30:42<21:41:03] +[titan] 2025-09-09 21:06:41,503 - root - INFO - step: 28155 loss: 2.6656 memory: 122.03GiB(87.57%) tps: 10,313 tflops: 491.52 mfu: 49.70% global_avg_ntp_loss: 0.7436 global_avg_top_loss: 1.9220 +[titan] 2025-09-09 21:06:41,503 - root - INFO - lr: 5.6896e-06 gnorm: 0.38 [2 days, 3:31:14<21:40:30] +[titan] 2025-09-09 21:07:13,680 - root - INFO - step: 28160 loss: 2.5493 memory: 122.03GiB(87.57%) tps: 10,184 tflops: 485.34 mfu: 49.07% global_avg_ntp_loss: 0.6903 global_avg_top_loss: 1.8590 +[titan] 2025-09-09 21:07:13,681 - root - INFO - lr: 5.6867e-06 gnorm: 0.36 [2 days, 3:31:46<21:39:57] +[titan] 2025-09-09 21:07:13,936 - root - INFO - Dumping profiler traces at step 28160 +[titan] 2025-09-09 21:07:13,995 - root - INFO - Finished dumping profiler traces in 0.06 seconds +[titan] 2025-09-09 21:07:45,964 - root - INFO - step: 28165 loss: 2.7377 memory: 122.03GiB(87.57%) tps: 10,150 tflops: 483.76 mfu: 48.91% global_avg_ntp_loss: 0.7750 global_avg_top_loss: 1.9627 +[titan] 2025-09-09 21:07:45,964 - root - INFO - lr: 5.6838e-06 gnorm: 0.39 [2 days, 3:32:18<21:39:23] +[titan] 2025-09-09 21:08:17,852 - root - INFO - step: 28170 loss: 2.7077 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.75 mfu: 49.52% global_avg_ntp_loss: 0.7605 global_avg_top_loss: 1.9472 +[titan] 2025-09-09 21:08:17,852 - root - INFO - lr: 5.6809e-06 gnorm: 0.37 [2 days, 3:32:50<21:38:50] +[titan] 2025-09-09 21:08:49,947 - root - INFO - step: 28175 loss: 2.6875 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.60 mfu: 49.20% global_avg_ntp_loss: 0.7510 global_avg_top_loss: 1.9366 +[titan] 2025-09-09 21:08:49,947 - root - INFO - lr: 5.6780e-06 gnorm: 0.37 [2 days, 3:33:22<21:38:17] +[titan] 2025-09-09 21:09:21,907 - root - INFO - step: 28180 loss: 2.7553 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.65 mfu: 49.41% global_avg_ntp_loss: 0.7859 global_avg_top_loss: 1.9693 +[titan] 2025-09-09 21:09:21,907 - root - INFO - lr: 5.6752e-06 gnorm: 0.37 [2 days, 3:33:54<21:37:43] +[titan] 2025-09-09 21:09:53,779 - root - INFO - step: 28185 loss: 2.7911 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 490.00 mfu: 49.54% global_avg_ntp_loss: 0.8002 global_avg_top_loss: 1.9909 +[titan] 2025-09-09 21:09:53,779 - root - INFO - lr: 5.6723e-06 gnorm: 0.41 [2 days, 3:34:26<21:37:10] +[titan] 2025-09-09 21:10:25,646 - root - INFO - step: 28190 loss: 2.6504 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.08 mfu: 49.55% global_avg_ntp_loss: 0.7356 global_avg_top_loss: 1.9148 +[titan] 2025-09-09 21:10:25,646 - root - INFO - lr: 5.6694e-06 gnorm: 0.37 [2 days, 3:34:58<21:36:37] +[titan] 2025-09-09 21:10:57,552 - root - INFO - step: 28195 loss: 2.6239 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.47 mfu: 49.49% global_avg_ntp_loss: 0.7222 global_avg_top_loss: 1.9017 +[titan] 2025-09-09 21:10:57,553 - root - INFO - lr: 5.6665e-06 gnorm: 0.40 [2 days, 3:35:30<21:36:03] +[titan] 2025-09-09 21:11:23,102 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:11:29,443 - root - INFO - step: 28200 loss: 2.8591 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.72 mfu: 49.52% global_avg_ntp_loss: 0.8402 global_avg_top_loss: 2.0190 +[titan] 2025-09-09 21:11:29,443 - root - INFO - lr: 5.6637e-06 gnorm: 0.38 [2 days, 3:36:02<21:35:30] +[titan] 2025-09-09 21:12:01,378 - root - INFO - step: 28205 loss: 2.6971 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.03 mfu: 49.45% global_avg_ntp_loss: 0.7600 global_avg_top_loss: 1.9371 +[titan] 2025-09-09 21:12:01,378 - root - INFO - lr: 5.6608e-06 gnorm: 0.39 [2 days, 3:36:34<21:34:56] +[titan] 2025-09-09 21:12:33,382 - root - INFO - step: 28210 loss: 2.6892 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.97 mfu: 49.34% global_avg_ntp_loss: 0.7563 global_avg_top_loss: 1.9330 +[titan] 2025-09-09 21:12:33,383 - root - INFO - lr: 5.6579e-06 gnorm: 0.38 [2 days, 3:37:06<21:34:23] +[titan] 2025-09-09 21:13:05,226 - root - INFO - step: 28215 loss: 2.6069 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.43 mfu: 49.59% global_avg_ntp_loss: 0.7227 global_avg_top_loss: 1.8842 +[titan] 2025-09-09 21:13:05,227 - root - INFO - lr: 5.6550e-06 gnorm: 0.39 [2 days, 3:37:37<21:33:50] +[titan] 2025-09-09 21:13:37,315 - root - INFO - step: 28220 loss: 2.7630 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.68 mfu: 49.21% global_avg_ntp_loss: 0.7858 global_avg_top_loss: 1.9773 +[titan] 2025-09-09 21:13:37,316 - root - INFO - lr: 5.6522e-06 gnorm: 0.38 [2 days, 3:38:10<21:33:16] +[titan] 2025-09-09 21:14:09,293 - root - INFO - step: 28225 loss: 2.7865 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.38 mfu: 49.38% global_avg_ntp_loss: 0.8006 global_avg_top_loss: 1.9858 +[titan] 2025-09-09 21:14:09,293 - root - INFO - lr: 5.6493e-06 gnorm: 0.38 [2 days, 3:38:42<21:32:43] +[titan] 2025-09-09 21:14:41,201 - root - INFO - step: 28230 loss: 2.6685 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.45 mfu: 49.49% global_avg_ntp_loss: 0.7415 global_avg_top_loss: 1.9270 +[titan] 2025-09-09 21:14:41,201 - root - INFO - lr: 5.6464e-06 gnorm: 0.37 [2 days, 3:39:13<21:32:10] +[titan] 2025-09-09 21:15:13,119 - root - INFO - step: 28235 loss: 2.7414 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.29 mfu: 49.47% global_avg_ntp_loss: 0.7739 global_avg_top_loss: 1.9676 +[titan] 2025-09-09 21:15:13,120 - root - INFO - lr: 5.6436e-06 gnorm: 0.37 [2 days, 3:39:45<21:31:36] +[titan] 2025-09-09 21:15:45,139 - root - INFO - step: 28240 loss: 2.7126 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.74 mfu: 49.32% global_avg_ntp_loss: 0.7607 global_avg_top_loss: 1.9519 +[titan] 2025-09-09 21:15:45,139 - root - INFO - lr: 5.6407e-06 gnorm: 0.37 [2 days, 3:40:17<21:31:03] +[titan] 2025-09-09 21:16:17,076 - root - INFO - step: 28245 loss: 2.7356 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 489.01 mfu: 49.44% global_avg_ntp_loss: 0.7783 global_avg_top_loss: 1.9573 +[titan] 2025-09-09 21:16:17,076 - root - INFO - lr: 5.6378e-06 gnorm: 0.37 [2 days, 3:40:49<21:30:30] +[titan] 2025-09-09 21:16:42,585 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:16:49,094 - root - INFO - step: 28250 loss: 2.6597 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.76 mfu: 49.32% global_avg_ntp_loss: 0.7406 global_avg_top_loss: 1.9191 +[titan] 2025-09-09 21:16:49,094 - root - INFO - lr: 5.6350e-06 gnorm: 0.38 [2 days, 3:41:21<21:29:56] +[titan] 2025-09-09 21:17:21,153 - root - INFO - step: 28255 loss: 2.8276 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.13 mfu: 49.25% global_avg_ntp_loss: 0.8282 global_avg_top_loss: 1.9994 +[titan] 2025-09-09 21:17:21,154 - root - INFO - lr: 5.6321e-06 gnorm: 0.39 [2 days, 3:41:53<21:29:23] +[titan] 2025-09-09 21:17:52,901 - root - INFO - step: 28260 loss: 2.6914 memory: 122.03GiB(87.57%) tps: 10,321 tflops: 491.91 mfu: 49.74% global_avg_ntp_loss: 0.7553 global_avg_top_loss: 1.9361 +[titan] 2025-09-09 21:17:52,902 - root - INFO - lr: 5.6292e-06 gnorm: 0.38 [2 days, 3:42:25<21:28:50] +[titan] 2025-09-09 21:18:24,838 - root - INFO - step: 28265 loss: 2.8613 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.01 mfu: 49.44% global_avg_ntp_loss: 0.8460 global_avg_top_loss: 2.0153 +[titan] 2025-09-09 21:18:24,838 - root - INFO - lr: 5.6264e-06 gnorm: 0.38 [2 days, 3:42:57<21:28:16] +[titan] 2025-09-09 21:18:56,741 - root - INFO - step: 28270 loss: 2.5910 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.51 mfu: 49.50% global_avg_ntp_loss: 0.7136 global_avg_top_loss: 1.8774 +[titan] 2025-09-09 21:18:56,742 - root - INFO - lr: 5.6235e-06 gnorm: 0.37 [2 days, 3:43:29<21:27:43] +[titan] 2025-09-09 21:19:28,821 - root - INFO - step: 28275 loss: 2.6543 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.83 mfu: 49.22% global_avg_ntp_loss: 0.7392 global_avg_top_loss: 1.9151 +[titan] 2025-09-09 21:19:28,821 - root - INFO - lr: 5.6206e-06 gnorm: 0.38 [2 days, 3:44:01<21:27:10] +[titan] 2025-09-09 21:20:00,561 - root - INFO - step: 28280 loss: 3.3227 memory: 122.03GiB(87.57%) tps: 10,324 tflops: 492.04 mfu: 49.75% global_avg_ntp_loss: 1.1079 global_avg_top_loss: 2.2148 +[titan] 2025-09-09 21:20:00,561 - root - INFO - lr: 5.6178e-06 gnorm: 0.40 [2 days, 3:44:33<21:26:36] +[titan] 2025-09-09 21:20:32,501 - root - INFO - step: 28285 loss: 2.7512 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.96 mfu: 49.44% global_avg_ntp_loss: 0.7806 global_avg_top_loss: 1.9706 +[titan] 2025-09-09 21:20:32,501 - root - INFO - lr: 5.6149e-06 gnorm: 0.37 [2 days, 3:45:05<21:26:03] +[titan] 2025-09-09 21:21:04,572 - root - INFO - step: 28290 loss: 2.5216 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.95 mfu: 49.24% global_avg_ntp_loss: 0.6776 global_avg_top_loss: 1.8440 +[titan] 2025-09-09 21:21:04,572 - root - INFO - lr: 5.6120e-06 gnorm: 0.36 [2 days, 3:45:37<21:25:30] +[titan] 2025-09-09 21:21:36,332 - root - INFO - step: 28295 loss: 2.6873 memory: 122.03GiB(87.57%) tps: 10,318 tflops: 491.73 mfu: 49.72% global_avg_ntp_loss: 0.7580 global_avg_top_loss: 1.9293 +[titan] 2025-09-09 21:21:36,332 - root - INFO - lr: 5.6092e-06 gnorm: 0.40 [2 days, 3:46:09<21:24:56] +[titan] 2025-09-09 21:22:02,062 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:22:08,495 - root - INFO - step: 28300 loss: 2.6765 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.56 mfu: 49.10% global_avg_ntp_loss: 0.7471 global_avg_top_loss: 1.9293 +[titan] 2025-09-09 21:22:08,495 - root - INFO - lr: 5.6063e-06 gnorm: 0.37 [2 days, 3:46:41<21:24:23] +[titan] 2025-09-09 21:22:40,484 - root - INFO - step: 28305 loss: 2.7023 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.21 mfu: 49.36% global_avg_ntp_loss: 0.7591 global_avg_top_loss: 1.9433 +[titan] 2025-09-09 21:22:40,484 - root - INFO - lr: 5.6035e-06 gnorm: 0.38 [2 days, 3:47:13<21:23:50] +[titan] 2025-09-09 21:23:12,475 - root - INFO - step: 28310 loss: 2.8013 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.17 mfu: 49.36% global_avg_ntp_loss: 0.8024 global_avg_top_loss: 1.9988 +[titan] 2025-09-09 21:23:12,475 - root - INFO - lr: 5.6006e-06 gnorm: 0.39 [2 days, 3:47:45<21:23:16] +[titan] 2025-09-09 21:23:44,550 - root - INFO - step: 28315 loss: 2.7410 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.90 mfu: 49.23% global_avg_ntp_loss: 0.7780 global_avg_top_loss: 1.9630 +[titan] 2025-09-09 21:23:44,550 - root - INFO - lr: 5.5978e-06 gnorm: 0.38 [2 days, 3:48:17<21:22:43] +[titan] 2025-09-09 21:24:16,614 - root - INFO - step: 28320 loss: 2.6325 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.05 mfu: 49.25% global_avg_ntp_loss: 0.7306 global_avg_top_loss: 1.9019 +[titan] 2025-09-09 21:24:16,615 - root - INFO - lr: 5.5949e-06 gnorm: 0.36 [2 days, 3:48:49<21:22:10] +[titan] 2025-09-09 21:24:48,574 - root - INFO - step: 28325 loss: 2.7379 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.65 mfu: 49.41% global_avg_ntp_loss: 0.7735 global_avg_top_loss: 1.9644 +[titan] 2025-09-09 21:24:48,575 - root - INFO - lr: 5.5921e-06 gnorm: 0.38 [2 days, 3:49:21<21:21:36] +[titan] 2025-09-09 21:25:20,513 - root - INFO - step: 28330 loss: 2.7198 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.98 mfu: 49.44% global_avg_ntp_loss: 0.7656 global_avg_top_loss: 1.9542 +[titan] 2025-09-09 21:25:20,513 - root - INFO - lr: 5.5892e-06 gnorm: 0.37 [2 days, 3:49:53<21:21:03] +[titan] 2025-09-09 21:25:52,437 - root - INFO - step: 28335 loss: 2.7184 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.19 mfu: 49.46% global_avg_ntp_loss: 0.7665 global_avg_top_loss: 1.9519 +[titan] 2025-09-09 21:25:52,438 - root - INFO - lr: 5.5863e-06 gnorm: 0.37 [2 days, 3:50:25<21:20:30] +[titan] 2025-09-09 21:26:24,421 - root - INFO - step: 28340 loss: 2.7392 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.29 mfu: 49.37% global_avg_ntp_loss: 0.7835 global_avg_top_loss: 1.9557 +[titan] 2025-09-09 21:26:24,421 - root - INFO - lr: 5.5835e-06 gnorm: 0.39 [2 days, 3:50:57<21:19:56] +[titan] 2025-09-09 21:26:56,483 - root - INFO - step: 28345 loss: 2.7355 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.10 mfu: 49.25% global_avg_ntp_loss: 0.7774 global_avg_top_loss: 1.9581 +[titan] 2025-09-09 21:26:56,483 - root - INFO - lr: 5.5806e-06 gnorm: 0.41 [2 days, 3:51:29<21:19:23] +[titan] 2025-09-09 21:27:21,808 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:27:28,257 - root - INFO - step: 28350 loss: 2.6791 memory: 122.03GiB(87.57%) tps: 10,313 tflops: 491.50 mfu: 49.70% global_avg_ntp_loss: 0.7472 global_avg_top_loss: 1.9319 +[titan] 2025-09-09 21:27:28,257 - root - INFO - lr: 5.5778e-06 gnorm: 0.38 [2 days, 3:52:00<21:18:50] +[titan] 2025-09-09 21:28:00,207 - root - INFO - step: 28355 loss: 2.7238 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.81 mfu: 49.42% global_avg_ntp_loss: 0.7695 global_avg_top_loss: 1.9543 +[titan] 2025-09-09 21:28:00,207 - root - INFO - lr: 5.5749e-06 gnorm: 0.38 [2 days, 3:52:32<21:18:16] +[titan] 2025-09-09 21:28:32,170 - root - INFO - step: 28360 loss: 3.3545 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.59 mfu: 49.40% global_avg_ntp_loss: 1.1257 global_avg_top_loss: 2.2288 +[titan] 2025-09-09 21:28:32,171 - root - INFO - lr: 5.5721e-06 gnorm: 0.37 [2 days, 3:53:04<21:17:43] +[titan] 2025-09-09 21:29:04,364 - root - INFO - step: 28365 loss: 2.6643 memory: 122.03GiB(87.57%) tps: 10,179 tflops: 485.10 mfu: 49.05% global_avg_ntp_loss: 0.7408 global_avg_top_loss: 1.9235 +[titan] 2025-09-09 21:29:04,364 - root - INFO - lr: 5.5693e-06 gnorm: 0.37 [2 days, 3:53:37<21:17:10] +[titan] 2025-09-09 21:29:36,263 - root - INFO - step: 28370 loss: 2.7348 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.58 mfu: 49.50% global_avg_ntp_loss: 0.7726 global_avg_top_loss: 1.9622 +[titan] 2025-09-09 21:29:36,263 - root - INFO - lr: 5.5664e-06 gnorm: 0.44 [2 days, 3:54:09<21:16:36] +[titan] 2025-09-09 21:30:08,276 - root - INFO - step: 28375 loss: 2.5622 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.85 mfu: 49.33% global_avg_ntp_loss: 0.7009 global_avg_top_loss: 1.8613 +[titan] 2025-09-09 21:30:08,276 - root - INFO - lr: 5.5636e-06 gnorm: 0.44 [2 days, 3:54:41<21:16:03] +[titan] 2025-09-09 21:30:40,407 - root - INFO - step: 28380 loss: 2.7838 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.04 mfu: 49.14% global_avg_ntp_loss: 0.8067 global_avg_top_loss: 1.9770 +[titan] 2025-09-09 21:30:40,408 - root - INFO - lr: 5.5607e-06 gnorm: 0.40 [2 days, 3:55:13<21:15:30] +[titan] 2025-09-09 21:31:12,132 - root - INFO - step: 28385 loss: 2.6302 memory: 122.03GiB(87.57%) tps: 10,329 tflops: 492.28 mfu: 49.78% global_avg_ntp_loss: 0.7225 global_avg_top_loss: 1.9078 +[titan] 2025-09-09 21:31:12,132 - root - INFO - lr: 5.5579e-06 gnorm: 0.42 [2 days, 3:55:44<21:14:56] +[titan] 2025-09-09 21:31:44,055 - root - INFO - step: 28390 loss: 2.8550 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.21 mfu: 49.46% global_avg_ntp_loss: 0.8501 global_avg_top_loss: 2.0049 +[titan] 2025-09-09 21:31:44,056 - root - INFO - lr: 5.5550e-06 gnorm: 0.39 [2 days, 3:56:16<21:14:23] +[titan] 2025-09-09 21:32:15,879 - root - INFO - step: 28395 loss: 2.6919 memory: 122.03GiB(87.57%) tps: 10,297 tflops: 490.74 mfu: 49.62% global_avg_ntp_loss: 0.7558 global_avg_top_loss: 1.9360 +[titan] 2025-09-09 21:32:15,880 - root - INFO - lr: 5.5522e-06 gnorm: 0.43 [2 days, 3:56:48<21:13:50] +[titan] 2025-09-09 21:32:41,420 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:32:47,854 - root - INFO - step: 28400 loss: 2.5807 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.43 mfu: 49.39% global_avg_ntp_loss: 0.7034 global_avg_top_loss: 1.8773 +[titan] 2025-09-09 21:32:47,854 - root - INFO - lr: 5.5493e-06 gnorm: 0.42 [2 days, 3:57:20<21:13:16] +[titan] 2025-09-09 21:33:19,784 - root - INFO - step: 28405 loss: 2.7164 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.11 mfu: 49.45% global_avg_ntp_loss: 0.7639 global_avg_top_loss: 1.9526 +[titan] 2025-09-09 21:33:19,784 - root - INFO - lr: 5.5465e-06 gnorm: 0.37 [2 days, 3:57:52<21:12:43] +[titan] 2025-09-09 21:33:51,752 - root - INFO - step: 28410 loss: 3.0325 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.53 mfu: 49.40% global_avg_ntp_loss: 0.9399 global_avg_top_loss: 2.0926 +[titan] 2025-09-09 21:33:51,752 - root - INFO - lr: 5.5437e-06 gnorm: 0.40 [2 days, 3:58:24<21:12:10] +[titan] 2025-09-09 21:34:23,754 - root - INFO - step: 28415 loss: 2.7711 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.01 mfu: 49.34% global_avg_ntp_loss: 0.7887 global_avg_top_loss: 1.9824 +[titan] 2025-09-09 21:34:23,754 - root - INFO - lr: 5.5408e-06 gnorm: 0.38 [2 days, 3:58:56<21:11:36] +[titan] 2025-09-09 21:34:55,955 - root - INFO - step: 28420 loss: 2.7472 memory: 122.03GiB(87.57%) tps: 10,176 tflops: 484.99 mfu: 49.04% global_avg_ntp_loss: 0.7821 global_avg_top_loss: 1.9651 +[titan] 2025-09-09 21:34:55,955 - root - INFO - lr: 5.5380e-06 gnorm: 0.37 [2 days, 3:59:28<21:11:03] +[titan] 2025-09-09 21:35:27,769 - root - INFO - step: 28425 loss: 2.8168 memory: 122.03GiB(87.57%) tps: 10,300 tflops: 490.89 mfu: 49.63% global_avg_ntp_loss: 0.8165 global_avg_top_loss: 2.0003 +[titan] 2025-09-09 21:35:27,769 - root - INFO - lr: 5.5352e-06 gnorm: 0.39 [2 days, 4:00:00<21:10:30] +[titan] 2025-09-09 21:35:59,676 - root - INFO - step: 28430 loss: 2.8166 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.46 mfu: 49.49% global_avg_ntp_loss: 0.8164 global_avg_top_loss: 2.0003 +[titan] 2025-09-09 21:35:59,676 - root - INFO - lr: 5.5323e-06 gnorm: 0.38 [2 days, 4:00:32<21:09:56] +[titan] 2025-09-09 21:36:31,635 - root - INFO - step: 28435 loss: 2.6888 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.66 mfu: 49.41% global_avg_ntp_loss: 0.7517 global_avg_top_loss: 1.9371 +[titan] 2025-09-09 21:36:31,636 - root - INFO - lr: 5.5295e-06 gnorm: 0.37 [2 days, 4:01:04<21:09:23] +[titan] 2025-09-09 21:37:03,606 - root - INFO - step: 28440 loss: 3.1246 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.48 mfu: 49.39% global_avg_ntp_loss: 1.0049 global_avg_top_loss: 2.1197 +[titan] 2025-09-09 21:37:03,607 - root - INFO - lr: 5.5266e-06 gnorm: 0.42 [2 days, 4:01:36<21:08:50] +[titan] 2025-09-09 21:37:35,461 - root - INFO - step: 28445 loss: 2.7246 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.27 mfu: 49.57% global_avg_ntp_loss: 0.7677 global_avg_top_loss: 1.9569 +[titan] 2025-09-09 21:37:35,461 - root - INFO - lr: 5.5238e-06 gnorm: 0.38 [2 days, 4:02:08<21:08:16] +[titan] 2025-09-09 21:38:01,193 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:38:07,556 - root - INFO - step: 28450 loss: 2.9222 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.59 mfu: 49.20% global_avg_ntp_loss: 0.8822 global_avg_top_loss: 2.0400 +[titan] 2025-09-09 21:38:07,556 - root - INFO - lr: 5.5210e-06 gnorm: 0.39 [2 days, 4:02:40<21:07:43] +[titan] 2025-09-09 21:38:39,330 - root - INFO - step: 28455 loss: 2.6796 memory: 122.03GiB(87.57%) tps: 10,313 tflops: 491.50 mfu: 49.70% global_avg_ntp_loss: 0.7506 global_avg_top_loss: 1.9290 +[titan] 2025-09-09 21:38:39,330 - root - INFO - lr: 5.5182e-06 gnorm: 0.43 [2 days, 4:03:12<21:07:10] +[titan] 2025-09-09 21:39:11,175 - root - INFO - step: 28460 loss: 2.7121 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.41 mfu: 49.59% global_avg_ntp_loss: 0.7631 global_avg_top_loss: 1.9491 +[titan] 2025-09-09 21:39:11,176 - root - INFO - lr: 5.5153e-06 gnorm: 0.40 [2 days, 4:03:43<21:06:36] +[titan] 2025-09-09 21:39:43,199 - root - INFO - step: 28465 loss: 2.6685 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.67 mfu: 49.31% global_avg_ntp_loss: 0.7458 global_avg_top_loss: 1.9226 +[titan] 2025-09-09 21:39:43,200 - root - INFO - lr: 5.5125e-06 gnorm: 0.37 [2 days, 4:04:15<21:06:03] +[titan] 2025-09-09 21:40:15,359 - root - INFO - step: 28470 loss: 2.7464 memory: 122.03GiB(87.57%) tps: 10,189 tflops: 485.62 mfu: 49.10% global_avg_ntp_loss: 0.7767 global_avg_top_loss: 1.9698 +[titan] 2025-09-09 21:40:15,359 - root - INFO - lr: 5.5097e-06 gnorm: 0.38 [2 days, 4:04:48<21:05:30] +[titan] 2025-09-09 21:40:47,551 - root - INFO - step: 28475 loss: 2.6765 memory: 122.03GiB(87.57%) tps: 10,179 tflops: 485.12 mfu: 49.05% global_avg_ntp_loss: 0.7455 global_avg_top_loss: 1.9310 +[titan] 2025-09-09 21:40:47,551 - root - INFO - lr: 5.5068e-06 gnorm: 0.38 [2 days, 4:05:20<21:04:57] +[titan] 2025-09-09 21:41:19,545 - root - INFO - step: 28480 loss: 2.7187 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.13 mfu: 49.36% global_avg_ntp_loss: 0.7676 global_avg_top_loss: 1.9511 +[titan] 2025-09-09 21:41:19,545 - root - INFO - lr: 5.5040e-06 gnorm: 0.38 [2 days, 4:05:52<21:04:23] +[titan] 2025-09-09 21:41:51,389 - root - INFO - step: 28485 loss: 2.6657 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.43 mfu: 49.59% global_avg_ntp_loss: 0.7411 global_avg_top_loss: 1.9245 +[titan] 2025-09-09 21:41:51,389 - root - INFO - lr: 5.5012e-06 gnorm: 0.37 [2 days, 4:06:24<21:03:50] +[titan] 2025-09-09 21:42:23,178 - root - INFO - step: 28490 loss: 3.2187 memory: 122.03GiB(87.57%) tps: 10,308 tflops: 491.28 mfu: 49.67% global_avg_ntp_loss: 1.0470 global_avg_top_loss: 2.1716 +[titan] 2025-09-09 21:42:23,178 - root - INFO - lr: 5.4984e-06 gnorm: 0.38 [2 days, 4:06:55<21:03:17] +[titan] 2025-09-09 21:42:55,118 - root - INFO - step: 28495 loss: 2.6664 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.95 mfu: 49.44% global_avg_ntp_loss: 0.7416 global_avg_top_loss: 1.9248 +[titan] 2025-09-09 21:42:55,118 - root - INFO - lr: 5.4955e-06 gnorm: 0.37 [2 days, 4:07:27<21:02:43] +[titan] 2025-09-09 21:43:20,653 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:43:27,078 - root - INFO - step: 28500 loss: 2.7225 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.65 mfu: 49.41% global_avg_ntp_loss: 0.7655 global_avg_top_loss: 1.9571 +[titan] 2025-09-09 21:43:27,078 - root - INFO - lr: 5.4927e-06 gnorm: 0.38 [2 days, 4:07:59<21:02:10] +[titan] 2025-09-09 21:43:59,033 - root - INFO - step: 28505 loss: 2.6893 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.72 mfu: 49.42% global_avg_ntp_loss: 0.7535 global_avg_top_loss: 1.9358 +[titan] 2025-09-09 21:43:59,033 - root - INFO - lr: 5.4899e-06 gnorm: 0.38 [2 days, 4:08:31<21:01:37] +[titan] 2025-09-09 21:44:30,839 - root - INFO - step: 28510 loss: 2.7577 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.02 mfu: 49.65% global_avg_ntp_loss: 0.7871 global_avg_top_loss: 1.9706 +[titan] 2025-09-09 21:44:30,839 - root - INFO - lr: 5.4871e-06 gnorm: 0.38 [2 days, 4:09:03<21:01:03] +[titan] 2025-09-09 21:45:03,153 - root - INFO - step: 28515 loss: 2.6288 memory: 122.03GiB(87.57%) tps: 10,141 tflops: 483.29 mfu: 48.87% global_avg_ntp_loss: 0.7264 global_avg_top_loss: 1.9024 +[titan] 2025-09-09 21:45:03,153 - root - INFO - lr: 5.4842e-06 gnorm: 0.39 [2 days, 4:09:35<21:00:30] +[titan] 2025-09-09 21:45:35,081 - root - INFO - step: 28520 loss: 3.2211 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.14 mfu: 49.46% global_avg_ntp_loss: 1.0502 global_avg_top_loss: 2.1709 +[titan] 2025-09-09 21:45:35,081 - root - INFO - lr: 5.4814e-06 gnorm: 0.39 [2 days, 4:10:07<20:59:57] +[titan] 2025-09-09 21:46:06,973 - root - INFO - step: 28525 loss: 2.8803 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.69 mfu: 49.51% global_avg_ntp_loss: 0.8427 global_avg_top_loss: 2.0376 +[titan] 2025-09-09 21:46:06,973 - root - INFO - lr: 5.4786e-06 gnorm: 0.39 [2 days, 4:10:39<20:59:23] +[titan] 2025-09-09 21:46:38,919 - root - INFO - step: 28530 loss: 2.5020 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.86 mfu: 49.43% global_avg_ntp_loss: 0.6680 global_avg_top_loss: 1.8340 +[titan] 2025-09-09 21:46:38,919 - root - INFO - lr: 5.4758e-06 gnorm: 0.36 [2 days, 4:11:11<20:58:50] +[titan] 2025-09-09 21:47:10,823 - root - INFO - step: 28535 loss: 3.1806 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.50 mfu: 49.49% global_avg_ntp_loss: 1.0301 global_avg_top_loss: 2.1505 +[titan] 2025-09-09 21:47:10,824 - root - INFO - lr: 5.4730e-06 gnorm: 0.38 [2 days, 4:11:43<20:58:17] +[titan] 2025-09-09 21:47:42,767 - root - INFO - step: 28540 loss: 2.7551 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.91 mfu: 49.43% global_avg_ntp_loss: 0.7848 global_avg_top_loss: 1.9703 +[titan] 2025-09-09 21:47:42,767 - root - INFO - lr: 5.4701e-06 gnorm: 0.38 [2 days, 4:12:15<20:57:43] +[titan] 2025-09-09 21:48:14,745 - root - INFO - step: 28545 loss: 2.6349 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.36 mfu: 49.38% global_avg_ntp_loss: 0.7264 global_avg_top_loss: 1.9085 +[titan] 2025-09-09 21:48:14,746 - root - INFO - lr: 5.4673e-06 gnorm: 0.38 [2 days, 4:12:47<20:57:10] +[titan] 2025-09-09 21:48:40,189 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:48:46,627 - root - INFO - step: 28550 loss: 2.7167 memory: 122.03GiB(87.57%) tps: 10,278 tflops: 489.85 mfu: 49.53% global_avg_ntp_loss: 0.7678 global_avg_top_loss: 1.9489 +[titan] 2025-09-09 21:48:46,628 - root - INFO - lr: 5.4645e-06 gnorm: 0.39 [2 days, 4:13:19<20:56:37] +[titan] 2025-09-09 21:49:18,585 - root - INFO - step: 28555 loss: 2.6226 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.69 mfu: 49.41% global_avg_ntp_loss: 0.7200 global_avg_top_loss: 1.9026 +[titan] 2025-09-09 21:49:18,585 - root - INFO - lr: 5.4617e-06 gnorm: 0.39 [2 days, 4:13:51<20:56:03] +[titan] 2025-09-09 21:49:50,577 - root - INFO - step: 28560 loss: 2.8661 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.16 mfu: 49.36% global_avg_ntp_loss: 0.8341 global_avg_top_loss: 2.0321 +[titan] 2025-09-09 21:49:50,577 - root - INFO - lr: 5.4589e-06 gnorm: 0.41 [2 days, 4:14:23<20:55:30] +[titan] 2025-09-09 21:50:22,525 - root - INFO - step: 28565 loss: 2.5915 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.83 mfu: 49.43% global_avg_ntp_loss: 0.7055 global_avg_top_loss: 1.8859 +[titan] 2025-09-09 21:50:22,525 - root - INFO - lr: 5.4561e-06 gnorm: 0.39 [2 days, 4:14:55<20:54:57] +[titan] 2025-09-09 21:50:54,501 - root - INFO - step: 28570 loss: 3.1971 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.40 mfu: 49.38% global_avg_ntp_loss: 1.0341 global_avg_top_loss: 2.1630 +[titan] 2025-09-09 21:50:54,501 - root - INFO - lr: 5.4533e-06 gnorm: 0.51 [2 days, 4:15:27<20:54:24] +[titan] 2025-09-09 21:51:26,696 - root - INFO - step: 28575 loss: 2.6941 memory: 122.03GiB(87.57%) tps: 10,178 tflops: 485.08 mfu: 49.05% global_avg_ntp_loss: 0.7557 global_avg_top_loss: 1.9385 +[titan] 2025-09-09 21:51:26,697 - root - INFO - lr: 5.4504e-06 gnorm: 0.38 [2 days, 4:15:59<20:53:50] +[titan] 2025-09-09 21:51:58,672 - root - INFO - step: 28580 loss: 2.6588 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.42 mfu: 49.38% global_avg_ntp_loss: 0.7415 global_avg_top_loss: 1.9174 +[titan] 2025-09-09 21:51:58,672 - root - INFO - lr: 5.4476e-06 gnorm: 0.40 [2 days, 4:16:31<20:53:17] +[titan] 2025-09-09 21:52:30,508 - root - INFO - step: 28585 loss: 3.1974 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.55 mfu: 49.60% global_avg_ntp_loss: 1.0373 global_avg_top_loss: 2.1600 +[titan] 2025-09-09 21:52:30,508 - root - INFO - lr: 5.4448e-06 gnorm: 0.39 [2 days, 4:17:03<20:52:44] +[titan] 2025-09-09 21:53:02,463 - root - INFO - step: 28590 loss: 2.7603 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.72 mfu: 49.42% global_avg_ntp_loss: 0.7827 global_avg_top_loss: 1.9776 +[titan] 2025-09-09 21:53:02,463 - root - INFO - lr: 5.4420e-06 gnorm: 0.40 [2 days, 4:17:35<20:52:10] +[titan] 2025-09-09 21:53:34,353 - root - INFO - step: 28595 loss: 2.7405 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.72 mfu: 49.52% global_avg_ntp_loss: 0.7735 global_avg_top_loss: 1.9670 +[titan] 2025-09-09 21:53:34,353 - root - INFO - lr: 5.4392e-06 gnorm: 0.40 [2 days, 4:18:07<20:51:37] +[titan] 2025-09-09 21:54:00,247 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:54:06,571 - root - INFO - step: 28600 loss: 3.0858 memory: 122.03GiB(87.57%) tps: 10,171 tflops: 484.74 mfu: 49.01% global_avg_ntp_loss: 0.9849 global_avg_top_loss: 2.1009 +[titan] 2025-09-09 21:54:06,571 - root - INFO - lr: 5.4364e-06 gnorm: 0.38 [2 days, 4:18:39<20:51:04] +[titan] 2025-09-09 21:54:38,497 - root - INFO - step: 28605 loss: 2.6908 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.16 mfu: 49.46% global_avg_ntp_loss: 0.7542 global_avg_top_loss: 1.9367 +[titan] 2025-09-09 21:54:38,498 - root - INFO - lr: 5.4336e-06 gnorm: 0.38 [2 days, 4:19:11<20:50:31] +[titan] 2025-09-09 21:55:10,492 - root - INFO - step: 28610 loss: 2.7055 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.11 mfu: 49.35% global_avg_ntp_loss: 0.7618 global_avg_top_loss: 1.9437 +[titan] 2025-09-09 21:55:10,493 - root - INFO - lr: 5.4308e-06 gnorm: 0.38 [2 days, 4:19:43<20:49:57] +[titan] 2025-09-09 21:55:42,512 - root - INFO - step: 28615 loss: 3.1941 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.74 mfu: 49.32% global_avg_ntp_loss: 1.0331 global_avg_top_loss: 2.1610 +[titan] 2025-09-09 21:55:42,512 - root - INFO - lr: 5.4280e-06 gnorm: 0.40 [2 days, 4:20:15<20:49:24] +[titan] 2025-09-09 21:56:14,456 - root - INFO - step: 28620 loss: 2.7257 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.88 mfu: 49.43% global_avg_ntp_loss: 0.7699 global_avg_top_loss: 1.9557 +[titan] 2025-09-09 21:56:14,457 - root - INFO - lr: 5.4252e-06 gnorm: 0.37 [2 days, 4:20:47<20:48:51] +[titan] 2025-09-09 21:56:46,458 - root - INFO - step: 28625 loss: 2.6829 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.01 mfu: 49.34% global_avg_ntp_loss: 0.7492 global_avg_top_loss: 1.9337 +[titan] 2025-09-09 21:56:46,459 - root - INFO - lr: 5.4224e-06 gnorm: 0.38 [2 days, 4:21:19<20:48:17] +[titan] 2025-09-09 21:57:18,579 - root - INFO - step: 28630 loss: 2.6641 memory: 122.03GiB(87.57%) tps: 10,202 tflops: 486.20 mfu: 49.16% global_avg_ntp_loss: 0.7534 global_avg_top_loss: 1.9106 +[titan] 2025-09-09 21:57:18,579 - root - INFO - lr: 5.4196e-06 gnorm: 0.41 [2 days, 4:21:51<20:47:44] +[titan] 2025-09-09 21:57:50,515 - root - INFO - step: 28635 loss: 2.6440 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.02 mfu: 49.45% global_avg_ntp_loss: 0.7312 global_avg_top_loss: 1.9128 +[titan] 2025-09-09 21:57:50,515 - root - INFO - lr: 5.4168e-06 gnorm: 0.37 [2 days, 4:22:23<20:47:11] +[titan] 2025-09-09 21:58:22,560 - root - INFO - step: 28640 loss: 2.6476 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.34 mfu: 49.28% global_avg_ntp_loss: 0.7352 global_avg_top_loss: 1.9124 +[titan] 2025-09-09 21:58:22,561 - root - INFO - lr: 5.4140e-06 gnorm: 0.38 [2 days, 4:22:55<20:46:38] +[titan] 2025-09-09 21:58:54,368 - root - INFO - step: 28645 loss: 2.7637 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 490.99 mfu: 49.65% global_avg_ntp_loss: 0.7900 global_avg_top_loss: 1.9737 +[titan] 2025-09-09 21:58:54,368 - root - INFO - lr: 5.4112e-06 gnorm: 0.42 [2 days, 4:23:27<20:46:04] +[titan] 2025-09-09 21:59:20,074 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:59:26,572 - root - INFO - step: 28650 loss: 3.1641 memory: 122.03GiB(87.57%) tps: 10,175 tflops: 484.94 mfu: 49.03% global_avg_ntp_loss: 1.0333 global_avg_top_loss: 2.1307 +[titan] 2025-09-09 21:59:26,572 - root - INFO - lr: 5.4084e-06 gnorm: 0.39 [2 days, 4:23:59<20:45:31] +[titan] 2025-09-09 21:59:58,660 - root - INFO - step: 28655 loss: 2.7249 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.71 mfu: 49.21% global_avg_ntp_loss: 0.7668 global_avg_top_loss: 1.9581 +[titan] 2025-09-09 21:59:58,660 - root - INFO - lr: 5.4056e-06 gnorm: 0.39 [2 days, 4:24:31<20:44:58] +[titan] 2025-09-09 22:00:30,606 - root - INFO - step: 28660 loss: 2.5616 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.86 mfu: 49.43% global_avg_ntp_loss: 0.6933 global_avg_top_loss: 1.8683 +[titan] 2025-09-09 22:00:30,606 - root - INFO - lr: 5.4028e-06 gnorm: 0.39 [2 days, 4:25:03<20:44:24] +[titan] 2025-09-09 22:01:02,609 - root - INFO - step: 28665 loss: 3.1232 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.98 mfu: 49.34% global_avg_ntp_loss: 1.0025 global_avg_top_loss: 2.1206 +[titan] 2025-09-09 22:01:02,609 - root - INFO - lr: 5.4000e-06 gnorm: 0.38 [2 days, 4:25:35<20:43:51] +[titan] 2025-09-09 22:01:34,628 - root - INFO - step: 28670 loss: 2.6952 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.74 mfu: 49.32% global_avg_ntp_loss: 0.7604 global_avg_top_loss: 1.9348 +[titan] 2025-09-09 22:01:34,629 - root - INFO - lr: 5.3972e-06 gnorm: 0.39 [2 days, 4:26:07<20:43:18] +[titan] 2025-09-09 22:01:47,744 - root - INFO - Dumping profiler traces at step 28672 +[titan] 2025-09-09 22:01:47,801 - root - INFO - Finished dumping profiler traces in 0.06 seconds +[titan] 2025-09-09 22:02:06,898 - root - INFO - step: 28675 loss: 2.6586 memory: 122.03GiB(87.57%) tps: 10,155 tflops: 483.97 mfu: 48.93% global_avg_ntp_loss: 0.7408 global_avg_top_loss: 1.9179 +[titan] 2025-09-09 22:02:06,898 - root - INFO - lr: 5.3944e-06 gnorm: 0.38 [2 days, 4:26:39<20:42:45] +[titan] 2025-09-09 22:02:38,743 - root - INFO - step: 28680 loss: 2.6551 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.40 mfu: 49.59% global_avg_ntp_loss: 0.7363 global_avg_top_loss: 1.9188 +[titan] 2025-09-09 22:02:38,744 - root - INFO - lr: 5.3916e-06 gnorm: 0.36 [2 days, 4:27:11<20:42:11] +[titan] 2025-09-09 22:03:10,799 - root - INFO - step: 28685 loss: 2.7268 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.20 mfu: 49.26% global_avg_ntp_loss: 0.7700 global_avg_top_loss: 1.9569 +[titan] 2025-09-09 22:03:10,799 - root - INFO - lr: 5.3888e-06 gnorm: 0.40 [2 days, 4:27:43<20:41:38] +[titan] 2025-09-09 22:03:42,679 - root - INFO - step: 28690 loss: 2.6472 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.87 mfu: 49.53% global_avg_ntp_loss: 0.7358 global_avg_top_loss: 1.9114 +[titan] 2025-09-09 22:03:42,679 - root - INFO - lr: 5.3860e-06 gnorm: 0.37 [2 days, 4:28:15<20:41:05] +[titan] 2025-09-09 22:04:14,853 - root - INFO - step: 28695 loss: 3.1900 memory: 122.03GiB(87.57%) tps: 10,185 tflops: 485.40 mfu: 49.08% global_avg_ntp_loss: 1.0345 global_avg_top_loss: 2.1555 +[titan] 2025-09-09 22:04:14,853 - root - INFO - lr: 5.3833e-06 gnorm: 0.37 [2 days, 4:28:47<20:40:31] +[titan] 2025-09-09 22:04:40,544 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 22:04:46,936 - root - INFO - step: 28700 loss: 2.7025 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.77 mfu: 49.22% global_avg_ntp_loss: 0.7573 global_avg_top_loss: 1.9451 +[titan] 2025-09-09 22:04:46,936 - root - INFO - lr: 5.3805e-06 gnorm: 0.38 [2 days, 4:29:19<20:39:58] +[titan] 2025-09-09 22:05:19,119 - root - INFO - step: 28705 loss: 2.6819 memory: 122.03GiB(87.57%) tps: 10,182 tflops: 485.27 mfu: 49.07% global_avg_ntp_loss: 0.7505 global_avg_top_loss: 1.9314 +[titan] 2025-09-09 22:05:19,119 - root - INFO - lr: 5.3777e-06 gnorm: 0.38 [2 days, 4:29:51<20:39:25] +[titan] 2025-09-09 22:05:51,275 - root - INFO - step: 28710 loss: 2.6720 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.66 mfu: 49.11% global_avg_ntp_loss: 0.7448 global_avg_top_loss: 1.9272 +[titan] 2025-09-09 22:05:51,276 - root - INFO - lr: 5.3749e-06 gnorm: 0.40 [2 days, 4:30:23<20:38:52] +[titan] 2025-09-09 22:06:23,208 - root - INFO - step: 28715 loss: 3.1158 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.06 mfu: 49.45% global_avg_ntp_loss: 0.9988 global_avg_top_loss: 2.1171 +[titan] 2025-09-09 22:06:23,209 - root - INFO - lr: 5.3721e-06 gnorm: 0.39 [2 days, 4:30:55<20:38:19] +[titan] 2025-09-09 22:06:55,018 - root - INFO - step: 28720 loss: 2.6877 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 490.97 mfu: 49.64% global_avg_ntp_loss: 0.7519 global_avg_top_loss: 1.9358 +[titan] 2025-09-09 22:06:55,018 - root - INFO - lr: 5.3693e-06 gnorm: 0.40 [2 days, 4:31:27<20:37:45] +[titan] 2025-09-09 22:07:27,132 - root - INFO - step: 28725 loss: 2.6639 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.30 mfu: 49.17% global_avg_ntp_loss: 0.7421 global_avg_top_loss: 1.9218 +[titan] 2025-09-09 22:07:27,132 - root - INFO - lr: 5.3665e-06 gnorm: 0.40 [2 days, 4:31:59<20:37:12] +[titan] 2025-09-09 22:07:58,984 - root - INFO - step: 28730 loss: 3.1616 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.31 mfu: 49.58% global_avg_ntp_loss: 1.0217 global_avg_top_loss: 2.1399 +[titan] 2025-09-09 22:07:58,984 - root - INFO - lr: 5.3637e-06 gnorm: 0.42 [2 days, 4:32:31<20:36:39] +[titan] 2025-09-09 22:08:31,198 - root - INFO - step: 28735 loss: 2.6441 memory: 122.03GiB(87.57%) tps: 10,172 tflops: 484.80 mfu: 49.02% global_avg_ntp_loss: 0.7313 global_avg_top_loss: 1.9128 +[titan] 2025-09-09 22:08:31,198 - root - INFO - lr: 5.3610e-06 gnorm: 0.38 [2 days, 4:33:03<20:36:05] +[titan] 2025-09-09 22:09:03,304 - root - INFO - step: 28740 loss: 2.7205 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.42 mfu: 49.18% global_avg_ntp_loss: 0.7682 global_avg_top_loss: 1.9522 +[titan] 2025-09-09 22:09:03,305 - root - INFO - lr: 5.3582e-06 gnorm: 0.41 [2 days, 4:33:36<20:35:32] +[titan] 2025-09-09 22:09:35,324 - root - INFO - step: 28745 loss: 3.0771 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.73 mfu: 49.32% global_avg_ntp_loss: 0.9833 global_avg_top_loss: 2.0938 +[titan] 2025-09-09 22:09:35,325 - root - INFO - lr: 5.3554e-06 gnorm: 0.39 [2 days, 4:34:08<20:34:59] +[titan] 2025-09-09 22:10:01,068 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 22:10:07,441 - root - INFO - step: 28750 loss: 2.6752 memory: 122.03GiB(87.57%) tps: 10,203 tflops: 486.27 mfu: 49.17% global_avg_ntp_loss: 0.7440 global_avg_top_loss: 1.9311 +[titan] 2025-09-09 22:10:07,441 - root - INFO - lr: 5.3526e-06 gnorm: 0.38 [2 days, 4:34:40<20:34:26] +[titan] 2025-09-09 22:10:39,494 - root - INFO - step: 28755 loss: 2.6750 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.22 mfu: 49.26% global_avg_ntp_loss: 0.7475 global_avg_top_loss: 1.9275 +[titan] 2025-09-09 22:10:39,495 - root - INFO - lr: 5.3498e-06 gnorm: 0.37 [2 days, 4:35:12<20:33:52] +[titan] 2025-09-09 22:11:11,429 - root - INFO - step: 28760 loss: 2.7428 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.04 mfu: 49.45% global_avg_ntp_loss: 0.7751 global_avg_top_loss: 1.9677 +[titan] 2025-09-09 22:11:11,429 - root - INFO - lr: 5.3471e-06 gnorm: 0.39 [2 days, 4:35:44<20:33:19] +[titan] 2025-09-09 22:11:43,494 - root - INFO - step: 28765 loss: 2.6364 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.05 mfu: 49.25% global_avg_ntp_loss: 0.7254 global_avg_top_loss: 1.9110 +[titan] 2025-09-09 22:11:43,494 - root - INFO - lr: 5.3443e-06 gnorm: 0.38 [2 days, 4:36:16<20:32:46] +[titan] 2025-09-09 22:12:15,583 - root - INFO - step: 28770 loss: 2.6527 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.67 mfu: 49.21% global_avg_ntp_loss: 0.7365 global_avg_top_loss: 1.9163 +[titan] 2025-09-09 22:12:15,584 - root - INFO - lr: 5.3415e-06 gnorm: 0.38 [2 days, 4:36:48<20:32:13] +[titan] 2025-09-09 22:12:47,719 - root - INFO - step: 28775 loss: 3.1524 memory: 122.03GiB(87.57%) tps: 10,197 tflops: 485.97 mfu: 49.14% global_avg_ntp_loss: 1.0147 global_avg_top_loss: 2.1377 +[titan] 2025-09-09 22:12:47,720 - root - INFO - lr: 5.3387e-06 gnorm: 0.39 [2 days, 4:37:20<20:31:39] +[titan] 2025-09-09 22:13:19,821 - root - INFO - step: 28780 loss: 2.6481 memory: 122.03GiB(87.57%) tps: 10,208 tflops: 486.49 mfu: 49.19% global_avg_ntp_loss: 0.7351 global_avg_top_loss: 1.9131 +[titan] 2025-09-09 22:13:19,821 - root - INFO - lr: 5.3360e-06 gnorm: 0.39 [2 days, 4:37:52<20:31:06] +[titan] 2025-09-09 22:13:52,050 - root - INFO - step: 28785 loss: 2.6342 memory: 122.03GiB(87.57%) tps: 10,168 tflops: 484.58 mfu: 49.00% global_avg_ntp_loss: 0.7244 global_avg_top_loss: 1.9098 +[titan] 2025-09-09 22:13:52,050 - root - INFO - lr: 5.3332e-06 gnorm: 0.40 [2 days, 4:38:24<20:30:33] +[titan] 2025-09-09 22:14:24,061 - root - INFO - step: 28790 loss: 2.6920 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.87 mfu: 49.33% global_avg_ntp_loss: 0.7544 global_avg_top_loss: 1.9376 +[titan] 2025-09-09 22:14:24,061 - root - INFO - lr: 5.3304e-06 gnorm: 0.40 [2 days, 4:38:56<20:30:00] +[titan] 2025-09-09 22:14:56,308 - root - INFO - step: 28795 loss: 3.1088 memory: 122.03GiB(87.57%) tps: 10,162 tflops: 484.30 mfu: 48.97% global_avg_ntp_loss: 0.9950 global_avg_top_loss: 2.1138 +[titan] 2025-09-09 22:14:56,308 - root - INFO - lr: 5.3276e-06 gnorm: 0.39 [2 days, 4:39:29<20:29:26] +[titan] 2025-09-09 22:15:21,980 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 22:15:28,411 - root - INFO - step: 28800 loss: 2.6261 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.47 mfu: 49.19% global_avg_ntp_loss: 0.7230 global_avg_top_loss: 1.9030 +[titan] 2025-09-09 22:15:28,411 - root - INFO - lr: 5.3249e-06 gnorm: 0.38 [2 days, 4:40:01<20:28:53] +[titan] 2025-09-09 22:16:00,462 - root - INFO - step: 28805 loss: 2.5911 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.26 mfu: 49.27% global_avg_ntp_loss: 0.7104 global_avg_top_loss: 1.8807 +[titan] 2025-09-09 22:16:00,462 - root - INFO - lr: 5.3221e-06 gnorm: 0.37 [2 days, 4:40:33<20:28:20] +[titan] 2025-09-09 22:16:32,557 - root - INFO - step: 28810 loss: 3.1791 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.59 mfu: 49.20% global_avg_ntp_loss: 1.0293 global_avg_top_loss: 2.1497 +[titan] 2025-09-09 22:16:32,557 - root - INFO - lr: 5.3193e-06 gnorm: 0.39 [2 days, 4:41:05<20:27:47] +[titan] 2025-09-09 22:17:04,649 - root - INFO - step: 28815 loss: 2.6710 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.64 mfu: 49.20% global_avg_ntp_loss: 0.7457 global_avg_top_loss: 1.9253 +[titan] 2025-09-09 22:17:04,649 - root - INFO - lr: 5.3166e-06 gnorm: 0.44 [2 days, 4:41:37<20:27:14] +[titan] 2025-09-09 22:17:36,857 - root - INFO - step: 28820 loss: 2.7233 memory: 122.03GiB(87.57%) tps: 10,174 tflops: 484.89 mfu: 49.03% global_avg_ntp_loss: 0.7658 global_avg_top_loss: 1.9575 +[titan] 2025-09-09 22:17:36,857 - root - INFO - lr: 5.3138e-06 gnorm: 0.38 [2 days, 4:42:09<20:26:40] +[titan] 2025-09-09 22:18:08,732 - root - INFO - step: 28825 loss: 2.8850 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.95 mfu: 49.54% global_avg_ntp_loss: 0.8689 global_avg_top_loss: 2.0160 +[titan] 2025-09-09 22:18:08,733 - root - INFO - lr: 5.3110e-06 gnorm: 0.38 [2 days, 4:42:41<20:26:07] +[titan] 2025-09-09 22:18:40,756 - root - INFO - step: 28830 loss: 2.6594 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.69 mfu: 49.31% global_avg_ntp_loss: 0.7398 global_avg_top_loss: 1.9196 +[titan] 2025-09-09 22:18:40,756 - root - INFO - lr: 5.3083e-06 gnorm: 0.39 [2 days, 4:43:13<20:25:34] +[titan] 2025-09-09 22:19:12,753 - root - INFO - step: 28835 loss: 2.6925 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.08 mfu: 49.35% global_avg_ntp_loss: 0.7521 global_avg_top_loss: 1.9404 +[titan] 2025-09-09 22:19:12,753 - root - INFO - lr: 5.3055e-06 gnorm: 0.43 [2 days, 4:43:45<20:25:00] +[titan] 2025-09-09 22:19:44,720 - root - INFO - step: 28840 loss: 2.6651 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.54 mfu: 49.40% global_avg_ntp_loss: 0.7416 global_avg_top_loss: 1.9235 +[titan] 2025-09-09 22:19:44,720 - root - INFO - lr: 5.3027e-06 gnorm: 0.39 [2 days, 4:44:17<20:24:27] +[titan] 2025-09-09 22:20:16,646 - root - INFO - step: 28845 loss: 2.7803 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.18 mfu: 49.46% global_avg_ntp_loss: 0.7930 global_avg_top_loss: 1.9873 +[titan] 2025-09-09 22:20:16,646 - root - INFO - lr: 5.3000e-06 gnorm: 0.40 [2 days, 4:44:49<20:23:54] +[titan] 2025-09-09 22:20:42,121 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 22:20:48,597 - root - INFO - step: 28850 loss: 2.8029 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.78 mfu: 49.42% global_avg_ntp_loss: 0.8124 global_avg_top_loss: 1.9906 +[titan] 2025-09-09 22:20:48,597 - root - INFO - lr: 5.2972e-06 gnorm: 0.41 [2 days, 4:45:21<20:23:21] +[titan] 2025-09-09 22:21:20,776 - root - INFO - step: 28855 loss: 2.6903 memory: 122.03GiB(87.57%) tps: 10,183 tflops: 485.32 mfu: 49.07% global_avg_ntp_loss: 0.7583 global_avg_top_loss: 1.9320 +[titan] 2025-09-09 22:21:20,777 - root - INFO - lr: 5.2944e-06 gnorm: 0.41 [2 days, 4:45:53<20:22:47] +[titan] 2025-09-09 22:21:52,887 - root - INFO - step: 28860 loss: 2.7436 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.36 mfu: 49.18% global_avg_ntp_loss: 0.7766 global_avg_top_loss: 1.9670 +[titan] 2025-09-09 22:21:52,887 - root - INFO - lr: 5.2917e-06 gnorm: 0.38 [2 days, 4:46:25<20:22:14] +[titan] 2025-09-09 22:22:24,857 - root - INFO - step: 28865 loss: 2.7135 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.49 mfu: 49.39% global_avg_ntp_loss: 0.7639 global_avg_top_loss: 1.9496 +[titan] 2025-09-09 22:22:24,858 - root - INFO - lr: 5.2889e-06 gnorm: 0.41 [2 days, 4:46:57<20:21:41] +[titan] 2025-09-09 22:22:56,994 - root - INFO - step: 28870 loss: 2.7356 memory: 122.03GiB(87.57%) tps: 10,197 tflops: 485.96 mfu: 49.14% global_avg_ntp_loss: 0.7734 global_avg_top_loss: 1.9622 +[titan] 2025-09-09 22:22:56,994 - root - INFO - lr: 5.2862e-06 gnorm: 0.38 [2 days, 4:47:29<20:21:08] +[titan] 2025-09-09 22:23:28,996 - root - INFO - step: 28875 loss: 3.6074 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 488.01 mfu: 49.34% global_avg_ntp_loss: 1.2744 global_avg_top_loss: 2.3330 +[titan] 2025-09-09 22:23:28,997 - root - INFO - lr: 5.2834e-06 gnorm: 0.39 [2 days, 4:48:01<20:20:34] +[titan] 2025-09-09 22:24:01,394 - root - INFO - step: 28880 loss: 2.6378 memory: 122.03GiB(87.57%) tps: 10,114 tflops: 482.04 mfu: 48.74% global_avg_ntp_loss: 0.7282 global_avg_top_loss: 1.9096 +[titan] 2025-09-09 22:24:01,394 - root - INFO - lr: 5.2807e-06 gnorm: 0.38 [2 days, 4:48:34<20:20:01] +[titan] 2025-09-09 22:24:33,525 - root - INFO - step: 28885 loss: 2.6623 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.05 mfu: 49.15% global_avg_ntp_loss: 0.7431 global_avg_top_loss: 1.9191 +[titan] 2025-09-09 22:24:33,525 - root - INFO - lr: 5.2779e-06 gnorm: 0.38 [2 days, 4:49:06<20:19:28] +[titan] 2025-09-09 22:25:05,462 - root - INFO - step: 28890 loss: 3.1742 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.99 mfu: 49.44% global_avg_ntp_loss: 1.0287 global_avg_top_loss: 2.1455 +[titan] 2025-09-09 22:25:05,463 - root - INFO - lr: 5.2751e-06 gnorm: 0.38 [2 days, 4:49:38<20:18:55] +[titan] 2025-09-09 22:25:37,496 - root - INFO - step: 28895 loss: 2.6747 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.53 mfu: 49.29% global_avg_ntp_loss: 0.7448 global_avg_top_loss: 1.9299 +[titan] 2025-09-09 22:25:37,496 - root - INFO - lr: 5.2724e-06 gnorm: 0.40 [2 days, 4:50:10<20:18:22] +[titan] 2025-09-09 22:26:02,908 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 22:26:09,315 - root - INFO - step: 28900 loss: 2.6546 memory: 122.03GiB(87.57%) tps: 10,298 tflops: 490.81 mfu: 49.63% global_avg_ntp_loss: 0.7368 global_avg_top_loss: 1.9178 +[titan] 2025-09-09 22:26:09,316 - root - INFO - lr: 5.2696e-06 gnorm: 0.42 [2 days, 4:50:41<20:17:48] +[titan] 2025-09-09 22:26:41,177 - root - INFO - step: 28905 loss: 2.6410 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.16 mfu: 49.56% global_avg_ntp_loss: 0.7279 global_avg_top_loss: 1.9131 +[titan] 2025-09-09 22:26:41,177 - root - INFO - lr: 5.2669e-06 gnorm: 0.38 [2 days, 4:51:13<20:17:15] +[titan] 2025-09-09 22:27:13,289 - root - INFO - step: 28910 loss: 2.6782 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.32 mfu: 49.17% global_avg_ntp_loss: 0.7516 global_avg_top_loss: 1.9265 +[titan] 2025-09-09 22:27:13,290 - root - INFO - lr: 5.2641e-06 gnorm: 0.38 [2 days, 4:51:45<20:16:42] +[titan] 2025-09-09 22:27:45,302 - root - INFO - step: 28915 loss: 2.6866 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.85 mfu: 49.33% global_avg_ntp_loss: 0.7632 global_avg_top_loss: 1.9234 +[titan] 2025-09-09 22:27:45,302 - root - INFO - lr: 5.2614e-06 gnorm: 0.37 [2 days, 4:52:17<20:16:08] +[titan] 2025-09-09 22:28:17,218 - root - INFO - step: 28920 loss: 2.7901 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.32 mfu: 49.48% global_avg_ntp_loss: 0.7978 global_avg_top_loss: 1.9922 +[titan] 2025-09-09 22:28:17,219 - root - INFO - lr: 5.2586e-06 gnorm: 0.39 [2 days, 4:52:49<20:15:35] +[titan] 2025-09-09 22:28:49,228 - root - INFO - step: 28925 loss: 2.6673 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.89 mfu: 49.33% global_avg_ntp_loss: 0.7464 global_avg_top_loss: 1.9209 +[titan] 2025-09-09 22:28:49,229 - root - INFO - lr: 5.2559e-06 gnorm: 0.38 [2 days, 4:53:21<20:15:02] +[titan] 2025-09-09 22:29:21,338 - root - INFO - step: 28930 loss: 2.7162 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.37 mfu: 49.18% global_avg_ntp_loss: 0.7631 global_avg_top_loss: 1.9531 +[titan] 2025-09-09 22:29:21,338 - root - INFO - lr: 5.2531e-06 gnorm: 0.41 [2 days, 4:53:54<20:14:29] +[titan] 2025-09-09 22:29:53,380 - root - INFO - step: 28935 loss: 2.6847 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.39 mfu: 49.28% global_avg_ntp_loss: 0.7489 global_avg_top_loss: 1.9358 +[titan] 2025-09-09 22:29:53,381 - root - INFO - lr: 5.2504e-06 gnorm: 0.40 [2 days, 4:54:26<20:13:55] +[titan] 2025-09-09 22:30:25,603 - root - INFO - step: 28940 loss: 2.7387 memory: 122.03GiB(87.57%) tps: 10,170 tflops: 484.67 mfu: 49.01% global_avg_ntp_loss: 0.7740 global_avg_top_loss: 1.9647 +[titan] 2025-09-09 22:30:25,603 - root - INFO - lr: 5.2476e-06 gnorm: 0.39 [2 days, 4:54:58<20:13:22] +[titan] 2025-09-09 22:30:57,518 - root - INFO - step: 28945 loss: 2.7068 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.34 mfu: 49.48% global_avg_ntp_loss: 0.7588 global_avg_top_loss: 1.9480 +[titan] 2025-09-09 22:30:57,518 - root - INFO - lr: 5.2449e-06 gnorm: 0.41 [2 days, 4:55:30<20:12:49] +[titan] 2025-09-09 22:31:23,141 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 22:31:29,555 - root - INFO - step: 28950 loss: 2.7610 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.46 mfu: 49.29% global_avg_ntp_loss: 0.7847 global_avg_top_loss: 1.9763 +[titan] 2025-09-09 22:31:29,555 - root - INFO - lr: 5.2422e-06 gnorm: 0.39 [2 days, 4:56:02<20:12:16] +[titan] 2025-09-09 22:32:01,535 - root - INFO - step: 28955 loss: 3.6030 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.35 mfu: 49.38% global_avg_ntp_loss: 1.2743 global_avg_top_loss: 2.3287 +[titan] 2025-09-09 22:32:01,535 - root - INFO - lr: 5.2394e-06 gnorm: 0.42 [2 days, 4:56:34<20:11:42] +[titan] 2025-09-09 22:32:33,518 - root - INFO - step: 28960 loss: 2.6624 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.31 mfu: 49.37% global_avg_ntp_loss: 0.7422 global_avg_top_loss: 1.9202 +[titan] 2025-09-09 22:32:33,518 - root - INFO - lr: 5.2367e-06 gnorm: 0.38 [2 days, 4:57:06<20:11:09] +[titan] 2025-09-09 22:33:05,435 - root - INFO - step: 28965 loss: 2.6714 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.30 mfu: 49.47% global_avg_ntp_loss: 0.7453 global_avg_top_loss: 1.9261 +[titan] 2025-09-09 22:33:05,436 - root - INFO - lr: 5.2339e-06 gnorm: 0.40 [2 days, 4:57:38<20:10:36] +[titan] 2025-09-09 22:33:37,573 - root - INFO - step: 28970 loss: 2.6994 memory: 122.03GiB(87.57%) tps: 10,196 tflops: 485.95 mfu: 49.14% global_avg_ntp_loss: 0.7558 global_avg_top_loss: 1.9435 +[titan] 2025-09-09 22:33:37,573 - root - INFO - lr: 5.2312e-06 gnorm: 0.39 [2 days, 4:58:10<20:10:03] +[titan] 2025-09-09 22:34:09,684 - root - INFO - step: 28975 loss: 2.5462 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.35 mfu: 49.18% global_avg_ntp_loss: 0.6863 global_avg_top_loss: 1.8599 +[titan] 2025-09-09 22:34:09,684 - root - INFO - lr: 5.2284e-06 gnorm: 0.39 [2 days, 4:58:42<20:09:29] +[titan] 2025-09-09 22:34:41,909 - root - INFO - step: 28980 loss: 2.6703 memory: 122.03GiB(87.57%) tps: 10,168 tflops: 484.62 mfu: 49.00% global_avg_ntp_loss: 0.7429 global_avg_top_loss: 1.9274 +[titan] 2025-09-09 22:34:41,910 - root - INFO - lr: 5.2257e-06 gnorm: 0.39 [2 days, 4:59:14<20:08:56] +[titan] 2025-09-09 22:35:13,710 - root - INFO - step: 28985 loss: 2.6346 memory: 122.03GiB(87.57%) tps: 10,304 tflops: 491.10 mfu: 49.66% global_avg_ntp_loss: 0.7294 global_avg_top_loss: 1.9052 +[titan] 2025-09-09 22:35:13,710 - root - INFO - lr: 5.2230e-06 gnorm: 0.37 [2 days, 4:59:46<20:08:23] +[titan] 2025-09-09 22:35:45,684 - root - INFO - step: 28990 loss: 2.7377 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.43 mfu: 49.39% global_avg_ntp_loss: 0.7751 global_avg_top_loss: 1.9626 +[titan] 2025-09-09 22:35:45,684 - root - INFO - lr: 5.2202e-06 gnorm: 0.38 [2 days, 5:00:18<20:07:50] +[titan] 2025-09-09 22:36:17,703 - root - INFO - step: 28995 loss: 2.6655 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.74 mfu: 49.32% global_avg_ntp_loss: 0.7393 global_avg_top_loss: 1.9262 +[titan] 2025-09-09 22:36:17,704 - root - INFO - lr: 5.2175e-06 gnorm: 0.39 [2 days, 5:00:50<20:07:16] +[titan] 2025-09-09 22:36:43,428 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 22:36:49,789 - root - INFO - step: 29000 loss: 2.6706 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.73 mfu: 49.21% global_avg_ntp_loss: 0.7425 global_avg_top_loss: 1.9281 +[titan] 2025-09-09 22:36:49,790 - root - INFO - lr: 5.2148e-06 gnorm: 0.38 [2 days, 5:01:22<20:06:43] +[titan] 2025-09-09 22:37:21,733 - root - INFO - step: 29005 loss: 2.6569 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.90 mfu: 49.43% global_avg_ntp_loss: 0.7413 global_avg_top_loss: 1.9156 +[titan] 2025-09-09 22:37:21,733 - root - INFO - lr: 5.2120e-06 gnorm: 0.38 [2 days, 5:01:54<20:06:10] +[titan] 2025-09-09 22:37:53,908 - root - INFO - step: 29010 loss: 2.7091 memory: 122.03GiB(87.57%) tps: 10,184 tflops: 485.38 mfu: 49.08% global_avg_ntp_loss: 0.7599 global_avg_top_loss: 1.9493 +[titan] 2025-09-09 22:37:53,908 - root - INFO - lr: 5.2093e-06 gnorm: 0.39 [2 days, 5:02:26<20:05:37] +[titan] 2025-09-09 22:38:25,933 - root - INFO - step: 29015 loss: 2.6873 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.66 mfu: 49.31% global_avg_ntp_loss: 0.7524 global_avg_top_loss: 1.9349 +[titan] 2025-09-09 22:38:25,933 - root - INFO - lr: 5.2066e-06 gnorm: 0.39 [2 days, 5:02:58<20:05:03] +[titan] 2025-09-09 22:38:58,108 - root - INFO - step: 29020 loss: 2.6560 memory: 122.03GiB(87.57%) tps: 10,184 tflops: 485.38 mfu: 49.08% global_avg_ntp_loss: 0.7429 global_avg_top_loss: 1.9131 +[titan] 2025-09-09 22:38:58,108 - root - INFO - lr: 5.2038e-06 gnorm: 0.40 [2 days, 5:03:30<20:04:30] +[titan] 2025-09-09 22:39:30,482 - root - INFO - step: 29025 loss: 2.6399 memory: 122.03GiB(87.57%) tps: 10,122 tflops: 482.40 mfu: 48.78% global_avg_ntp_loss: 0.7348 global_avg_top_loss: 1.9052 +[titan] 2025-09-09 22:39:30,482 - root - INFO - lr: 5.2011e-06 gnorm: 0.47 [2 days, 5:04:03<20:03:57] +[titan] 2025-09-09 22:40:02,311 - root - INFO - step: 29030 loss: 2.7287 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.66 mfu: 49.61% global_avg_ntp_loss: 0.7708 global_avg_top_loss: 1.9578 +[titan] 2025-09-09 22:40:02,311 - root - INFO - lr: 5.1984e-06 gnorm: 0.39 [2 days, 5:04:34<20:03:24] +[titan] 2025-09-09 22:40:34,212 - root - INFO - step: 29035 loss: 3.6279 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.56 mfu: 49.50% global_avg_ntp_loss: 1.2845 global_avg_top_loss: 2.3435 +[titan] 2025-09-09 22:40:34,212 - root - INFO - lr: 5.1956e-06 gnorm: 0.38 [2 days, 5:05:06<20:02:51] +[titan] 2025-09-09 22:41:06,176 - root - INFO - step: 29040 loss: 2.6558 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.59 mfu: 49.40% global_avg_ntp_loss: 0.7384 global_avg_top_loss: 1.9174 +[titan] 2025-09-09 22:41:06,176 - root - INFO - lr: 5.1929e-06 gnorm: 0.40 [2 days, 5:05:38<20:02:17] +[titan] 2025-09-09 22:41:38,477 - root - INFO - step: 29045 loss: 2.6775 memory: 122.03GiB(87.57%) tps: 10,145 tflops: 483.48 mfu: 48.89% global_avg_ntp_loss: 0.7511 global_avg_top_loss: 1.9264 +[titan] 2025-09-09 22:41:38,477 - root - INFO - lr: 5.1902e-06 gnorm: 0.38 [2 days, 5:06:11<20:01:44] +[titan] 2025-09-09 22:42:03,930 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 22:42:10,293 - root - INFO - step: 29050 loss: 2.6216 memory: 122.03GiB(87.57%) tps: 10,300 tflops: 490.87 mfu: 49.63% global_avg_ntp_loss: 0.7192 global_avg_top_loss: 1.9024 +[titan] 2025-09-09 22:42:10,293 - root - INFO - lr: 5.1875e-06 gnorm: 0.39 [2 days, 5:06:42<20:01:11] +[titan] 2025-09-09 22:42:42,442 - root - INFO - step: 29055 loss: 2.6030 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.77 mfu: 49.12% global_avg_ntp_loss: 0.7124 global_avg_top_loss: 1.8906 +[titan] 2025-09-09 22:42:42,442 - root - INFO - lr: 5.1847e-06 gnorm: 0.40 [2 days, 5:07:15<20:00:38] +[titan] 2025-09-09 22:43:14,432 - root - INFO - step: 29060 loss: 2.6507 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.19 mfu: 49.36% global_avg_ntp_loss: 0.7355 global_avg_top_loss: 1.9152 +[titan] 2025-09-09 22:43:14,432 - root - INFO - lr: 5.1820e-06 gnorm: 0.39 [2 days, 5:07:47<20:00:04] +[titan] 2025-09-09 22:43:46,261 - root - INFO - step: 29065 loss: 2.6855 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.65 mfu: 49.61% global_avg_ntp_loss: 0.7500 global_avg_top_loss: 1.9355 +[titan] 2025-09-09 22:43:46,262 - root - INFO - lr: 5.1793e-06 gnorm: 0.39 [2 days, 5:08:18<19:59:31] +[titan] 2025-09-09 22:44:18,373 - root - INFO - step: 29070 loss: 2.9020 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.35 mfu: 49.18% global_avg_ntp_loss: 0.8524 global_avg_top_loss: 2.0496 +[titan] 2025-09-09 22:44:18,373 - root - INFO - lr: 5.1766e-06 gnorm: 0.42 [2 days, 5:08:51<19:58:58] +[titan] 2025-09-09 22:44:50,385 - root - INFO - step: 29075 loss: 2.7733 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.85 mfu: 49.33% global_avg_ntp_loss: 0.7906 global_avg_top_loss: 1.9827 +[titan] 2025-09-09 22:44:50,385 - root - INFO - lr: 5.1738e-06 gnorm: 0.40 [2 days, 5:09:23<19:58:25] +[titan] 2025-09-09 22:45:22,383 - root - INFO - step: 29080 loss: 2.7112 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.07 mfu: 49.35% global_avg_ntp_loss: 0.7622 global_avg_top_loss: 1.9490 +[titan] 2025-09-09 22:45:22,383 - root - INFO - lr: 5.1711e-06 gnorm: 0.40 [2 days, 5:09:55<19:57:51] +[titan] 2025-09-09 22:45:54,391 - root - INFO - step: 29085 loss: 2.6443 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.92 mfu: 49.33% global_avg_ntp_loss: 0.7329 global_avg_top_loss: 1.9114 +[titan] 2025-09-09 22:45:54,391 - root - INFO - lr: 5.1684e-06 gnorm: 0.37 [2 days, 5:10:27<19:57:18] +[titan] 2025-09-09 22:46:26,546 - root - INFO - step: 29090 loss: 2.6211 memory: 122.03GiB(87.57%) tps: 10,191 tflops: 485.68 mfu: 49.11% global_avg_ntp_loss: 0.7190 global_avg_top_loss: 1.9021 +[titan] 2025-09-09 22:46:26,547 - root - INFO - lr: 5.1657e-06 gnorm: 0.41 [2 days, 5:10:59<19:56:45] +[titan] 2025-09-09 22:46:58,589 - root - INFO - step: 29095 loss: 3.2135 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.38 mfu: 49.28% global_avg_ntp_loss: 1.0503 global_avg_top_loss: 2.1632 +[titan] 2025-09-09 22:46:58,590 - root - INFO - lr: 5.1630e-06 gnorm: 0.38 [2 days, 5:11:31<19:56:12] +[titan] 2025-09-09 22:47:24,378 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 22:47:30,777 - root - INFO - step: 29100 loss: 2.5813 memory: 122.03GiB(87.57%) tps: 10,180 tflops: 485.19 mfu: 49.06% global_avg_ntp_loss: 0.7079 global_avg_top_loss: 1.8733 +[titan] 2025-09-09 22:47:30,777 - root - INFO - lr: 5.1602e-06 gnorm: 0.36 [2 days, 5:12:03<19:55:39] +[titan] 2025-09-09 22:48:02,654 - root - INFO - step: 29105 loss: 2.6240 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.93 mfu: 49.54% global_avg_ntp_loss: 0.7261 global_avg_top_loss: 1.8979 +[titan] 2025-09-09 22:48:02,654 - root - INFO - lr: 5.1575e-06 gnorm: 0.43 [2 days, 5:12:35<19:55:05] +[titan] 2025-09-09 22:48:34,748 - root - INFO - step: 29110 loss: 2.6238 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.61 mfu: 49.20% global_avg_ntp_loss: 0.7265 global_avg_top_loss: 1.8973 +[titan] 2025-09-09 22:48:34,748 - root - INFO - lr: 5.1548e-06 gnorm: 0.39 [2 days, 5:13:07<19:54:32] +[titan] 2025-09-09 22:49:06,774 - root - INFO - step: 29115 loss: 3.0516 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.64 mfu: 49.31% global_avg_ntp_loss: 0.9731 global_avg_top_loss: 2.0786 +[titan] 2025-09-09 22:49:06,774 - root - INFO - lr: 5.1521e-06 gnorm: 0.37 [2 days, 5:13:39<19:53:59] +[titan] 2025-09-09 22:49:38,840 - root - INFO - step: 29120 loss: 2.7758 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.02 mfu: 49.24% global_avg_ntp_loss: 0.7954 global_avg_top_loss: 1.9804 +[titan] 2025-09-09 22:49:38,841 - root - INFO - lr: 5.1494e-06 gnorm: 0.41 [2 days, 5:14:11<19:53:26] +[titan] 2025-09-09 22:50:10,762 - root - INFO - step: 29125 loss: 2.6811 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.24 mfu: 49.47% global_avg_ntp_loss: 0.7535 global_avg_top_loss: 1.9276 +[titan] 2025-09-09 22:50:10,762 - root - INFO - lr: 5.1467e-06 gnorm: 0.41 [2 days, 5:14:43<19:52:52] +[titan] 2025-09-09 22:50:42,971 - root - INFO - step: 29130 loss: 2.6952 memory: 122.03GiB(87.57%) tps: 10,174 tflops: 484.87 mfu: 49.03% global_avg_ntp_loss: 0.7592 global_avg_top_loss: 1.9360 +[titan] 2025-09-09 22:50:42,971 - root - INFO - lr: 5.1440e-06 gnorm: 0.38 [2 days, 5:15:15<19:52:19] +[titan] 2025-09-09 22:51:15,062 - root - INFO - step: 29135 loss: 2.7569 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.65 mfu: 49.21% global_avg_ntp_loss: 0.7881 global_avg_top_loss: 1.9688 +[titan] 2025-09-09 22:51:15,063 - root - INFO - lr: 5.1413e-06 gnorm: 0.39 [2 days, 5:15:47<19:51:46] +[titan] 2025-09-09 22:51:47,021 - root - INFO - step: 29140 loss: 2.7331 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.67 mfu: 49.41% global_avg_ntp_loss: 0.7839 global_avg_top_loss: 1.9492 +[titan] 2025-09-09 22:51:47,021 - root - INFO - lr: 5.1385e-06 gnorm: 0.39 [2 days, 5:16:19<19:51:13] +[titan] 2025-09-09 22:52:19,111 - root - INFO - step: 29145 loss: 2.7940 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.67 mfu: 49.21% global_avg_ntp_loss: 0.8079 global_avg_top_loss: 1.9861 +[titan] 2025-09-09 22:52:19,111 - root - INFO - lr: 5.1358e-06 gnorm: 0.41 [2 days, 5:16:51<19:50:39] +[titan] 2025-09-09 22:52:44,647 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 22:52:51,044 - root - INFO - step: 29150 loss: 2.9851 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.06 mfu: 49.45% global_avg_ntp_loss: 0.8898 global_avg_top_loss: 2.0953 +[titan] 2025-09-09 22:52:51,045 - root - INFO - lr: 5.1331e-06 gnorm: 0.40 [2 days, 5:17:23<19:50:06] +[titan] 2025-09-09 22:53:22,988 - root - INFO - step: 29155 loss: 2.6518 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.90 mfu: 49.43% global_avg_ntp_loss: 0.7328 global_avg_top_loss: 1.9191 +[titan] 2025-09-09 22:53:22,988 - root - INFO - lr: 5.1304e-06 gnorm: 0.40 [2 days, 5:17:55<19:49:33] +[titan] 2025-09-09 22:53:54,930 - root - INFO - step: 29160 loss: 2.8450 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.92 mfu: 49.44% global_avg_ntp_loss: 0.8424 global_avg_top_loss: 2.0026 +[titan] 2025-09-09 22:53:54,930 - root - INFO - lr: 5.1277e-06 gnorm: 0.40 [2 days, 5:18:27<19:49:00] +[titan] 2025-09-09 22:54:26,964 - root - INFO - step: 29165 loss: 2.7427 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.52 mfu: 49.29% global_avg_ntp_loss: 0.7772 global_avg_top_loss: 1.9655 +[titan] 2025-09-09 22:54:26,964 - root - INFO - lr: 5.1250e-06 gnorm: 0.42 [2 days, 5:18:59<19:48:26] +[titan] 2025-09-09 22:54:59,095 - root - INFO - step: 29170 loss: 2.7462 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.05 mfu: 49.15% global_avg_ntp_loss: 0.7783 global_avg_top_loss: 1.9678 +[titan] 2025-09-09 22:54:59,095 - root - INFO - lr: 5.1223e-06 gnorm: 0.38 [2 days, 5:19:31<19:47:53] +[titan] 2025-09-09 22:55:30,852 - root - INFO - step: 29175 loss: 2.7078 memory: 122.03GiB(87.57%) tps: 10,318 tflops: 491.77 mfu: 49.72% global_avg_ntp_loss: 0.7600 global_avg_top_loss: 1.9478 +[titan] 2025-09-09 22:55:30,853 - root - INFO - lr: 5.1196e-06 gnorm: 0.40 [2 days, 5:20:03<19:47:20] +[titan] 2025-09-09 22:56:02,955 - root - INFO - step: 29180 loss: 2.7188 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.48 mfu: 49.19% global_avg_ntp_loss: 0.7658 global_avg_top_loss: 1.9529 +[titan] 2025-09-09 22:56:02,956 - root - INFO - lr: 5.1169e-06 gnorm: 0.38 [2 days, 5:20:35<19:46:47] +[titan] 2025-09-09 22:56:28,867 - root - INFO - Dumping profiler traces at step 29184 +[titan] 2025-09-09 22:56:28,925 - root - INFO - Finished dumping profiler traces in 0.06 seconds +[titan] 2025-09-09 22:56:35,311 - root - INFO - step: 29185 loss: 2.6094 memory: 122.03GiB(87.57%) tps: 10,128 tflops: 482.68 mfu: 48.80% global_avg_ntp_loss: 0.7306 global_avg_top_loss: 1.8788 +[titan] 2025-09-09 22:56:35,311 - root - INFO - lr: 5.1142e-06 gnorm: 0.48 [2 days, 5:21:07<19:46:14] +[titan] 2025-09-09 22:57:07,090 - root - INFO - step: 29190 loss: 2.6938 memory: 122.03GiB(87.57%) tps: 10,311 tflops: 491.42 mfu: 49.69% global_avg_ntp_loss: 0.7530 global_avg_top_loss: 1.9408 +[titan] 2025-09-09 22:57:07,091 - root - INFO - lr: 5.1115e-06 gnorm: 0.38 [2 days, 5:21:39<19:45:40] +[titan] 2025-09-09 22:57:38,985 - root - INFO - step: 29195 loss: 3.1577 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.65 mfu: 49.51% global_avg_ntp_loss: 1.0210 global_avg_top_loss: 2.1367 +[titan] 2025-09-09 22:57:38,985 - root - INFO - lr: 5.1088e-06 gnorm: 0.38 [2 days, 5:22:11<19:45:07] +[titan] 2025-09-09 22:58:04,661 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 22:58:11,054 - root - INFO - step: 29200 loss: 2.6686 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.99 mfu: 49.24% global_avg_ntp_loss: 0.7422 global_avg_top_loss: 1.9263 +[titan] 2025-09-09 22:58:11,054 - root - INFO - lr: 5.1061e-06 gnorm: 0.39 [2 days, 5:22:43<19:44:34] +[titan] 2025-09-09 22:58:42,842 - root - INFO - step: 29205 loss: 2.6944 memory: 122.03GiB(87.57%) tps: 10,309 tflops: 491.30 mfu: 49.68% global_avg_ntp_loss: 0.7533 global_avg_top_loss: 1.9411 +[titan] 2025-09-09 22:58:42,842 - root - INFO - lr: 5.1034e-06 gnorm: 0.39 [2 days, 5:23:15<19:44:00] +[titan] 2025-09-09 22:59:14,821 - root - INFO - step: 29210 loss: 2.6511 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.36 mfu: 49.38% global_avg_ntp_loss: 0.7349 global_avg_top_loss: 1.9162 +[titan] 2025-09-09 22:59:14,821 - root - INFO - lr: 5.1007e-06 gnorm: 0.39 [2 days, 5:23:47<19:43:27] +[titan] 2025-09-09 22:59:46,629 - root - INFO - step: 29215 loss: 2.7013 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 490.98 mfu: 49.64% global_avg_ntp_loss: 0.7585 global_avg_top_loss: 1.9429 +[titan] 2025-09-09 22:59:46,629 - root - INFO - lr: 5.0980e-06 gnorm: 0.39 [2 days, 5:24:19<19:42:54] +[titan] 2025-09-09 23:00:18,596 - root - INFO - step: 29220 loss: 2.6984 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.54 mfu: 49.40% global_avg_ntp_loss: 0.7608 global_avg_top_loss: 1.9376 +[titan] 2025-09-09 23:00:18,596 - root - INFO - lr: 5.0953e-06 gnorm: 0.39 [2 days, 5:24:51<19:42:21] +[titan] 2025-09-09 23:00:50,663 - root - INFO - step: 29225 loss: 2.6469 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.02 mfu: 49.24% global_avg_ntp_loss: 0.7335 global_avg_top_loss: 1.9134 +[titan] 2025-09-09 23:00:50,663 - root - INFO - lr: 5.0926e-06 gnorm: 0.38 [2 days, 5:25:23<19:41:47] +[titan] 2025-09-09 23:01:22,702 - root - INFO - step: 29230 loss: 2.6773 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.44 mfu: 49.29% global_avg_ntp_loss: 0.7482 global_avg_top_loss: 1.9290 +[titan] 2025-09-09 23:01:22,703 - root - INFO - lr: 5.0899e-06 gnorm: 0.40 [2 days, 5:25:55<19:41:14] +[titan] 2025-09-09 23:01:54,652 - root - INFO - step: 29235 loss: 2.7078 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.81 mfu: 49.42% global_avg_ntp_loss: 0.7596 global_avg_top_loss: 1.9482 +[titan] 2025-09-09 23:01:54,652 - root - INFO - lr: 5.0872e-06 gnorm: 0.41 [2 days, 5:26:27<19:40:41] +[titan] 2025-09-09 23:02:26,593 - root - INFO - step: 29240 loss: 2.6131 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.94 mfu: 49.44% global_avg_ntp_loss: 0.7177 global_avg_top_loss: 1.8954 +[titan] 2025-09-09 23:02:26,593 - root - INFO - lr: 5.0846e-06 gnorm: 0.39 [2 days, 5:26:59<19:40:08] +[titan] 2025-09-09 23:02:58,597 - root - INFO - step: 29245 loss: 2.7600 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.98 mfu: 49.34% global_avg_ntp_loss: 0.7883 global_avg_top_loss: 1.9716 +[titan] 2025-09-09 23:02:58,597 - root - INFO - lr: 5.0819e-06 gnorm: 0.38 [2 days, 5:27:31<19:39:34] +[titan] 2025-09-09 23:03:24,209 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 23:03:30,592 - root - INFO - step: 29250 loss: 2.6628 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.12 mfu: 49.35% global_avg_ntp_loss: 0.7411 global_avg_top_loss: 1.9217 +[titan] 2025-09-09 23:03:30,592 - root - INFO - lr: 5.0792e-06 gnorm: 0.38 [2 days, 5:28:03<19:39:01] +[titan] 2025-09-09 23:04:02,495 - root - INFO - step: 29255 loss: 2.7456 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.51 mfu: 49.50% global_avg_ntp_loss: 0.7792 global_avg_top_loss: 1.9664 +[titan] 2025-09-09 23:04:02,496 - root - INFO - lr: 5.0765e-06 gnorm: 0.39 [2 days, 5:28:35<19:38:28] +[titan] 2025-09-09 23:04:34,402 - root - INFO - step: 29260 loss: 2.7025 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.47 mfu: 49.49% global_avg_ntp_loss: 0.7557 global_avg_top_loss: 1.9468 +[titan] 2025-09-09 23:04:34,402 - root - INFO - lr: 5.0738e-06 gnorm: 0.39 [2 days, 5:29:07<19:37:55] +[titan] 2025-09-09 23:05:06,378 - root - INFO - step: 29265 loss: 2.6506 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.40 mfu: 49.38% global_avg_ntp_loss: 0.7370 global_avg_top_loss: 1.9136 +[titan] 2025-09-09 23:05:06,378 - root - INFO - lr: 5.0711e-06 gnorm: 0.38 [2 days, 5:29:39<19:37:21] +[titan] 2025-09-09 23:05:38,292 - root - INFO - step: 29270 loss: 2.7339 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.36 mfu: 49.48% global_avg_ntp_loss: 0.7735 global_avg_top_loss: 1.9604 +[titan] 2025-09-09 23:05:38,292 - root - INFO - lr: 5.0684e-06 gnorm: 0.39 [2 days, 5:30:10<19:36:48] +[titan] 2025-09-09 23:06:10,247 - root - INFO - step: 29275 loss: 2.6792 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.72 mfu: 49.42% global_avg_ntp_loss: 0.7468 global_avg_top_loss: 1.9324 +[titan] 2025-09-09 23:06:10,247 - root - INFO - lr: 5.0657e-06 gnorm: 0.41 [2 days, 5:30:42<19:36:15] +[titan] 2025-09-09 23:06:42,443 - root - INFO - step: 29280 loss: 2.7697 memory: 122.03GiB(87.57%) tps: 10,178 tflops: 485.06 mfu: 49.05% global_avg_ntp_loss: 0.7903 global_avg_top_loss: 1.9795 +[titan] 2025-09-09 23:06:42,444 - root - INFO - lr: 5.0631e-06 gnorm: 0.38 [2 days, 5:31:15<19:35:42] +[titan] 2025-09-09 23:07:14,578 - root - INFO - step: 29285 loss: 2.7344 memory: 122.03GiB(87.57%) tps: 10,197 tflops: 485.99 mfu: 49.14% global_avg_ntp_loss: 0.7738 global_avg_top_loss: 1.9606 +[titan] 2025-09-09 23:07:14,578 - root - INFO - lr: 5.0604e-06 gnorm: 0.41 [2 days, 5:31:47<19:35:09] +[titan] 2025-09-09 23:07:46,617 - root - INFO - step: 29290 loss: 2.6415 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.44 mfu: 49.29% global_avg_ntp_loss: 0.7295 global_avg_top_loss: 1.9121 +[titan] 2025-09-09 23:07:46,617 - root - INFO - lr: 5.0577e-06 gnorm: 0.38 [2 days, 5:32:19<19:34:35] +[titan] 2025-09-09 23:08:18,620 - root - INFO - step: 29295 loss: 2.6646 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.99 mfu: 49.34% global_avg_ntp_loss: 0.7405 global_avg_top_loss: 1.9241 +[titan] 2025-09-09 23:08:18,620 - root - INFO - lr: 5.0550e-06 gnorm: 0.46 [2 days, 5:32:51<19:34:02] +[titan] 2025-09-09 23:08:44,163 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 23:08:50,538 - root - INFO - step: 29300 loss: 2.7990 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.30 mfu: 49.47% global_avg_ntp_loss: 0.7976 global_avg_top_loss: 2.0014 +[titan] 2025-09-09 23:08:50,538 - root - INFO - lr: 5.0523e-06 gnorm: 0.39 [2 days, 5:33:23<19:33:29] +[titan] 2025-09-09 23:09:22,322 - root - INFO - step: 29305 loss: 2.6830 memory: 122.03GiB(87.57%) tps: 10,310 tflops: 491.34 mfu: 49.68% global_avg_ntp_loss: 0.7503 global_avg_top_loss: 1.9327 +[titan] 2025-09-09 23:09:22,323 - root - INFO - lr: 5.0497e-06 gnorm: 0.39 [2 days, 5:33:54<19:32:56] +[titan] 2025-09-09 23:09:54,600 - root - INFO - step: 29310 loss: 2.7757 memory: 122.03GiB(87.57%) tps: 10,152 tflops: 483.84 mfu: 48.92% global_avg_ntp_loss: 0.7954 global_avg_top_loss: 1.9802 +[titan] 2025-09-09 23:09:54,600 - root - INFO - lr: 5.0470e-06 gnorm: 0.40 [2 days, 5:34:27<19:32:22] +[titan] 2025-09-09 23:10:26,550 - root - INFO - step: 29315 loss: 2.7023 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.80 mfu: 49.42% global_avg_ntp_loss: 0.7604 global_avg_top_loss: 1.9418 +[titan] 2025-09-09 23:10:26,550 - root - INFO - lr: 5.0443e-06 gnorm: 0.38 [2 days, 5:34:59<19:31:49] +[titan] 2025-09-09 23:10:58,509 - root - INFO - step: 29320 loss: 2.5561 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.66 mfu: 49.41% global_avg_ntp_loss: 0.6911 global_avg_top_loss: 1.8650 +[titan] 2025-09-09 23:10:58,510 - root - INFO - lr: 5.0416e-06 gnorm: 0.38 [2 days, 5:35:31<19:31:16] +[titan] 2025-09-09 23:11:30,390 - root - INFO - step: 29325 loss: 2.6740 memory: 122.03GiB(87.57%) tps: 10,278 tflops: 489.86 mfu: 49.53% global_avg_ntp_loss: 0.7473 global_avg_top_loss: 1.9267 +[titan] 2025-09-09 23:11:30,391 - root - INFO - lr: 5.0389e-06 gnorm: 0.39 [2 days, 5:36:03<19:30:43] +[titan] 2025-09-09 23:12:02,294 - root - INFO - step: 29330 loss: 2.7124 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.52 mfu: 49.50% global_avg_ntp_loss: 0.7615 global_avg_top_loss: 1.9509 +[titan] 2025-09-09 23:12:02,294 - root - INFO - lr: 5.0363e-06 gnorm: 0.38 [2 days, 5:36:34<19:30:09] +[titan] 2025-09-09 23:12:34,251 - root - INFO - step: 29335 loss: 2.6808 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.69 mfu: 49.41% global_avg_ntp_loss: 0.7482 global_avg_top_loss: 1.9326 +[titan] 2025-09-09 23:12:34,251 - root - INFO - lr: 5.0336e-06 gnorm: 0.40 [2 days, 5:37:06<19:29:36] +[titan] 2025-09-09 23:13:06,285 - root - INFO - step: 29340 loss: 2.6380 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.52 mfu: 49.29% global_avg_ntp_loss: 0.7297 global_avg_top_loss: 1.9083 +[titan] 2025-09-09 23:13:06,286 - root - INFO - lr: 5.0309e-06 gnorm: 0.38 [2 days, 5:37:38<19:29:03] +[titan] 2025-09-09 23:13:38,327 - root - INFO - step: 29345 loss: 2.7049 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.41 mfu: 49.28% global_avg_ntp_loss: 0.7606 global_avg_top_loss: 1.9443 +[titan] 2025-09-09 23:13:38,327 - root - INFO - lr: 5.0283e-06 gnorm: 0.38 [2 days, 5:38:10<19:28:30] +[titan] 2025-09-09 23:14:03,863 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 23:14:10,283 - root - INFO - step: 29350 loss: 2.6685 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.71 mfu: 49.41% global_avg_ntp_loss: 0.7434 global_avg_top_loss: 1.9251 +[titan] 2025-09-09 23:14:10,283 - root - INFO - lr: 5.0256e-06 gnorm: 0.39 [2 days, 5:38:42<19:27:56] +[titan] 2025-09-09 23:14:42,166 - root - INFO - step: 29355 loss: 2.7210 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.82 mfu: 49.53% global_avg_ntp_loss: 0.7705 global_avg_top_loss: 1.9505 +[titan] 2025-09-09 23:14:42,167 - root - INFO - lr: 5.0229e-06 gnorm: 0.39 [2 days, 5:39:14<19:27:23] +[titan] 2025-09-09 23:15:14,042 - root - INFO - step: 29360 loss: 2.6904 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.94 mfu: 49.54% global_avg_ntp_loss: 0.7531 global_avg_top_loss: 1.9373 +[titan] 2025-09-09 23:15:14,043 - root - INFO - lr: 5.0203e-06 gnorm: 0.38 [2 days, 5:39:46<19:26:50] +[titan] 2025-09-09 23:15:46,334 - root - INFO - step: 29365 loss: 2.6537 memory: 122.03GiB(87.57%) tps: 10,148 tflops: 483.64 mfu: 48.90% global_avg_ntp_loss: 0.7375 global_avg_top_loss: 1.9161 +[titan] 2025-09-09 23:15:46,334 - root - INFO - lr: 5.0176e-06 gnorm: 0.41 [2 days, 5:40:18<19:26:17] +[titan] 2025-09-09 23:16:18,019 - root - INFO - step: 29370 loss: 2.7244 memory: 122.03GiB(87.57%) tps: 10,342 tflops: 492.88 mfu: 49.84% global_avg_ntp_loss: 0.7680 global_avg_top_loss: 1.9564 +[titan] 2025-09-09 23:16:18,020 - root - INFO - lr: 5.0149e-06 gnorm: 0.40 [2 days, 5:40:50<19:25:43] +[titan] 2025-09-09 23:16:50,166 - root - INFO - step: 29375 loss: 2.7198 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.81 mfu: 49.12% global_avg_ntp_loss: 0.7676 global_avg_top_loss: 1.9521 +[titan] 2025-09-09 23:16:50,166 - root - INFO - lr: 5.0123e-06 gnorm: 0.38 [2 days, 5:41:22<19:25:10] +[titan] 2025-09-09 23:17:22,258 - root - INFO - step: 29380 loss: 2.5785 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.64 mfu: 49.21% global_avg_ntp_loss: 0.7027 global_avg_top_loss: 1.8758 +[titan] 2025-09-09 23:17:22,258 - root - INFO - lr: 5.0096e-06 gnorm: 0.39 [2 days, 5:41:54<19:24:37] +[titan] 2025-09-09 23:17:54,068 - root - INFO - step: 29385 loss: 2.7280 memory: 122.03GiB(87.57%) tps: 10,301 tflops: 490.95 mfu: 49.64% global_avg_ntp_loss: 0.7727 global_avg_top_loss: 1.9553 +[titan] 2025-09-09 23:17:54,068 - root - INFO - lr: 5.0069e-06 gnorm: 0.39 [2 days, 5:42:26<19:24:04] +[titan] 2025-09-09 23:18:25,942 - root - INFO - step: 29390 loss: 2.6867 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.97 mfu: 49.54% global_avg_ntp_loss: 0.7522 global_avg_top_loss: 1.9344 +[titan] 2025-09-09 23:18:25,942 - root - INFO - lr: 5.0043e-06 gnorm: 0.39 [2 days, 5:42:58<19:23:31] +[titan] 2025-09-09 23:18:58,086 - root - INFO - step: 29395 loss: 2.5905 memory: 122.03GiB(87.57%) tps: 10,194 tflops: 485.85 mfu: 49.12% global_avg_ntp_loss: 0.7117 global_avg_top_loss: 1.8787 +[titan] 2025-09-09 23:18:58,086 - root - INFO - lr: 5.0016e-06 gnorm: 0.38 [2 days, 5:43:30<19:22:57] +[titan] 2025-09-09 23:19:23,629 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 23:19:30,066 - root - INFO - step: 29400 loss: 2.6546 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.34 mfu: 49.38% global_avg_ntp_loss: 0.7355 global_avg_top_loss: 1.9192 +[titan] 2025-09-09 23:19:30,066 - root - INFO - lr: 4.9989e-06 gnorm: 0.39 [2 days, 5:44:02<19:22:24] +[titan] 2025-09-09 23:20:02,217 - root - INFO - step: 29405 loss: 2.7080 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.74 mfu: 49.11% global_avg_ntp_loss: 0.7621 global_avg_top_loss: 1.9458 +[titan] 2025-09-09 23:20:02,218 - root - INFO - lr: 4.9963e-06 gnorm: 0.41 [2 days, 5:44:34<19:21:51] +[titan] 2025-09-09 23:20:34,210 - root - INFO - step: 29410 loss: 2.7444 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.15 mfu: 49.36% global_avg_ntp_loss: 0.7791 global_avg_top_loss: 1.9653 +[titan] 2025-09-09 23:20:34,210 - root - INFO - lr: 4.9936e-06 gnorm: 0.41 [2 days, 5:45:06<19:21:18] +[titan] 2025-09-09 23:21:06,546 - root - INFO - step: 29415 loss: 2.7118 memory: 122.03GiB(87.57%) tps: 10,133 tflops: 482.95 mfu: 48.83% global_avg_ntp_loss: 0.7672 global_avg_top_loss: 1.9445 +[titan] 2025-09-09 23:21:06,547 - root - INFO - lr: 4.9910e-06 gnorm: 0.40 [2 days, 5:45:39<19:20:45] +[titan] 2025-09-09 23:21:38,635 - root - INFO - step: 29420 loss: 2.7307 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.69 mfu: 49.21% global_avg_ntp_loss: 0.7686 global_avg_top_loss: 1.9621 +[titan] 2025-09-09 23:21:38,636 - root - INFO - lr: 4.9883e-06 gnorm: 0.40 [2 days, 5:46:11<19:20:11] +[titan] 2025-09-09 23:22:10,683 - root - INFO - step: 29425 loss: 2.7485 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.31 mfu: 49.27% global_avg_ntp_loss: 0.7819 global_avg_top_loss: 1.9666 +[titan] 2025-09-09 23:22:10,683 - root - INFO - lr: 4.9856e-06 gnorm: 0.39 [2 days, 5:46:43<19:19:38] +[titan] 2025-09-09 23:22:42,613 - root - INFO - step: 29430 loss: 2.7111 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.11 mfu: 49.45% global_avg_ntp_loss: 0.7616 global_avg_top_loss: 1.9495 +[titan] 2025-09-09 23:22:42,613 - root - INFO - lr: 4.9830e-06 gnorm: 0.39 [2 days, 5:47:15<19:19:05] +[titan] 2025-09-09 23:23:14,555 - root - INFO - step: 29435 loss: 2.7984 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.92 mfu: 49.44% global_avg_ntp_loss: 0.8023 global_avg_top_loss: 1.9961 +[titan] 2025-09-09 23:23:14,555 - root - INFO - lr: 4.9803e-06 gnorm: 0.44 [2 days, 5:47:47<19:18:32] +[titan] 2025-09-09 23:23:46,631 - root - INFO - step: 29440 loss: 2.7336 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.88 mfu: 49.23% global_avg_ntp_loss: 0.7707 global_avg_top_loss: 1.9629 +[titan] 2025-09-09 23:23:46,631 - root - INFO - lr: 4.9777e-06 gnorm: 0.39 [2 days, 5:48:19<19:17:59] +[titan] 2025-09-09 23:24:18,630 - root - INFO - step: 29445 loss: 2.6786 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.05 mfu: 49.35% global_avg_ntp_loss: 0.7484 global_avg_top_loss: 1.9302 +[titan] 2025-09-09 23:24:18,630 - root - INFO - lr: 4.9750e-06 gnorm: 0.39 [2 days, 5:48:51<19:17:25] +[titan] 2025-09-09 23:24:44,350 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 23:24:50,723 - root - INFO - step: 29450 loss: 2.6512 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.63 mfu: 49.20% global_avg_ntp_loss: 0.7335 global_avg_top_loss: 1.9177 +[titan] 2025-09-09 23:24:50,723 - root - INFO - lr: 4.9724e-06 gnorm: 0.53 [2 days, 5:49:23<19:16:52] +[titan] 2025-09-09 23:25:22,863 - root - INFO - step: 29455 loss: 2.6324 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.91 mfu: 49.13% global_avg_ntp_loss: 0.7294 global_avg_top_loss: 1.9030 +[titan] 2025-09-09 23:25:22,863 - root - INFO - lr: 4.9697e-06 gnorm: 0.38 [2 days, 5:49:55<19:16:19] +[titan] 2025-09-09 23:25:54,872 - root - INFO - step: 29460 loss: 2.6107 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.90 mfu: 49.33% global_avg_ntp_loss: 0.7171 global_avg_top_loss: 1.8937 +[titan] 2025-09-09 23:25:54,872 - root - INFO - lr: 4.9671e-06 gnorm: 0.50 [2 days, 5:50:27<19:15:46] +[titan] 2025-09-09 23:26:26,821 - root - INFO - step: 29465 loss: 2.7100 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.81 mfu: 49.42% global_avg_ntp_loss: 0.7655 global_avg_top_loss: 1.9444 +[titan] 2025-09-09 23:26:26,821 - root - INFO - lr: 4.9644e-06 gnorm: 0.39 [2 days, 5:50:59<19:15:13] +[titan] 2025-09-09 23:26:59,115 - root - INFO - step: 29470 loss: 2.8733 memory: 122.03GiB(87.57%) tps: 10,147 tflops: 483.60 mfu: 48.90% global_avg_ntp_loss: 0.8526 global_avg_top_loss: 2.0207 +[titan] 2025-09-09 23:26:59,115 - root - INFO - lr: 4.9618e-06 gnorm: 0.40 [2 days, 5:51:31<19:14:39] +[titan] 2025-09-09 23:27:31,112 - root - INFO - step: 29475 loss: 2.6146 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.08 mfu: 49.35% global_avg_ntp_loss: 0.7173 global_avg_top_loss: 1.8973 +[titan] 2025-09-09 23:27:31,112 - root - INFO - lr: 4.9591e-06 gnorm: 0.39 [2 days, 5:52:03<19:14:06] +[titan] 2025-09-09 23:28:03,054 - root - INFO - step: 29480 loss: 2.7077 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.92 mfu: 49.44% global_avg_ntp_loss: 0.7603 global_avg_top_loss: 1.9475 +[titan] 2025-09-09 23:28:03,055 - root - INFO - lr: 4.9565e-06 gnorm: 0.41 [2 days, 5:52:35<19:13:33] +[titan] 2025-09-09 23:28:34,949 - root - INFO - step: 29485 loss: 2.6600 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.66 mfu: 49.51% global_avg_ntp_loss: 0.7423 global_avg_top_loss: 1.9177 +[titan] 2025-09-09 23:28:34,949 - root - INFO - lr: 4.9538e-06 gnorm: 0.39 [2 days, 5:53:07<19:13:00] +[titan] 2025-09-09 23:29:06,989 - root - INFO - step: 29490 loss: 2.7052 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.42 mfu: 49.28% global_avg_ntp_loss: 0.7591 global_avg_top_loss: 1.9461 +[titan] 2025-09-09 23:29:06,989 - root - INFO - lr: 4.9512e-06 gnorm: 0.43 [2 days, 5:53:39<19:12:27] +[titan] 2025-09-09 23:29:39,008 - root - INFO - step: 29495 loss: 2.6494 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.75 mfu: 49.32% global_avg_ntp_loss: 0.7352 global_avg_top_loss: 1.9142 +[titan] 2025-09-09 23:29:39,008 - root - INFO - lr: 4.9486e-06 gnorm: 0.38 [2 days, 5:54:11<19:11:53] +[titan] 2025-09-09 23:30:04,501 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 23:30:10,900 - root - INFO - step: 29500 loss: 2.6307 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.69 mfu: 49.51% global_avg_ntp_loss: 0.7293 global_avg_top_loss: 1.9014 +[titan] 2025-09-09 23:30:10,900 - root - INFO - lr: 4.9459e-06 gnorm: 0.37 [2 days, 5:54:43<19:11:20] +[titan] 2025-09-09 23:30:42,669 - root - INFO - step: 29505 loss: 2.6899 memory: 122.03GiB(87.57%) tps: 10,315 tflops: 491.59 mfu: 49.71% global_avg_ntp_loss: 0.7533 global_avg_top_loss: 1.9365 +[titan] 2025-09-09 23:30:42,669 - root - INFO - lr: 4.9433e-06 gnorm: 0.40 [2 days, 5:55:15<19:10:47] +[titan] 2025-09-09 23:31:14,713 - root - INFO - step: 29510 loss: 2.7504 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.36 mfu: 49.28% global_avg_ntp_loss: 0.7848 global_avg_top_loss: 1.9656 +[titan] 2025-09-09 23:31:14,714 - root - INFO - lr: 4.9406e-06 gnorm: 0.38 [2 days, 5:55:47<19:10:14] +[titan] 2025-09-09 23:31:46,701 - root - INFO - step: 29515 loss: 2.6921 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.23 mfu: 49.37% global_avg_ntp_loss: 0.7494 global_avg_top_loss: 1.9427 +[titan] 2025-09-09 23:31:46,701 - root - INFO - lr: 4.9380e-06 gnorm: 0.39 [2 days, 5:56:19<19:09:40] +[titan] 2025-09-09 23:32:18,822 - root - INFO - step: 29520 loss: 2.6763 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.20 mfu: 49.16% global_avg_ntp_loss: 0.7621 global_avg_top_loss: 1.9143 +[titan] 2025-09-09 23:32:18,822 - root - INFO - lr: 4.9354e-06 gnorm: 0.39 [2 days, 5:56:51<19:09:07] +[titan] 2025-09-09 23:32:50,838 - root - INFO - step: 29525 loss: 3.0576 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.80 mfu: 49.32% global_avg_ntp_loss: 0.9634 global_avg_top_loss: 2.0942 +[titan] 2025-09-09 23:32:50,838 - root - INFO - lr: 4.9327e-06 gnorm: 0.39 [2 days, 5:57:23<19:08:34] +[titan] 2025-09-09 23:33:22,841 - root - INFO - step: 29530 loss: 2.6854 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.98 mfu: 49.34% global_avg_ntp_loss: 0.7522 global_avg_top_loss: 1.9332 +[titan] 2025-09-09 23:33:22,841 - root - INFO - lr: 4.9301e-06 gnorm: 0.38 [2 days, 5:57:55<19:08:01] +[titan] 2025-09-09 23:33:54,771 - root - INFO - step: 29535 loss: 2.6469 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.11 mfu: 49.45% global_avg_ntp_loss: 0.7293 global_avg_top_loss: 1.9177 +[titan] 2025-09-09 23:33:54,772 - root - INFO - lr: 4.9274e-06 gnorm: 0.39 [2 days, 5:58:27<19:07:28] +[titan] 2025-09-09 23:34:26,817 - root - INFO - step: 29540 loss: 2.6060 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.33 mfu: 49.28% global_avg_ntp_loss: 0.7195 global_avg_top_loss: 1.8864 +[titan] 2025-09-09 23:34:26,818 - root - INFO - lr: 4.9248e-06 gnorm: 0.47 [2 days, 5:58:59<19:06:54] +[titan] 2025-09-09 23:34:58,816 - root - INFO - step: 29545 loss: 2.6410 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.06 mfu: 49.35% global_avg_ntp_loss: 0.7293 global_avg_top_loss: 1.9117 +[titan] 2025-09-09 23:34:58,816 - root - INFO - lr: 4.9222e-06 gnorm: 0.38 [2 days, 5:59:31<19:06:21] +[titan] 2025-09-09 23:35:24,537 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 23:35:30,940 - root - INFO - step: 29550 loss: 2.7671 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.15 mfu: 49.16% global_avg_ntp_loss: 0.7879 global_avg_top_loss: 1.9792 +[titan] 2025-09-09 23:35:30,940 - root - INFO - lr: 4.9195e-06 gnorm: 0.45 [2 days, 6:00:03<19:05:48] +[titan] 2025-09-09 23:36:03,162 - root - INFO - step: 29555 loss: 2.6948 memory: 122.03GiB(87.57%) tps: 10,170 tflops: 484.67 mfu: 49.01% global_avg_ntp_loss: 0.7547 global_avg_top_loss: 1.9401 +[titan] 2025-09-09 23:36:03,162 - root - INFO - lr: 4.9169e-06 gnorm: 0.40 [2 days, 6:00:35<19:05:15] +[titan] 2025-09-09 23:36:34,999 - root - INFO - step: 29560 loss: 2.7432 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.53 mfu: 49.60% global_avg_ntp_loss: 0.7774 global_avg_top_loss: 1.9657 +[titan] 2025-09-09 23:36:35,000 - root - INFO - lr: 4.9143e-06 gnorm: 0.39 [2 days, 6:01:07<19:04:42] +[titan] 2025-09-09 23:37:06,856 - root - INFO - step: 29565 loss: 2.7192 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.24 mfu: 49.57% global_avg_ntp_loss: 0.7649 global_avg_top_loss: 1.9543 +[titan] 2025-09-09 23:37:06,856 - root - INFO - lr: 4.9117e-06 gnorm: 0.39 [2 days, 6:01:39<19:04:08] +[titan] 2025-09-09 23:37:38,967 - root - INFO - step: 29570 loss: 2.6684 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.34 mfu: 49.17% global_avg_ntp_loss: 0.7420 global_avg_top_loss: 1.9264 +[titan] 2025-09-09 23:37:38,968 - root - INFO - lr: 4.9090e-06 gnorm: 0.38 [2 days, 6:02:11<19:03:35] +[titan] 2025-09-09 23:38:10,969 - root - INFO - step: 29575 loss: 2.6341 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.02 mfu: 49.34% global_avg_ntp_loss: 0.7259 global_avg_top_loss: 1.9081 +[titan] 2025-09-09 23:38:10,969 - root - INFO - lr: 4.9064e-06 gnorm: 0.38 [2 days, 6:02:43<19:03:02] +[titan] 2025-09-09 23:38:43,179 - root - INFO - step: 29580 loss: 2.5590 memory: 122.03GiB(87.57%) tps: 10,173 tflops: 484.86 mfu: 49.02% global_avg_ntp_loss: 0.6934 global_avg_top_loss: 1.8656 +[titan] 2025-09-09 23:38:43,179 - root - INFO - lr: 4.9038e-06 gnorm: 0.40 [2 days, 6:03:15<19:02:29] +[titan] 2025-09-09 23:39:15,183 - root - INFO - step: 29585 loss: 2.7235 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.98 mfu: 49.34% global_avg_ntp_loss: 0.7690 global_avg_top_loss: 1.9545 +[titan] 2025-09-09 23:39:15,183 - root - INFO - lr: 4.9011e-06 gnorm: 0.39 [2 days, 6:03:47<19:01:56] +[titan] 2025-09-09 23:39:47,245 - root - INFO - step: 29590 loss: 2.8727 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.09 mfu: 49.25% global_avg_ntp_loss: 0.8667 global_avg_top_loss: 2.0060 +[titan] 2025-09-09 23:39:47,246 - root - INFO - lr: 4.8985e-06 gnorm: 0.39 [2 days, 6:04:19<19:01:22] +[titan] 2025-09-09 23:40:19,391 - root - INFO - step: 29595 loss: 2.6904 memory: 122.03GiB(87.57%) tps: 10,194 tflops: 485.83 mfu: 49.12% global_avg_ntp_loss: 0.7493 global_avg_top_loss: 1.9411 +[titan] 2025-09-09 23:40:19,391 - root - INFO - lr: 4.8959e-06 gnorm: 0.39 [2 days, 6:04:51<19:00:49] +[titan] 2025-09-09 23:40:44,958 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 23:40:51,411 - root - INFO - step: 29600 loss: 2.6792 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.72 mfu: 49.31% global_avg_ntp_loss: 0.7478 global_avg_top_loss: 1.9314 +[titan] 2025-09-09 23:40:51,412 - root - INFO - lr: 4.8933e-06 gnorm: 0.39 [2 days, 6:05:24<19:00:16] +[titan] 2025-09-09 23:41:23,486 - root - INFO - step: 29605 loss: 2.6825 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.91 mfu: 49.23% global_avg_ntp_loss: 0.7484 global_avg_top_loss: 1.9341 +[titan] 2025-09-09 23:41:23,486 - root - INFO - lr: 4.8907e-06 gnorm: 0.40 [2 days, 6:05:56<18:59:43] +[titan] 2025-09-09 23:41:55,558 - root - INFO - step: 29610 loss: 2.6834 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.93 mfu: 49.23% global_avg_ntp_loss: 0.7484 global_avg_top_loss: 1.9349 +[titan] 2025-09-09 23:41:55,559 - root - INFO - lr: 4.8880e-06 gnorm: 0.41 [2 days, 6:06:28<18:59:10] +[titan] 2025-09-09 23:42:27,592 - root - INFO - step: 29615 loss: 2.7094 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.52 mfu: 49.29% global_avg_ntp_loss: 0.7609 global_avg_top_loss: 1.9485 +[titan] 2025-09-09 23:42:27,592 - root - INFO - lr: 4.8854e-06 gnorm: 0.39 [2 days, 6:07:00<18:58:36] +[titan] 2025-09-09 23:42:59,640 - root - INFO - step: 29620 loss: 3.0617 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.31 mfu: 49.27% global_avg_ntp_loss: 0.9728 global_avg_top_loss: 2.0889 +[titan] 2025-09-09 23:42:59,640 - root - INFO - lr: 4.8828e-06 gnorm: 0.39 [2 days, 6:07:32<18:58:03] +[titan] 2025-09-09 23:43:31,486 - root - INFO - step: 29625 loss: 2.8192 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.40 mfu: 49.59% global_avg_ntp_loss: 0.8222 global_avg_top_loss: 1.9970 +[titan] 2025-09-09 23:43:31,486 - root - INFO - lr: 4.8802e-06 gnorm: 0.39 [2 days, 6:08:04<18:57:30] +[titan] 2025-09-09 23:44:03,370 - root - INFO - step: 29630 loss: 2.6848 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.81 mfu: 49.53% global_avg_ntp_loss: 0.7519 global_avg_top_loss: 1.9330 +[titan] 2025-09-09 23:44:03,370 - root - INFO - lr: 4.8776e-06 gnorm: 0.38 [2 days, 6:08:35<18:56:57] +[titan] 2025-09-09 23:44:35,405 - root - INFO - step: 29635 loss: 2.6599 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.51 mfu: 49.29% global_avg_ntp_loss: 0.7397 global_avg_top_loss: 1.9202 +[titan] 2025-09-09 23:44:35,405 - root - INFO - lr: 4.8749e-06 gnorm: 0.38 [2 days, 6:09:08<18:56:24] +[titan] 2025-09-09 23:45:07,457 - root - INFO - step: 29640 loss: 2.7675 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.25 mfu: 49.27% global_avg_ntp_loss: 0.7881 global_avg_top_loss: 1.9794 +[titan] 2025-09-09 23:45:07,457 - root - INFO - lr: 4.8723e-06 gnorm: 0.39 [2 days, 6:09:40<18:55:50] +[titan] 2025-09-09 23:45:39,587 - root - INFO - step: 29645 loss: 2.6829 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.06 mfu: 49.15% global_avg_ntp_loss: 0.7513 global_avg_top_loss: 1.9316 +[titan] 2025-09-09 23:45:39,587 - root - INFO - lr: 4.8697e-06 gnorm: 0.38 [2 days, 6:10:12<18:55:17] +[titan] 2025-09-09 23:46:05,078 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 23:46:11,475 - root - INFO - step: 29650 loss: 2.6912 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.75 mfu: 49.52% global_avg_ntp_loss: 0.7516 global_avg_top_loss: 1.9395 +[titan] 2025-09-09 23:46:11,475 - root - INFO - lr: 4.8671e-06 gnorm: 0.64 [2 days, 6:10:44<18:54:44] +[titan] 2025-09-09 23:46:43,410 - root - INFO - step: 29655 loss: 2.6293 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.03 mfu: 49.45% global_avg_ntp_loss: 0.7226 global_avg_top_loss: 1.9067 +[titan] 2025-09-09 23:46:43,410 - root - INFO - lr: 4.8645e-06 gnorm: 0.38 [2 days, 6:11:16<18:54:11] +[titan] 2025-09-09 23:47:15,405 - root - INFO - step: 29660 loss: 2.7703 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.11 mfu: 49.35% global_avg_ntp_loss: 0.7866 global_avg_top_loss: 1.9838 +[titan] 2025-09-09 23:47:15,405 - root - INFO - lr: 4.8619e-06 gnorm: 0.39 [2 days, 6:11:48<18:53:38] +[titan] 2025-09-09 23:47:47,232 - root - INFO - step: 29665 loss: 2.7646 memory: 122.03GiB(87.57%) tps: 10,296 tflops: 490.69 mfu: 49.61% global_avg_ntp_loss: 0.7845 global_avg_top_loss: 1.9800 +[titan] 2025-09-09 23:47:47,232 - root - INFO - lr: 4.8593e-06 gnorm: 0.38 [2 days, 6:12:19<18:53:04] +[titan] 2025-09-09 23:48:19,241 - root - INFO - step: 29670 loss: 2.6916 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.90 mfu: 49.33% global_avg_ntp_loss: 0.7538 global_avg_top_loss: 1.9378 +[titan] 2025-09-09 23:48:19,241 - root - INFO - lr: 4.8567e-06 gnorm: 0.38 [2 days, 6:12:51<18:52:31] +[titan] 2025-09-09 23:48:51,377 - root - INFO - step: 29675 loss: 2.7000 memory: 122.03GiB(87.57%) tps: 10,197 tflops: 485.97 mfu: 49.14% global_avg_ntp_loss: 0.7544 global_avg_top_loss: 1.9456 +[titan] 2025-09-09 23:48:51,378 - root - INFO - lr: 4.8541e-06 gnorm: 0.38 [2 days, 6:13:23<18:51:58] +[titan] 2025-09-09 23:49:23,199 - root - INFO - step: 29680 loss: 2.7927 memory: 122.03GiB(87.57%) tps: 10,298 tflops: 490.78 mfu: 49.62% global_avg_ntp_loss: 0.7960 global_avg_top_loss: 1.9967 +[titan] 2025-09-09 23:49:23,199 - root - INFO - lr: 4.8514e-06 gnorm: 0.40 [2 days, 6:13:55<18:51:25] +[titan] 2025-09-09 23:49:55,236 - root - INFO - step: 29685 loss: 2.7399 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.47 mfu: 49.29% global_avg_ntp_loss: 0.7752 global_avg_top_loss: 1.9647 +[titan] 2025-09-09 23:49:55,236 - root - INFO - lr: 4.8488e-06 gnorm: 0.39 [2 days, 6:14:27<18:50:52] +[titan] 2025-09-09 23:50:27,227 - root - INFO - step: 29690 loss: 2.7406 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.17 mfu: 49.36% global_avg_ntp_loss: 0.7764 global_avg_top_loss: 1.9642 +[titan] 2025-09-09 23:50:27,228 - root - INFO - lr: 4.8462e-06 gnorm: 0.39 [2 days, 6:14:59<18:50:18] +[titan] 2025-09-09 23:50:59,184 - root - INFO - step: 29695 loss: 2.6932 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.71 mfu: 49.41% global_avg_ntp_loss: 0.7544 global_avg_top_loss: 1.9388 +[titan] 2025-09-09 23:50:59,184 - root - INFO - lr: 4.8436e-06 gnorm: 0.40 [2 days, 6:15:31<18:49:45] +[titan] 2025-09-09 23:51:05,808 - root - INFO - Dumping profiler traces at step 29696 +[titan] 2025-09-09 23:51:05,880 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-09 23:51:24,941 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 23:51:31,380 - root - INFO - step: 29700 loss: 2.7486 memory: 122.03GiB(87.57%) tps: 10,178 tflops: 485.07 mfu: 49.05% global_avg_ntp_loss: 0.7782 global_avg_top_loss: 1.9704 +[titan] 2025-09-09 23:51:31,380 - root - INFO - lr: 4.8410e-06 gnorm: 0.39 [2 days, 6:16:03<18:49:12] +[titan] 2025-09-09 23:52:03,289 - root - INFO - step: 29705 loss: 2.6199 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.43 mfu: 49.49% global_avg_ntp_loss: 0.7254 global_avg_top_loss: 1.8945 +[titan] 2025-09-09 23:52:03,289 - root - INFO - lr: 4.8384e-06 gnorm: 0.39 [2 days, 6:16:35<18:48:39] +[titan] 2025-09-09 23:52:35,048 - root - INFO - step: 29710 loss: 2.6929 memory: 122.03GiB(87.57%) tps: 10,318 tflops: 491.75 mfu: 49.72% global_avg_ntp_loss: 0.7557 global_avg_top_loss: 1.9372 +[titan] 2025-09-09 23:52:35,048 - root - INFO - lr: 4.8358e-06 gnorm: 0.40 [2 days, 6:17:07<18:48:05] +[titan] 2025-09-09 23:53:06,855 - root - INFO - step: 29715 loss: 2.6783 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 491.00 mfu: 49.65% global_avg_ntp_loss: 0.7491 global_avg_top_loss: 1.9293 +[titan] 2025-09-09 23:53:06,855 - root - INFO - lr: 4.8332e-06 gnorm: 0.38 [2 days, 6:17:39<18:47:32] +[titan] 2025-09-09 23:53:38,701 - root - INFO - step: 29720 loss: 2.7316 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.39 mfu: 49.58% global_avg_ntp_loss: 0.7712 global_avg_top_loss: 1.9604 +[titan] 2025-09-09 23:53:38,701 - root - INFO - lr: 4.8306e-06 gnorm: 0.39 [2 days, 6:18:11<18:46:59] +[titan] 2025-09-09 23:54:10,529 - root - INFO - step: 29725 loss: 2.6654 memory: 122.03GiB(87.57%) tps: 10,296 tflops: 490.68 mfu: 49.61% global_avg_ntp_loss: 0.7409 global_avg_top_loss: 1.9245 +[titan] 2025-09-09 23:54:10,529 - root - INFO - lr: 4.8280e-06 gnorm: 0.39 [2 days, 6:18:43<18:46:26] +[titan] 2025-09-09 23:54:42,620 - root - INFO - step: 29730 loss: 2.6882 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.65 mfu: 49.21% global_avg_ntp_loss: 0.7527 global_avg_top_loss: 1.9356 +[titan] 2025-09-09 23:54:42,620 - root - INFO - lr: 4.8254e-06 gnorm: 0.38 [2 days, 6:19:15<18:45:53] +[titan] 2025-09-09 23:55:14,622 - root - INFO - step: 29735 loss: 3.1121 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 488.00 mfu: 49.34% global_avg_ntp_loss: 1.0012 global_avg_top_loss: 2.1108 +[titan] 2025-09-09 23:55:14,622 - root - INFO - lr: 4.8228e-06 gnorm: 0.41 [2 days, 6:19:47<18:45:19] +[titan] 2025-09-09 23:55:46,562 - root - INFO - step: 29740 loss: 2.7354 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.94 mfu: 49.44% global_avg_ntp_loss: 0.7733 global_avg_top_loss: 1.9620 +[titan] 2025-09-09 23:55:46,563 - root - INFO - lr: 4.8202e-06 gnorm: 0.39 [2 days, 6:20:19<18:44:46] +[titan] 2025-09-09 23:56:18,418 - root - INFO - step: 29745 loss: 2.8026 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.24 mfu: 49.57% global_avg_ntp_loss: 0.8278 global_avg_top_loss: 1.9747 +[titan] 2025-09-09 23:56:18,419 - root - INFO - lr: 4.8176e-06 gnorm: 0.38 [2 days, 6:20:51<18:44:13] +[titan] 2025-09-09 23:56:44,039 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 23:56:50,340 - root - INFO - step: 29750 loss: 2.5971 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.23 mfu: 49.47% global_avg_ntp_loss: 0.7216 global_avg_top_loss: 1.8755 +[titan] 2025-09-09 23:56:50,341 - root - INFO - lr: 4.8150e-06 gnorm: 1.13 [2 days, 6:21:22<18:43:40] +[titan] 2025-09-09 23:57:22,520 - root - INFO - step: 29755 loss: 3.0763 memory: 122.03GiB(87.57%) tps: 10,183 tflops: 485.30 mfu: 49.07% global_avg_ntp_loss: 0.9800 global_avg_top_loss: 2.0963 +[titan] 2025-09-09 23:57:22,521 - root - INFO - lr: 4.8124e-06 gnorm: 0.37 [2 days, 6:21:55<18:43:07] +[titan] 2025-09-09 23:57:54,378 - root - INFO - step: 29760 loss: 2.6953 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.23 mfu: 49.57% global_avg_ntp_loss: 0.7549 global_avg_top_loss: 1.9404 +[titan] 2025-09-09 23:57:54,378 - root - INFO - lr: 4.8099e-06 gnorm: 0.43 [2 days, 6:22:26<18:42:33] +[titan] 2025-09-09 23:58:26,318 - root - INFO - step: 29765 loss: 2.6468 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.95 mfu: 49.44% global_avg_ntp_loss: 0.7329 global_avg_top_loss: 1.9139 +[titan] 2025-09-09 23:58:26,318 - root - INFO - lr: 4.8073e-06 gnorm: 0.43 [2 days, 6:22:58<18:42:00] +[titan] 2025-09-09 23:58:58,421 - root - INFO - step: 29770 loss: 2.6920 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.47 mfu: 49.19% global_avg_ntp_loss: 0.7535 global_avg_top_loss: 1.9385 +[titan] 2025-09-09 23:58:58,421 - root - INFO - lr: 4.8047e-06 gnorm: 0.40 [2 days, 6:23:31<18:41:27] +[titan] 2025-09-09 23:59:30,709 - root - INFO - step: 29775 loss: 2.6059 memory: 122.03GiB(87.57%) tps: 10,149 tflops: 483.68 mfu: 48.91% global_avg_ntp_loss: 0.7144 global_avg_top_loss: 1.8915 +[titan] 2025-09-09 23:59:30,710 - root - INFO - lr: 4.8021e-06 gnorm: 0.37 [2 days, 6:24:03<18:40:54] +[titan] 2025-09-10 00:00:02,497 - root - INFO - step: 29780 loss: 2.6640 memory: 122.03GiB(87.57%) tps: 10,308 tflops: 491.29 mfu: 49.68% global_avg_ntp_loss: 0.7417 global_avg_top_loss: 1.9223 +[titan] 2025-09-10 00:00:02,498 - root - INFO - lr: 4.7995e-06 gnorm: 0.40 [2 days, 6:24:35<18:40:21] +[titan] 2025-09-10 00:00:34,400 - root - INFO - step: 29785 loss: 2.5482 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.53 mfu: 49.50% global_avg_ntp_loss: 0.6895 global_avg_top_loss: 1.8587 +[titan] 2025-09-10 00:00:34,400 - root - INFO - lr: 4.7969e-06 gnorm: 0.37 [2 days, 6:25:06<18:39:47] +[titan] 2025-09-10 00:01:06,418 - root - INFO - step: 29790 loss: 2.7028 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.77 mfu: 49.32% global_avg_ntp_loss: 0.7597 global_avg_top_loss: 1.9431 +[titan] 2025-09-10 00:01:06,418 - root - INFO - lr: 4.7943e-06 gnorm: 0.39 [2 days, 6:25:39<18:39:14] +[titan] 2025-09-10 00:01:38,373 - root - INFO - step: 29795 loss: 2.6177 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.72 mfu: 49.42% global_avg_ntp_loss: 0.7188 global_avg_top_loss: 1.8989 +[titan] 2025-09-10 00:01:38,373 - root - INFO - lr: 4.7917e-06 gnorm: 0.39 [2 days, 6:26:10<18:38:41] +[titan] 2025-09-10 00:02:03,862 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 00:02:10,276 - root - INFO - step: 29800 loss: 2.6450 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.52 mfu: 49.50% global_avg_ntp_loss: 0.7344 global_avg_top_loss: 1.9106 +[titan] 2025-09-10 00:02:10,277 - root - INFO - lr: 4.7892e-06 gnorm: 0.39 [2 days, 6:26:42<18:38:08] +[titan] 2025-09-10 00:02:42,323 - root - INFO - step: 29805 loss: 2.8560 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.32 mfu: 49.27% global_avg_ntp_loss: 0.8431 global_avg_top_loss: 2.0129 +[titan] 2025-09-10 00:02:42,324 - root - INFO - lr: 4.7866e-06 gnorm: 0.38 [2 days, 6:27:14<18:37:35] +[titan] 2025-09-10 00:03:14,335 - root - INFO - step: 29810 loss: 2.6454 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.86 mfu: 49.33% global_avg_ntp_loss: 0.7318 global_avg_top_loss: 1.9136 +[titan] 2025-09-10 00:03:14,335 - root - INFO - lr: 4.7840e-06 gnorm: 0.38 [2 days, 6:27:46<18:37:01] +[titan] 2025-09-10 00:03:46,354 - root - INFO - step: 29815 loss: 2.5502 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.75 mfu: 49.32% global_avg_ntp_loss: 0.6868 global_avg_top_loss: 1.8634 +[titan] 2025-09-10 00:03:46,354 - root - INFO - lr: 4.7814e-06 gnorm: 0.38 [2 days, 6:28:18<18:36:28] +[titan] 2025-09-10 00:04:18,237 - root - INFO - step: 29820 loss: 2.6542 memory: 122.03GiB(87.57%) tps: 10,278 tflops: 489.82 mfu: 49.53% global_avg_ntp_loss: 0.7341 global_avg_top_loss: 1.9202 +[titan] 2025-09-10 00:04:18,237 - root - INFO - lr: 4.7788e-06 gnorm: 0.41 [2 days, 6:28:50<18:35:55] +[titan] 2025-09-10 00:04:50,377 - root - INFO - step: 29825 loss: 2.5905 memory: 122.03GiB(87.57%) tps: 10,196 tflops: 485.91 mfu: 49.13% global_avg_ntp_loss: 0.7089 global_avg_top_loss: 1.8816 +[titan] 2025-09-10 00:04:50,377 - root - INFO - lr: 4.7762e-06 gnorm: 0.37 [2 days, 6:29:22<18:35:22] +[titan] 2025-09-10 00:05:22,383 - root - INFO - step: 29830 loss: 2.5534 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.95 mfu: 49.34% global_avg_ntp_loss: 0.6934 global_avg_top_loss: 1.8600 +[titan] 2025-09-10 00:05:22,383 - root - INFO - lr: 4.7737e-06 gnorm: 0.39 [2 days, 6:29:54<18:34:49] +[titan] 2025-09-10 00:05:54,578 - root - INFO - step: 29835 loss: 2.6753 memory: 122.03GiB(87.57%) tps: 10,178 tflops: 485.08 mfu: 49.05% global_avg_ntp_loss: 0.7460 global_avg_top_loss: 1.9292 +[titan] 2025-09-10 00:05:54,578 - root - INFO - lr: 4.7711e-06 gnorm: 0.38 [2 days, 6:30:27<18:34:16] +[titan] 2025-09-10 00:06:26,565 - root - INFO - step: 29840 loss: 2.7302 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.23 mfu: 49.37% global_avg_ntp_loss: 0.7746 global_avg_top_loss: 1.9556 +[titan] 2025-09-10 00:06:26,566 - root - INFO - lr: 4.7685e-06 gnorm: 0.38 [2 days, 6:30:59<18:33:42] +[titan] 2025-09-10 00:06:58,802 - root - INFO - step: 29845 loss: 2.6537 memory: 122.03GiB(87.57%) tps: 10,165 tflops: 484.45 mfu: 48.98% global_avg_ntp_loss: 0.7356 global_avg_top_loss: 1.9181 +[titan] 2025-09-10 00:06:58,803 - root - INFO - lr: 4.7659e-06 gnorm: 0.38 [2 days, 6:31:31<18:33:09] +[titan] 2025-09-10 00:07:24,446 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 00:07:30,805 - root - INFO - step: 29850 loss: 2.7362 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.99 mfu: 49.34% global_avg_ntp_loss: 0.7745 global_avg_top_loss: 1.9617 +[titan] 2025-09-10 00:07:30,806 - root - INFO - lr: 4.7634e-06 gnorm: 0.38 [2 days, 6:32:03<18:32:36] +[titan] 2025-09-10 00:08:02,841 - root - INFO - step: 29855 loss: 2.7087 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.50 mfu: 49.29% global_avg_ntp_loss: 0.7605 global_avg_top_loss: 1.9483 +[titan] 2025-09-10 00:08:02,841 - root - INFO - lr: 4.7608e-06 gnorm: 0.39 [2 days, 6:32:35<18:32:03] +[titan] 2025-09-10 00:08:34,683 - root - INFO - step: 29860 loss: 2.6531 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.45 mfu: 49.59% global_avg_ntp_loss: 0.7355 global_avg_top_loss: 1.9176 +[titan] 2025-09-10 00:08:34,684 - root - INFO - lr: 4.7582e-06 gnorm: 0.39 [2 days, 6:33:07<18:31:30] +[titan] 2025-09-10 00:09:06,601 - root - INFO - step: 29865 loss: 2.7168 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.30 mfu: 49.47% global_avg_ntp_loss: 0.7646 global_avg_top_loss: 1.9522 +[titan] 2025-09-10 00:09:06,601 - root - INFO - lr: 4.7557e-06 gnorm: 0.39 [2 days, 6:33:39<18:30:56] +[titan] 2025-09-10 00:09:38,451 - root - INFO - step: 29870 loss: 2.7116 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.34 mfu: 49.58% global_avg_ntp_loss: 0.7654 global_avg_top_loss: 1.9462 +[titan] 2025-09-10 00:09:38,451 - root - INFO - lr: 4.7531e-06 gnorm: 0.39 [2 days, 6:34:11<18:30:23] +[titan] 2025-09-10 00:10:10,458 - root - INFO - step: 29875 loss: 2.7238 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.93 mfu: 49.34% global_avg_ntp_loss: 0.7679 global_avg_top_loss: 1.9559 +[titan] 2025-09-10 00:10:10,458 - root - INFO - lr: 4.7505e-06 gnorm: 0.40 [2 days, 6:34:43<18:29:50] +[titan] 2025-09-10 00:10:42,571 - root - INFO - step: 29880 loss: 2.6896 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.32 mfu: 49.17% global_avg_ntp_loss: 0.7511 global_avg_top_loss: 1.9384 +[titan] 2025-09-10 00:10:42,571 - root - INFO - lr: 4.7479e-06 gnorm: 0.40 [2 days, 6:35:15<18:29:17] +[titan] 2025-09-10 00:11:14,690 - root - INFO - step: 29885 loss: 3.1702 memory: 122.03GiB(87.57%) tps: 10,202 tflops: 486.23 mfu: 49.16% global_avg_ntp_loss: 1.0232 global_avg_top_loss: 2.1470 +[titan] 2025-09-10 00:11:14,690 - root - INFO - lr: 4.7454e-06 gnorm: 0.39 [2 days, 6:35:47<18:28:44] +[titan] 2025-09-10 00:11:46,939 - root - INFO - step: 29890 loss: 2.6405 memory: 122.03GiB(87.57%) tps: 10,161 tflops: 484.27 mfu: 48.97% global_avg_ntp_loss: 0.7311 global_avg_top_loss: 1.9094 +[titan] 2025-09-10 00:11:46,939 - root - INFO - lr: 4.7428e-06 gnorm: 0.38 [2 days, 6:36:19<18:28:11] +[titan] 2025-09-10 00:12:19,176 - root - INFO - step: 29895 loss: 2.7750 memory: 122.03GiB(87.57%) tps: 10,165 tflops: 484.45 mfu: 48.98% global_avg_ntp_loss: 0.7865 global_avg_top_loss: 1.9884 +[titan] 2025-09-10 00:12:19,176 - root - INFO - lr: 4.7402e-06 gnorm: 0.60 [2 days, 6:36:51<18:27:37] +[titan] 2025-09-10 00:12:44,694 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 00:12:51,251 - root - INFO - step: 29900 loss: 2.6360 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.89 mfu: 49.23% global_avg_ntp_loss: 0.7259 global_avg_top_loss: 1.9101 +[titan] 2025-09-10 00:12:51,251 - root - INFO - lr: 4.7377e-06 gnorm: 0.40 [2 days, 6:37:23<18:27:04] +[titan] 2025-09-10 00:13:23,319 - root - INFO - step: 29905 loss: 2.7614 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 487.00 mfu: 49.24% global_avg_ntp_loss: 0.7881 global_avg_top_loss: 1.9733 +[titan] 2025-09-10 00:13:23,319 - root - INFO - lr: 4.7351e-06 gnorm: 0.40 [2 days, 6:37:55<18:26:31] +[titan] 2025-09-10 00:13:55,301 - root - INFO - step: 29910 loss: 2.5872 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.32 mfu: 49.37% global_avg_ntp_loss: 0.7106 global_avg_top_loss: 1.8765 +[titan] 2025-09-10 00:13:55,301 - root - INFO - lr: 4.7326e-06 gnorm: 0.38 [2 days, 6:38:27<18:25:58] +[titan] 2025-09-10 00:14:27,418 - root - INFO - step: 29915 loss: 2.9859 memory: 122.03GiB(87.57%) tps: 10,203 tflops: 486.26 mfu: 49.17% global_avg_ntp_loss: 0.9417 global_avg_top_loss: 2.0442 +[titan] 2025-09-10 00:14:27,418 - root - INFO - lr: 4.7300e-06 gnorm: 0.39 [2 days, 6:38:59<18:25:25] +[titan] 2025-09-10 00:14:59,292 - root - INFO - step: 29920 loss: 2.7048 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.97 mfu: 49.54% global_avg_ntp_loss: 0.7620 global_avg_top_loss: 1.9428 +[titan] 2025-09-10 00:14:59,292 - root - INFO - lr: 4.7274e-06 gnorm: 0.38 [2 days, 6:39:31<18:24:52] +[titan] 2025-09-10 00:15:31,363 - root - INFO - step: 29925 loss: 2.6873 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.95 mfu: 49.24% global_avg_ntp_loss: 0.7512 global_avg_top_loss: 1.9362 +[titan] 2025-09-10 00:15:31,364 - root - INFO - lr: 4.7249e-06 gnorm: 0.42 [2 days, 6:40:03<18:24:18] +[titan] 2025-09-10 00:16:03,583 - root - INFO - step: 29930 loss: 2.6387 memory: 122.03GiB(87.57%) tps: 10,170 tflops: 484.71 mfu: 49.01% global_avg_ntp_loss: 0.7322 global_avg_top_loss: 1.9065 +[titan] 2025-09-10 00:16:03,584 - root - INFO - lr: 4.7223e-06 gnorm: 0.39 [2 days, 6:40:36<18:23:45] +[titan] 2025-09-10 00:16:35,629 - root - INFO - step: 29935 loss: 2.6520 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.35 mfu: 49.28% global_avg_ntp_loss: 0.7334 global_avg_top_loss: 1.9186 +[titan] 2025-09-10 00:16:35,629 - root - INFO - lr: 4.7198e-06 gnorm: 0.40 [2 days, 6:41:08<18:23:12] +[titan] 2025-09-10 00:17:07,616 - root - INFO - step: 29940 loss: 2.7165 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.23 mfu: 49.37% global_avg_ntp_loss: 0.7679 global_avg_top_loss: 1.9486 +[titan] 2025-09-10 00:17:07,617 - root - INFO - lr: 4.7172e-06 gnorm: 0.39 [2 days, 6:41:40<18:22:39] +[titan] 2025-09-10 00:17:39,815 - root - INFO - step: 29945 loss: 2.7217 memory: 122.03GiB(87.57%) tps: 10,177 tflops: 485.02 mfu: 49.04% global_avg_ntp_loss: 0.7665 global_avg_top_loss: 1.9552 +[titan] 2025-09-10 00:17:39,816 - root - INFO - lr: 4.7146e-06 gnorm: 0.39 [2 days, 6:42:12<18:22:06] +[titan] 2025-09-10 00:18:05,462 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 00:18:11,908 - root - INFO - step: 29950 loss: 2.6150 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.64 mfu: 49.20% global_avg_ntp_loss: 0.7222 global_avg_top_loss: 1.8928 +[titan] 2025-09-10 00:18:11,908 - root - INFO - lr: 4.7121e-06 gnorm: 0.38 [2 days, 6:42:44<18:21:33] +[titan] 2025-09-10 00:18:44,032 - root - INFO - step: 29955 loss: 2.7258 memory: 122.03GiB(87.57%) tps: 10,200 tflops: 486.15 mfu: 49.16% global_avg_ntp_loss: 0.7661 global_avg_top_loss: 1.9597 +[titan] 2025-09-10 00:18:44,032 - root - INFO - lr: 4.7095e-06 gnorm: 0.40 [2 days, 6:43:16<18:21:00] +[titan] 2025-09-10 00:19:16,105 - root - INFO - step: 29960 loss: 2.5972 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.93 mfu: 49.23% global_avg_ntp_loss: 0.7099 global_avg_top_loss: 1.8873 +[titan] 2025-09-10 00:19:16,105 - root - INFO - lr: 4.7070e-06 gnorm: 0.39 [2 days, 6:43:48<18:20:26] +[titan] 2025-09-10 00:19:48,028 - root - INFO - step: 29965 loss: 3.8520 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.21 mfu: 49.47% global_avg_ntp_loss: 1.3764 global_avg_top_loss: 2.4756 +[titan] 2025-09-10 00:19:48,029 - root - INFO - lr: 4.7044e-06 gnorm: 0.41 [2 days, 6:44:20<18:19:53] +[titan] 2025-09-10 00:20:20,186 - root - INFO - step: 29970 loss: 2.6966 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.64 mfu: 49.10% global_avg_ntp_loss: 0.7559 global_avg_top_loss: 1.9408 +[titan] 2025-09-10 00:20:20,187 - root - INFO - lr: 4.7019e-06 gnorm: 0.41 [2 days, 6:44:52<18:19:20] +[titan] 2025-09-10 00:20:52,285 - root - INFO - step: 29975 loss: 3.1491 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.55 mfu: 49.20% global_avg_ntp_loss: 1.0164 global_avg_top_loss: 2.1327 +[titan] 2025-09-10 00:20:52,285 - root - INFO - lr: 4.6993e-06 gnorm: 0.38 [2 days, 6:45:24<18:18:47] +[titan] 2025-09-10 00:21:24,143 - root - INFO - step: 29980 loss: 2.7489 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.21 mfu: 49.57% global_avg_ntp_loss: 0.7802 global_avg_top_loss: 1.9688 +[titan] 2025-09-10 00:21:24,143 - root - INFO - lr: 4.6968e-06 gnorm: 0.38 [2 days, 6:45:56<18:18:14] +[titan] 2025-09-10 00:21:56,300 - root - INFO - step: 29985 loss: 2.6586 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.65 mfu: 49.11% global_avg_ntp_loss: 0.7358 global_avg_top_loss: 1.9227 +[titan] 2025-09-10 00:21:56,300 - root - INFO - lr: 4.6942e-06 gnorm: 0.39 [2 days, 6:46:28<18:17:41] +[titan] 2025-09-10 00:22:28,294 - root - INFO - step: 29990 loss: 2.5988 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.13 mfu: 49.36% global_avg_ntp_loss: 0.7105 global_avg_top_loss: 1.8884 +[titan] 2025-09-10 00:22:28,294 - root - INFO - lr: 4.6917e-06 gnorm: 0.39 [2 days, 6:47:00<18:17:07] +[titan] 2025-09-10 00:23:00,308 - root - INFO - step: 29995 loss: 2.5806 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.83 mfu: 49.33% global_avg_ntp_loss: 0.7012 global_avg_top_loss: 1.8794 +[titan] 2025-09-10 00:23:00,308 - root - INFO - lr: 4.6891e-06 gnorm: 0.38 [2 days, 6:47:32<18:16:34] +[titan] 2025-09-10 00:23:25,912 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds. +[titan] 2025-09-10 00:23:32,340 - root - INFO - step: 30000 loss: 2.5194 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.54 mfu: 49.30% global_avg_ntp_loss: 0.6778 global_avg_top_loss: 1.8416 +[titan] 2025-09-10 00:23:32,341 - root - INFO - lr: 4.6866e-06 gnorm: 0.39 [2 days, 6:48:04<18:16:01] +[titan] 2025-09-10 00:23:32,341 - root - INFO - Saving the checkpoint (or staging if async is enabled). +[titan] 2025-09-10 00:24:07,182 - root - INFO - [GC] GC collection invoked by checkpointer. 0.02 seconds. +[titan] 2025-09-10 00:24:07,182 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 34.84 seconds. +[rank7]:[E910 01:14:07.962844308 ProcessGroupNCCL.cpp:629] [Rank 7] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1842028, OpType=_ALLGATHER_BASE, NumelIn=49152512, NumelOut=393220096, Timeout(ms)=3000000) ran for 3000087 milliseconds before timing out. +[rank7]:[E910 01:14:07.962980528 ProcessGroupNCCL.cpp:2168] [PG ID 0 PG GUID 0(default_pg) Rank 7] failure detected by watchdog at work sequence id: 1842028 PG status: last enqueued work: 1842028, last completed work: 1842027 +[rank7]:[E910 01:14:07.963565989 ProcessGroupNCCL.cpp:664] Stack trace of the failed collective: +#0 all_gather_into_tensor from /home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:3798 +#1 wrapper from /home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/distributed/c10d_logger.py:81 +#2 foreach_all_gather from /home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py:165 +#3 decorate_context from /home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/utils/_contextlib.py:116 +#4 unshard from /home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py:263 +#5 pre_forward from /home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py:334 +#6 _pre_forward from /home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/distributed/fsdp/_fully_shard/_fsdp_state.py:239 +#7 _fn from /home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:745 +#8 fsdp_hook_wrapper from /home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/distributed/fsdp/_fully_shard/_fsdp_state.py:71 +#9 inner from /home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/nn/modules/module.py:1772 +#10 _call_impl from /home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/nn/modules/module.py:1845 +#11 _wrapped_call_impl from /home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/nn/modules/module.py:1739 +#12 main from /home/cvm/flame/flame/train.py:692 +#13 wrapper from /home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py:355 +#14 from /home/cvm/flame/flame/train.py:896 +#15 _run_code from :88 +#16 _run_module_as_main from :198 + +[rank7]:[E910 01:14:07.347524557 ProcessGroupNCCL.cpp:1753] [PG ID 0 PG GUID 0(default_pg) Rank 7] Received a dump signal due to a collective timeout from this local rank and we will try our best to dump the debug info. Last enqueued NCCL work: 1842028, last completed NCCL work: 1842027.This is most likely caused by incorrect usages of collectives, e.g., wrong sizes used across ranks, the order of collectives is not same for all ranks or the scheduled collective, for some reason, didn't run. Additionally, this can be caused by GIL deadlock or other reasons such as network errors or bugs in the communications library (e.g. NCCL), etc. +[rank7]:[E910 01:14:07.347689231 ProcessGroupNCCL.cpp:1554] [PG ID 0 PG GUID 0(default_pg) Rank 7] ProcessGroupNCCL preparing to dump debug info. Include stack trace: 1 +[rank7]:[E910 01:14:07.676188587 ProcessGroupNCCL.cpp:681] [Rank 7] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. +[rank7]:[E910 01:14:07.676211307 ProcessGroupNCCL.cpp:695] [Rank 7] To avoid data inconsistency, we are taking the entire process down. +[rank7]:[E910 01:14:07.677769880 ProcessGroupNCCL.cpp:1895] [PG ID 0 PG GUID 0(default_pg) Rank 7] Process group watchdog thread terminated with exception: [Rank 7] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1842028, OpType=_ALLGATHER_BASE, NumelIn=49152512, NumelOut=393220096, Timeout(ms)=3000000) ran for 3000087 milliseconds before timing out. +Exception raised from checkTimeout at /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:632 (most recent call first): +frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x7ac37716c1b6 in /home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/lib/libc10.so) +frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x2b4 (0x7ac3251fec74 in /home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/lib/libtorch_cuda.so) +frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x890 (0x7ac3252007d0 in /home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/lib/libtorch_cuda.so) +frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x7ac3252016ed in /home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/lib/libtorch_cuda.so) +frame #4: + 0x145c0 (0x7ac3775cd5c0 in /home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/lib/libtorch.so) +frame #5: + 0x9caa4 (0x7ac37809caa4 in /lib/x86_64-linux-gnu/libc.so.6) +frame #6: + 0x129c3c (0x7ac378129c3c in /lib/x86_64-linux-gnu/libc.so.6) + +Fatal Python error: Aborted + +Thread 0x00007aacf47f86c0 (most recent call first): + + +Thread 0x00007aacf4ff96c0 (most recent call first): + + +Thread 0x00007aacf57fa6c0 (most recent call first): + + +Thread 0x00007aacf5ffb6c0 (most recent call first): + + +Thread 0x00007aacf67fc6c0 (most recent call first): + + +Thread 0x00007aacf77fe6c0 (most recent call first): + + +Thread 0x00007aacf6ffd6c0 (most recent call first): + + +Thread 0x00007aacf7fff6c0 (most recent call first): + + +Thread 0x00007aad24bff6c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 359 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 655 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/tqdm/_monitor.py", line 60 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad356e66c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad35fe76c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad367e86c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad36fe96c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad377ea6c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad37feb6c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad387ec6c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad38fed6c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad397ee6c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad3aff16c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad3b7f26c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad3bff36c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad3cff56c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad3d7f66c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad3dff76c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad3e7f86c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad3eff96c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad3f7fa6c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad3fffb6c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad407fc6c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad40ffd6c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad417fe6c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad41fff6c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007abfd6ffd6c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007abfd77fe6c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007abfd7fff6c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007abfe11e46c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007abfe19e56c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad3c7f46c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad39fef6c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007aad3a7f06c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007abfbcff96c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 355 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/multiprocessing/queues.py", line 251 in _feed + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007abf9cff96c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 359 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 655 in wait + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/tqdm/_monitor.py", line 60 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007ac07a73e6c0 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/_inductor/compile_worker/subproc_pool.py", line 53 in _recv_msg + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/_inductor/compile_worker/subproc_pool.py", line 161 in _read_thread + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1012 in run + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1075 in _bootstrap_inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/threading.py", line 1032 in _bootstrap + +Thread 0x00007ac3782f6740 (most recent call first): + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py", line 105 in split_with_sizes_copy + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/_ops.py", line 1123 in __call__ + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py", line 288 in foreach_all_gather_copy_out + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 116 in decorate_context + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py", line 288 in wait_for_unshard + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py", line 335 in pre_forward + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/distributed/fsdp/_fully_shard/_fsdp_state.py", line 239 in _pre_forward + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py", line 745 in _fn + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/distributed/fsdp/_fully_shard/_fsdp_state.py", line 71 in fsdp_hook_wrapper + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1772 in inner + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1845 in _call_impl + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1739 in _wrapped_call_impl + File "/home/cvm/flame/flame/train.py", line 692 in main + File "/home/cvm/miniconda3/envs/flame-env/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355 in wrapper + File "/home/cvm/flame/flame/train.py", line 896 in + File "", line 88 in _run_code + File "", line 198 in _run_module_as_main + +Extension modules: numpy._core._multiarray_umath, numpy.linalg._umath_linalg, torch._C, torch._C._dynamo.autograd_compiler, torch._C._dynamo.eval_frame, torch._C._dynamo.guards, torch._C._dynamo.utils, torch._C._fft, torch._C._linalg, torch._C._nested, torch._C._nn, torch._C._sparse, torch._C._special, zstandard.backend_c, pyarrow.lib, numpy.random._common, numpy.random.bit_generator, numpy.random._bounded_integers, numpy.random._pcg64, numpy.random._mt19937, numpy.random._generator, numpy.random._philox, numpy.random._sfc64, numpy.random.mtrand, pandas._libs.tslibs.ccalendar, pandas._libs.tslibs.np_datetime, pandas._libs.tslibs.dtypes, pandas._libs.tslibs.base, pandas._libs.tslibs.nattype, pandas._libs.tslibs.timezones, pandas._libs.tslibs.fields, pandas._libs.tslibs.timedeltas, pandas._libs.tslibs.tzconversion, pandas._libs.tslibs.timestamps, pandas._libs.properties, pandas._libs.tslibs.offsets, pandas._libs.tslibs.strptime, pandas._libs.tslibs.parsing, pandas._libs.tslibs.conversion, pandas._libs.tslibs.period, pandas._libs.tslibs.vectorized, pandas._libs.ops_dispatch, pandas._libs.missing, pandas._libs.hashtable, pandas._libs.algos, pandas._libs.interval, pandas._libs.lib, pyarrow._compute, pandas._libs.ops, numexpr.interpreter, pandas._libs.hashing, pandas._libs.arrays, pandas._libs.tslib, pandas._libs.sparse, pandas._libs.internals, pandas._libs.indexing, pandas._libs.index, pandas._libs.writers, pandas._libs.join, pandas._libs.window.aggregations, pandas._libs.window.indexers, pandas._libs.reshape, pandas._libs.groupby, pandas._libs.json, pandas._libs.parsers, pandas._libs.testing, charset_normalizer.md, yaml._yaml, pyarrow._parquet, pyarrow._fs, pyarrow._azurefs, pyarrow._hdfs, pyarrow._gcsfs, pyarrow._s3fs, multidict._multidict, yarl._quoting_c, propcache._helpers_c, aiohttp._http_writer, aiohttp._http_parser, aiohttp._websocket.mask, aiohttp._websocket.reader_c, frozenlist._frozenlist, xxhash._xxhash, pyarrow._acero, pyarrow._csv, pyarrow._json, pyarrow._substrait, pyarrow._dataset, pyarrow._dataset_orc, pyarrow._parquet_encryption, pyarrow._dataset_parquet_encryption, pyarrow._dataset_parquet, markupsafe._speedups, PIL._imaging, sklearn.__check_build._check_build, scipy._lib._ccallback_c, scipy.sparse._sparsetools, _csparsetools, _cyutility, scipy._cyutility, scipy.sparse._csparsetools, psutil._psutil_linux, psutil._psutil_posix, scipy.special._ufuncs_cxx, scipy.special._ellip_harm_2, scipy.special._special_ufuncs, scipy.special._gufuncs, scipy.special._ufuncs, scipy.special._specfun, scipy.special._comb, scipy.linalg._fblas, scipy.linalg._flapack, scipy.linalg.cython_lapack, scipy.linalg._cythonized_array_utils, scipy.linalg._solve_toeplitz, scipy.linalg._decomp_lu_cython, scipy.linalg._matfuncs_schur_sqrtm, scipy.linalg._matfuncs_expm, scipy.linalg._linalg_pythran, scipy.linalg.cython_blas, scipy.linalg._decomp_update, scipy.sparse.linalg._dsolve._superlu, scipy.sparse.linalg._eigen.arpack._arpack, scipy.sparse.linalg._propack._spropack, scipy.sparse.linalg._propack._dpropack, scipy.sparse.linalg._propack._cpropack, scipy.sparse.linalg._propack._zpropack, scipy.spatial._ckdtree, scipy._lib.messagestream, scipy.spatial._qhull, scipy.spatial._voronoi, scipy.spatial._hausdorff, scipy.spatial._distance_wrap, scipy.spatial.transform._rotation, scipy.spatial.transform._rigid_transform, scipy.optimize._group_columns, scipy.optimize._trlib._trlib, scipy.optimize._lbfgsb, _moduleTNC, scipy.optimize._moduleTNC, scipy.optimize._slsqplib, scipy.optimize._minpack, scipy.optimize._lsq.givens_elimination, scipy.optimize._zeros, scipy._lib._uarray._uarray, scipy.linalg._decomp_interpolative, scipy.optimize._bglu_dense, scipy.optimize._lsap, scipy.optimize._direct, scipy.integrate._odepack, scipy.integrate._quadpack, scipy.integrate._vode, scipy.integrate._dop, scipy.integrate._lsoda, scipy.interpolate._fitpack, scipy.interpolate._dfitpack, scipy.interpolate._dierckx, scipy.interpolate._ppoly, scipy.interpolate._interpnd, scipy.interpolate._rbfinterp_pythran, scipy.interpolate._rgi_cython, scipy.special.cython_special, scipy.stats._stats, scipy.stats._biasedurn, scipy.stats._stats_pythran, scipy.stats._levy_stable.levyst, scipy.stats._ansari_swilk_statistics, scipy.sparse.csgraph._tools, scipy.sparse.csgraph._shortest_path, scipy.sparse.csgraph._traversal, scipy.sparse.csgraph._min_spanning_tree, scipy.sparse.csgraph._flow, scipy.sparse.csgraph._matching, scipy.sparse.csgraph._reordering, scipy.stats._sobol, scipy.stats._qmc_cy, scipy.stats._rcont.rcont, scipy.stats._qmvnt_cy, scipy.ndimage._nd_image, scipy.ndimage._rank_filter_1d, _ni_label, scipy.ndimage._ni_label, sklearn._cyutility, sklearn.utils._isfinite, sklearn.utils.sparsefuncs_fast, sklearn.utils.murmurhash, sklearn.utils._openmp_helpers, sklearn.metrics.cluster._expected_mutual_info_fast, sklearn.preprocessing._csr_polynomial_expansion, sklearn.preprocessing._target_encoder_fast, sklearn.metrics._dist_metrics, sklearn.metrics._pairwise_distances_reduction._datasets_pair, sklearn.utils._cython_blas, sklearn.metrics._pairwise_distances_reduction._base, sklearn.metrics._pairwise_distances_reduction._middle_term_computer, sklearn.utils._heap, sklearn.utils._sorting, sklearn.metrics._pairwise_distances_reduction._argkmin, sklearn.metrics._pairwise_distances_reduction._argkmin_classmode, sklearn.utils._vector_sentinel, sklearn.metrics._pairwise_distances_reduction._radius_neighbors, sklearn.metrics._pairwise_distances_reduction._radius_neighbors_classmode, sklearn.metrics._pairwise_fast, cuda_utils, google._upb._message, h5py._errors, h5py.defs, h5py._objects, h5py.h5, h5py.utils, h5py.h5t, h5py.h5s, h5py.h5ac, h5py.h5p, h5py.h5r, h5py._npystrings, h5py._proxy, h5py._conv, h5py.h5z, h5py.h5a, h5py.h5d, h5py.h5ds, h5py.h5g, h5py.h5i, h5py.h5o, h5py.h5f, h5py.h5fd, h5py.h5pl, h5py.h5l, h5py._selector, kiwisolver._cext, regex._regex, sentencepiece._sentencepiece, scipy.io.matlab._mio_utils, scipy.io.matlab._streams, scipy.io.matlab._mio5_utils, __triton_launcher (total: 237) diff --git a/logs/none_lyv0rec_/attempt_0/7/stdout.log b/logs/none_lyv0rec_/attempt_0/7/stdout.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tb/20250909-0619/wandb/debug.log b/tb/20250909-0619/wandb/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..b1fe6180502b5e0c9a480097879c081c907d671c --- /dev/null +++ b/tb/20250909-0619/wandb/debug.log @@ -0,0 +1,21 @@ +2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_setup.py:_flush():80] Current SDK version is 0.21.0 +2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_setup.py:_flush():80] Configure stats pid to 795439 +2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_setup.py:_flush():80] Loading settings from /home/cvm/.config/wandb/settings +2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_setup.py:_flush():80] Loading settings from /home/cvm/flame/wandb/settings +2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_init.py:setup_run_log_directory():703] Logging user logs to exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug.log +2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug-internal.log +2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_init.py:init():830] calling init triggers +2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_init.py:init():835] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_init.py:init():871] starting backend +2025-09-09 06:19:20,025 INFO MainThread:795439 [wandb_init.py:init():874] sending inform_init request +2025-09-09 06:19:20,027 INFO MainThread:795439 [wandb_init.py:init():882] backend started and connected +2025-09-09 06:19:20,033 INFO MainThread:795439 [wandb_init.py:init():953] updated telemetry +2025-09-09 06:19:20,039 INFO MainThread:795439 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout +2025-09-09 06:19:20,682 INFO MainThread:795439 [wandb_init.py:init():1029] starting run threads in backend +2025-09-09 06:19:20,815 INFO MainThread:795439 [wandb_run.py:_console_start():2458] atexit reg +2025-09-09 06:19:20,815 INFO MainThread:795439 [wandb_run.py:_redirect():2306] redirect: wrap_raw +2025-09-09 06:19:20,815 INFO MainThread:795439 [wandb_run.py:_redirect():2375] Wrapping output streams. +2025-09-09 06:19:20,815 INFO MainThread:795439 [wandb_run.py:_redirect():2398] Redirects installed. +2025-09-09 06:19:20,817 INFO MainThread:795439 [wandb_init.py:init():1075] run started, returning control to user process diff --git a/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/files/output.log b/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..f4926fcbc9c41d7b06f6280f438620828a91c0ff --- /dev/null +++ b/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/files/output.log @@ -0,0 +1,4285 @@ +[titan] 2025-09-09 06:19:20,817 - root - INFO - WandB logging enabled +[titan] 2025-09-09 06:19:20,890 - root - INFO - CUDA capacity: NVIDIA H200 with 139.36GiB memory +[titan] 2025-09-09 06:19:28,408 - root - INFO - ***** Running training ***** +[titan] 2025-09-09 06:19:28,442 - root - INFO -  Training starts at step 20001 +[titan] 2025-09-09 06:19:28,442 - root - INFO -  Number of tokens per sequence = 4,096 +[titan] 2025-09-09 06:19:28,442 - root - INFO -  Gradient Accumulation steps = 2 +[titan] 2025-09-09 06:19:28,442 - root - INFO -  Instantaneous batch size (per device) = 8 +[titan] 2025-09-09 06:19:28,442 - root - INFO -  Global batch size (w. parallel, distributed & accumulation) = 128 (524,288 tokens) +[titan] 2025-09-09 06:19:28,443 - root - INFO -  Total optimization steps = 40,000 (20,971,520,000 tokens) +[titan] 2025-09-09 06:19:28,443 - root - INFO -  Warmup steps = 400 (209,715,200 tokens) +[titan] 2025-09-09 06:19:28,443 - root - INFO -  Number of parameters = 6,936,580,096  +[titan] 2025-09-09 06:19:28,443 - root - INFO - Profiling active. Traces will be saved at exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/profile_trace +[titan] 2025-09-09 06:20:20,235 - root - INFO - step: 20005 loss: 2.7469 memory: 122.03GiB(87.57%) tps: 5,524 tflops: 263.28 mfu: 26.62% global_avg_ntp_loss: 0.7818 global_avg_top_loss: 1.9651 +[titan] 2025-09-09 06:20:20,236 - root - INFO - lr: 1.1139e-05 gnorm: 0.33 [1 day, 12:44:53<1 day, 12:43:46] +[titan] 2025-09-09 06:20:50,050 - root - INFO - step: 20010 loss: 2.9600 memory: 122.03GiB(87.57%) tps: 10,991 tflops: 523.83 mfu: 52.97% global_avg_ntp_loss: 0.8794 global_avg_top_loss: 2.0806 +[titan] 2025-09-09 06:20:50,050 - root - INFO - lr: 1.1135e-05 gnorm: 0.44 [1 day, 12:45:22<1 day, 12:43:10] +[titan] 2025-09-09 06:21:19,936 - root - INFO - step: 20015 loss: 2.7626 memory: 122.03GiB(87.57%) tps: 10,965 tflops: 522.56 mfu: 52.84% global_avg_ntp_loss: 0.7871 global_avg_top_loss: 1.9755 +[titan] 2025-09-09 06:21:19,937 - root - INFO - lr: 1.1132e-05 gnorm: 0.34 [1 day, 12:45:52<1 day, 12:42:34] +[titan] 2025-09-09 06:21:49,995 - root - INFO - step: 20020 loss: 2.7556 memory: 122.03GiB(87.57%) tps: 10,902 tflops: 519.56 mfu: 52.53% global_avg_ntp_loss: 0.7861 global_avg_top_loss: 1.9694 +[titan] 2025-09-09 06:21:49,996 - root - INFO - lr: 1.1128e-05 gnorm: 0.34 [1 day, 12:46:22<1 day, 12:41:58] +[titan] 2025-09-09 06:22:20,199 - root - INFO - step: 20025 loss: 2.8442 memory: 122.03GiB(87.57%) tps: 10,849 tflops: 517.07 mfu: 52.28% global_avg_ntp_loss: 0.8268 global_avg_top_loss: 2.0174 +[titan] 2025-09-09 06:22:20,200 - root - INFO - lr: 1.1125e-05 gnorm: 0.34 [1 day, 12:46:53<1 day, 12:41:22] +[titan] 2025-09-09 06:22:50,511 - root - INFO - step: 20030 loss: 2.7813 memory: 122.03GiB(87.57%) tps: 10,811 tflops: 515.23 mfu: 52.10% global_avg_ntp_loss: 0.7966 global_avg_top_loss: 1.9848 +[titan] 2025-09-09 06:22:50,511 - root - INFO - lr: 1.1121e-05 gnorm: 0.34 [1 day, 12:47:23<1 day, 12:40:46] +[titan] 2025-09-09 06:23:21,005 - root - INFO - step: 20035 loss: 3.3298 memory: 122.03GiB(87.57%) tps: 10,746 tflops: 512.14 mfu: 51.78% global_avg_ntp_loss: 1.1042 global_avg_top_loss: 2.2255 +[titan] 2025-09-09 06:23:21,006 - root - INFO - lr: 1.1117e-05 gnorm: 0.34 [1 day, 12:47:53<1 day, 12:40:10] +[titan] 2025-09-09 06:23:51,586 - root - INFO - step: 20040 loss: 2.8476 memory: 122.03GiB(87.57%) tps: 10,715 tflops: 510.69 mfu: 51.64% global_avg_ntp_loss: 0.8270 global_avg_top_loss: 2.0206 +[titan] 2025-09-09 06:23:51,587 - root - INFO - lr: 1.1114e-05 gnorm: 0.32 [1 day, 12:48:24<1 day, 12:39:35] +[titan] 2025-09-09 06:24:22,337 - root - INFO - step: 20045 loss: 2.7900 memory: 122.03GiB(87.57%) tps: 10,656 tflops: 507.87 mfu: 51.35% global_avg_ntp_loss: 0.8005 global_avg_top_loss: 1.9896 +[titan] 2025-09-09 06:24:22,338 - root - INFO - lr: 1.1110e-05 gnorm: 0.34 [1 day, 12:48:55<1 day, 12:39:00] +[titan] 2025-09-09 06:24:47,252 - root - INFO - [GC] Peforming periodical GC collection. 0.13 seconds. +[titan] 2025-09-09 06:24:53,455 - root - INFO - step: 20050 loss: 2.6039 memory: 122.03GiB(87.57%) tps: 10,531 tflops: 501.89 mfu: 50.75% global_avg_ntp_loss: 0.7205 global_avg_top_loss: 1.8833 +[titan] 2025-09-09 06:24:53,456 - root - INFO - lr: 1.1107e-05 gnorm: 0.33 [1 day, 12:49:26<1 day, 12:38:25] +[titan] 2025-09-09 06:25:24,419 - root - INFO - step: 20055 loss: 2.7943 memory: 122.03GiB(87.57%) tps: 10,583 tflops: 504.38 mfu: 51.00% global_avg_ntp_loss: 0.8078 global_avg_top_loss: 1.9865 +[titan] 2025-09-09 06:25:24,419 - root - INFO - lr: 1.1103e-05 gnorm: 0.33 [1 day, 12:49:57<1 day, 12:37:49] +[titan] 2025-09-09 06:25:55,659 - root - INFO - step: 20060 loss: 2.8892 memory: 122.03GiB(87.57%) tps: 10,489 tflops: 499.92 mfu: 50.55% global_avg_ntp_loss: 0.8490 global_avg_top_loss: 2.0401 +[titan] 2025-09-09 06:25:55,659 - root - INFO - lr: 1.1100e-05 gnorm: 0.33 [1 day, 12:50:28<1 day, 12:37:15] +[titan] 2025-09-09 06:26:27,033 - root - INFO - step: 20065 loss: 2.8216 memory: 122.03GiB(87.57%) tps: 10,445 tflops: 497.78 mfu: 50.33% global_avg_ntp_loss: 0.8154 global_avg_top_loss: 2.0062 +[titan] 2025-09-09 06:26:27,033 - root - INFO - lr: 1.1096e-05 gnorm: 0.35 [1 day, 12:50:59<1 day, 12:36:40] +[titan] 2025-09-09 06:26:58,429 - root - INFO - step: 20070 loss: 2.7793 memory: 122.03GiB(87.57%) tps: 10,437 tflops: 497.43 mfu: 50.30% global_avg_ntp_loss: 0.7949 global_avg_top_loss: 1.9844 +[titan] 2025-09-09 06:26:58,429 - root - INFO - lr: 1.1092e-05 gnorm: 0.32 [1 day, 12:51:31<1 day, 12:36:05] +[titan] 2025-09-09 06:27:29,891 - root - INFO - step: 20075 loss: 2.7812 memory: 122.03GiB(87.57%) tps: 10,416 tflops: 496.40 mfu: 50.19% global_avg_ntp_loss: 0.7926 global_avg_top_loss: 1.9885 +[titan] 2025-09-09 06:27:29,891 - root - INFO - lr: 1.1089e-05 gnorm: 0.36 [1 day, 12:52:02<1 day, 12:35:30] +[titan] 2025-09-09 06:28:01,444 - root - INFO - step: 20080 loss: 2.7163 memory: 122.03GiB(87.57%) tps: 10,385 tflops: 494.96 mfu: 50.05% global_avg_ntp_loss: 0.7680 global_avg_top_loss: 1.9483 +[titan] 2025-09-09 06:28:01,444 - root - INFO - lr: 1.1085e-05 gnorm: 0.35 [1 day, 12:52:34<1 day, 12:34:56] +[titan] 2025-09-09 06:28:32,958 - root - INFO - step: 20085 loss: 2.8179 memory: 122.03GiB(87.57%) tps: 10,398 tflops: 495.57 mfu: 50.11% global_avg_ntp_loss: 0.8203 global_avg_top_loss: 1.9976 +[titan] 2025-09-09 06:28:32,959 - root - INFO - lr: 1.1082e-05 gnorm: 0.34 [1 day, 12:53:05<1 day, 12:34:21] +[titan] 2025-09-09 06:29:04,816 - root - INFO - step: 20090 loss: 2.7528 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.22 mfu: 49.57% global_avg_ntp_loss: 0.7862 global_avg_top_loss: 1.9666 +[titan] 2025-09-09 06:29:04,817 - root - INFO - lr: 1.1078e-05 gnorm: 0.33 [1 day, 12:53:37<1 day, 12:33:47] +[titan] 2025-09-09 06:29:36,424 - root - INFO - step: 20095 loss: 2.9881 memory: 122.03GiB(87.57%) tps: 10,367 tflops: 494.10 mfu: 49.96% global_avg_ntp_loss: 0.9035 global_avg_top_loss: 2.0846 +[titan] 2025-09-09 06:29:36,425 - root - INFO - lr: 1.1075e-05 gnorm: 0.36 [1 day, 12:54:09<1 day, 12:33:13] +[titan] 2025-09-09 06:30:01,842 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 06:30:08,210 - root - INFO - step: 20100 loss: 2.6944 memory: 122.03GiB(87.57%) tps: 10,309 tflops: 491.33 mfu: 49.68% global_avg_ntp_loss: 0.7563 global_avg_top_loss: 1.9381 +[titan] 2025-09-09 06:30:08,211 - root - INFO - lr: 1.1071e-05 gnorm: 0.35 [1 day, 12:54:41<1 day, 12:32:38] +[titan] 2025-09-09 06:30:39,837 - root - INFO - step: 20105 loss: 2.8289 memory: 122.03GiB(87.57%) tps: 10,361 tflops: 493.81 mfu: 49.93% global_avg_ntp_loss: 0.8182 global_avg_top_loss: 2.0108 +[titan] 2025-09-09 06:30:39,837 - root - INFO - lr: 1.1067e-05 gnorm: 0.33 [1 day, 12:55:12<1 day, 12:32:04] +[titan] 2025-09-09 06:31:11,551 - root - INFO - step: 20110 loss: 2.7379 memory: 122.03GiB(87.57%) tps: 10,333 tflops: 492.45 mfu: 49.79% global_avg_ntp_loss: 0.7768 global_avg_top_loss: 1.9611 +[titan] 2025-09-09 06:31:11,551 - root - INFO - lr: 1.1064e-05 gnorm: 0.34 [1 day, 12:55:44<1 day, 12:31:29] +[titan] 2025-09-09 06:31:43,421 - root - INFO - step: 20115 loss: 3.2744 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.02 mfu: 49.55% global_avg_ntp_loss: 1.0718 global_avg_top_loss: 2.2025 +[titan] 2025-09-09 06:31:43,422 - root - INFO - lr: 1.1060e-05 gnorm: 0.34 [1 day, 12:56:16<1 day, 12:30:55] +[titan] 2025-09-09 06:32:15,207 - root - INFO - step: 20120 loss: 2.8005 memory: 122.03GiB(87.57%) tps: 10,309 tflops: 491.34 mfu: 49.68% global_avg_ntp_loss: 0.8055 global_avg_top_loss: 1.9950 +[titan] 2025-09-09 06:32:15,207 - root - INFO - lr: 1.1057e-05 gnorm: 0.32 [1 day, 12:56:47<1 day, 12:30:21] +[titan] 2025-09-09 06:32:47,105 - root - INFO - step: 20125 loss: 2.7452 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.61 mfu: 49.51% global_avg_ntp_loss: 0.7798 global_avg_top_loss: 1.9654 +[titan] 2025-09-09 06:32:47,105 - root - INFO - lr: 1.1053e-05 gnorm: 0.33 [1 day, 12:57:19<1 day, 12:29:47] +[titan] 2025-09-09 06:33:18,816 - root - INFO - step: 20130 loss: 2.6240 memory: 122.03GiB(87.57%) tps: 10,334 tflops: 492.49 mfu: 49.80% global_avg_ntp_loss: 0.7311 global_avg_top_loss: 1.8929 +[titan] 2025-09-09 06:33:18,816 - root - INFO - lr: 1.1050e-05 gnorm: 0.33 [1 day, 12:57:51<1 day, 12:29:12] +[titan] 2025-09-09 06:33:50,801 - root - INFO - step: 20135 loss: 2.8719 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.26 mfu: 49.37% global_avg_ntp_loss: 0.8376 global_avg_top_loss: 2.0343 +[titan] 2025-09-09 06:33:50,802 - root - INFO - lr: 1.1046e-05 gnorm: 0.37 [1 day, 12:58:23<1 day, 12:28:38] +[titan] 2025-09-09 06:34:22,522 - root - INFO - step: 20140 loss: 2.7833 memory: 122.03GiB(87.57%) tps: 10,331 tflops: 492.35 mfu: 49.78% global_avg_ntp_loss: 0.7952 global_avg_top_loss: 1.9881 +[titan] 2025-09-09 06:34:22,522 - root - INFO - lr: 1.1042e-05 gnorm: 0.35 [1 day, 12:58:55<1 day, 12:28:04] +[titan] 2025-09-09 06:34:54,407 - root - INFO - step: 20145 loss: 2.8903 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.80 mfu: 49.52% global_avg_ntp_loss: 0.8482 global_avg_top_loss: 2.0421 +[titan] 2025-09-09 06:34:54,408 - root - INFO - lr: 1.1039e-05 gnorm: 0.37 [1 day, 12:59:27<1 day, 12:27:30] +[titan] 2025-09-09 06:35:19,986 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 06:35:26,373 - root - INFO - step: 20150 loss: 2.9203 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.57 mfu: 49.40% global_avg_ntp_loss: 0.8794 global_avg_top_loss: 2.0409 +[titan] 2025-09-09 06:35:26,373 - root - INFO - lr: 1.1035e-05 gnorm: 0.37 [1 day, 12:59:59<1 day, 12:26:56] +[titan] 2025-09-09 06:35:58,179 - root - INFO - step: 20155 loss: 2.8514 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.01 mfu: 49.65% global_avg_ntp_loss: 0.8292 global_avg_top_loss: 2.0222 +[titan] 2025-09-09 06:35:58,180 - root - INFO - lr: 1.1032e-05 gnorm: 0.33 [1 day, 13:00:30<1 day, 12:26:21] +[titan] 2025-09-09 06:36:30,067 - root - INFO - step: 20160 loss: 2.5929 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.77 mfu: 49.52% global_avg_ntp_loss: 0.7032 global_avg_top_loss: 1.8897 +[titan] 2025-09-09 06:36:30,067 - root - INFO - lr: 1.1028e-05 gnorm: 0.76 [1 day, 13:01:02<1 day, 12:25:47] +[titan] 2025-09-09 06:37:01,877 - root - INFO - step: 20165 loss: 2.7879 memory: 122.03GiB(87.57%) tps: 10,301 tflops: 490.96 mfu: 49.64% global_avg_ntp_loss: 0.8091 global_avg_top_loss: 1.9788 +[titan] 2025-09-09 06:37:01,878 - root - INFO - lr: 1.1025e-05 gnorm: 0.41 [1 day, 13:01:34<1 day, 12:25:13] +[titan] 2025-09-09 06:37:33,811 - root - INFO - step: 20170 loss: 2.7937 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.06 mfu: 49.45% global_avg_ntp_loss: 0.8013 global_avg_top_loss: 1.9924 +[titan] 2025-09-09 06:37:33,811 - root - INFO - lr: 1.1021e-05 gnorm: 0.52 [1 day, 13:02:06<1 day, 12:24:39] +[titan] 2025-09-09 06:38:05,708 - root - INFO - step: 20175 loss: 2.8139 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.62 mfu: 49.51% global_avg_ntp_loss: 0.8099 global_avg_top_loss: 2.0040 +[titan] 2025-09-09 06:38:05,709 - root - INFO - lr: 1.1017e-05 gnorm: 0.34 [1 day, 13:02:38<1 day, 12:24:04] +[titan] 2025-09-09 06:38:37,468 - root - INFO - step: 20180 loss: 2.8027 memory: 122.03GiB(87.57%) tps: 10,318 tflops: 491.74 mfu: 49.72% global_avg_ntp_loss: 0.8096 global_avg_top_loss: 1.9930 +[titan] 2025-09-09 06:38:37,468 - root - INFO - lr: 1.1014e-05 gnorm: 0.37 [1 day, 13:03:10<1 day, 12:23:30] +[titan] 2025-09-09 06:39:09,190 - root - INFO - step: 20185 loss: 2.7779 memory: 122.03GiB(87.57%) tps: 10,330 tflops: 492.32 mfu: 49.78% global_avg_ntp_loss: 0.7954 global_avg_top_loss: 1.9826 +[titan] 2025-09-09 06:39:09,190 - root - INFO - lr: 1.1010e-05 gnorm: 0.36 [1 day, 13:03:41<1 day, 12:22:56] +[titan] 2025-09-09 06:39:41,087 - root - INFO - step: 20190 loss: 2.8220 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.63 mfu: 49.51% global_avg_ntp_loss: 0.8130 global_avg_top_loss: 2.0090 +[titan] 2025-09-09 06:39:41,087 - root - INFO - lr: 1.1007e-05 gnorm: 1.26 [1 day, 13:04:13<1 day, 12:22:22] +[titan] 2025-09-09 06:40:13,085 - root - INFO - step: 20195 loss: 2.7428 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.07 mfu: 49.35% global_avg_ntp_loss: 0.7792 global_avg_top_loss: 1.9636 +[titan] 2025-09-09 06:40:13,085 - root - INFO - lr: 1.1003e-05 gnorm: 0.37 [1 day, 13:04:45<1 day, 12:21:48] +[titan] 2025-09-09 06:40:38,715 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 06:40:45,093 - root - INFO - step: 20200 loss: 2.7984 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.92 mfu: 49.33% global_avg_ntp_loss: 0.8073 global_avg_top_loss: 1.9910 +[titan] 2025-09-09 06:40:45,094 - root - INFO - lr: 1.1000e-05 gnorm: 0.37 [1 day, 13:05:17<1 day, 12:21:13] +[titan] 2025-09-09 06:41:16,990 - root - INFO - step: 20205 loss: 2.8747 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.63 mfu: 49.51% global_avg_ntp_loss: 0.8392 global_avg_top_loss: 2.0355 +[titan] 2025-09-09 06:41:16,990 - root - INFO - lr: 1.0996e-05 gnorm: 0.35 [1 day, 13:05:49<1 day, 12:20:39] +[titan] 2025-09-09 06:41:48,710 - root - INFO - step: 20210 loss: 2.8652 memory: 122.03GiB(87.57%) tps: 10,331 tflops: 492.35 mfu: 49.78% global_avg_ntp_loss: 0.8291 global_avg_top_loss: 2.0361 +[titan] 2025-09-09 06:41:48,711 - root - INFO - lr: 1.0993e-05 gnorm: 0.36 [1 day, 13:06:21<1 day, 12:20:05] +[titan] 2025-09-09 06:42:20,571 - root - INFO - step: 20215 loss: 2.9655 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.17 mfu: 49.56% global_avg_ntp_loss: 0.9175 global_avg_top_loss: 2.0480 +[titan] 2025-09-09 06:42:20,572 - root - INFO - lr: 1.0989e-05 gnorm: 0.35 [1 day, 13:06:53<1 day, 12:19:31] +[titan] 2025-09-09 06:42:52,277 - root - INFO - step: 20220 loss: 2.7629 memory: 122.03GiB(87.57%) tps: 10,336 tflops: 492.58 mfu: 49.81% global_avg_ntp_loss: 0.7864 global_avg_top_loss: 1.9764 +[titan] 2025-09-09 06:42:52,277 - root - INFO - lr: 1.0985e-05 gnorm: 0.34 [1 day, 13:07:25<1 day, 12:18:56] +[titan] 2025-09-09 06:43:24,073 - root - INFO - step: 20225 loss: 2.7892 memory: 122.03GiB(87.57%) tps: 10,306 tflops: 491.18 mfu: 49.66% global_avg_ntp_loss: 0.8035 global_avg_top_loss: 1.9857 +[titan] 2025-09-09 06:43:24,073 - root - INFO - lr: 1.0982e-05 gnorm: 0.36 [1 day, 13:07:56<1 day, 12:18:22] +[titan] 2025-09-09 06:43:56,140 - root - INFO - step: 20230 loss: 2.8211 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.02 mfu: 49.24% global_avg_ntp_loss: 0.8123 global_avg_top_loss: 2.0087 +[titan] 2025-09-09 06:43:56,141 - root - INFO - lr: 1.0978e-05 gnorm: 0.34 [1 day, 13:08:28<1 day, 12:17:48] +[titan] 2025-09-09 06:44:27,809 - root - INFO - step: 20235 loss: 2.8625 memory: 122.03GiB(87.57%) tps: 10,347 tflops: 493.15 mfu: 49.86% global_avg_ntp_loss: 0.8326 global_avg_top_loss: 2.0299 +[titan] 2025-09-09 06:44:27,809 - root - INFO - lr: 1.0975e-05 gnorm: 0.33 [1 day, 13:09:00<1 day, 12:17:14] +[titan] 2025-09-09 06:44:59,720 - root - INFO - step: 20240 loss: 2.7057 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.40 mfu: 49.48% global_avg_ntp_loss: 0.7563 global_avg_top_loss: 1.9494 +[titan] 2025-09-09 06:44:59,721 - root - INFO - lr: 1.0971e-05 gnorm: 0.49 [1 day, 13:09:32<1 day, 12:16:40] +[titan] 2025-09-09 06:45:31,632 - root - INFO - step: 20245 loss: 2.6970 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.39 mfu: 49.48% global_avg_ntp_loss: 0.7653 global_avg_top_loss: 1.9316 +[titan] 2025-09-09 06:45:31,633 - root - INFO - lr: 1.0968e-05 gnorm: 0.37 [1 day, 13:10:04<1 day, 12:16:05] +[titan] 2025-09-09 06:45:57,235 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 06:46:03,715 - root - INFO - step: 20250 loss: 2.7878 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.78 mfu: 49.22% global_avg_ntp_loss: 0.8011 global_avg_top_loss: 1.9867 +[titan] 2025-09-09 06:46:03,716 - root - INFO - lr: 1.0964e-05 gnorm: 0.36 [1 day, 13:10:36<1 day, 12:15:31] +[titan] 2025-09-09 06:46:35,720 - root - INFO - step: 20255 loss: 2.8541 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.98 mfu: 49.34% global_avg_ntp_loss: 0.8296 global_avg_top_loss: 2.0246 +[titan] 2025-09-09 06:46:35,720 - root - INFO - lr: 1.0960e-05 gnorm: 0.37 [1 day, 13:11:08<1 day, 12:14:57] +[titan] 2025-09-09 06:47:07,630 - root - INFO - step: 20260 loss: 2.7089 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.42 mfu: 49.49% global_avg_ntp_loss: 0.7656 global_avg_top_loss: 1.9432 +[titan] 2025-09-09 06:47:07,630 - root - INFO - lr: 1.0957e-05 gnorm: 0.35 [1 day, 13:11:40<1 day, 12:14:23] +[titan] 2025-09-09 06:47:39,813 - root - INFO - step: 20265 loss: 2.6907 memory: 122.03GiB(87.57%) tps: 10,182 tflops: 485.27 mfu: 49.07% global_avg_ntp_loss: 0.7592 global_avg_top_loss: 1.9315 +[titan] 2025-09-09 06:47:39,813 - root - INFO - lr: 1.0953e-05 gnorm: 0.34 [1 day, 13:12:12<1 day, 12:13:49] +[titan] 2025-09-09 06:48:11,816 - root - INFO - step: 20270 loss: 2.8668 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 488.00 mfu: 49.34% global_avg_ntp_loss: 0.8402 global_avg_top_loss: 2.0267 +[titan] 2025-09-09 06:48:11,816 - root - INFO - lr: 1.0950e-05 gnorm: 0.33 [1 day, 13:12:44<1 day, 12:13:15] +[titan] 2025-09-09 06:48:44,044 - root - INFO - step: 20275 loss: 2.7622 memory: 122.03GiB(87.57%) tps: 10,168 tflops: 484.60 mfu: 49.00% global_avg_ntp_loss: 0.8053 global_avg_top_loss: 1.9569 +[titan] 2025-09-09 06:48:44,044 - root - INFO - lr: 1.0946e-05 gnorm: 0.41 [1 day, 13:13:16<1 day, 12:12:41] +[titan] 2025-09-09 06:49:16,114 - root - INFO - step: 20280 loss: 2.8485 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.98 mfu: 49.24% global_avg_ntp_loss: 0.8292 global_avg_top_loss: 2.0193 +[titan] 2025-09-09 06:49:16,114 - root - INFO - lr: 1.0943e-05 gnorm: 0.39 [1 day, 13:13:48<1 day, 12:12:07] +[titan] 2025-09-09 06:49:48,067 - root - INFO - step: 20285 loss: 2.8332 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.75 mfu: 49.42% global_avg_ntp_loss: 0.8222 global_avg_top_loss: 2.0110 +[titan] 2025-09-09 06:49:48,068 - root - INFO - lr: 1.0939e-05 gnorm: 0.40 [1 day, 13:14:20<1 day, 12:11:33] +[titan] 2025-09-09 06:50:19,957 - root - INFO - step: 20290 loss: 2.6742 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.74 mfu: 49.52% global_avg_ntp_loss: 0.7495 global_avg_top_loss: 1.9247 +[titan] 2025-09-09 06:50:19,957 - root - INFO - lr: 1.0935e-05 gnorm: 0.37 [1 day, 13:14:52<1 day, 12:10:59] +[titan] 2025-09-09 06:50:51,969 - root - INFO - step: 20295 loss: 2.6844 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.85 mfu: 49.33% global_avg_ntp_loss: 0.7538 global_avg_top_loss: 1.9306 +[titan] 2025-09-09 06:50:51,970 - root - INFO - lr: 1.0932e-05 gnorm: 0.37 [1 day, 13:15:24<1 day, 12:10:25] +[titan] 2025-09-09 06:51:17,413 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 06:51:23,905 - root - INFO - step: 20300 loss: 2.7561 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.03 mfu: 49.45% global_avg_ntp_loss: 0.7872 global_avg_top_loss: 1.9689 +[titan] 2025-09-09 06:51:23,905 - root - INFO - lr: 1.0928e-05 gnorm: 0.37 [1 day, 13:15:56<1 day, 12:09:51] +[titan] 2025-09-09 06:51:55,716 - root - INFO - step: 20305 loss: 2.7211 memory: 122.03GiB(87.57%) tps: 10,301 tflops: 490.94 mfu: 49.64% global_avg_ntp_loss: 0.7694 global_avg_top_loss: 1.9517 +[titan] 2025-09-09 06:51:55,717 - root - INFO - lr: 1.0925e-05 gnorm: 0.35 [1 day, 13:16:28<1 day, 12:09:17] +[titan] 2025-09-09 06:52:27,727 - root - INFO - step: 20310 loss: 2.7744 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.89 mfu: 49.33% global_avg_ntp_loss: 0.7937 global_avg_top_loss: 1.9807 +[titan] 2025-09-09 06:52:27,727 - root - INFO - lr: 1.0921e-05 gnorm: 0.34 [1 day, 13:17:00<1 day, 12:08:43] +[titan] 2025-09-09 06:52:59,560 - root - INFO - step: 20315 loss: 3.2540 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.60 mfu: 49.61% global_avg_ntp_loss: 1.0623 global_avg_top_loss: 2.1917 +[titan] 2025-09-09 06:52:59,561 - root - INFO - lr: 1.0918e-05 gnorm: 0.38 [1 day, 13:17:32<1 day, 12:08:08] +[titan] 2025-09-09 06:53:31,551 - root - INFO - step: 20320 loss: 2.7771 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.19 mfu: 49.36% global_avg_ntp_loss: 0.7907 global_avg_top_loss: 1.9865 +[titan] 2025-09-09 06:53:31,551 - root - INFO - lr: 1.0914e-05 gnorm: 0.42 [1 day, 13:18:04<1 day, 12:07:34] +[titan] 2025-09-09 06:54:03,315 - root - INFO - step: 20325 loss: 2.8222 memory: 122.03GiB(87.57%) tps: 10,316 tflops: 491.66 mfu: 49.71% global_avg_ntp_loss: 0.8125 global_avg_top_loss: 2.0098 +[titan] 2025-09-09 06:54:03,316 - root - INFO - lr: 1.0910e-05 gnorm: 0.44 [1 day, 13:18:36<1 day, 12:07:00] +[titan] 2025-09-09 06:54:35,253 - root - INFO - step: 20330 loss: 3.1607 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.99 mfu: 49.44% global_avg_ntp_loss: 1.0078 global_avg_top_loss: 2.1529 +[titan] 2025-09-09 06:54:35,254 - root - INFO - lr: 1.0907e-05 gnorm: 0.37 [1 day, 13:19:07<1 day, 12:06:26] +[titan] 2025-09-09 06:55:07,024 - root - INFO - step: 20335 loss: 2.7996 memory: 122.03GiB(87.57%) tps: 10,314 tflops: 491.58 mfu: 49.70% global_avg_ntp_loss: 0.8026 global_avg_top_loss: 1.9970 +[titan] 2025-09-09 06:55:07,024 - root - INFO - lr: 1.0903e-05 gnorm: 0.34 [1 day, 13:19:39<1 day, 12:05:52] +[titan] 2025-09-09 06:55:38,864 - root - INFO - step: 20340 loss: 2.7147 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.49 mfu: 49.59% global_avg_ntp_loss: 0.7647 global_avg_top_loss: 1.9500 +[titan] 2025-09-09 06:55:38,864 - root - INFO - lr: 1.0900e-05 gnorm: 0.34 [1 day, 13:20:11<1 day, 12:05:18] +[titan] 2025-09-09 06:56:10,802 - root - INFO - step: 20345 loss: 2.8070 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.99 mfu: 49.44% global_avg_ntp_loss: 0.8076 global_avg_top_loss: 1.9994 +[titan] 2025-09-09 06:56:10,802 - root - INFO - lr: 1.0896e-05 gnorm: 0.33 [1 day, 13:20:43<1 day, 12:04:43] +[titan] 2025-09-09 06:56:36,147 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 06:56:42,482 - root - INFO - step: 20350 loss: 2.8160 memory: 122.03GiB(87.57%) tps: 10,344 tflops: 492.98 mfu: 49.85% global_avg_ntp_loss: 0.8141 global_avg_top_loss: 2.0020 +[titan] 2025-09-09 06:56:42,482 - root - INFO - lr: 1.0893e-05 gnorm: 0.34 [1 day, 13:21:15<1 day, 12:04:09] +[titan] 2025-09-09 06:57:14,496 - root - INFO - step: 20355 loss: 2.7763 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.83 mfu: 49.33% global_avg_ntp_loss: 0.7943 global_avg_top_loss: 1.9820 +[titan] 2025-09-09 06:57:14,496 - root - INFO - lr: 1.0889e-05 gnorm: 0.33 [1 day, 13:21:47<1 day, 12:03:35] +[titan] 2025-09-09 06:57:46,528 - root - INFO - step: 20360 loss: 2.7609 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.55 mfu: 49.30% global_avg_ntp_loss: 0.7851 global_avg_top_loss: 1.9758 +[titan] 2025-09-09 06:57:46,529 - root - INFO - lr: 1.0885e-05 gnorm: 0.34 [1 day, 13:22:19<1 day, 12:03:01] +[titan] 2025-09-09 06:58:18,434 - root - INFO - step: 20365 loss: 2.8526 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.49 mfu: 49.49% global_avg_ntp_loss: 0.8304 global_avg_top_loss: 2.0222 +[titan] 2025-09-09 06:58:18,435 - root - INFO - lr: 1.0882e-05 gnorm: 0.35 [1 day, 13:22:51<1 day, 12:02:27] +[titan] 2025-09-09 06:58:50,498 - root - INFO - step: 20370 loss: 2.7777 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.07 mfu: 49.25% global_avg_ntp_loss: 0.8031 global_avg_top_loss: 1.9746 +[titan] 2025-09-09 06:58:50,499 - root - INFO - lr: 1.0878e-05 gnorm: 0.33 [1 day, 13:23:23<1 day, 12:01:53] +[titan] 2025-09-09 06:59:22,427 - root - INFO - step: 20375 loss: 2.6894 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.14 mfu: 49.46% global_avg_ntp_loss: 0.7562 global_avg_top_loss: 1.9333 +[titan] 2025-09-09 06:59:22,427 - root - INFO - lr: 1.0875e-05 gnorm: 0.35 [1 day, 13:23:55<1 day, 12:01:19] +[titan] 2025-09-09 06:59:54,616 - root - INFO - step: 20380 loss: 2.7796 memory: 122.03GiB(87.57%) tps: 10,180 tflops: 485.17 mfu: 49.06% global_avg_ntp_loss: 0.7999 global_avg_top_loss: 1.9797 +[titan] 2025-09-09 06:59:54,616 - root - INFO - lr: 1.0871e-05 gnorm: 0.33 [1 day, 13:24:27<1 day, 12:00:45] +[titan] 2025-09-09 07:00:26,523 - root - INFO - step: 20385 loss: 2.7060 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.47 mfu: 49.49% global_avg_ntp_loss: 0.7625 global_avg_top_loss: 1.9435 +[titan] 2025-09-09 07:00:26,523 - root - INFO - lr: 1.0868e-05 gnorm: 0.34 [1 day, 13:24:59<1 day, 12:00:11] +[titan] 2025-09-09 07:00:58,531 - root - INFO - step: 20390 loss: 3.0371 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.92 mfu: 49.34% global_avg_ntp_loss: 0.9383 global_avg_top_loss: 2.0988 +[titan] 2025-09-09 07:00:58,531 - root - INFO - lr: 1.0864e-05 gnorm: 0.35 [1 day, 13:25:31<1 day, 11:59:37] +[titan] 2025-09-09 07:01:30,383 - root - INFO - step: 20395 loss: 3.2952 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.30 mfu: 49.58% global_avg_ntp_loss: 1.0824 global_avg_top_loss: 2.2127 +[titan] 2025-09-09 07:01:30,384 - root - INFO - lr: 1.0860e-05 gnorm: 0.39 [1 day, 13:26:03<1 day, 11:59:03] +[titan] 2025-09-09 07:01:55,825 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 07:02:02,212 - root - INFO - step: 20400 loss: 2.7289 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.67 mfu: 49.61% global_avg_ntp_loss: 0.7767 global_avg_top_loss: 1.9522 +[titan] 2025-09-09 07:02:02,212 - root - INFO - lr: 1.0857e-05 gnorm: 0.36 [1 day, 13:26:34<1 day, 11:58:28] +[titan] 2025-09-09 07:02:34,203 - root - INFO - step: 20405 loss: 2.8002 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.18 mfu: 49.36% global_avg_ntp_loss: 0.8053 global_avg_top_loss: 1.9949 +[titan] 2025-09-09 07:02:34,204 - root - INFO - lr: 1.0853e-05 gnorm: 0.34 [1 day, 13:27:06<1 day, 11:57:54] +[titan] 2025-09-09 07:03:06,198 - root - INFO - step: 20410 loss: 2.8048 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.12 mfu: 49.36% global_avg_ntp_loss: 0.8092 global_avg_top_loss: 1.9956 +[titan] 2025-09-09 07:03:06,198 - root - INFO - lr: 1.0850e-05 gnorm: 0.36 [1 day, 13:27:38<1 day, 11:57:20] +[titan] 2025-09-09 07:03:38,243 - root - INFO - step: 20415 loss: 2.7784 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.35 mfu: 49.28% global_avg_ntp_loss: 0.7953 global_avg_top_loss: 1.9830 +[titan] 2025-09-09 07:03:38,244 - root - INFO - lr: 1.0846e-05 gnorm: 0.42 [1 day, 13:28:10<1 day, 11:56:46] +[titan] 2025-09-09 07:04:10,130 - root - INFO - step: 20420 loss: 2.7288 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.79 mfu: 49.52% global_avg_ntp_loss: 0.7734 global_avg_top_loss: 1.9554 +[titan] 2025-09-09 07:04:10,130 - root - INFO - lr: 1.0843e-05 gnorm: 0.34 [1 day, 13:28:42<1 day, 11:56:12] +[titan] 2025-09-09 07:04:42,052 - root - INFO - step: 20425 loss: 2.8111 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.23 mfu: 49.47% global_avg_ntp_loss: 0.8155 global_avg_top_loss: 1.9956 +[titan] 2025-09-09 07:04:42,053 - root - INFO - lr: 1.0839e-05 gnorm: 0.34 [1 day, 13:29:14<1 day, 11:55:38] +[titan] 2025-09-09 07:05:13,849 - root - INFO - step: 20430 loss: 2.7851 memory: 122.03GiB(87.57%) tps: 10,306 tflops: 491.16 mfu: 49.66% global_avg_ntp_loss: 0.7994 global_avg_top_loss: 1.9857 +[titan] 2025-09-09 07:05:13,850 - root - INFO - lr: 1.0835e-05 gnorm: 0.33 [1 day, 13:29:46<1 day, 11:55:04] +[titan] 2025-09-09 07:05:46,094 - root - INFO - step: 20435 loss: 3.1016 memory: 122.03GiB(87.57%) tps: 10,163 tflops: 484.34 mfu: 48.97% global_avg_ntp_loss: 0.9714 global_avg_top_loss: 2.1302 +[titan] 2025-09-09 07:05:46,094 - root - INFO - lr: 1.0832e-05 gnorm: 0.35 [1 day, 13:30:18<1 day, 11:54:30] +[titan] 2025-09-09 07:06:17,966 - root - INFO - step: 20440 loss: 2.8745 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.99 mfu: 49.54% global_avg_ntp_loss: 0.8395 global_avg_top_loss: 2.0351 +[titan] 2025-09-09 07:06:17,967 - root - INFO - lr: 1.0828e-05 gnorm: 0.34 [1 day, 13:30:50<1 day, 11:53:56] +[titan] 2025-09-09 07:06:50,158 - root - INFO - step: 20445 loss: 2.7692 memory: 122.03GiB(87.57%) tps: 10,179 tflops: 485.14 mfu: 49.05% global_avg_ntp_loss: 0.7921 global_avg_top_loss: 1.9771 +[titan] 2025-09-09 07:06:50,158 - root - INFO - lr: 1.0825e-05 gnorm: 0.34 [1 day, 13:31:22<1 day, 11:53:22] +[titan] 2025-09-09 07:07:15,784 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 07:07:22,090 - root - INFO - step: 20450 loss: 2.7641 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.08 mfu: 49.45% global_avg_ntp_loss: 0.7916 global_avg_top_loss: 1.9725 +[titan] 2025-09-09 07:07:22,091 - root - INFO - lr: 1.0821e-05 gnorm: 0.33 [1 day, 13:31:54<1 day, 11:52:48] +[titan] 2025-09-09 07:07:54,405 - root - INFO - step: 20455 loss: 2.6634 memory: 122.03GiB(87.57%) tps: 10,141 tflops: 483.29 mfu: 48.87% global_avg_ntp_loss: 0.7446 global_avg_top_loss: 1.9188 +[titan] 2025-09-09 07:07:54,405 - root - INFO - lr: 1.0818e-05 gnorm: 0.35 [1 day, 13:32:27<1 day, 11:52:14] +[titan] 2025-09-09 07:08:26,513 - root - INFO - step: 20460 loss: 2.8403 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.41 mfu: 49.18% global_avg_ntp_loss: 0.8255 global_avg_top_loss: 2.0148 +[titan] 2025-09-09 07:08:26,513 - root - INFO - lr: 1.0814e-05 gnorm: 0.34 [1 day, 13:32:59<1 day, 11:51:40] +[titan] 2025-09-09 07:08:58,482 - root - INFO - step: 20465 loss: 2.7725 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.51 mfu: 49.39% global_avg_ntp_loss: 0.7930 global_avg_top_loss: 1.9796 +[titan] 2025-09-09 07:08:58,483 - root - INFO - lr: 1.0810e-05 gnorm: 0.33 [1 day, 13:33:31<1 day, 11:51:06] +[titan] 2025-09-09 07:09:30,461 - root - INFO - step: 20470 loss: 2.7779 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.36 mfu: 49.38% global_avg_ntp_loss: 0.7936 global_avg_top_loss: 1.9842 +[titan] 2025-09-09 07:09:30,462 - root - INFO - lr: 1.0807e-05 gnorm: 0.34 [1 day, 13:34:03<1 day, 11:50:32] +[titan] 2025-09-09 07:10:02,599 - root - INFO - step: 20475 loss: 3.2187 memory: 122.03GiB(87.57%) tps: 10,197 tflops: 485.96 mfu: 49.14% global_avg_ntp_loss: 1.0490 global_avg_top_loss: 2.1697 +[titan] 2025-09-09 07:10:02,599 - root - INFO - lr: 1.0803e-05 gnorm: 0.36 [1 day, 13:34:35<1 day, 11:49:58] +[titan] 2025-09-09 07:10:35,023 - root - INFO - step: 20480 loss: 2.7769 memory: 122.03GiB(87.57%) tps: 10,106 tflops: 481.65 mfu: 48.70% global_avg_ntp_loss: 0.7952 global_avg_top_loss: 1.9817 +[titan] 2025-09-09 07:10:35,024 - root - INFO - lr: 1.0800e-05 gnorm: 0.33 [1 day, 13:35:07<1 day, 11:49:25] +[titan] 2025-09-09 07:10:35,360 - root - INFO - Dumping profiler traces at step 20480 +[titan] 2025-09-09 07:10:35,414 - root - INFO - Finished dumping profiler traces in 0.05 seconds +[titan] 2025-09-09 07:11:07,257 - root - INFO - step: 20485 loss: 3.0455 memory: 122.03GiB(87.57%) tps: 10,166 tflops: 484.50 mfu: 48.99% global_avg_ntp_loss: 0.9270 global_avg_top_loss: 2.1186 +[titan] 2025-09-09 07:11:07,258 - root - INFO - lr: 1.0796e-05 gnorm: 1.11 [1 day, 13:35:39<1 day, 11:48:51] +[titan] 2025-09-09 07:11:39,391 - root - INFO - step: 20490 loss: 3.0723 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.02 mfu: 49.14% global_avg_ntp_loss: 0.9626 global_avg_top_loss: 2.1097 +[titan] 2025-09-09 07:11:39,391 - root - INFO - lr: 1.0793e-05 gnorm: 0.36 [1 day, 13:36:12<1 day, 11:48:17] +[titan] 2025-09-09 07:12:11,223 - root - INFO - step: 20495 loss: 2.7660 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.63 mfu: 49.61% global_avg_ntp_loss: 0.7899 global_avg_top_loss: 1.9761 +[titan] 2025-09-09 07:12:11,223 - root - INFO - lr: 1.0789e-05 gnorm: 0.35 [1 day, 13:36:43<1 day, 11:47:43] +[titan] 2025-09-09 07:12:36,467 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 07:12:42,946 - root - INFO - step: 20500 loss: 2.8090 memory: 122.03GiB(87.57%) tps: 10,330 tflops: 492.31 mfu: 49.78% global_avg_ntp_loss: 0.8068 global_avg_top_loss: 2.0023 +[titan] 2025-09-09 07:12:42,946 - root - INFO - lr: 1.0785e-05 gnorm: 0.36 [1 day, 13:37:15<1 day, 11:47:09] +[titan] 2025-09-09 07:13:14,870 - root - INFO - step: 20505 loss: 2.7942 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.19 mfu: 49.46% global_avg_ntp_loss: 0.8034 global_avg_top_loss: 1.9908 +[titan] 2025-09-09 07:13:14,871 - root - INFO - lr: 1.0782e-05 gnorm: 0.34 [1 day, 13:37:47<1 day, 11:46:34] +[titan] 2025-09-09 07:13:46,506 - root - INFO - step: 20510 loss: 3.1287 memory: 122.03GiB(87.57%) tps: 10,358 tflops: 493.66 mfu: 49.92% global_avg_ntp_loss: 0.9606 global_avg_top_loss: 2.1681 +[titan] 2025-09-09 07:13:46,507 - root - INFO - lr: 1.0778e-05 gnorm: 0.66 [1 day, 13:38:19<1 day, 11:46:00] +[titan] 2025-09-09 07:14:18,461 - root - INFO - step: 20515 loss: 2.7824 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.74 mfu: 49.42% global_avg_ntp_loss: 0.7989 global_avg_top_loss: 1.9835 +[titan] 2025-09-09 07:14:18,461 - root - INFO - lr: 1.0775e-05 gnorm: 0.34 [1 day, 13:38:51<1 day, 11:45:26] +[titan] 2025-09-09 07:14:50,164 - root - INFO - step: 20520 loss: 2.7884 memory: 122.03GiB(87.57%) tps: 10,336 tflops: 492.61 mfu: 49.81% global_avg_ntp_loss: 0.7993 global_avg_top_loss: 1.9891 +[titan] 2025-09-09 07:14:50,164 - root - INFO - lr: 1.0771e-05 gnorm: 0.38 [1 day, 13:39:22<1 day, 11:44:52] +[titan] 2025-09-09 07:15:22,156 - root - INFO - step: 20525 loss: 2.7545 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.16 mfu: 49.36% global_avg_ntp_loss: 0.7855 global_avg_top_loss: 1.9689 +[titan] 2025-09-09 07:15:22,157 - root - INFO - lr: 1.0768e-05 gnorm: 0.36 [1 day, 13:39:54<1 day, 11:44:18] +[titan] 2025-09-09 07:15:53,886 - root - INFO - step: 20530 loss: 2.8623 memory: 122.03GiB(87.57%) tps: 10,328 tflops: 492.21 mfu: 49.77% global_avg_ntp_loss: 0.8337 global_avg_top_loss: 2.0287 +[titan] 2025-09-09 07:15:53,886 - root - INFO - lr: 1.0764e-05 gnorm: 0.37 [1 day, 13:40:26<1 day, 11:43:43] +[titan] 2025-09-09 07:16:25,766 - root - INFO - step: 20535 loss: 2.6397 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.87 mfu: 49.53% global_avg_ntp_loss: 0.7316 global_avg_top_loss: 1.9082 +[titan] 2025-09-09 07:16:25,767 - root - INFO - lr: 1.0760e-05 gnorm: 0.40 [1 day, 13:40:58<1 day, 11:43:09] +[titan] 2025-09-09 07:16:57,719 - root - INFO - step: 20540 loss: 2.7389 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.76 mfu: 49.42% global_avg_ntp_loss: 0.7752 global_avg_top_loss: 1.9637 +[titan] 2025-09-09 07:16:57,720 - root - INFO - lr: 1.0757e-05 gnorm: 0.34 [1 day, 13:41:30<1 day, 11:42:35] +[titan] 2025-09-09 07:17:29,790 - root - INFO - step: 20545 loss: 2.7142 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.98 mfu: 49.24% global_avg_ntp_loss: 0.7712 global_avg_top_loss: 1.9429 +[titan] 2025-09-09 07:17:29,790 - root - INFO - lr: 1.0753e-05 gnorm: 0.34 [1 day, 13:42:02<1 day, 11:42:01] +[titan] 2025-09-09 07:17:55,293 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 07:18:01,633 - root - INFO - step: 20550 loss: 2.7682 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.45 mfu: 49.59% global_avg_ntp_loss: 0.7911 global_avg_top_loss: 1.9771 +[titan] 2025-09-09 07:18:01,633 - root - INFO - lr: 1.0750e-05 gnorm: 0.34 [1 day, 13:42:34<1 day, 11:41:27] +[titan] 2025-09-09 07:18:33,495 - root - INFO - step: 20555 loss: 3.2850 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.16 mfu: 49.56% global_avg_ntp_loss: 1.0778 global_avg_top_loss: 2.2072 +[titan] 2025-09-09 07:18:33,495 - root - INFO - lr: 1.0746e-05 gnorm: 0.39 [1 day, 13:43:06<1 day, 11:40:53] +[titan] 2025-09-09 07:19:05,267 - root - INFO - step: 20560 loss: 2.8272 memory: 122.03GiB(87.57%) tps: 10,314 tflops: 491.54 mfu: 49.70% global_avg_ntp_loss: 0.8203 global_avg_top_loss: 2.0069 +[titan] 2025-09-09 07:19:05,267 - root - INFO - lr: 1.0743e-05 gnorm: 0.37 [1 day, 13:43:37<1 day, 11:40:19] +[titan] 2025-09-09 07:19:37,135 - root - INFO - step: 20565 loss: 2.7941 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.07 mfu: 49.55% global_avg_ntp_loss: 0.8025 global_avg_top_loss: 1.9916 +[titan] 2025-09-09 07:19:37,135 - root - INFO - lr: 1.0739e-05 gnorm: 0.35 [1 day, 13:44:09<1 day, 11:39:45] +[titan] 2025-09-09 07:20:09,312 - root - INFO - step: 20570 loss: 2.9126 memory: 122.03GiB(87.57%) tps: 10,184 tflops: 485.36 mfu: 49.08% global_avg_ntp_loss: 0.8736 global_avg_top_loss: 2.0389 +[titan] 2025-09-09 07:20:09,312 - root - INFO - lr: 1.0736e-05 gnorm: 0.38 [1 day, 13:44:42<1 day, 11:39:11] +[titan] 2025-09-09 07:20:41,308 - root - INFO - step: 20575 loss: 2.7576 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.10 mfu: 49.35% global_avg_ntp_loss: 0.7856 global_avg_top_loss: 1.9721 +[titan] 2025-09-09 07:20:41,309 - root - INFO - lr: 1.0732e-05 gnorm: 0.36 [1 day, 13:45:14<1 day, 11:38:37] +[titan] 2025-09-09 07:21:13,132 - root - INFO - step: 20580 loss: 2.8376 memory: 122.03GiB(87.57%) tps: 10,297 tflops: 490.75 mfu: 49.62% global_avg_ntp_loss: 0.8256 global_avg_top_loss: 2.0120 +[titan] 2025-09-09 07:21:13,133 - root - INFO - lr: 1.0728e-05 gnorm: 0.34 [1 day, 13:45:45<1 day, 11:38:03] +[titan] 2025-09-09 07:21:45,124 - root - INFO - step: 20585 loss: 2.8509 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.17 mfu: 49.36% global_avg_ntp_loss: 0.8310 global_avg_top_loss: 2.0199 +[titan] 2025-09-09 07:21:45,124 - root - INFO - lr: 1.0725e-05 gnorm: 0.34 [1 day, 13:46:17<1 day, 11:37:29] +[titan] 2025-09-09 07:22:16,997 - root - INFO - step: 20590 loss: 3.1311 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.98 mfu: 49.54% global_avg_ntp_loss: 0.9648 global_avg_top_loss: 2.1662 +[titan] 2025-09-09 07:22:16,998 - root - INFO - lr: 1.0721e-05 gnorm: 0.69 [1 day, 13:46:49<1 day, 11:36:55] +[titan] 2025-09-09 07:22:49,017 - root - INFO - step: 20595 loss: 2.7975 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.74 mfu: 49.32% global_avg_ntp_loss: 0.8041 global_avg_top_loss: 1.9933 +[titan] 2025-09-09 07:22:49,017 - root - INFO - lr: 1.0718e-05 gnorm: 0.36 [1 day, 13:47:21<1 day, 11:36:21] +[titan] 2025-09-09 07:23:14,280 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 07:23:20,736 - root - INFO - step: 20600 loss: 2.7198 memory: 122.03GiB(87.57%) tps: 10,331 tflops: 492.37 mfu: 49.78% global_avg_ntp_loss: 0.7719 global_avg_top_loss: 1.9479 +[titan] 2025-09-09 07:23:20,737 - root - INFO - lr: 1.0714e-05 gnorm: 0.35 [1 day, 13:47:53<1 day, 11:35:46] +[titan] 2025-09-09 07:23:52,653 - root - INFO - step: 20605 loss: 2.6982 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.31 mfu: 49.48% global_avg_ntp_loss: 0.7595 global_avg_top_loss: 1.9387 +[titan] 2025-09-09 07:23:52,654 - root - INFO - lr: 1.0711e-05 gnorm: 0.36 [1 day, 13:48:25<1 day, 11:35:12] +[titan] 2025-09-09 07:24:24,493 - root - INFO - step: 20610 loss: 2.6344 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.50 mfu: 49.60% global_avg_ntp_loss: 0.7300 global_avg_top_loss: 1.9045 +[titan] 2025-09-09 07:24:24,493 - root - INFO - lr: 1.0707e-05 gnorm: 0.35 [1 day, 13:48:57<1 day, 11:34:38] +[titan] 2025-09-09 07:24:56,361 - root - INFO - step: 20615 loss: 2.8545 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.07 mfu: 49.55% global_avg_ntp_loss: 0.8316 global_avg_top_loss: 2.0229 +[titan] 2025-09-09 07:24:56,361 - root - INFO - lr: 1.0703e-05 gnorm: 0.36 [1 day, 13:49:29<1 day, 11:34:04] +[titan] 2025-09-09 07:25:28,166 - root - INFO - step: 20620 loss: 2.7952 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.03 mfu: 49.65% global_avg_ntp_loss: 0.8033 global_avg_top_loss: 1.9919 +[titan] 2025-09-09 07:25:28,167 - root - INFO - lr: 1.0700e-05 gnorm: 0.35 [1 day, 13:50:00<1 day, 11:33:30] +[titan] 2025-09-09 07:26:00,181 - root - INFO - step: 20625 loss: 2.7567 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.82 mfu: 49.32% global_avg_ntp_loss: 0.7954 global_avg_top_loss: 1.9613 +[titan] 2025-09-09 07:26:00,182 - root - INFO - lr: 1.0696e-05 gnorm: 0.36 [1 day, 13:50:32<1 day, 11:32:56] +[titan] 2025-09-09 07:26:32,202 - root - INFO - step: 20630 loss: 3.6945 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.74 mfu: 49.32% global_avg_ntp_loss: 1.3172 global_avg_top_loss: 2.3773 +[titan] 2025-09-09 07:26:32,202 - root - INFO - lr: 1.0693e-05 gnorm: 0.34 [1 day, 13:51:04<1 day, 11:32:22] +[titan] 2025-09-09 07:27:04,024 - root - INFO - step: 20635 loss: 3.1787 memory: 122.03GiB(87.57%) tps: 10,297 tflops: 490.77 mfu: 49.62% global_avg_ntp_loss: 1.0293 global_avg_top_loss: 2.1493 +[titan] 2025-09-09 07:27:04,024 - root - INFO - lr: 1.0689e-05 gnorm: 0.49 [1 day, 13:51:36<1 day, 11:31:48] +[titan] 2025-09-09 07:27:35,971 - root - INFO - step: 20640 loss: 2.8221 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.85 mfu: 49.43% global_avg_ntp_loss: 0.8122 global_avg_top_loss: 2.0099 +[titan] 2025-09-09 07:27:35,972 - root - INFO - lr: 1.0686e-05 gnorm: 0.33 [1 day, 13:52:08<1 day, 11:31:14] +[titan] 2025-09-09 07:28:08,044 - root - INFO - step: 20645 loss: 2.7387 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.94 mfu: 49.24% global_avg_ntp_loss: 0.7786 global_avg_top_loss: 1.9601 +[titan] 2025-09-09 07:28:08,044 - root - INFO - lr: 1.0682e-05 gnorm: 0.33 [1 day, 13:52:40<1 day, 11:30:40] +[titan] 2025-09-09 07:28:33,580 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 07:28:39,993 - root - INFO - step: 20650 loss: 2.8459 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.82 mfu: 49.43% global_avg_ntp_loss: 0.8255 global_avg_top_loss: 2.0204 +[titan] 2025-09-09 07:28:39,993 - root - INFO - lr: 1.0678e-05 gnorm: 0.38 [1 day, 13:53:12<1 day, 11:30:06] +[titan] 2025-09-09 07:29:11,711 - root - INFO - step: 20655 loss: 2.9157 memory: 122.03GiB(87.57%) tps: 10,331 tflops: 492.37 mfu: 49.79% global_avg_ntp_loss: 0.8784 global_avg_top_loss: 2.0373 +[titan] 2025-09-09 07:29:11,712 - root - INFO - lr: 1.0675e-05 gnorm: 0.38 [1 day, 13:53:44<1 day, 11:29:31] +[titan] 2025-09-09 07:29:43,746 - root - INFO - step: 20660 loss: 2.7643 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.52 mfu: 49.29% global_avg_ntp_loss: 0.7887 global_avg_top_loss: 1.9755 +[titan] 2025-09-09 07:29:43,746 - root - INFO - lr: 1.0671e-05 gnorm: 0.38 [1 day, 13:54:16<1 day, 11:28:58] +[titan] 2025-09-09 07:30:15,581 - root - INFO - step: 20665 loss: 2.8318 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.58 mfu: 49.60% global_avg_ntp_loss: 0.8232 global_avg_top_loss: 2.0086 +[titan] 2025-09-09 07:30:15,581 - root - INFO - lr: 1.0668e-05 gnorm: 0.35 [1 day, 13:54:48<1 day, 11:28:23] +[titan] 2025-09-09 07:30:47,813 - root - INFO - step: 20670 loss: 2.7860 memory: 122.03GiB(87.57%) tps: 10,167 tflops: 484.53 mfu: 48.99% global_avg_ntp_loss: 0.7990 global_avg_top_loss: 1.9870 +[titan] 2025-09-09 07:30:47,813 - root - INFO - lr: 1.0664e-05 gnorm: 0.40 [1 day, 13:55:20<1 day, 11:27:50] +[titan] 2025-09-09 07:31:20,089 - root - INFO - step: 20675 loss: 3.0578 memory: 122.03GiB(87.57%) tps: 10,153 tflops: 483.87 mfu: 48.93% global_avg_ntp_loss: 0.9446 global_avg_top_loss: 2.1133 +[titan] 2025-09-09 07:31:20,089 - root - INFO - lr: 1.0661e-05 gnorm: 0.38 [1 day, 13:55:52<1 day, 11:27:16] +[titan] 2025-09-09 07:31:52,030 - root - INFO - step: 20680 loss: 2.8441 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.95 mfu: 49.44% global_avg_ntp_loss: 0.8248 global_avg_top_loss: 2.0194 +[titan] 2025-09-09 07:31:52,030 - root - INFO - lr: 1.0657e-05 gnorm: 0.36 [1 day, 13:56:24<1 day, 11:26:42] +[titan] 2025-09-09 07:32:23,838 - root - INFO - step: 20685 loss: 2.8750 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 490.98 mfu: 49.64% global_avg_ntp_loss: 0.8328 global_avg_top_loss: 2.0422 +[titan] 2025-09-09 07:32:23,839 - root - INFO - lr: 1.0653e-05 gnorm: 1.13 [1 day, 13:56:56<1 day, 11:26:08] +[titan] 2025-09-09 07:32:55,829 - root - INFO - step: 20690 loss: 2.7702 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.19 mfu: 49.36% global_avg_ntp_loss: 0.7990 global_avg_top_loss: 1.9712 +[titan] 2025-09-09 07:32:55,829 - root - INFO - lr: 1.0650e-05 gnorm: 0.39 [1 day, 13:57:28<1 day, 11:25:34] +[titan] 2025-09-09 07:33:27,566 - root - INFO - step: 20695 loss: 2.7366 memory: 122.03GiB(87.57%) tps: 10,325 tflops: 492.09 mfu: 49.76% global_avg_ntp_loss: 0.7745 global_avg_top_loss: 1.9621 +[titan] 2025-09-09 07:33:27,566 - root - INFO - lr: 1.0646e-05 gnorm: 0.36 [1 day, 13:58:00<1 day, 11:24:59] +[titan] 2025-09-09 07:33:53,012 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 07:33:59,395 - root - INFO - step: 20700 loss: 2.9222 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.67 mfu: 49.61% global_avg_ntp_loss: 0.8573 global_avg_top_loss: 2.0649 +[titan] 2025-09-09 07:33:59,396 - root - INFO - lr: 1.0643e-05 gnorm: 0.33 [1 day, 13:58:32<1 day, 11:24:25] +[titan] 2025-09-09 07:34:31,149 - root - INFO - step: 20705 loss: 2.7591 memory: 122.03GiB(87.57%) tps: 10,320 tflops: 491.84 mfu: 49.73% global_avg_ntp_loss: 0.7978 global_avg_top_loss: 1.9613 +[titan] 2025-09-09 07:34:31,149 - root - INFO - lr: 1.0639e-05 gnorm: 0.47 [1 day, 13:59:03<1 day, 11:23:51] +[titan] 2025-09-09 07:35:03,021 - root - INFO - step: 20710 loss: 3.3606 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 490.01 mfu: 49.55% global_avg_ntp_loss: 1.1187 global_avg_top_loss: 2.2419 +[titan] 2025-09-09 07:35:03,021 - root - INFO - lr: 1.0636e-05 gnorm: 0.34 [1 day, 13:59:35<1 day, 11:23:17] +[titan] 2025-09-09 07:35:35,006 - root - INFO - step: 20715 loss: 2.7429 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.26 mfu: 49.37% global_avg_ntp_loss: 0.7828 global_avg_top_loss: 1.9601 +[titan] 2025-09-09 07:35:35,007 - root - INFO - lr: 1.0632e-05 gnorm: 0.33 [1 day, 14:00:07<1 day, 11:22:43] +[titan] 2025-09-09 07:36:06,716 - root - INFO - step: 20720 loss: 2.8326 memory: 122.03GiB(87.57%) tps: 10,334 tflops: 492.52 mfu: 49.80% global_avg_ntp_loss: 0.8175 global_avg_top_loss: 2.0150 +[titan] 2025-09-09 07:36:06,716 - root - INFO - lr: 1.0628e-05 gnorm: 0.34 [1 day, 14:00:39<1 day, 11:22:09] +[titan] 2025-09-09 07:36:38,798 - root - INFO - step: 20725 loss: 2.8353 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.80 mfu: 49.22% global_avg_ntp_loss: 0.8213 global_avg_top_loss: 2.0140 +[titan] 2025-09-09 07:36:38,798 - root - INFO - lr: 1.0625e-05 gnorm: 0.33 [1 day, 14:01:11<1 day, 11:21:35] +[titan] 2025-09-09 07:37:10,652 - root - INFO - step: 20730 loss: 2.8775 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.28 mfu: 49.57% global_avg_ntp_loss: 0.8411 global_avg_top_loss: 2.0364 +[titan] 2025-09-09 07:37:10,652 - root - INFO - lr: 1.0621e-05 gnorm: 0.36 [1 day, 14:01:43<1 day, 11:21:01] +[titan] 2025-09-09 07:37:42,842 - root - INFO - step: 20735 loss: 2.9359 memory: 122.03GiB(87.57%) tps: 10,180 tflops: 485.17 mfu: 49.06% global_avg_ntp_loss: 0.8688 global_avg_top_loss: 2.0671 +[titan] 2025-09-09 07:37:42,842 - root - INFO - lr: 1.0618e-05 gnorm: 0.44 [1 day, 14:02:15<1 day, 11:20:27] +[titan] 2025-09-09 07:38:14,599 - root - INFO - step: 20740 loss: 2.7190 memory: 122.03GiB(87.57%) tps: 10,319 tflops: 491.77 mfu: 49.72% global_avg_ntp_loss: 0.7677 global_avg_top_loss: 1.9513 +[titan] 2025-09-09 07:38:14,599 - root - INFO - lr: 1.0614e-05 gnorm: 0.45 [1 day, 14:02:47<1 day, 11:19:53] +[titan] 2025-09-09 07:38:46,380 - root - INFO - step: 20745 loss: 2.8439 memory: 122.03GiB(87.57%) tps: 10,311 tflops: 491.41 mfu: 49.69% global_avg_ntp_loss: 0.8215 global_avg_top_loss: 2.0224 +[titan] 2025-09-09 07:38:46,381 - root - INFO - lr: 1.0611e-05 gnorm: 0.35 [1 day, 14:03:19<1 day, 11:19:19] +[titan] 2025-09-09 07:39:11,961 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 07:39:18,414 - root - INFO - step: 20750 loss: 2.7640 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.53 mfu: 49.29% global_avg_ntp_loss: 0.7947 global_avg_top_loss: 1.9693 +[titan] 2025-09-09 07:39:18,415 - root - INFO - lr: 1.0607e-05 gnorm: 0.35 [1 day, 14:03:51<1 day, 11:18:45] +[titan] 2025-09-09 07:39:50,347 - root - INFO - step: 20755 loss: 2.7815 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.07 mfu: 49.45% global_avg_ntp_loss: 0.7944 global_avg_top_loss: 1.9871 +[titan] 2025-09-09 07:39:50,348 - root - INFO - lr: 1.0604e-05 gnorm: 0.44 [1 day, 14:04:23<1 day, 11:18:11] +[titan] 2025-09-09 07:40:22,425 - root - INFO - step: 20760 loss: 2.8046 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.86 mfu: 49.23% global_avg_ntp_loss: 0.8071 global_avg_top_loss: 1.9975 +[titan] 2025-09-09 07:40:22,426 - root - INFO - lr: 1.0600e-05 gnorm: 0.34 [1 day, 14:04:55<1 day, 11:17:37] +[titan] 2025-09-09 07:40:54,274 - root - INFO - step: 20765 loss: 2.8028 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.37 mfu: 49.58% global_avg_ntp_loss: 0.8079 global_avg_top_loss: 1.9949 +[titan] 2025-09-09 07:40:54,274 - root - INFO - lr: 1.0596e-05 gnorm: 0.34 [1 day, 14:05:26<1 day, 11:17:03] +[titan] 2025-09-09 07:41:26,612 - root - INFO - step: 20770 loss: 2.8227 memory: 122.03GiB(87.57%) tps: 10,133 tflops: 482.94 mfu: 48.83% global_avg_ntp_loss: 0.8154 global_avg_top_loss: 2.0073 +[titan] 2025-09-09 07:41:26,612 - root - INFO - lr: 1.0593e-05 gnorm: 0.37 [1 day, 14:05:59<1 day, 11:16:29] +[titan] 2025-09-09 07:41:58,615 - root - INFO - step: 20775 loss: 2.8022 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 488.00 mfu: 49.34% global_avg_ntp_loss: 0.8065 global_avg_top_loss: 1.9957 +[titan] 2025-09-09 07:41:58,615 - root - INFO - lr: 1.0589e-05 gnorm: 0.35 [1 day, 14:06:31<1 day, 11:15:55] +[titan] 2025-09-09 07:42:30,425 - root - INFO - step: 20780 loss: 2.9815 memory: 122.03GiB(87.57%) tps: 10,301 tflops: 490.96 mfu: 49.64% global_avg_ntp_loss: 0.9007 global_avg_top_loss: 2.0808 +[titan] 2025-09-09 07:42:30,426 - root - INFO - lr: 1.0586e-05 gnorm: 0.34 [1 day, 14:07:03<1 day, 11:15:21] +[titan] 2025-09-09 07:43:02,471 - root - INFO - step: 20785 loss: 2.6697 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.35 mfu: 49.28% global_avg_ntp_loss: 0.7556 global_avg_top_loss: 1.9141 +[titan] 2025-09-09 07:43:02,471 - root - INFO - lr: 1.0582e-05 gnorm: 0.39 [1 day, 14:07:35<1 day, 11:14:47] +[titan] 2025-09-09 07:43:34,584 - root - INFO - step: 20790 loss: 3.1640 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.33 mfu: 49.17% global_avg_ntp_loss: 1.0273 global_avg_top_loss: 2.1367 +[titan] 2025-09-09 07:43:34,584 - root - INFO - lr: 1.0579e-05 gnorm: 0.37 [1 day, 14:08:07<1 day, 11:14:13] +[titan] 2025-09-09 07:44:06,509 - root - INFO - step: 20795 loss: 3.0651 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.20 mfu: 49.46% global_avg_ntp_loss: 0.9439 global_avg_top_loss: 2.1211 +[titan] 2025-09-09 07:44:06,509 - root - INFO - lr: 1.0575e-05 gnorm: 0.34 [1 day, 14:08:39<1 day, 11:13:39] +[titan] 2025-09-09 07:44:32,015 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 07:44:38,368 - root - INFO - step: 20800 loss: 2.6540 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.20 mfu: 49.57% global_avg_ntp_loss: 0.7441 global_avg_top_loss: 1.9100 +[titan] 2025-09-09 07:44:38,368 - root - INFO - lr: 1.0571e-05 gnorm: 0.34 [1 day, 14:09:11<1 day, 11:13:05] +[titan] 2025-09-09 07:45:10,207 - root - INFO - step: 20805 loss: 2.7972 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.51 mfu: 49.60% global_avg_ntp_loss: 0.8050 global_avg_top_loss: 1.9922 +[titan] 2025-09-09 07:45:10,207 - root - INFO - lr: 1.0568e-05 gnorm: 0.35 [1 day, 14:09:42<1 day, 11:12:31] +[titan] 2025-09-09 07:45:41,946 - root - INFO - step: 20810 loss: 2.7258 memory: 122.03GiB(87.57%) tps: 10,325 tflops: 492.06 mfu: 49.75% global_avg_ntp_loss: 0.7687 global_avg_top_loss: 1.9571 +[titan] 2025-09-09 07:45:41,946 - root - INFO - lr: 1.0564e-05 gnorm: 0.36 [1 day, 14:10:14<1 day, 11:11:57] +[titan] 2025-09-09 07:46:13,891 - root - INFO - step: 20815 loss: 2.7597 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.89 mfu: 49.43% global_avg_ntp_loss: 0.7872 global_avg_top_loss: 1.9725 +[titan] 2025-09-09 07:46:13,891 - root - INFO - lr: 1.0561e-05 gnorm: 0.40 [1 day, 14:10:46<1 day, 11:11:23] +[titan] 2025-09-09 07:46:46,093 - root - INFO - step: 20820 loss: 2.7721 memory: 122.03GiB(87.57%) tps: 10,176 tflops: 484.98 mfu: 49.04% global_avg_ntp_loss: 0.7911 global_avg_top_loss: 1.9811 +[titan] 2025-09-09 07:46:46,093 - root - INFO - lr: 1.0557e-05 gnorm: 0.36 [1 day, 14:11:18<1 day, 11:10:49] +[titan] 2025-09-09 07:47:18,042 - root - INFO - step: 20825 loss: 2.7175 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.82 mfu: 49.43% global_avg_ntp_loss: 0.7694 global_avg_top_loss: 1.9481 +[titan] 2025-09-09 07:47:18,043 - root - INFO - lr: 1.0554e-05 gnorm: 0.35 [1 day, 14:11:50<1 day, 11:10:15] +[titan] 2025-09-09 07:47:50,006 - root - INFO - step: 20830 loss: 2.8856 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.59 mfu: 49.40% global_avg_ntp_loss: 0.8411 global_avg_top_loss: 2.0445 +[titan] 2025-09-09 07:47:50,007 - root - INFO - lr: 1.0550e-05 gnorm: 0.35 [1 day, 14:12:22<1 day, 11:09:41] +[titan] 2025-09-09 07:48:21,884 - root - INFO - step: 20835 loss: 2.7359 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.92 mfu: 49.54% global_avg_ntp_loss: 0.7772 global_avg_top_loss: 1.9587 +[titan] 2025-09-09 07:48:21,884 - root - INFO - lr: 1.0546e-05 gnorm: 0.35 [1 day, 14:12:54<1 day, 11:09:07] +[titan] 2025-09-09 07:48:53,834 - root - INFO - step: 20840 loss: 2.7794 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.80 mfu: 49.42% global_avg_ntp_loss: 0.7969 global_avg_top_loss: 1.9825 +[titan] 2025-09-09 07:48:53,835 - root - INFO - lr: 1.0543e-05 gnorm: 0.33 [1 day, 14:13:26<1 day, 11:08:33] +[titan] 2025-09-09 07:49:25,664 - root - INFO - step: 20845 loss: 2.7689 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.65 mfu: 49.61% global_avg_ntp_loss: 0.7916 global_avg_top_loss: 1.9773 +[titan] 2025-09-09 07:49:25,665 - root - INFO - lr: 1.0539e-05 gnorm: 0.33 [1 day, 14:13:58<1 day, 11:07:59] +[titan] 2025-09-09 07:49:51,116 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 07:49:57,539 - root - INFO - step: 20850 loss: 2.9739 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.97 mfu: 49.54% global_avg_ntp_loss: 0.8858 global_avg_top_loss: 2.0881 +[titan] 2025-09-09 07:49:57,539 - root - INFO - lr: 1.0536e-05 gnorm: 1.04 [1 day, 14:14:30<1 day, 11:07:25] +[titan] 2025-09-09 07:50:29,472 - root - INFO - step: 20855 loss: 2.9238 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.06 mfu: 49.45% global_avg_ntp_loss: 0.8636 global_avg_top_loss: 2.0602 +[titan] 2025-09-09 07:50:29,473 - root - INFO - lr: 1.0532e-05 gnorm: 0.77 [1 day, 14:15:02<1 day, 11:06:51] +[titan] 2025-09-09 07:51:01,615 - root - INFO - step: 20860 loss: 2.7706 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.87 mfu: 49.13% global_avg_ntp_loss: 0.7953 global_avg_top_loss: 1.9753 +[titan] 2025-09-09 07:51:01,616 - root - INFO - lr: 1.0529e-05 gnorm: 0.37 [1 day, 14:15:34<1 day, 11:06:17] +[titan] 2025-09-09 07:51:33,435 - root - INFO - step: 20865 loss: 2.7579 memory: 122.03GiB(87.57%) tps: 10,298 tflops: 490.81 mfu: 49.63% global_avg_ntp_loss: 0.7845 global_avg_top_loss: 1.9734 +[titan] 2025-09-09 07:51:33,435 - root - INFO - lr: 1.0525e-05 gnorm: 0.34 [1 day, 14:16:06<1 day, 11:05:43] +[titan] 2025-09-09 07:52:05,439 - root - INFO - step: 20870 loss: 2.9183 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.98 mfu: 49.34% global_avg_ntp_loss: 0.8903 global_avg_top_loss: 2.0279 +[titan] 2025-09-09 07:52:05,440 - root - INFO - lr: 1.0522e-05 gnorm: 0.41 [1 day, 14:16:38<1 day, 11:05:09] +[titan] 2025-09-09 07:52:37,550 - root - INFO - step: 20875 loss: 2.6673 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.36 mfu: 49.18% global_avg_ntp_loss: 0.7462 global_avg_top_loss: 1.9211 +[titan] 2025-09-09 07:52:37,550 - root - INFO - lr: 1.0518e-05 gnorm: 0.49 [1 day, 14:17:10<1 day, 11:04:35] +[titan] 2025-09-09 07:53:09,605 - root - INFO - step: 20880 loss: 2.6185 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.21 mfu: 49.26% global_avg_ntp_loss: 0.7242 global_avg_top_loss: 1.8943 +[titan] 2025-09-09 07:53:09,605 - root - INFO - lr: 1.0514e-05 gnorm: 0.34 [1 day, 14:17:42<1 day, 11:04:01] +[titan] 2025-09-09 07:53:41,434 - root - INFO - step: 20885 loss: 2.7658 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.66 mfu: 49.61% global_avg_ntp_loss: 0.7907 global_avg_top_loss: 1.9751 +[titan] 2025-09-09 07:53:41,435 - root - INFO - lr: 1.0511e-05 gnorm: 0.33 [1 day, 14:18:14<1 day, 11:03:27] +[titan] 2025-09-09 07:54:13,278 - root - INFO - step: 20890 loss: 2.6896 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.43 mfu: 49.59% global_avg_ntp_loss: 0.7557 global_avg_top_loss: 1.9340 +[titan] 2025-09-09 07:54:13,279 - root - INFO - lr: 1.0507e-05 gnorm: 0.40 [1 day, 14:18:45<1 day, 11:02:53] +[titan] 2025-09-09 07:54:45,332 - root - INFO - step: 20895 loss: 2.7992 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.23 mfu: 49.26% global_avg_ntp_loss: 0.8024 global_avg_top_loss: 1.9968 +[titan] 2025-09-09 07:54:45,332 - root - INFO - lr: 1.0504e-05 gnorm: 0.48 [1 day, 14:19:17<1 day, 11:02:19] +[titan] 2025-09-09 07:55:10,807 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 07:55:17,281 - root - INFO - step: 20900 loss: 2.7383 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.83 mfu: 49.43% global_avg_ntp_loss: 0.7758 global_avg_top_loss: 1.9626 +[titan] 2025-09-09 07:55:17,281 - root - INFO - lr: 1.0500e-05 gnorm: 0.34 [1 day, 14:19:49<1 day, 11:01:45] +[titan] 2025-09-09 07:55:49,358 - root - INFO - step: 20905 loss: 2.8631 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.87 mfu: 49.23% global_avg_ntp_loss: 0.8373 global_avg_top_loss: 2.0258 +[titan] 2025-09-09 07:55:49,359 - root - INFO - lr: 1.0497e-05 gnorm: 0.37 [1 day, 14:20:21<1 day, 11:01:11] +[titan] 2025-09-09 07:56:21,199 - root - INFO - step: 20910 loss: 2.8960 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.49 mfu: 49.59% global_avg_ntp_loss: 0.8515 global_avg_top_loss: 2.0445 +[titan] 2025-09-09 07:56:21,199 - root - INFO - lr: 1.0493e-05 gnorm: 0.36 [1 day, 14:20:53<1 day, 11:00:37] +[titan] 2025-09-09 07:56:53,363 - root - INFO - step: 20915 loss: 2.8280 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.55 mfu: 49.10% global_avg_ntp_loss: 0.8177 global_avg_top_loss: 2.0103 +[titan] 2025-09-09 07:56:53,364 - root - INFO - lr: 1.0489e-05 gnorm: 0.35 [1 day, 14:21:25<1 day, 11:00:03] +[titan] 2025-09-09 07:57:25,296 - root - INFO - step: 20920 loss: 2.7663 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.07 mfu: 49.45% global_avg_ntp_loss: 0.7890 global_avg_top_loss: 1.9773 +[titan] 2025-09-09 07:57:25,297 - root - INFO - lr: 1.0486e-05 gnorm: 0.34 [1 day, 14:21:57<1 day, 10:59:29] +[titan] 2025-09-09 07:57:57,248 - root - INFO - step: 20925 loss: 2.8015 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.78 mfu: 49.42% global_avg_ntp_loss: 0.8027 global_avg_top_loss: 1.9988 +[titan] 2025-09-09 07:57:57,248 - root - INFO - lr: 1.0482e-05 gnorm: 0.35 [1 day, 14:22:29<1 day, 10:58:55] +[titan] 2025-09-09 07:58:29,155 - root - INFO - step: 20930 loss: 2.9265 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.47 mfu: 49.49% global_avg_ntp_loss: 0.8656 global_avg_top_loss: 2.0609 +[titan] 2025-09-09 07:58:29,156 - root - INFO - lr: 1.0479e-05 gnorm: 0.76 [1 day, 14:23:01<1 day, 10:58:21] +[titan] 2025-09-09 07:59:01,038 - root - INFO - step: 20935 loss: 3.2675 memory: 122.03GiB(87.57%) tps: 10,278 tflops: 489.83 mfu: 49.53% global_avg_ntp_loss: 1.0719 global_avg_top_loss: 2.1956 +[titan] 2025-09-09 07:59:01,039 - root - INFO - lr: 1.0475e-05 gnorm: 0.36 [1 day, 14:23:33<1 day, 10:57:47] +[titan] 2025-09-09 07:59:32,951 - root - INFO - step: 20940 loss: 2.8089 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.39 mfu: 49.48% global_avg_ntp_loss: 0.8071 global_avg_top_loss: 2.0018 +[titan] 2025-09-09 07:59:32,951 - root - INFO - lr: 1.0472e-05 gnorm: 0.33 [1 day, 14:24:05<1 day, 10:57:13] +[titan] 2025-09-09 08:00:04,740 - root - INFO - step: 20945 loss: 2.7405 memory: 122.03GiB(87.57%) tps: 10,308 tflops: 491.28 mfu: 49.67% global_avg_ntp_loss: 0.7780 global_avg_top_loss: 1.9625 +[titan] 2025-09-09 08:00:04,740 - root - INFO - lr: 1.0468e-05 gnorm: 0.35 [1 day, 14:24:37<1 day, 10:56:39] +[titan] 2025-09-09 08:00:30,312 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:00:36,668 - root - INFO - step: 20950 loss: 2.6918 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.15 mfu: 49.46% global_avg_ntp_loss: 0.7578 global_avg_top_loss: 1.9341 +[titan] 2025-09-09 08:00:36,668 - root - INFO - lr: 1.0464e-05 gnorm: 0.41 [1 day, 14:25:09<1 day, 10:56:05] +[titan] 2025-09-09 08:01:08,427 - root - INFO - step: 20955 loss: 2.6915 memory: 122.03GiB(87.57%) tps: 10,318 tflops: 491.74 mfu: 49.72% global_avg_ntp_loss: 0.7568 global_avg_top_loss: 1.9348 +[titan] 2025-09-09 08:01:08,428 - root - INFO - lr: 1.0461e-05 gnorm: 0.41 [1 day, 14:25:41<1 day, 10:55:31] +[titan] 2025-09-09 08:01:40,171 - root - INFO - step: 20960 loss: 2.8383 memory: 122.03GiB(87.57%) tps: 10,323 tflops: 491.99 mfu: 49.75% global_avg_ntp_loss: 0.8246 global_avg_top_loss: 2.0137 +[titan] 2025-09-09 08:01:40,172 - root - INFO - lr: 1.0457e-05 gnorm: 0.42 [1 day, 14:26:12<1 day, 10:54:57] +[titan] 2025-09-09 08:02:11,879 - root - INFO - step: 20965 loss: 2.7883 memory: 122.03GiB(87.57%) tps: 10,335 tflops: 492.54 mfu: 49.80% global_avg_ntp_loss: 0.8007 global_avg_top_loss: 1.9876 +[titan] 2025-09-09 08:02:11,880 - root - INFO - lr: 1.0454e-05 gnorm: 0.35 [1 day, 14:26:44<1 day, 10:54:23] +[titan] 2025-09-09 08:02:43,657 - root - INFO - step: 20970 loss: 2.7243 memory: 122.03GiB(87.57%) tps: 10,312 tflops: 491.46 mfu: 49.69% global_avg_ntp_loss: 0.7716 global_avg_top_loss: 1.9527 +[titan] 2025-09-09 08:02:43,658 - root - INFO - lr: 1.0450e-05 gnorm: 0.35 [1 day, 14:27:16<1 day, 10:53:49] +[titan] 2025-09-09 08:03:15,701 - root - INFO - step: 20975 loss: 2.8051 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.37 mfu: 49.28% global_avg_ntp_loss: 0.8077 global_avg_top_loss: 1.9973 +[titan] 2025-09-09 08:03:15,702 - root - INFO - lr: 1.0447e-05 gnorm: 0.61 [1 day, 14:27:48<1 day, 10:53:15] +[titan] 2025-09-09 08:03:47,824 - root - INFO - step: 20980 loss: 2.8081 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.19 mfu: 49.16% global_avg_ntp_loss: 0.8093 global_avg_top_loss: 1.9988 +[titan] 2025-09-09 08:03:47,824 - root - INFO - lr: 1.0443e-05 gnorm: 0.34 [1 day, 14:28:20<1 day, 10:52:41] +[titan] 2025-09-09 08:04:19,780 - root - INFO - step: 20985 loss: 2.7822 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.71 mfu: 49.41% global_avg_ntp_loss: 0.8019 global_avg_top_loss: 1.9803 +[titan] 2025-09-09 08:04:19,780 - root - INFO - lr: 1.0440e-05 gnorm: 0.33 [1 day, 14:28:52<1 day, 10:52:07] +[titan] 2025-09-09 08:04:51,586 - root - INFO - step: 20990 loss: 2.7827 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.03 mfu: 49.65% global_avg_ntp_loss: 0.7972 global_avg_top_loss: 1.9855 +[titan] 2025-09-09 08:04:51,586 - root - INFO - lr: 1.0436e-05 gnorm: 0.34 [1 day, 14:29:24<1 day, 10:51:33] +[titan] 2025-09-09 08:05:04,595 - root - INFO - Dumping profiler traces at step 20992 +[titan] 2025-09-09 08:05:04,661 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-09 08:05:23,732 - root - INFO - step: 20995 loss: 2.7803 memory: 122.03GiB(87.57%) tps: 10,194 tflops: 485.83 mfu: 49.12% global_avg_ntp_loss: 0.7956 global_avg_top_loss: 1.9847 +[titan] 2025-09-09 08:05:23,732 - root - INFO - lr: 1.0432e-05 gnorm: 0.39 [1 day, 14:29:56<1 day, 10:50:59] +[titan] 2025-09-09 08:05:49,142 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:05:55,532 - root - INFO - step: 21000 loss: 2.7204 memory: 122.03GiB(87.57%) tps: 10,305 tflops: 491.11 mfu: 49.66% global_avg_ntp_loss: 0.7681 global_avg_top_loss: 1.9524 +[titan] 2025-09-09 08:05:55,533 - root - INFO - lr: 1.0429e-05 gnorm: 0.33 [1 day, 14:30:28<1 day, 10:50:25] +[titan] 2025-09-09 08:06:27,528 - root - INFO - step: 21005 loss: 2.9063 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.11 mfu: 49.35% global_avg_ntp_loss: 0.8637 global_avg_top_loss: 2.0426 +[titan] 2025-09-09 08:06:27,529 - root - INFO - lr: 1.0425e-05 gnorm: 0.39 [1 day, 14:31:00<1 day, 10:49:51] +[titan] 2025-09-09 08:06:59,334 - root - INFO - step: 21010 loss: 2.8169 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.02 mfu: 49.65% global_avg_ntp_loss: 0.8137 global_avg_top_loss: 2.0032 +[titan] 2025-09-09 08:06:59,335 - root - INFO - lr: 1.0422e-05 gnorm: 0.37 [1 day, 14:31:31<1 day, 10:49:17] +[titan] 2025-09-09 08:07:31,218 - root - INFO - step: 21015 loss: 3.2978 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.82 mfu: 49.53% global_avg_ntp_loss: 1.0833 global_avg_top_loss: 2.2144 +[titan] 2025-09-09 08:07:31,219 - root - INFO - lr: 1.0418e-05 gnorm: 0.34 [1 day, 14:32:03<1 day, 10:48:43] +[titan] 2025-09-09 08:08:03,278 - root - INFO - step: 21020 loss: 2.8163 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.14 mfu: 49.26% global_avg_ntp_loss: 0.8144 global_avg_top_loss: 2.0019 +[titan] 2025-09-09 08:08:03,278 - root - INFO - lr: 1.0415e-05 gnorm: 0.34 [1 day, 14:32:35<1 day, 10:48:09] +[titan] 2025-09-09 08:08:35,227 - root - INFO - step: 21025 loss: 2.8080 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.83 mfu: 49.43% global_avg_ntp_loss: 0.8089 global_avg_top_loss: 1.9991 +[titan] 2025-09-09 08:08:35,227 - root - INFO - lr: 1.0411e-05 gnorm: 0.34 [1 day, 14:33:07<1 day, 10:47:35] +[titan] 2025-09-09 08:09:07,117 - root - INFO - step: 21030 loss: 2.7775 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.72 mfu: 49.52% global_avg_ntp_loss: 0.8056 global_avg_top_loss: 1.9720 +[titan] 2025-09-09 08:09:07,117 - root - INFO - lr: 1.0407e-05 gnorm: 0.34 [1 day, 14:33:39<1 day, 10:47:01] +[titan] 2025-09-09 08:09:39,062 - root - INFO - step: 21035 loss: 2.7663 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.89 mfu: 49.43% global_avg_ntp_loss: 0.7906 global_avg_top_loss: 1.9757 +[titan] 2025-09-09 08:09:39,062 - root - INFO - lr: 1.0404e-05 gnorm: 0.34 [1 day, 14:34:11<1 day, 10:46:27] +[titan] 2025-09-09 08:10:11,008 - root - INFO - step: 21040 loss: 2.7821 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.86 mfu: 49.43% global_avg_ntp_loss: 0.7995 global_avg_top_loss: 1.9826 +[titan] 2025-09-09 08:10:11,009 - root - INFO - lr: 1.0400e-05 gnorm: 0.35 [1 day, 14:34:43<1 day, 10:45:53] +[titan] 2025-09-09 08:10:42,829 - root - INFO - step: 21045 loss: 2.7440 memory: 122.03GiB(87.57%) tps: 10,298 tflops: 490.80 mfu: 49.63% global_avg_ntp_loss: 0.7797 global_avg_top_loss: 1.9643 +[titan] 2025-09-09 08:10:42,829 - root - INFO - lr: 1.0397e-05 gnorm: 0.33 [1 day, 14:35:15<1 day, 10:45:19] +[titan] 2025-09-09 08:11:08,409 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:11:14,825 - root - INFO - step: 21050 loss: 2.7525 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.10 mfu: 49.35% global_avg_ntp_loss: 0.7832 global_avg_top_loss: 1.9693 +[titan] 2025-09-09 08:11:14,826 - root - INFO - lr: 1.0393e-05 gnorm: 0.33 [1 day, 14:35:47<1 day, 10:44:45] +[titan] 2025-09-09 08:11:46,977 - root - INFO - step: 21055 loss: 2.6952 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.75 mfu: 49.12% global_avg_ntp_loss: 0.7572 global_avg_top_loss: 1.9380 +[titan] 2025-09-09 08:11:46,977 - root - INFO - lr: 1.0390e-05 gnorm: 0.34 [1 day, 14:36:19<1 day, 10:44:11] +[titan] 2025-09-09 08:12:18,872 - root - INFO - step: 21060 loss: 2.6235 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.64 mfu: 49.51% global_avg_ntp_loss: 0.7295 global_avg_top_loss: 1.8941 +[titan] 2025-09-09 08:12:18,873 - root - INFO - lr: 1.0386e-05 gnorm: 0.45 [1 day, 14:36:51<1 day, 10:43:37] +[titan] 2025-09-09 08:12:50,792 - root - INFO - step: 21065 loss: 2.7581 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.28 mfu: 49.47% global_avg_ntp_loss: 0.7873 global_avg_top_loss: 1.9709 +[titan] 2025-09-09 08:12:50,792 - root - INFO - lr: 1.0383e-05 gnorm: 0.34 [1 day, 14:37:23<1 day, 10:43:03] +[titan] 2025-09-09 08:13:22,704 - root - INFO - step: 21070 loss: 2.6939 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.38 mfu: 49.48% global_avg_ntp_loss: 0.7548 global_avg_top_loss: 1.9391 +[titan] 2025-09-09 08:13:22,705 - root - INFO - lr: 1.0379e-05 gnorm: 0.34 [1 day, 14:37:55<1 day, 10:42:29] +[titan] 2025-09-09 08:13:54,590 - root - INFO - step: 21075 loss: 2.7878 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.80 mfu: 49.52% global_avg_ntp_loss: 0.8016 global_avg_top_loss: 1.9862 +[titan] 2025-09-09 08:13:54,590 - root - INFO - lr: 1.0375e-05 gnorm: 0.34 [1 day, 14:38:27<1 day, 10:41:55] +[titan] 2025-09-09 08:14:26,459 - root - INFO - step: 21080 loss: 2.7931 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.05 mfu: 49.55% global_avg_ntp_loss: 0.8034 global_avg_top_loss: 1.9896 +[titan] 2025-09-09 08:14:26,460 - root - INFO - lr: 1.0372e-05 gnorm: 0.34 [1 day, 14:38:59<1 day, 10:41:21] +[titan] 2025-09-09 08:14:58,295 - root - INFO - step: 21085 loss: 3.1784 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.56 mfu: 49.60% global_avg_ntp_loss: 0.9944 global_avg_top_loss: 2.1841 +[titan] 2025-09-09 08:14:58,296 - root - INFO - lr: 1.0368e-05 gnorm: 0.37 [1 day, 14:39:30<1 day, 10:40:47] +[titan] 2025-09-09 08:15:30,318 - root - INFO - step: 21090 loss: 2.8480 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.70 mfu: 49.31% global_avg_ntp_loss: 0.8296 global_avg_top_loss: 2.0184 +[titan] 2025-09-09 08:15:30,318 - root - INFO - lr: 1.0365e-05 gnorm: 0.32 [1 day, 14:40:02<1 day, 10:40:13] +[titan] 2025-09-09 08:16:02,066 - root - INFO - step: 21095 loss: 3.1556 memory: 122.03GiB(87.57%) tps: 10,321 tflops: 491.91 mfu: 49.74% global_avg_ntp_loss: 1.0224 global_avg_top_loss: 2.1332 +[titan] 2025-09-09 08:16:02,066 - root - INFO - lr: 1.0361e-05 gnorm: 0.33 [1 day, 14:40:34<1 day, 10:39:39] +[titan] 2025-09-09 08:16:27,563 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:16:33,905 - root - INFO - step: 21100 loss: 2.8049 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.51 mfu: 49.60% global_avg_ntp_loss: 0.8074 global_avg_top_loss: 1.9976 +[titan] 2025-09-09 08:16:33,906 - root - INFO - lr: 1.0358e-05 gnorm: 0.36 [1 day, 14:41:06<1 day, 10:39:05] +[titan] 2025-09-09 08:17:05,759 - root - INFO - step: 21105 loss: 2.6375 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.28 mfu: 49.57% global_avg_ntp_loss: 0.7327 global_avg_top_loss: 1.9048 +[titan] 2025-09-09 08:17:05,759 - root - INFO - lr: 1.0354e-05 gnorm: 0.33 [1 day, 14:41:38<1 day, 10:38:31] +[titan] 2025-09-09 08:17:37,475 - root - INFO - step: 21110 loss: 2.7934 memory: 122.03GiB(87.57%) tps: 10,332 tflops: 492.42 mfu: 49.79% global_avg_ntp_loss: 0.7993 global_avg_top_loss: 1.9942 +[titan] 2025-09-09 08:17:37,475 - root - INFO - lr: 1.0350e-05 gnorm: 0.34 [1 day, 14:42:10<1 day, 10:37:57] +[titan] 2025-09-09 08:18:09,514 - root - INFO - step: 21115 loss: 2.7802 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.44 mfu: 49.29% global_avg_ntp_loss: 0.7952 global_avg_top_loss: 1.9850 +[titan] 2025-09-09 08:18:09,515 - root - INFO - lr: 1.0347e-05 gnorm: 0.33 [1 day, 14:42:42<1 day, 10:37:23] +[titan] 2025-09-09 08:18:41,343 - root - INFO - step: 21120 loss: 2.7539 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.67 mfu: 49.61% global_avg_ntp_loss: 0.7800 global_avg_top_loss: 1.9739 +[titan] 2025-09-09 08:18:41,344 - root - INFO - lr: 1.0343e-05 gnorm: 0.35 [1 day, 14:43:13<1 day, 10:36:49] +[titan] 2025-09-09 08:19:13,081 - root - INFO - step: 21125 loss: 3.1028 memory: 122.03GiB(87.57%) tps: 10,325 tflops: 492.08 mfu: 49.76% global_avg_ntp_loss: 0.9473 global_avg_top_loss: 2.1555 +[titan] 2025-09-09 08:19:13,081 - root - INFO - lr: 1.0340e-05 gnorm: 0.38 [1 day, 14:43:45<1 day, 10:36:15] +[titan] 2025-09-09 08:19:45,334 - root - INFO - step: 21130 loss: 2.7767 memory: 122.03GiB(87.57%) tps: 10,160 tflops: 484.21 mfu: 48.96% global_avg_ntp_loss: 0.7932 global_avg_top_loss: 1.9835 +[titan] 2025-09-09 08:19:45,335 - root - INFO - lr: 1.0336e-05 gnorm: 0.34 [1 day, 14:44:17<1 day, 10:35:41] +[titan] 2025-09-09 08:20:17,484 - root - INFO - step: 21135 loss: 2.7999 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.77 mfu: 49.12% global_avg_ntp_loss: 0.8034 global_avg_top_loss: 1.9965 +[titan] 2025-09-09 08:20:17,484 - root - INFO - lr: 1.0333e-05 gnorm: 0.33 [1 day, 14:44:50<1 day, 10:35:08] +[titan] 2025-09-09 08:20:49,386 - root - INFO - step: 21140 loss: 2.5905 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.55 mfu: 49.50% global_avg_ntp_loss: 0.7072 global_avg_top_loss: 1.8832 +[titan] 2025-09-09 08:20:49,386 - root - INFO - lr: 1.0329e-05 gnorm: 0.43 [1 day, 14:45:21<1 day, 10:34:34] +[titan] 2025-09-09 08:21:21,367 - root - INFO - step: 21145 loss: 3.1811 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.33 mfu: 49.38% global_avg_ntp_loss: 1.0347 global_avg_top_loss: 2.1463 +[titan] 2025-09-09 08:21:21,368 - root - INFO - lr: 1.0326e-05 gnorm: 0.34 [1 day, 14:45:53<1 day, 10:34:00] +[titan] 2025-09-09 08:21:46,894 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:21:53,350 - root - INFO - step: 21150 loss: 2.8332 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.30 mfu: 49.37% global_avg_ntp_loss: 0.8192 global_avg_top_loss: 2.0140 +[titan] 2025-09-09 08:21:53,351 - root - INFO - lr: 1.0322e-05 gnorm: 0.37 [1 day, 14:46:25<1 day, 10:33:26] +[titan] 2025-09-09 08:22:25,343 - root - INFO - step: 21155 loss: 2.7156 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.16 mfu: 49.36% global_avg_ntp_loss: 0.7669 global_avg_top_loss: 1.9487 +[titan] 2025-09-09 08:22:25,343 - root - INFO - lr: 1.0318e-05 gnorm: 0.37 [1 day, 14:46:57<1 day, 10:32:52] +[titan] 2025-09-09 08:22:57,111 - root - INFO - step: 21160 loss: 2.7876 memory: 122.03GiB(87.57%) tps: 10,315 tflops: 491.60 mfu: 49.71% global_avg_ntp_loss: 0.8013 global_avg_top_loss: 1.9863 +[titan] 2025-09-09 08:22:57,112 - root - INFO - lr: 1.0315e-05 gnorm: 0.34 [1 day, 14:47:29<1 day, 10:32:18] +[titan] 2025-09-09 08:23:29,275 - root - INFO - step: 21165 loss: 3.0356 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.56 mfu: 49.10% global_avg_ntp_loss: 0.9197 global_avg_top_loss: 2.1159 +[titan] 2025-09-09 08:23:29,276 - root - INFO - lr: 1.0311e-05 gnorm: 0.39 [1 day, 14:48:01<1 day, 10:31:44] +[titan] 2025-09-09 08:24:01,373 - root - INFO - step: 21170 loss: 2.7337 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.56 mfu: 49.20% global_avg_ntp_loss: 0.7779 global_avg_top_loss: 1.9559 +[titan] 2025-09-09 08:24:01,373 - root - INFO - lr: 1.0308e-05 gnorm: 0.34 [1 day, 14:48:33<1 day, 10:31:10] +[titan] 2025-09-09 08:24:33,246 - root - INFO - step: 21175 loss: 3.2832 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.99 mfu: 49.54% global_avg_ntp_loss: 1.0789 global_avg_top_loss: 2.2042 +[titan] 2025-09-09 08:24:33,246 - root - INFO - lr: 1.0304e-05 gnorm: 0.38 [1 day, 14:49:05<1 day, 10:30:36] +[titan] 2025-09-09 08:25:05,150 - root - INFO - step: 21180 loss: 2.7886 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.51 mfu: 49.50% global_avg_ntp_loss: 0.8007 global_avg_top_loss: 1.9879 +[titan] 2025-09-09 08:25:05,151 - root - INFO - lr: 1.0301e-05 gnorm: 0.38 [1 day, 14:49:37<1 day, 10:30:02] +[titan] 2025-09-09 08:25:37,105 - root - INFO - step: 21185 loss: 2.7363 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.74 mfu: 49.42% global_avg_ntp_loss: 0.7782 global_avg_top_loss: 1.9581 +[titan] 2025-09-09 08:25:37,105 - root - INFO - lr: 1.0297e-05 gnorm: 0.32 [1 day, 14:50:09<1 day, 10:29:28] +[titan] 2025-09-09 08:26:09,014 - root - INFO - step: 21190 loss: 2.7883 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.44 mfu: 49.49% global_avg_ntp_loss: 0.7990 global_avg_top_loss: 1.9894 +[titan] 2025-09-09 08:26:09,014 - root - INFO - lr: 1.0294e-05 gnorm: 0.37 [1 day, 14:50:41<1 day, 10:28:54] +[titan] 2025-09-09 08:26:40,951 - root - INFO - step: 21195 loss: 2.7609 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 489.00 mfu: 49.44% global_avg_ntp_loss: 0.7866 global_avg_top_loss: 1.9743 +[titan] 2025-09-09 08:26:40,952 - root - INFO - lr: 1.0290e-05 gnorm: 0.35 [1 day, 14:51:13<1 day, 10:28:21] +[titan] 2025-09-09 08:27:06,456 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:27:12,857 - root - INFO - step: 21200 loss: 2.8088 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.49 mfu: 49.49% global_avg_ntp_loss: 0.8095 global_avg_top_loss: 1.9993 +[titan] 2025-09-09 08:27:12,857 - root - INFO - lr: 1.0286e-05 gnorm: 0.34 [1 day, 14:51:45<1 day, 10:27:47] +[titan] 2025-09-09 08:27:44,761 - root - INFO - step: 21205 loss: 2.8327 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.51 mfu: 49.50% global_avg_ntp_loss: 0.8227 global_avg_top_loss: 2.0100 +[titan] 2025-09-09 08:27:44,761 - root - INFO - lr: 1.0283e-05 gnorm: 0.36 [1 day, 14:52:17<1 day, 10:27:13] +[titan] 2025-09-09 08:28:16,646 - root - INFO - step: 21210 loss: 2.7428 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.81 mfu: 49.53% global_avg_ntp_loss: 0.7750 global_avg_top_loss: 1.9678 +[titan] 2025-09-09 08:28:16,646 - root - INFO - lr: 1.0279e-05 gnorm: 0.37 [1 day, 14:52:49<1 day, 10:26:39] +[titan] 2025-09-09 08:28:48,498 - root - INFO - step: 21215 loss: 2.7985 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.31 mfu: 49.58% global_avg_ntp_loss: 0.8044 global_avg_top_loss: 1.9941 +[titan] 2025-09-09 08:28:48,498 - root - INFO - lr: 1.0276e-05 gnorm: 0.41 [1 day, 14:53:21<1 day, 10:26:05] +[titan] 2025-09-09 08:29:20,626 - root - INFO - step: 21220 loss: 2.7330 memory: 122.03GiB(87.57%) tps: 10,200 tflops: 486.11 mfu: 49.15% global_avg_ntp_loss: 0.7767 global_avg_top_loss: 1.9564 +[titan] 2025-09-09 08:29:20,626 - root - INFO - lr: 1.0272e-05 gnorm: 0.64 [1 day, 14:53:53<1 day, 10:25:31] +[titan] 2025-09-09 08:29:52,377 - root - INFO - step: 21225 loss: 3.2566 memory: 122.03GiB(87.57%) tps: 10,321 tflops: 491.87 mfu: 49.73% global_avg_ntp_loss: 1.0652 global_avg_top_loss: 2.1914 +[titan] 2025-09-09 08:29:52,377 - root - INFO - lr: 1.0269e-05 gnorm: 0.35 [1 day, 14:54:24<1 day, 10:24:57] +[titan] 2025-09-09 08:30:24,267 - root - INFO - step: 21230 loss: 2.7619 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.72 mfu: 49.52% global_avg_ntp_loss: 0.7897 global_avg_top_loss: 1.9723 +[titan] 2025-09-09 08:30:24,268 - root - INFO - lr: 1.0265e-05 gnorm: 0.38 [1 day, 14:54:56<1 day, 10:24:23] +[titan] 2025-09-09 08:30:56,119 - root - INFO - step: 21235 loss: 2.7596 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.32 mfu: 49.58% global_avg_ntp_loss: 0.7843 global_avg_top_loss: 1.9753 +[titan] 2025-09-09 08:30:56,120 - root - INFO - lr: 1.0262e-05 gnorm: 0.36 [1 day, 14:55:28<1 day, 10:23:49] +[titan] 2025-09-09 08:31:27,996 - root - INFO - step: 21240 loss: 2.6339 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.93 mfu: 49.54% global_avg_ntp_loss: 0.7300 global_avg_top_loss: 1.9039 +[titan] 2025-09-09 08:31:27,997 - root - INFO - lr: 1.0258e-05 gnorm: 0.37 [1 day, 14:56:00<1 day, 10:23:15] +[titan] 2025-09-09 08:31:59,922 - root - INFO - step: 21245 loss: 2.8927 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.18 mfu: 49.46% global_avg_ntp_loss: 0.8453 global_avg_top_loss: 2.0474 +[titan] 2025-09-09 08:31:59,922 - root - INFO - lr: 1.0254e-05 gnorm: 0.36 [1 day, 14:56:32<1 day, 10:22:41] +[titan] 2025-09-09 08:32:25,411 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:32:31,783 - root - INFO - step: 21250 loss: 2.6490 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.17 mfu: 49.56% global_avg_ntp_loss: 0.7374 global_avg_top_loss: 1.9116 +[titan] 2025-09-09 08:32:31,783 - root - INFO - lr: 1.0251e-05 gnorm: 0.37 [1 day, 14:57:04<1 day, 10:22:07] +[titan] 2025-09-09 08:33:03,883 - root - INFO - step: 21255 loss: 2.9289 memory: 122.03GiB(87.57%) tps: 10,208 tflops: 486.52 mfu: 49.19% global_avg_ntp_loss: 0.8814 global_avg_top_loss: 2.0475 +[titan] 2025-09-09 08:33:03,884 - root - INFO - lr: 1.0247e-05 gnorm: 0.39 [1 day, 14:57:36<1 day, 10:21:33] +[titan] 2025-09-09 08:33:35,861 - root - INFO - step: 21260 loss: 2.7507 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.38 mfu: 49.38% global_avg_ntp_loss: 0.7845 global_avg_top_loss: 1.9663 +[titan] 2025-09-09 08:33:35,862 - root - INFO - lr: 1.0244e-05 gnorm: 0.41 [1 day, 14:58:08<1 day, 10:20:59] +[titan] 2025-09-09 08:34:07,792 - root - INFO - step: 21265 loss: 2.7694 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.10 mfu: 49.45% global_avg_ntp_loss: 0.7900 global_avg_top_loss: 1.9794 +[titan] 2025-09-09 08:34:07,792 - root - INFO - lr: 1.0240e-05 gnorm: 0.35 [1 day, 14:58:40<1 day, 10:20:25] +[titan] 2025-09-09 08:34:39,695 - root - INFO - step: 21270 loss: 3.2825 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.53 mfu: 49.50% global_avg_ntp_loss: 1.0775 global_avg_top_loss: 2.2050 +[titan] 2025-09-09 08:34:39,696 - root - INFO - lr: 1.0237e-05 gnorm: 0.36 [1 day, 14:59:12<1 day, 10:19:51] +[titan] 2025-09-09 08:35:11,643 - root - INFO - step: 21275 loss: 2.7718 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.84 mfu: 49.43% global_avg_ntp_loss: 0.7955 global_avg_top_loss: 1.9763 +[titan] 2025-09-09 08:35:11,643 - root - INFO - lr: 1.0233e-05 gnorm: 0.34 [1 day, 14:59:44<1 day, 10:19:17] +[titan] 2025-09-09 08:35:43,579 - root - INFO - step: 21280 loss: 2.7849 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.02 mfu: 49.45% global_avg_ntp_loss: 0.8014 global_avg_top_loss: 1.9835 +[titan] 2025-09-09 08:35:43,579 - root - INFO - lr: 1.0229e-05 gnorm: 0.38 [1 day, 15:00:16<1 day, 10:18:43] +[titan] 2025-09-09 08:36:15,843 - root - INFO - step: 21285 loss: 2.8401 memory: 122.03GiB(87.57%) tps: 10,157 tflops: 484.06 mfu: 48.94% global_avg_ntp_loss: 0.8238 global_avg_top_loss: 2.0163 +[titan] 2025-09-09 08:36:15,843 - root - INFO - lr: 1.0226e-05 gnorm: 0.42 [1 day, 15:00:48<1 day, 10:18:10] +[titan] 2025-09-09 08:36:47,470 - root - INFO - step: 21290 loss: 3.1267 memory: 122.03GiB(87.57%) tps: 10,361 tflops: 493.80 mfu: 49.93% global_avg_ntp_loss: 0.9904 global_avg_top_loss: 2.1363 +[titan] 2025-09-09 08:36:47,470 - root - INFO - lr: 1.0222e-05 gnorm: 0.48 [1 day, 15:01:20<1 day, 10:17:36] +[titan] 2025-09-09 08:37:19,400 - root - INFO - step: 21295 loss: 2.7485 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.11 mfu: 49.46% global_avg_ntp_loss: 0.7810 global_avg_top_loss: 1.9674 +[titan] 2025-09-09 08:37:19,400 - root - INFO - lr: 1.0219e-05 gnorm: 0.34 [1 day, 15:01:51<1 day, 10:17:02] +[titan] 2025-09-09 08:37:44,909 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:37:51,307 - root - INFO - step: 21300 loss: 2.6898 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.46 mfu: 49.49% global_avg_ntp_loss: 0.7528 global_avg_top_loss: 1.9370 +[titan] 2025-09-09 08:37:51,308 - root - INFO - lr: 1.0215e-05 gnorm: 0.63 [1 day, 15:02:23<1 day, 10:16:28] +[titan] 2025-09-09 08:38:23,328 - root - INFO - step: 21305 loss: 3.2138 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.73 mfu: 49.32% global_avg_ntp_loss: 1.0453 global_avg_top_loss: 2.1684 +[titan] 2025-09-09 08:38:23,328 - root - INFO - lr: 1.0212e-05 gnorm: 0.33 [1 day, 15:02:55<1 day, 10:15:54] +[titan] 2025-09-09 08:38:55,457 - root - INFO - step: 21310 loss: 2.6523 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.08 mfu: 49.15% global_avg_ntp_loss: 0.7390 global_avg_top_loss: 1.9133 +[titan] 2025-09-09 08:38:55,458 - root - INFO - lr: 1.0208e-05 gnorm: 0.33 [1 day, 15:03:28<1 day, 10:15:20] +[titan] 2025-09-09 08:39:27,268 - root - INFO - step: 21315 loss: 2.7725 memory: 122.03GiB(87.57%) tps: 10,301 tflops: 490.95 mfu: 49.64% global_avg_ntp_loss: 0.7902 global_avg_top_loss: 1.9823 +[titan] 2025-09-09 08:39:27,268 - root - INFO - lr: 1.0205e-05 gnorm: 0.38 [1 day, 15:03:59<1 day, 10:14:46] +[titan] 2025-09-09 08:39:59,304 - root - INFO - step: 21320 loss: 3.2155 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.49 mfu: 49.29% global_avg_ntp_loss: 1.0474 global_avg_top_loss: 2.1681 +[titan] 2025-09-09 08:39:59,305 - root - INFO - lr: 1.0201e-05 gnorm: 0.37 [1 day, 15:04:31<1 day, 10:14:12] +[titan] 2025-09-09 08:40:31,203 - root - INFO - step: 21325 loss: 2.7910 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.59 mfu: 49.50% global_avg_ntp_loss: 0.8014 global_avg_top_loss: 1.9895 +[titan] 2025-09-09 08:40:31,204 - root - INFO - lr: 1.0197e-05 gnorm: 0.35 [1 day, 15:05:03<1 day, 10:13:38] +[titan] 2025-09-09 08:41:02,867 - root - INFO - step: 21330 loss: 2.7792 memory: 122.03GiB(87.57%) tps: 10,349 tflops: 493.22 mfu: 49.87% global_avg_ntp_loss: 0.7908 global_avg_top_loss: 1.9884 +[titan] 2025-09-09 08:41:02,868 - root - INFO - lr: 1.0194e-05 gnorm: 0.75 [1 day, 15:05:35<1 day, 10:13:04] +[titan] 2025-09-09 08:41:34,736 - root - INFO - step: 21335 loss: 2.8769 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.05 mfu: 49.55% global_avg_ntp_loss: 0.8415 global_avg_top_loss: 2.0354 +[titan] 2025-09-09 08:41:34,737 - root - INFO - lr: 1.0190e-05 gnorm: 0.36 [1 day, 15:06:07<1 day, 10:12:30] +[titan] 2025-09-09 08:42:06,671 - root - INFO - step: 21340 loss: 2.8258 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.04 mfu: 49.45% global_avg_ntp_loss: 0.8173 global_avg_top_loss: 2.0085 +[titan] 2025-09-09 08:42:06,672 - root - INFO - lr: 1.0187e-05 gnorm: 0.35 [1 day, 15:06:39<1 day, 10:11:56] +[titan] 2025-09-09 08:42:39,089 - root - INFO - step: 21345 loss: 2.7732 memory: 122.03GiB(87.57%) tps: 10,109 tflops: 481.76 mfu: 48.71% global_avg_ntp_loss: 0.7933 global_avg_top_loss: 1.9799 +[titan] 2025-09-09 08:42:39,089 - root - INFO - lr: 1.0183e-05 gnorm: 0.35 [1 day, 15:07:11<1 day, 10:11:23] +[titan] 2025-09-09 08:43:04,629 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:43:11,000 - root - INFO - step: 21350 loss: 3.1695 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.41 mfu: 49.49% global_avg_ntp_loss: 1.0284 global_avg_top_loss: 2.1411 +[titan] 2025-09-09 08:43:11,000 - root - INFO - lr: 1.0180e-05 gnorm: 0.44 [1 day, 15:07:43<1 day, 10:10:49] +[titan] 2025-09-09 08:43:43,087 - root - INFO - step: 21355 loss: 2.8161 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.71 mfu: 49.21% global_avg_ntp_loss: 0.8140 global_avg_top_loss: 2.0022 +[titan] 2025-09-09 08:43:43,088 - root - INFO - lr: 1.0176e-05 gnorm: 0.34 [1 day, 15:08:15<1 day, 10:10:15] +[titan] 2025-09-09 08:44:15,025 - root - INFO - step: 21360 loss: 2.8946 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.99 mfu: 49.44% global_avg_ntp_loss: 0.8617 global_avg_top_loss: 2.0329 +[titan] 2025-09-09 08:44:15,026 - root - INFO - lr: 1.0173e-05 gnorm: 0.53 [1 day, 15:08:47<1 day, 10:09:41] +[titan] 2025-09-09 08:44:47,038 - root - INFO - step: 21365 loss: 2.8272 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.86 mfu: 49.33% global_avg_ntp_loss: 0.8180 global_avg_top_loss: 2.0092 +[titan] 2025-09-09 08:44:47,038 - root - INFO - lr: 1.0169e-05 gnorm: 0.37 [1 day, 15:09:19<1 day, 10:09:07] +[titan] 2025-09-09 08:45:19,179 - root - INFO - step: 21370 loss: 2.7363 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.91 mfu: 49.13% global_avg_ntp_loss: 0.7739 global_avg_top_loss: 1.9625 +[titan] 2025-09-09 08:45:19,179 - root - INFO - lr: 1.0165e-05 gnorm: 0.58 [1 day, 15:09:51<1 day, 10:08:34] +[titan] 2025-09-09 08:45:51,007 - root - INFO - step: 21375 loss: 2.6926 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.67 mfu: 49.61% global_avg_ntp_loss: 0.7562 global_avg_top_loss: 1.9364 +[titan] 2025-09-09 08:45:51,008 - root - INFO - lr: 1.0162e-05 gnorm: 0.35 [1 day, 15:10:23<1 day, 10:08:00] +[titan] 2025-09-09 08:46:22,977 - root - INFO - step: 21380 loss: 3.1665 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.50 mfu: 49.39% global_avg_ntp_loss: 1.0248 global_avg_top_loss: 2.1417 +[titan] 2025-09-09 08:46:22,978 - root - INFO - lr: 1.0158e-05 gnorm: 0.38 [1 day, 15:10:55<1 day, 10:07:26] +[titan] 2025-09-09 08:46:54,724 - root - INFO - step: 21385 loss: 3.1693 memory: 122.03GiB(87.57%) tps: 10,322 tflops: 491.94 mfu: 49.74% global_avg_ntp_loss: 1.0278 global_avg_top_loss: 2.1416 +[titan] 2025-09-09 08:46:54,725 - root - INFO - lr: 1.0155e-05 gnorm: 0.38 [1 day, 15:11:27<1 day, 10:06:52] +[titan] 2025-09-09 08:47:26,715 - root - INFO - step: 21390 loss: 2.7132 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.18 mfu: 49.36% global_avg_ntp_loss: 0.7658 global_avg_top_loss: 1.9475 +[titan] 2025-09-09 08:47:26,716 - root - INFO - lr: 1.0151e-05 gnorm: 0.35 [1 day, 15:11:59<1 day, 10:06:18] +[titan] 2025-09-09 08:47:58,747 - root - INFO - step: 21395 loss: 2.8333 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.57 mfu: 49.30% global_avg_ntp_loss: 0.8238 global_avg_top_loss: 2.0096 +[titan] 2025-09-09 08:47:58,747 - root - INFO - lr: 1.0148e-05 gnorm: 0.38 [1 day, 15:12:31<1 day, 10:05:44] +[titan] 2025-09-09 08:48:24,378 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:48:30,698 - root - INFO - step: 21400 loss: 3.2217 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.79 mfu: 49.42% global_avg_ntp_loss: 1.0531 global_avg_top_loss: 2.1686 +[titan] 2025-09-09 08:48:30,698 - root - INFO - lr: 1.0144e-05 gnorm: 0.49 [1 day, 15:13:03<1 day, 10:05:10] +[titan] 2025-09-09 08:49:02,771 - root - INFO - step: 21405 loss: 2.7985 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.93 mfu: 49.23% global_avg_ntp_loss: 0.8028 global_avg_top_loss: 1.9957 +[titan] 2025-09-09 08:49:02,772 - root - INFO - lr: 1.0141e-05 gnorm: 0.38 [1 day, 15:13:35<1 day, 10:04:36] +[titan] 2025-09-09 08:49:34,717 - root - INFO - step: 21410 loss: 2.7025 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.88 mfu: 49.43% global_avg_ntp_loss: 0.7627 global_avg_top_loss: 1.9398 +[titan] 2025-09-09 08:49:34,717 - root - INFO - lr: 1.0137e-05 gnorm: 0.39 [1 day, 15:14:07<1 day, 10:04:02] +[titan] 2025-09-09 08:50:06,822 - root - INFO - step: 21415 loss: 3.2043 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.44 mfu: 49.19% global_avg_ntp_loss: 1.0499 global_avg_top_loss: 2.1544 +[titan] 2025-09-09 08:50:06,823 - root - INFO - lr: 1.0134e-05 gnorm: 0.41 [1 day, 15:14:39<1 day, 10:03:29] +[titan] 2025-09-09 08:50:38,964 - root - INFO - step: 21420 loss: 2.7892 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.90 mfu: 49.13% global_avg_ntp_loss: 0.8005 global_avg_top_loss: 1.9887 +[titan] 2025-09-09 08:50:38,964 - root - INFO - lr: 1.0130e-05 gnorm: 0.41 [1 day, 15:15:11<1 day, 10:02:55] +[titan] 2025-09-09 08:51:10,696 - root - INFO - step: 21425 loss: 2.8082 memory: 122.03GiB(87.57%) tps: 10,327 tflops: 492.17 mfu: 49.76% global_avg_ntp_loss: 0.8092 global_avg_top_loss: 1.9990 +[titan] 2025-09-09 08:51:10,696 - root - INFO - lr: 1.0126e-05 gnorm: 0.33 [1 day, 15:15:43<1 day, 10:02:21] +[titan] 2025-09-09 08:51:42,687 - root - INFO - step: 21430 loss: 3.2729 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.18 mfu: 49.36% global_avg_ntp_loss: 1.0723 global_avg_top_loss: 2.2005 +[titan] 2025-09-09 08:51:42,687 - root - INFO - lr: 1.0123e-05 gnorm: 0.44 [1 day, 15:16:15<1 day, 10:01:47] +[titan] 2025-09-09 08:52:14,594 - root - INFO - step: 21435 loss: 2.9409 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.47 mfu: 49.49% global_avg_ntp_loss: 0.8806 global_avg_top_loss: 2.0603 +[titan] 2025-09-09 08:52:14,595 - root - INFO - lr: 1.0119e-05 gnorm: 0.36 [1 day, 15:16:47<1 day, 10:01:13] +[titan] 2025-09-09 08:52:46,674 - root - INFO - step: 21440 loss: 2.7742 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.83 mfu: 49.22% global_avg_ntp_loss: 0.7939 global_avg_top_loss: 1.9803 +[titan] 2025-09-09 08:52:46,675 - root - INFO - lr: 1.0116e-05 gnorm: 0.34 [1 day, 15:17:19<1 day, 10:00:39] +[titan] 2025-09-09 08:53:18,594 - root - INFO - step: 21445 loss: 2.9764 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.27 mfu: 49.47% global_avg_ntp_loss: 0.9063 global_avg_top_loss: 2.0701 +[titan] 2025-09-09 08:53:18,594 - root - INFO - lr: 1.0112e-05 gnorm: 0.37 [1 day, 15:17:51<1 day, 10:00:05] +[titan] 2025-09-09 08:53:44,001 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:53:50,615 - root - INFO - step: 21450 loss: 2.8811 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.72 mfu: 49.31% global_avg_ntp_loss: 0.8518 global_avg_top_loss: 2.0293 +[titan] 2025-09-09 08:53:50,616 - root - INFO - lr: 1.0109e-05 gnorm: 0.59 [1 day, 15:18:23<1 day, 9:59:32] +[titan] 2025-09-09 08:54:23,030 - root - INFO - step: 21455 loss: 2.7564 memory: 122.03GiB(87.57%) tps: 10,109 tflops: 481.79 mfu: 48.72% global_avg_ntp_loss: 0.7852 global_avg_top_loss: 1.9713 +[titan] 2025-09-09 08:54:23,031 - root - INFO - lr: 1.0105e-05 gnorm: 0.35 [1 day, 15:18:55<1 day, 9:58:58] +[titan] 2025-09-09 08:54:55,019 - root - INFO - step: 21460 loss: 2.7226 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.22 mfu: 49.36% global_avg_ntp_loss: 0.7708 global_avg_top_loss: 1.9519 +[titan] 2025-09-09 08:54:55,019 - root - INFO - lr: 1.0102e-05 gnorm: 0.34 [1 day, 15:19:27<1 day, 9:58:24] +[titan] 2025-09-09 08:55:27,041 - root - INFO - step: 21465 loss: 2.7373 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.71 mfu: 49.31% global_avg_ntp_loss: 0.7844 global_avg_top_loss: 1.9530 +[titan] 2025-09-09 08:55:27,041 - root - INFO - lr: 1.0098e-05 gnorm: 0.33 [1 day, 15:19:59<1 day, 9:57:51] +[titan] 2025-09-09 08:55:59,128 - root - INFO - step: 21470 loss: 2.7267 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.72 mfu: 49.21% global_avg_ntp_loss: 0.7742 global_avg_top_loss: 1.9524 +[titan] 2025-09-09 08:55:59,128 - root - INFO - lr: 1.0094e-05 gnorm: 0.34 [1 day, 15:20:31<1 day, 9:57:17] +[titan] 2025-09-09 08:56:30,924 - root - INFO - step: 21475 loss: 2.7791 memory: 122.03GiB(87.57%) tps: 10,306 tflops: 491.18 mfu: 49.66% global_avg_ntp_loss: 0.7971 global_avg_top_loss: 1.9820 +[titan] 2025-09-09 08:56:30,924 - root - INFO - lr: 1.0091e-05 gnorm: 0.33 [1 day, 15:21:03<1 day, 9:56:43] +[titan] 2025-09-09 08:57:03,162 - root - INFO - step: 21480 loss: 3.2365 memory: 122.03GiB(87.57%) tps: 10,165 tflops: 484.44 mfu: 48.98% global_avg_ntp_loss: 1.0599 global_avg_top_loss: 2.1767 +[titan] 2025-09-09 08:57:03,162 - root - INFO - lr: 1.0087e-05 gnorm: 0.36 [1 day, 15:21:35<1 day, 9:56:09] +[titan] 2025-09-09 08:57:35,295 - root - INFO - step: 21485 loss: 2.8410 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.02 mfu: 49.14% global_avg_ntp_loss: 0.8233 global_avg_top_loss: 2.0177 +[titan] 2025-09-09 08:57:35,295 - root - INFO - lr: 1.0084e-05 gnorm: 0.33 [1 day, 15:22:07<1 day, 9:55:35] +[titan] 2025-09-09 08:58:07,572 - root - INFO - step: 21490 loss: 2.7830 memory: 122.03GiB(87.57%) tps: 10,152 tflops: 483.85 mfu: 48.92% global_avg_ntp_loss: 0.8010 global_avg_top_loss: 1.9821 +[titan] 2025-09-09 08:58:07,573 - root - INFO - lr: 1.0080e-05 gnorm: 0.39 [1 day, 15:22:40<1 day, 9:55:02] +[titan] 2025-09-09 08:58:39,731 - root - INFO - step: 21495 loss: 3.1936 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.64 mfu: 49.10% global_avg_ntp_loss: 1.0459 global_avg_top_loss: 2.1477 +[titan] 2025-09-09 08:58:39,731 - root - INFO - lr: 1.0077e-05 gnorm: 0.45 [1 day, 15:23:12<1 day, 9:54:28] +[titan] 2025-09-09 08:59:05,215 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 08:59:11,695 - root - INFO - step: 21500 loss: 2.7727 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.60 mfu: 49.40% global_avg_ntp_loss: 0.7926 global_avg_top_loss: 1.9801 +[titan] 2025-09-09 08:59:11,695 - root - INFO - lr: 1.0073e-05 gnorm: 0.34 [1 day, 15:23:44<1 day, 9:53:54] +[titan] 2025-09-09 08:59:37,553 - root - INFO - Dumping profiler traces at step 21504 +[titan] 2025-09-09 08:59:37,604 - root - INFO - Finished dumping profiler traces in 0.05 seconds +[titan] 2025-09-09 08:59:43,999 - root - INFO - step: 21505 loss: 2.8412 memory: 122.03GiB(87.57%) tps: 10,144 tflops: 483.45 mfu: 48.88% global_avg_ntp_loss: 0.8224 global_avg_top_loss: 2.0188 +[titan] 2025-09-09 08:59:43,999 - root - INFO - lr: 1.0070e-05 gnorm: 0.35 [1 day, 15:24:16<1 day, 9:53:21] +[titan] 2025-09-09 09:00:16,226 - root - INFO - step: 21510 loss: 3.1718 memory: 122.03GiB(87.57%) tps: 10,168 tflops: 484.60 mfu: 49.00% global_avg_ntp_loss: 1.0284 global_avg_top_loss: 2.1434 +[titan] 2025-09-09 09:00:16,227 - root - INFO - lr: 1.0066e-05 gnorm: 0.53 [1 day, 15:24:48<1 day, 9:52:47] +[titan] 2025-09-09 09:00:48,288 - root - INFO - step: 21515 loss: 2.8465 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.11 mfu: 49.25% global_avg_ntp_loss: 0.8217 global_avg_top_loss: 2.0248 +[titan] 2025-09-09 09:00:48,289 - root - INFO - lr: 1.0062e-05 gnorm: 0.36 [1 day, 15:25:20<1 day, 9:52:13] +[titan] 2025-09-09 09:01:20,370 - root - INFO - step: 21520 loss: 2.7681 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.80 mfu: 49.22% global_avg_ntp_loss: 0.7901 global_avg_top_loss: 1.9780 +[titan] 2025-09-09 09:01:20,370 - root - INFO - lr: 1.0059e-05 gnorm: 0.36 [1 day, 15:25:52<1 day, 9:51:40] +[titan] 2025-09-09 09:01:52,291 - root - INFO - step: 21525 loss: 2.8162 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.25 mfu: 49.47% global_avg_ntp_loss: 0.8160 global_avg_top_loss: 2.0002 +[titan] 2025-09-09 09:01:52,292 - root - INFO - lr: 1.0055e-05 gnorm: 0.38 [1 day, 15:26:24<1 day, 9:51:06] +[titan] 2025-09-09 09:02:24,640 - root - INFO - step: 21530 loss: 3.1600 memory: 122.03GiB(87.57%) tps: 10,130 tflops: 482.78 mfu: 48.82% global_avg_ntp_loss: 1.0201 global_avg_top_loss: 2.1399 +[titan] 2025-09-09 09:02:24,641 - root - INFO - lr: 1.0052e-05 gnorm: 0.34 [1 day, 15:26:57<1 day, 9:50:32] +[titan] 2025-09-09 09:02:56,773 - root - INFO - step: 21535 loss: 2.8205 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.03 mfu: 49.14% global_avg_ntp_loss: 0.8149 global_avg_top_loss: 2.0056 +[titan] 2025-09-09 09:02:56,773 - root - INFO - lr: 1.0048e-05 gnorm: 0.34 [1 day, 15:27:29<1 day, 9:49:58] +[titan] 2025-09-09 09:03:28,861 - root - INFO - step: 21540 loss: 2.8180 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.70 mfu: 49.21% global_avg_ntp_loss: 0.8195 global_avg_top_loss: 1.9985 +[titan] 2025-09-09 09:03:28,862 - root - INFO - lr: 1.0045e-05 gnorm: 0.34 [1 day, 15:28:01<1 day, 9:49:25] +[titan] 2025-09-09 09:04:00,701 - root - INFO - step: 21545 loss: 2.6384 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.50 mfu: 49.60% global_avg_ntp_loss: 0.7275 global_avg_top_loss: 1.9108 +[titan] 2025-09-09 09:04:00,701 - root - INFO - lr: 1.0041e-05 gnorm: 0.46 [1 day, 15:28:33<1 day, 9:48:51] +[titan] 2025-09-09 09:04:26,260 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 09:04:32,679 - root - INFO - step: 21550 loss: 2.8645 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.38 mfu: 49.38% global_avg_ntp_loss: 0.8352 global_avg_top_loss: 2.0293 +[titan] 2025-09-09 09:04:32,679 - root - INFO - lr: 1.0038e-05 gnorm: 0.34 [1 day, 15:29:05<1 day, 9:48:17] +[titan] 2025-09-09 09:05:04,604 - root - INFO - step: 21555 loss: 2.7747 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.20 mfu: 49.46% global_avg_ntp_loss: 0.8015 global_avg_top_loss: 1.9732 +[titan] 2025-09-09 09:05:04,604 - root - INFO - lr: 1.0034e-05 gnorm: 0.34 [1 day, 15:29:37<1 day, 9:47:43] +[titan] 2025-09-09 09:05:36,652 - root - INFO - step: 21560 loss: 3.2656 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.31 mfu: 49.27% global_avg_ntp_loss: 1.0721 global_avg_top_loss: 2.1935 +[titan] 2025-09-09 09:05:36,652 - root - INFO - lr: 1.0031e-05 gnorm: 0.37 [1 day, 15:30:09<1 day, 9:47:09] +[titan] 2025-09-09 09:06:08,695 - root - INFO - step: 21565 loss: 2.8106 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.39 mfu: 49.28% global_avg_ntp_loss: 0.8015 global_avg_top_loss: 2.0091 +[titan] 2025-09-09 09:06:08,695 - root - INFO - lr: 1.0027e-05 gnorm: 0.47 [1 day, 15:30:41<1 day, 9:46:35] +[titan] 2025-09-09 09:06:40,846 - root - INFO - step: 21570 loss: 2.7767 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.75 mfu: 49.12% global_avg_ntp_loss: 0.7938 global_avg_top_loss: 1.9830 +[titan] 2025-09-09 09:06:40,846 - root - INFO - lr: 1.0023e-05 gnorm: 0.33 [1 day, 15:31:13<1 day, 9:46:02] +[titan] 2025-09-09 09:07:12,820 - root - INFO - step: 21575 loss: 3.2102 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.43 mfu: 49.39% global_avg_ntp_loss: 1.0477 global_avg_top_loss: 2.1625 +[titan] 2025-09-09 09:07:12,821 - root - INFO - lr: 1.0020e-05 gnorm: 0.36 [1 day, 15:31:45<1 day, 9:45:28] +[titan] 2025-09-09 09:07:44,834 - root - INFO - step: 21580 loss: 2.8018 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.84 mfu: 49.33% global_avg_ntp_loss: 0.7931 global_avg_top_loss: 2.0087 +[titan] 2025-09-09 09:07:44,834 - root - INFO - lr: 1.0016e-05 gnorm: 1.13 [1 day, 15:32:17<1 day, 9:44:54] +[titan] 2025-09-09 09:08:16,906 - root - INFO - step: 21585 loss: 2.8776 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.96 mfu: 49.24% global_avg_ntp_loss: 0.8386 global_avg_top_loss: 2.0390 +[titan] 2025-09-09 09:08:16,906 - root - INFO - lr: 1.0013e-05 gnorm: 0.37 [1 day, 15:32:49<1 day, 9:44:20] +[titan] 2025-09-09 09:08:48,956 - root - INFO - step: 21590 loss: 2.7611 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.27 mfu: 49.27% global_avg_ntp_loss: 0.7877 global_avg_top_loss: 1.9734 +[titan] 2025-09-09 09:08:48,957 - root - INFO - lr: 1.0009e-05 gnorm: 0.35 [1 day, 15:33:21<1 day, 9:43:47] +[titan] 2025-09-09 09:09:20,964 - root - INFO - step: 21595 loss: 2.6858 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.93 mfu: 49.34% global_avg_ntp_loss: 0.7556 global_avg_top_loss: 1.9303 +[titan] 2025-09-09 09:09:20,964 - root - INFO - lr: 1.0006e-05 gnorm: 0.34 [1 day, 15:33:53<1 day, 9:43:13] +[titan] 2025-09-09 09:09:46,585 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 09:09:53,062 - root - INFO - step: 21600 loss: 2.7319 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.55 mfu: 49.20% global_avg_ntp_loss: 0.7754 global_avg_top_loss: 1.9565 +[titan] 2025-09-09 09:09:53,063 - root - INFO - lr: 1.0002e-05 gnorm: 0.34 [1 day, 15:34:25<1 day, 9:42:39] +[titan] 2025-09-09 09:10:25,171 - root - INFO - step: 21605 loss: 2.8199 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.40 mfu: 49.18% global_avg_ntp_loss: 0.8112 global_avg_top_loss: 2.0086 +[titan] 2025-09-09 09:10:25,171 - root - INFO - lr: 9.9986e-06 gnorm: 0.34 [1 day, 15:34:57<1 day, 9:42:05] +[titan] 2025-09-09 09:10:57,199 - root - INFO - step: 21610 loss: 3.2078 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.62 mfu: 49.30% global_avg_ntp_loss: 1.0454 global_avg_top_loss: 2.1623 +[titan] 2025-09-09 09:10:57,199 - root - INFO - lr: 9.9950e-06 gnorm: 0.34 [1 day, 15:35:29<1 day, 9:41:32] +[titan] 2025-09-09 09:11:29,325 - root - INFO - step: 21615 loss: 2.8016 memory: 122.03GiB(87.57%) tps: 10,200 tflops: 486.13 mfu: 49.15% global_avg_ntp_loss: 0.8056 global_avg_top_loss: 1.9959 +[titan] 2025-09-09 09:11:29,325 - root - INFO - lr: 9.9915e-06 gnorm: 0.36 [1 day, 15:36:01<1 day, 9:40:58] +[titan] 2025-09-09 09:12:01,269 - root - INFO - step: 21620 loss: 2.7403 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.90 mfu: 49.43% global_avg_ntp_loss: 0.7775 global_avg_top_loss: 1.9628 +[titan] 2025-09-09 09:12:01,270 - root - INFO - lr: 9.9879e-06 gnorm: 0.36 [1 day, 15:36:33<1 day, 9:40:24] +[titan] 2025-09-09 09:12:33,188 - root - INFO - step: 21625 loss: 2.7520 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.29 mfu: 49.47% global_avg_ntp_loss: 0.7841 global_avg_top_loss: 1.9678 +[titan] 2025-09-09 09:12:33,188 - root - INFO - lr: 9.9844e-06 gnorm: 0.35 [1 day, 15:37:05<1 day, 9:39:50] +[titan] 2025-09-09 09:13:04,993 - root - INFO - step: 21630 loss: 2.7973 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.04 mfu: 49.65% global_avg_ntp_loss: 0.8051 global_avg_top_loss: 1.9922 +[titan] 2025-09-09 09:13:04,993 - root - INFO - lr: 9.9808e-06 gnorm: 0.34 [1 day, 15:37:37<1 day, 9:39:16] +[titan] 2025-09-09 09:13:37,258 - root - INFO - step: 21635 loss: 2.8207 memory: 122.03GiB(87.57%) tps: 10,156 tflops: 484.04 mfu: 48.94% global_avg_ntp_loss: 0.8149 global_avg_top_loss: 2.0058 +[titan] 2025-09-09 09:13:37,258 - root - INFO - lr: 9.9773e-06 gnorm: 0.33 [1 day, 15:38:09<1 day, 9:38:43] +[titan] 2025-09-09 09:14:09,247 - root - INFO - step: 21640 loss: 3.2058 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.21 mfu: 49.36% global_avg_ntp_loss: 1.0431 global_avg_top_loss: 2.1627 +[titan] 2025-09-09 09:14:09,247 - root - INFO - lr: 9.9737e-06 gnorm: 0.35 [1 day, 15:38:41<1 day, 9:38:09] +[titan] 2025-09-09 09:14:41,094 - root - INFO - step: 21645 loss: 2.7891 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.40 mfu: 49.59% global_avg_ntp_loss: 0.8006 global_avg_top_loss: 1.9886 +[titan] 2025-09-09 09:14:41,094 - root - INFO - lr: 9.9702e-06 gnorm: 0.33 [1 day, 15:39:13<1 day, 9:37:35] +[titan] 2025-09-09 09:15:06,673 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 09:15:13,183 - root - INFO - step: 21650 loss: 2.8429 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.69 mfu: 49.21% global_avg_ntp_loss: 0.8245 global_avg_top_loss: 2.0184 +[titan] 2025-09-09 09:15:13,183 - root - INFO - lr: 9.9667e-06 gnorm: 0.34 [1 day, 15:39:45<1 day, 9:37:01] +[titan] 2025-09-09 09:15:45,108 - root - INFO - step: 21655 loss: 2.7378 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.18 mfu: 49.46% global_avg_ntp_loss: 0.7868 global_avg_top_loss: 1.9509 +[titan] 2025-09-09 09:15:45,109 - root - INFO - lr: 9.9631e-06 gnorm: 0.34 [1 day, 15:40:17<1 day, 9:36:27] +[titan] 2025-09-09 09:16:17,071 - root - INFO - step: 21660 loss: 2.7385 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.62 mfu: 49.41% global_avg_ntp_loss: 0.7806 global_avg_top_loss: 1.9579 +[titan] 2025-09-09 09:16:17,071 - root - INFO - lr: 9.9596e-06 gnorm: 0.33 [1 day, 15:40:49<1 day, 9:35:53] +[titan] 2025-09-09 09:16:48,949 - root - INFO - step: 21665 loss: 2.7924 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.91 mfu: 49.54% global_avg_ntp_loss: 0.8005 global_avg_top_loss: 1.9919 +[titan] 2025-09-09 09:16:48,950 - root - INFO - lr: 9.9560e-06 gnorm: 0.35 [1 day, 15:41:21<1 day, 9:35:19] +[titan] 2025-09-09 09:17:20,890 - root - INFO - step: 21670 loss: 2.8056 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.96 mfu: 49.44% global_avg_ntp_loss: 0.8157 global_avg_top_loss: 1.9900 +[titan] 2025-09-09 09:17:20,890 - root - INFO - lr: 9.9525e-06 gnorm: 0.33 [1 day, 15:41:53<1 day, 9:34:46] +[titan] 2025-09-09 09:17:53,055 - root - INFO - step: 21675 loss: 3.1147 memory: 122.03GiB(87.57%) tps: 10,188 tflops: 485.54 mfu: 49.09% global_avg_ntp_loss: 0.9845 global_avg_top_loss: 2.1303 +[titan] 2025-09-09 09:17:53,055 - root - INFO - lr: 9.9489e-06 gnorm: 0.42 [1 day, 15:42:25<1 day, 9:34:12] +[titan] 2025-09-09 09:18:25,060 - root - INFO - step: 21680 loss: 2.8162 memory: 122.03GiB(87.57%) tps: 10,239 tflops: 487.97 mfu: 49.34% global_avg_ntp_loss: 0.8134 global_avg_top_loss: 2.0027 +[titan] 2025-09-09 09:18:25,060 - root - INFO - lr: 9.9454e-06 gnorm: 0.33 [1 day, 15:42:57<1 day, 9:33:38] +[titan] 2025-09-09 09:18:57,218 - root - INFO - step: 21685 loss: 2.7674 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.64 mfu: 49.10% global_avg_ntp_loss: 0.7897 global_avg_top_loss: 1.9777 +[titan] 2025-09-09 09:18:57,219 - root - INFO - lr: 9.9418e-06 gnorm: 0.34 [1 day, 15:43:29<1 day, 9:33:04] +[titan] 2025-09-09 09:19:29,260 - root - INFO - step: 21690 loss: 3.2513 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.41 mfu: 49.28% global_avg_ntp_loss: 1.0636 global_avg_top_loss: 2.1877 +[titan] 2025-09-09 09:19:29,261 - root - INFO - lr: 9.9383e-06 gnorm: 0.36 [1 day, 15:44:01<1 day, 9:32:31] +[titan] 2025-09-09 09:20:01,389 - root - INFO - step: 21695 loss: 2.7729 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.09 mfu: 49.15% global_avg_ntp_loss: 0.7945 global_avg_top_loss: 1.9784 +[titan] 2025-09-09 09:20:01,390 - root - INFO - lr: 9.9347e-06 gnorm: 0.35 [1 day, 15:44:33<1 day, 9:31:57] +[titan] 2025-09-09 09:20:27,089 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 09:20:33,537 - root - INFO - step: 21700 loss: 3.2302 memory: 122.03GiB(87.57%) tps: 10,193 tflops: 485.80 mfu: 49.12% global_avg_ntp_loss: 1.0528 global_avg_top_loss: 2.1775 +[titan] 2025-09-09 09:20:33,538 - root - INFO - lr: 9.9312e-06 gnorm: 0.39 [1 day, 15:45:06<1 day, 9:31:23] +[titan] 2025-09-09 09:21:05,525 - root - INFO - step: 21705 loss: 2.8747 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.23 mfu: 49.37% global_avg_ntp_loss: 0.8379 global_avg_top_loss: 2.0368 +[titan] 2025-09-09 09:21:05,525 - root - INFO - lr: 9.9277e-06 gnorm: 0.35 [1 day, 15:45:37<1 day, 9:30:50] +[titan] 2025-09-09 09:21:37,445 - root - INFO - step: 21710 loss: 2.7781 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.27 mfu: 49.47% global_avg_ntp_loss: 0.7988 global_avg_top_loss: 1.9793 +[titan] 2025-09-09 09:21:37,445 - root - INFO - lr: 9.9241e-06 gnorm: 0.34 [1 day, 15:46:09<1 day, 9:30:16] +[titan] 2025-09-09 09:22:09,456 - root - INFO - step: 21715 loss: 2.7318 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.88 mfu: 49.33% global_avg_ntp_loss: 0.7771 global_avg_top_loss: 1.9547 +[titan] 2025-09-09 09:22:09,457 - root - INFO - lr: 9.9206e-06 gnorm: 0.34 [1 day, 15:46:41<1 day, 9:29:42] +[titan] 2025-09-09 09:22:41,500 - root - INFO - step: 21720 loss: 2.9010 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.38 mfu: 49.28% global_avg_ntp_loss: 0.8719 global_avg_top_loss: 2.0291 +[titan] 2025-09-09 09:22:41,500 - root - INFO - lr: 9.9170e-06 gnorm: 0.35 [1 day, 15:47:13<1 day, 9:29:08] +[titan] 2025-09-09 09:23:13,407 - root - INFO - step: 21725 loss: 2.8291 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.47 mfu: 49.49% global_avg_ntp_loss: 0.8200 global_avg_top_loss: 2.0091 +[titan] 2025-09-09 09:23:13,407 - root - INFO - lr: 9.9135e-06 gnorm: 0.34 [1 day, 15:47:45<1 day, 9:28:34] +[titan] 2025-09-09 09:23:45,374 - root - INFO - step: 21730 loss: 2.8107 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.55 mfu: 49.40% global_avg_ntp_loss: 0.8108 global_avg_top_loss: 2.0000 +[titan] 2025-09-09 09:23:45,375 - root - INFO - lr: 9.9099e-06 gnorm: 0.46 [1 day, 15:48:17<1 day, 9:28:00] +[titan] 2025-09-09 09:24:17,342 - root - INFO - step: 21735 loss: 2.7608 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.54 mfu: 49.40% global_avg_ntp_loss: 0.7838 global_avg_top_loss: 1.9770 +[titan] 2025-09-09 09:24:17,342 - root - INFO - lr: 9.9064e-06 gnorm: 0.33 [1 day, 15:48:49<1 day, 9:27:27] +[titan] 2025-09-09 09:24:49,459 - root - INFO - step: 21740 loss: 2.6843 memory: 122.03GiB(87.57%) tps: 10,203 tflops: 486.26 mfu: 49.17% global_avg_ntp_loss: 0.7508 global_avg_top_loss: 1.9335 +[titan] 2025-09-09 09:24:49,460 - root - INFO - lr: 9.9028e-06 gnorm: 0.33 [1 day, 15:49:21<1 day, 9:26:53] +[titan] 2025-09-09 09:25:21,362 - root - INFO - step: 21745 loss: 2.7867 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.54 mfu: 49.50% global_avg_ntp_loss: 0.7984 global_avg_top_loss: 1.9883 +[titan] 2025-09-09 09:25:21,362 - root - INFO - lr: 9.8993e-06 gnorm: 0.33 [1 day, 15:49:53<1 day, 9:26:19] +[titan] 2025-09-09 09:25:46,807 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 09:25:53,228 - root - INFO - step: 21750 loss: 2.7778 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.09 mfu: 49.55% global_avg_ntp_loss: 0.7930 global_avg_top_loss: 1.9848 +[titan] 2025-09-09 09:25:53,228 - root - INFO - lr: 9.8958e-06 gnorm: 0.35 [1 day, 15:50:25<1 day, 9:25:45] +[titan] 2025-09-09 09:26:25,445 - root - INFO - step: 21755 loss: 3.2527 memory: 122.03GiB(87.57%) tps: 10,171 tflops: 484.75 mfu: 49.01% global_avg_ntp_loss: 1.0648 global_avg_top_loss: 2.1878 +[titan] 2025-09-09 09:26:25,445 - root - INFO - lr: 9.8922e-06 gnorm: 0.37 [1 day, 15:50:57<1 day, 9:25:12] +[titan] 2025-09-09 09:26:57,489 - root - INFO - step: 21760 loss: 2.8124 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.38 mfu: 49.28% global_avg_ntp_loss: 0.8115 global_avg_top_loss: 2.0009 +[titan] 2025-09-09 09:26:57,489 - root - INFO - lr: 9.8887e-06 gnorm: 0.34 [1 day, 15:51:29<1 day, 9:24:38] +[titan] 2025-09-09 09:27:29,539 - root - INFO - step: 21765 loss: 2.7269 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.29 mfu: 49.27% global_avg_ntp_loss: 0.7721 global_avg_top_loss: 1.9549 +[titan] 2025-09-09 09:27:29,539 - root - INFO - lr: 9.8851e-06 gnorm: 0.33 [1 day, 15:52:01<1 day, 9:24:04] +[titan] 2025-09-09 09:28:01,435 - root - INFO - step: 21770 loss: 3.2764 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.63 mfu: 49.51% global_avg_ntp_loss: 1.0767 global_avg_top_loss: 2.1997 +[titan] 2025-09-09 09:28:01,435 - root - INFO - lr: 9.8816e-06 gnorm: 0.36 [1 day, 15:52:33<1 day, 9:23:30] +[titan] 2025-09-09 09:28:33,547 - root - INFO - step: 21775 loss: 2.8499 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.34 mfu: 49.18% global_avg_ntp_loss: 0.8312 global_avg_top_loss: 2.0187 +[titan] 2025-09-09 09:28:33,547 - root - INFO - lr: 9.8781e-06 gnorm: 0.34 [1 day, 15:53:05<1 day, 9:22:57] +[titan] 2025-09-09 09:29:05,222 - root - INFO - step: 21780 loss: 2.7963 memory: 122.03GiB(87.57%) tps: 10,345 tflops: 493.05 mfu: 49.85% global_avg_ntp_loss: 0.8028 global_avg_top_loss: 1.9935 +[titan] 2025-09-09 09:29:05,223 - root - INFO - lr: 9.8745e-06 gnorm: 0.35 [1 day, 15:53:37<1 day, 9:22:22] +[titan] 2025-09-09 09:29:37,193 - root - INFO - step: 21785 loss: 2.8062 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.50 mfu: 49.39% global_avg_ntp_loss: 0.8093 global_avg_top_loss: 1.9969 +[titan] 2025-09-09 09:29:37,193 - root - INFO - lr: 9.8710e-06 gnorm: 0.34 [1 day, 15:54:09<1 day, 9:21:49] +[titan] 2025-09-09 09:30:08,984 - root - INFO - step: 21790 loss: 2.7962 memory: 122.03GiB(87.57%) tps: 10,308 tflops: 491.25 mfu: 49.67% global_avg_ntp_loss: 0.8009 global_avg_top_loss: 1.9953 +[titan] 2025-09-09 09:30:08,984 - root - INFO - lr: 9.8674e-06 gnorm: 0.35 [1 day, 15:54:41<1 day, 9:21:15] +[titan] 2025-09-09 09:30:40,810 - root - INFO - step: 21795 loss: 2.7956 memory: 122.03GiB(87.57%) tps: 10,296 tflops: 490.71 mfu: 49.62% global_avg_ntp_loss: 0.8059 global_avg_top_loss: 1.9897 +[titan] 2025-09-09 09:30:40,810 - root - INFO - lr: 9.8639e-06 gnorm: 0.32 [1 day, 15:55:13<1 day, 9:20:41] +[titan] 2025-09-09 09:31:06,355 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 09:31:12,680 - root - INFO - step: 21800 loss: 2.7703 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.04 mfu: 49.55% global_avg_ntp_loss: 0.7934 global_avg_top_loss: 1.9769 +[titan] 2025-09-09 09:31:12,680 - root - INFO - lr: 9.8603e-06 gnorm: 0.43 [1 day, 15:55:45<1 day, 9:20:07] +[titan] 2025-09-09 09:31:44,624 - root - INFO - step: 21805 loss: 2.7569 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.90 mfu: 49.43% global_avg_ntp_loss: 0.7871 global_avg_top_loss: 1.9699 +[titan] 2025-09-09 09:31:44,624 - root - INFO - lr: 9.8568e-06 gnorm: 0.32 [1 day, 15:56:17<1 day, 9:19:33] +[titan] 2025-09-09 09:32:16,374 - root - INFO - step: 21810 loss: 2.7460 memory: 122.03GiB(87.57%) tps: 10,321 tflops: 491.88 mfu: 49.74% global_avg_ntp_loss: 0.7822 global_avg_top_loss: 1.9639 +[titan] 2025-09-09 09:32:16,375 - root - INFO - lr: 9.8533e-06 gnorm: 0.35 [1 day, 15:56:48<1 day, 9:18:59] +[titan] 2025-09-09 09:32:48,303 - root - INFO - step: 21815 loss: 2.6446 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.14 mfu: 49.46% global_avg_ntp_loss: 0.7350 global_avg_top_loss: 1.9095 +[titan] 2025-09-09 09:32:48,304 - root - INFO - lr: 9.8497e-06 gnorm: 0.33 [1 day, 15:57:20<1 day, 9:18:25] +[titan] 2025-09-09 09:33:20,058 - root - INFO - step: 21820 loss: 2.8282 memory: 122.03GiB(87.57%) tps: 10,319 tflops: 491.81 mfu: 49.73% global_avg_ntp_loss: 0.8196 global_avg_top_loss: 2.0086 +[titan] 2025-09-09 09:33:20,058 - root - INFO - lr: 9.8462e-06 gnorm: 0.35 [1 day, 15:57:52<1 day, 9:17:51] +[titan] 2025-09-09 09:33:52,089 - root - INFO - step: 21825 loss: 2.8589 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.57 mfu: 49.30% global_avg_ntp_loss: 0.8374 global_avg_top_loss: 2.0214 +[titan] 2025-09-09 09:33:52,090 - root - INFO - lr: 9.8426e-06 gnorm: 0.35 [1 day, 15:58:24<1 day, 9:17:18] +[titan] 2025-09-09 09:34:23,933 - root - INFO - step: 21830 loss: 2.7610 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.44 mfu: 49.59% global_avg_ntp_loss: 0.7878 global_avg_top_loss: 1.9732 +[titan] 2025-09-09 09:34:23,934 - root - INFO - lr: 9.8391e-06 gnorm: 0.33 [1 day, 15:58:56<1 day, 9:16:44] +[titan] 2025-09-09 09:34:56,009 - root - INFO - step: 21835 loss: 3.2460 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.89 mfu: 49.23% global_avg_ntp_loss: 1.0598 global_avg_top_loss: 2.1862 +[titan] 2025-09-09 09:34:56,010 - root - INFO - lr: 9.8356e-06 gnorm: 0.36 [1 day, 15:59:28<1 day, 9:16:10] +[titan] 2025-09-09 09:35:27,984 - root - INFO - step: 21840 loss: 2.8116 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.43 mfu: 49.39% global_avg_ntp_loss: 0.8206 global_avg_top_loss: 1.9911 +[titan] 2025-09-09 09:35:27,984 - root - INFO - lr: 9.8320e-06 gnorm: 0.33 [1 day, 16:00:00<1 day, 9:15:36] +[titan] 2025-09-09 09:35:59,850 - root - INFO - step: 21845 loss: 2.8150 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.10 mfu: 49.56% global_avg_ntp_loss: 0.8118 global_avg_top_loss: 2.0031 +[titan] 2025-09-09 09:35:59,850 - root - INFO - lr: 9.8285e-06 gnorm: 0.34 [1 day, 16:00:32<1 day, 9:15:02] +[titan] 2025-09-09 09:36:25,266 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 09:36:31,658 - root - INFO - step: 21850 loss: 3.1993 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 490.98 mfu: 49.64% global_avg_ntp_loss: 1.0417 global_avg_top_loss: 2.1576 +[titan] 2025-09-09 09:36:31,658 - root - INFO - lr: 9.8249e-06 gnorm: 0.40 [1 day, 16:01:04<1 day, 9:14:28] +[titan] 2025-09-09 09:37:03,729 - root - INFO - step: 21855 loss: 2.8371 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.97 mfu: 49.24% global_avg_ntp_loss: 0.8225 global_avg_top_loss: 2.0146 +[titan] 2025-09-09 09:37:03,729 - root - INFO - lr: 9.8214e-06 gnorm: 0.33 [1 day, 16:01:36<1 day, 9:13:55] +[titan] 2025-09-09 09:37:35,660 - root - INFO - step: 21860 loss: 2.8166 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.10 mfu: 49.45% global_avg_ntp_loss: 0.8123 global_avg_top_loss: 2.0043 +[titan] 2025-09-09 09:37:35,660 - root - INFO - lr: 9.8179e-06 gnorm: 0.35 [1 day, 16:02:08<1 day, 9:13:21] +[titan] 2025-09-09 09:38:07,597 - root - INFO - step: 21865 loss: 3.2084 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 489.00 mfu: 49.44% global_avg_ntp_loss: 1.0441 global_avg_top_loss: 2.1643 +[titan] 2025-09-09 09:38:07,597 - root - INFO - lr: 9.8143e-06 gnorm: 0.41 [1 day, 16:02:40<1 day, 9:12:47] +[titan] 2025-09-09 09:38:39,439 - root - INFO - step: 21870 loss: 2.7741 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.47 mfu: 49.59% global_avg_ntp_loss: 0.7944 global_avg_top_loss: 1.9797 +[titan] 2025-09-09 09:38:39,439 - root - INFO - lr: 9.8108e-06 gnorm: 0.33 [1 day, 16:03:11<1 day, 9:12:13] +[titan] 2025-09-09 09:39:11,570 - root - INFO - step: 21875 loss: 2.7826 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.05 mfu: 49.15% global_avg_ntp_loss: 0.7949 global_avg_top_loss: 1.9877 +[titan] 2025-09-09 09:39:11,570 - root - INFO - lr: 9.8072e-06 gnorm: 0.36 [1 day, 16:03:43<1 day, 9:11:39] +[titan] 2025-09-09 09:39:43,333 - root - INFO - step: 21880 loss: 2.7948 memory: 122.03GiB(87.57%) tps: 10,316 tflops: 491.68 mfu: 49.71% global_avg_ntp_loss: 0.8035 global_avg_top_loss: 1.9913 +[titan] 2025-09-09 09:39:43,334 - root - INFO - lr: 9.8037e-06 gnorm: 0.35 [1 day, 16:04:15<1 day, 9:11:05] +[titan] 2025-09-09 09:40:15,140 - root - INFO - step: 21885 loss: 2.8201 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 491.01 mfu: 49.65% global_avg_ntp_loss: 0.8154 global_avg_top_loss: 2.0047 +[titan] 2025-09-09 09:40:15,141 - root - INFO - lr: 9.8002e-06 gnorm: 0.38 [1 day, 16:04:47<1 day, 9:10:31] +[titan] 2025-09-09 09:40:46,740 - root - INFO - step: 21890 loss: 2.7615 memory: 122.03GiB(87.57%) tps: 10,370 tflops: 494.23 mfu: 49.97% global_avg_ntp_loss: 0.7901 global_avg_top_loss: 1.9714 +[titan] 2025-09-09 09:40:46,740 - root - INFO - lr: 9.7966e-06 gnorm: 0.34 [1 day, 16:05:19<1 day, 9:09:57] +[titan] 2025-09-09 09:41:18,569 - root - INFO - step: 21895 loss: 2.8081 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.67 mfu: 49.61% global_avg_ntp_loss: 0.8092 global_avg_top_loss: 1.9988 +[titan] 2025-09-09 09:41:18,569 - root - INFO - lr: 9.7931e-06 gnorm: 0.32 [1 day, 16:05:50<1 day, 9:09:23] +[titan] 2025-09-09 09:41:43,980 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 09:41:50,384 - root - INFO - step: 21900 loss: 2.7632 memory: 122.03GiB(87.57%) tps: 10,300 tflops: 490.88 mfu: 49.63% global_avg_ntp_loss: 0.7893 global_avg_top_loss: 1.9739 +[titan] 2025-09-09 09:41:50,384 - root - INFO - lr: 9.7896e-06 gnorm: 0.36 [1 day, 16:06:22<1 day, 9:08:50] +[titan] 2025-09-09 09:42:22,286 - root - INFO - step: 21905 loss: 2.6902 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.55 mfu: 49.50% global_avg_ntp_loss: 0.7564 global_avg_top_loss: 1.9338 +[titan] 2025-09-09 09:42:22,286 - root - INFO - lr: 9.7860e-06 gnorm: 0.34 [1 day, 16:06:54<1 day, 9:08:16] +[titan] 2025-09-09 09:42:54,215 - root - INFO - step: 21910 loss: 2.8081 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.13 mfu: 49.46% global_avg_ntp_loss: 0.8064 global_avg_top_loss: 2.0017 +[titan] 2025-09-09 09:42:54,215 - root - INFO - lr: 9.7825e-06 gnorm: 0.32 [1 day, 16:07:26<1 day, 9:07:42] +[titan] 2025-09-09 09:43:26,138 - root - INFO - step: 21915 loss: 3.2417 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.21 mfu: 49.47% global_avg_ntp_loss: 1.0618 global_avg_top_loss: 2.1799 +[titan] 2025-09-09 09:43:26,139 - root - INFO - lr: 9.7789e-06 gnorm: 0.38 [1 day, 16:07:58<1 day, 9:07:08] +[titan] 2025-09-09 09:43:57,814 - root - INFO - step: 21920 loss: 2.8170 memory: 122.03GiB(87.57%) tps: 10,345 tflops: 493.04 mfu: 49.85% global_avg_ntp_loss: 0.8148 global_avg_top_loss: 2.0022 +[titan] 2025-09-09 09:43:57,815 - root - INFO - lr: 9.7754e-06 gnorm: 0.33 [1 day, 16:08:30<1 day, 9:06:34] +[titan] 2025-09-09 09:44:29,578 - root - INFO - step: 21925 loss: 2.7314 memory: 122.03GiB(87.57%) tps: 10,316 tflops: 491.67 mfu: 49.71% global_avg_ntp_loss: 0.7739 global_avg_top_loss: 1.9575 +[titan] 2025-09-09 09:44:29,579 - root - INFO - lr: 9.7719e-06 gnorm: 0.32 [1 day, 16:09:01<1 day, 9:06:00] +[titan] 2025-09-09 09:45:01,467 - root - INFO - step: 21930 loss: 3.2231 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.75 mfu: 49.52% global_avg_ntp_loss: 1.0525 global_avg_top_loss: 2.1706 +[titan] 2025-09-09 09:45:01,468 - root - INFO - lr: 9.7683e-06 gnorm: 0.37 [1 day, 16:09:33<1 day, 9:05:26] +[titan] 2025-09-09 09:45:33,163 - root - INFO - step: 21935 loss: 2.6797 memory: 122.03GiB(87.57%) tps: 10,339 tflops: 492.73 mfu: 49.82% global_avg_ntp_loss: 0.7534 global_avg_top_loss: 1.9263 +[titan] 2025-09-09 09:45:33,164 - root - INFO - lr: 9.7648e-06 gnorm: 0.32 [1 day, 16:10:05<1 day, 9:04:52] +[titan] 2025-09-09 09:46:04,882 - root - INFO - step: 21940 loss: 2.6642 memory: 122.03GiB(87.57%) tps: 10,331 tflops: 492.37 mfu: 49.79% global_avg_ntp_loss: 0.7409 global_avg_top_loss: 1.9233 +[titan] 2025-09-09 09:46:04,882 - root - INFO - lr: 9.7613e-06 gnorm: 0.36 [1 day, 16:10:37<1 day, 9:04:18] +[titan] 2025-09-09 09:46:36,664 - root - INFO - step: 21945 loss: 3.2113 memory: 122.03GiB(87.57%) tps: 10,311 tflops: 491.40 mfu: 49.69% global_avg_ntp_loss: 1.0472 global_avg_top_loss: 2.1641 +[titan] 2025-09-09 09:46:36,664 - root - INFO - lr: 9.7577e-06 gnorm: 0.42 [1 day, 16:11:09<1 day, 9:03:44] +[titan] 2025-09-09 09:47:02,172 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 09:47:08,577 - root - INFO - step: 21950 loss: 2.8186 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.38 mfu: 49.48% global_avg_ntp_loss: 0.8139 global_avg_top_loss: 2.0048 +[titan] 2025-09-09 09:47:08,577 - root - INFO - lr: 9.7542e-06 gnorm: 0.33 [1 day, 16:11:40<1 day, 9:03:11] +[titan] 2025-09-09 09:47:40,404 - root - INFO - step: 21955 loss: 2.8415 memory: 122.03GiB(87.57%) tps: 10,296 tflops: 490.69 mfu: 49.61% global_avg_ntp_loss: 0.8217 global_avg_top_loss: 2.0198 +[titan] 2025-09-09 09:47:40,405 - root - INFO - lr: 9.7507e-06 gnorm: 0.35 [1 day, 16:12:12<1 day, 9:02:37] +[titan] 2025-09-09 09:48:12,411 - root - INFO - step: 21960 loss: 2.7756 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.95 mfu: 49.34% global_avg_ntp_loss: 0.7947 global_avg_top_loss: 1.9810 +[titan] 2025-09-09 09:48:12,411 - root - INFO - lr: 9.7471e-06 gnorm: 0.40 [1 day, 16:12:44<1 day, 9:02:03] +[titan] 2025-09-09 09:48:44,356 - root - INFO - step: 21965 loss: 2.7766 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.88 mfu: 49.43% global_avg_ntp_loss: 0.7952 global_avg_top_loss: 1.9814 +[titan] 2025-09-09 09:48:44,357 - root - INFO - lr: 9.7436e-06 gnorm: 0.34 [1 day, 16:13:16<1 day, 9:01:29] +[titan] 2025-09-09 09:49:16,244 - root - INFO - step: 21970 loss: 2.8347 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.77 mfu: 49.52% global_avg_ntp_loss: 0.8210 global_avg_top_loss: 2.0137 +[titan] 2025-09-09 09:49:16,244 - root - INFO - lr: 9.7401e-06 gnorm: 0.33 [1 day, 16:13:48<1 day, 9:00:55] +[titan] 2025-09-09 09:49:48,135 - root - INFO - step: 21975 loss: 2.8245 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.71 mfu: 49.52% global_avg_ntp_loss: 0.8236 global_avg_top_loss: 2.0009 +[titan] 2025-09-09 09:49:48,135 - root - INFO - lr: 9.7365e-06 gnorm: 0.41 [1 day, 16:14:20<1 day, 9:00:21] +[titan] 2025-09-09 09:50:19,972 - root - INFO - step: 21980 loss: 2.7653 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.54 mfu: 49.60% global_avg_ntp_loss: 0.7893 global_avg_top_loss: 1.9760 +[titan] 2025-09-09 09:50:19,973 - root - INFO - lr: 9.7330e-06 gnorm: 0.32 [1 day, 16:14:52<1 day, 8:59:48] +[titan] 2025-09-09 09:50:51,599 - root - INFO - step: 21985 loss: 2.7592 memory: 122.03GiB(87.57%) tps: 10,361 tflops: 493.80 mfu: 49.93% global_avg_ntp_loss: 0.7876 global_avg_top_loss: 1.9716 +[titan] 2025-09-09 09:50:51,600 - root - INFO - lr: 9.7294e-06 gnorm: 0.34 [1 day, 16:15:24<1 day, 8:59:13] +[titan] 2025-09-09 09:51:23,425 - root - INFO - step: 21990 loss: 2.7158 memory: 122.03GiB(87.57%) tps: 10,296 tflops: 490.71 mfu: 49.62% global_avg_ntp_loss: 0.7660 global_avg_top_loss: 1.9497 +[titan] 2025-09-09 09:51:23,426 - root - INFO - lr: 9.7259e-06 gnorm: 0.34 [1 day, 16:15:55<1 day, 8:58:40] +[titan] 2025-09-09 09:51:55,246 - root - INFO - step: 21995 loss: 3.1904 memory: 122.03GiB(87.57%) tps: 10,298 tflops: 490.79 mfu: 49.63% global_avg_ntp_loss: 1.0343 global_avg_top_loss: 2.1560 +[titan] 2025-09-09 09:51:55,247 - root - INFO - lr: 9.7224e-06 gnorm: 0.40 [1 day, 16:16:27<1 day, 8:58:06] +[titan] 2025-09-09 09:52:20,945 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 09:52:27,529 - root - INFO - step: 22000 loss: 2.7668 memory: 122.03GiB(87.57%) tps: 10,151 tflops: 483.77 mfu: 48.91% global_avg_ntp_loss: 0.7902 global_avg_top_loss: 1.9766 +[titan] 2025-09-09 09:52:27,530 - root - INFO - lr: 9.7188e-06 gnorm: 0.35 [1 day, 16:16:59<1 day, 8:57:32] +[titan] 2025-09-09 09:52:59,568 - root - INFO - step: 22005 loss: 2.7883 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.46 mfu: 49.29% global_avg_ntp_loss: 0.8005 global_avg_top_loss: 1.9878 +[titan] 2025-09-09 09:52:59,568 - root - INFO - lr: 9.7153e-06 gnorm: 0.33 [1 day, 16:17:31<1 day, 8:56:58] +[titan] 2025-09-09 09:53:31,448 - root - INFO - step: 22010 loss: 2.7886 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.88 mfu: 49.53% global_avg_ntp_loss: 0.8015 global_avg_top_loss: 1.9871 +[titan] 2025-09-09 09:53:31,449 - root - INFO - lr: 9.7118e-06 gnorm: 0.35 [1 day, 16:18:03<1 day, 8:56:25] +[titan] 2025-09-09 09:54:03,731 - root - INFO - step: 22015 loss: 2.7549 memory: 122.03GiB(87.57%) tps: 10,151 tflops: 483.77 mfu: 48.91% global_avg_ntp_loss: 0.7886 global_avg_top_loss: 1.9664 +[titan] 2025-09-09 09:54:03,732 - root - INFO - lr: 9.7082e-06 gnorm: 0.33 [1 day, 16:18:36<1 day, 8:55:51] +[titan] 2025-09-09 09:54:10,340 - root - INFO - Dumping profiler traces at step 22016 +[titan] 2025-09-09 09:54:10,391 - root - INFO - Finished dumping profiler traces in 0.05 seconds +[titan] 2025-09-09 09:54:35,687 - root - INFO - step: 22020 loss: 2.7375 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.72 mfu: 49.42% global_avg_ntp_loss: 0.7770 global_avg_top_loss: 1.9605 +[titan] 2025-09-09 09:54:35,687 - root - INFO - lr: 9.7047e-06 gnorm: 0.35 [1 day, 16:19:08<1 day, 8:55:17] +[titan] 2025-09-09 09:55:07,657 - root - INFO - step: 22025 loss: 3.2441 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.50 mfu: 49.39% global_avg_ntp_loss: 1.0641 global_avg_top_loss: 2.1800 +[titan] 2025-09-09 09:55:07,657 - root - INFO - lr: 9.7012e-06 gnorm: 0.34 [1 day, 16:19:40<1 day, 8:54:44] +[titan] 2025-09-09 09:55:39,861 - root - INFO - step: 22030 loss: 2.7467 memory: 122.03GiB(87.57%) tps: 10,176 tflops: 484.96 mfu: 49.04% global_avg_ntp_loss: 0.7864 global_avg_top_loss: 1.9603 +[titan] 2025-09-09 09:55:39,861 - root - INFO - lr: 9.6976e-06 gnorm: 0.33 [1 day, 16:20:12<1 day, 8:54:10] +[titan] 2025-09-09 09:56:11,601 - root - INFO - step: 22035 loss: 2.8318 memory: 122.03GiB(87.57%) tps: 10,324 tflops: 492.04 mfu: 49.75% global_avg_ntp_loss: 0.8177 global_avg_top_loss: 2.0142 +[titan] 2025-09-09 09:56:11,601 - root - INFO - lr: 9.6941e-06 gnorm: 0.33 [1 day, 16:20:43<1 day, 8:53:36] +[titan] 2025-09-09 09:56:43,556 - root - INFO - step: 22040 loss: 2.7866 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.73 mfu: 49.42% global_avg_ntp_loss: 0.7991 global_avg_top_loss: 1.9874 +[titan] 2025-09-09 09:56:43,556 - root - INFO - lr: 9.6906e-06 gnorm: 0.34 [1 day, 16:21:15<1 day, 8:53:02] +[titan] 2025-09-09 09:57:15,359 - root - INFO - step: 22045 loss: 2.7100 memory: 122.03GiB(87.57%) tps: 10,304 tflops: 491.06 mfu: 49.65% global_avg_ntp_loss: 0.7674 global_avg_top_loss: 1.9426 +[titan] 2025-09-09 09:57:15,360 - root - INFO - lr: 9.6871e-06 gnorm: 0.34 [1 day, 16:21:47<1 day, 8:52:28] +[titan] 2025-09-09 09:57:40,940 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 09:57:47,275 - root - INFO - step: 22050 loss: 2.7349 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.34 mfu: 49.48% global_avg_ntp_loss: 0.7748 global_avg_top_loss: 1.9601 +[titan] 2025-09-09 09:57:47,275 - root - INFO - lr: 9.6835e-06 gnorm: 0.35 [1 day, 16:22:19<1 day, 8:51:55] +[titan] 2025-09-09 09:58:19,127 - root - INFO - step: 22055 loss: 2.7298 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.30 mfu: 49.58% global_avg_ntp_loss: 0.7710 global_avg_top_loss: 1.9588 +[titan] 2025-09-09 09:58:19,128 - root - INFO - lr: 9.6800e-06 gnorm: 0.37 [1 day, 16:22:51<1 day, 8:51:21] +[titan] 2025-09-09 09:58:50,998 - root - INFO - step: 22060 loss: 2.7477 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.03 mfu: 49.55% global_avg_ntp_loss: 0.7843 global_avg_top_loss: 1.9634 +[titan] 2025-09-09 09:58:50,998 - root - INFO - lr: 9.6765e-06 gnorm: 0.35 [1 day, 16:23:23<1 day, 8:50:47] +[titan] 2025-09-09 09:59:22,838 - root - INFO - step: 22065 loss: 2.7699 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.50 mfu: 49.60% global_avg_ntp_loss: 0.7909 global_avg_top_loss: 1.9789 +[titan] 2025-09-09 09:59:22,838 - root - INFO - lr: 9.6729e-06 gnorm: 0.34 [1 day, 16:23:55<1 day, 8:50:13] +[titan] 2025-09-09 09:59:54,827 - root - INFO - step: 22070 loss: 2.7495 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.21 mfu: 49.36% global_avg_ntp_loss: 0.7871 global_avg_top_loss: 1.9624 +[titan] 2025-09-09 09:59:54,827 - root - INFO - lr: 9.6694e-06 gnorm: 0.38 [1 day, 16:24:27<1 day, 8:49:39] +[titan] 2025-09-09 10:00:26,935 - root - INFO - step: 22075 loss: 3.2809 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.41 mfu: 49.18% global_avg_ntp_loss: 1.0810 global_avg_top_loss: 2.1999 +[titan] 2025-09-09 10:00:26,935 - root - INFO - lr: 9.6659e-06 gnorm: 0.33 [1 day, 16:24:59<1 day, 8:49:06] +[titan] 2025-09-09 10:00:58,596 - root - INFO - step: 22080 loss: 2.8153 memory: 122.03GiB(87.57%) tps: 10,350 tflops: 493.27 mfu: 49.88% global_avg_ntp_loss: 0.8120 global_avg_top_loss: 2.0034 +[titan] 2025-09-09 10:00:58,596 - root - INFO - lr: 9.6623e-06 gnorm: 0.35 [1 day, 16:25:30<1 day, 8:48:32] +[titan] 2025-09-09 10:01:30,590 - root - INFO - step: 22085 loss: 2.7460 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.14 mfu: 49.36% global_avg_ntp_loss: 0.7811 global_avg_top_loss: 1.9649 +[titan] 2025-09-09 10:01:30,590 - root - INFO - lr: 9.6588e-06 gnorm: 0.33 [1 day, 16:26:02<1 day, 8:47:58] +[titan] 2025-09-09 10:02:02,362 - root - INFO - step: 22090 loss: 2.7389 memory: 122.03GiB(87.57%) tps: 10,314 tflops: 491.55 mfu: 49.70% global_avg_ntp_loss: 0.7754 global_avg_top_loss: 1.9635 +[titan] 2025-09-09 10:02:02,362 - root - INFO - lr: 9.6553e-06 gnorm: 0.34 [1 day, 16:26:34<1 day, 8:47:24] +[titan] 2025-09-09 10:02:34,290 - root - INFO - step: 22095 loss: 2.7602 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.13 mfu: 49.46% global_avg_ntp_loss: 0.7891 global_avg_top_loss: 1.9711 +[titan] 2025-09-09 10:02:34,291 - root - INFO - lr: 9.6517e-06 gnorm: 0.34 [1 day, 16:27:06<1 day, 8:46:50] +[titan] 2025-09-09 10:02:59,647 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 10:03:06,074 - root - INFO - step: 22100 loss: 2.7562 memory: 122.03GiB(87.57%) tps: 10,310 tflops: 491.36 mfu: 49.68% global_avg_ntp_loss: 0.7872 global_avg_top_loss: 1.9690 +[titan] 2025-09-09 10:03:06,075 - root - INFO - lr: 9.6482e-06 gnorm: 0.36 [1 day, 16:27:38<1 day, 8:46:16] +[titan] 2025-09-09 10:03:37,842 - root - INFO - step: 22105 loss: 3.1691 memory: 122.03GiB(87.57%) tps: 10,315 tflops: 491.61 mfu: 49.71% global_avg_ntp_loss: 1.0253 global_avg_top_loss: 2.1439 +[titan] 2025-09-09 10:03:37,842 - root - INFO - lr: 9.6447e-06 gnorm: 0.37 [1 day, 16:28:10<1 day, 8:45:42] +[titan] 2025-09-09 10:04:09,641 - root - INFO - step: 22110 loss: 2.8558 memory: 122.03GiB(87.57%) tps: 10,305 tflops: 491.13 mfu: 49.66% global_avg_ntp_loss: 0.8291 global_avg_top_loss: 2.0267 +[titan] 2025-09-09 10:04:09,641 - root - INFO - lr: 9.6412e-06 gnorm: 0.33 [1 day, 16:28:42<1 day, 8:45:08] +[titan] 2025-09-09 10:04:41,365 - root - INFO - step: 22115 loss: 2.6868 memory: 122.03GiB(87.57%) tps: 10,329 tflops: 492.28 mfu: 49.78% global_avg_ntp_loss: 0.7539 global_avg_top_loss: 1.9328 +[titan] 2025-09-09 10:04:41,366 - root - INFO - lr: 9.6376e-06 gnorm: 0.37 [1 day, 16:29:13<1 day, 8:44:35] +[titan] 2025-09-09 10:05:13,222 - root - INFO - step: 22120 loss: 3.3268 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.25 mfu: 49.57% global_avg_ntp_loss: 1.1003 global_avg_top_loss: 2.2265 +[titan] 2025-09-09 10:05:13,222 - root - INFO - lr: 9.6341e-06 gnorm: 0.64 [1 day, 16:29:45<1 day, 8:44:01] +[titan] 2025-09-09 10:05:45,320 - root - INFO - step: 22125 loss: 2.7736 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.55 mfu: 49.20% global_avg_ntp_loss: 0.7928 global_avg_top_loss: 1.9808 +[titan] 2025-09-09 10:05:45,320 - root - INFO - lr: 9.6306e-06 gnorm: 0.33 [1 day, 16:30:17<1 day, 8:43:27] +[titan] 2025-09-09 10:06:17,222 - root - INFO - step: 22130 loss: 2.8568 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.54 mfu: 49.50% global_avg_ntp_loss: 0.8254 global_avg_top_loss: 2.0314 +[titan] 2025-09-09 10:06:17,222 - root - INFO - lr: 9.6270e-06 gnorm: 0.43 [1 day, 16:30:49<1 day, 8:42:53] +[titan] 2025-09-09 10:06:48,942 - root - INFO - step: 22135 loss: 2.7616 memory: 122.03GiB(87.57%) tps: 10,331 tflops: 492.35 mfu: 49.78% global_avg_ntp_loss: 0.7892 global_avg_top_loss: 1.9724 +[titan] 2025-09-09 10:06:48,943 - root - INFO - lr: 9.6235e-06 gnorm: 0.35 [1 day, 16:31:21<1 day, 8:42:19] +[titan] 2025-09-09 10:07:20,631 - root - INFO - step: 22140 loss: 2.7316 memory: 122.03GiB(87.57%) tps: 10,341 tflops: 492.85 mfu: 49.83% global_avg_ntp_loss: 0.7714 global_avg_top_loss: 1.9602 +[titan] 2025-09-09 10:07:20,631 - root - INFO - lr: 9.6200e-06 gnorm: 0.34 [1 day, 16:31:53<1 day, 8:41:45] +[titan] 2025-09-09 10:07:52,503 - root - INFO - step: 22145 loss: 2.8411 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 490.00 mfu: 49.54% global_avg_ntp_loss: 0.8221 global_avg_top_loss: 2.0190 +[titan] 2025-09-09 10:07:52,504 - root - INFO - lr: 9.6165e-06 gnorm: 0.33 [1 day, 16:32:24<1 day, 8:41:11] +[titan] 2025-09-09 10:08:18,032 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 10:08:24,541 - root - INFO - step: 22150 loss: 2.6662 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.47 mfu: 49.29% global_avg_ntp_loss: 0.7439 global_avg_top_loss: 1.9223 +[titan] 2025-09-09 10:08:24,541 - root - INFO - lr: 9.6129e-06 gnorm: 0.33 [1 day, 16:32:56<1 day, 8:40:38] +[titan] 2025-09-09 10:08:56,313 - root - INFO - step: 22155 loss: 2.7910 memory: 122.03GiB(87.57%) tps: 10,314 tflops: 491.55 mfu: 49.70% global_avg_ntp_loss: 0.8034 global_avg_top_loss: 1.9877 +[titan] 2025-09-09 10:08:56,314 - root - INFO - lr: 9.6094e-06 gnorm: 0.34 [1 day, 16:33:28<1 day, 8:40:04] +[titan] 2025-09-09 10:09:28,146 - root - INFO - step: 22160 loss: 2.7642 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.61 mfu: 49.61% global_avg_ntp_loss: 0.7887 global_avg_top_loss: 1.9755 +[titan] 2025-09-09 10:09:28,146 - root - INFO - lr: 9.6059e-06 gnorm: 0.34 [1 day, 16:34:00<1 day, 8:39:30] +[titan] 2025-09-09 10:10:00,154 - root - INFO - step: 22165 loss: 2.7905 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.93 mfu: 49.34% global_avg_ntp_loss: 0.8073 global_avg_top_loss: 1.9832 +[titan] 2025-09-09 10:10:00,154 - root - INFO - lr: 9.6024e-06 gnorm: 0.33 [1 day, 16:34:32<1 day, 8:38:56] +[titan] 2025-09-09 10:10:31,835 - root - INFO - step: 22170 loss: 2.7554 memory: 122.03GiB(87.57%) tps: 10,343 tflops: 492.95 mfu: 49.84% global_avg_ntp_loss: 0.7862 global_avg_top_loss: 1.9693 +[titan] 2025-09-09 10:10:31,835 - root - INFO - lr: 9.5988e-06 gnorm: 0.32 [1 day, 16:35:04<1 day, 8:38:22] +[titan] 2025-09-09 10:11:03,637 - root - INFO - step: 22175 loss: 2.7585 memory: 122.03GiB(87.57%) tps: 10,304 tflops: 491.08 mfu: 49.65% global_avg_ntp_loss: 0.7868 global_avg_top_loss: 1.9716 +[titan] 2025-09-09 10:11:03,638 - root - INFO - lr: 9.5953e-06 gnorm: 0.34 [1 day, 16:35:36<1 day, 8:37:48] +[titan] 2025-09-09 10:11:35,566 - root - INFO - step: 22180 loss: 2.8919 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.14 mfu: 49.46% global_avg_ntp_loss: 0.8649 global_avg_top_loss: 2.0270 +[titan] 2025-09-09 10:11:35,566 - root - INFO - lr: 9.5918e-06 gnorm: 0.34 [1 day, 16:36:07<1 day, 8:37:15] +[titan] 2025-09-09 10:12:07,435 - root - INFO - step: 22185 loss: 2.6708 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.05 mfu: 49.55% global_avg_ntp_loss: 0.7526 global_avg_top_loss: 1.9182 +[titan] 2025-09-09 10:12:07,435 - root - INFO - lr: 9.5882e-06 gnorm: 0.32 [1 day, 16:36:39<1 day, 8:36:41] +[titan] 2025-09-09 10:12:39,290 - root - INFO - step: 22190 loss: 2.7907 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.26 mfu: 49.57% global_avg_ntp_loss: 0.8036 global_avg_top_loss: 1.9871 +[titan] 2025-09-09 10:12:39,291 - root - INFO - lr: 9.5847e-06 gnorm: 0.36 [1 day, 16:37:11<1 day, 8:36:07] +[titan] 2025-09-09 10:13:11,256 - root - INFO - step: 22195 loss: 2.8944 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.57 mfu: 49.40% global_avg_ntp_loss: 0.8586 global_avg_top_loss: 2.0359 +[titan] 2025-09-09 10:13:11,256 - root - INFO - lr: 9.5812e-06 gnorm: 0.34 [1 day, 16:37:43<1 day, 8:35:33] +[titan] 2025-09-09 10:13:36,672 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 10:13:43,040 - root - INFO - step: 22200 loss: 2.7918 memory: 122.03GiB(87.57%) tps: 10,310 tflops: 491.37 mfu: 49.68% global_avg_ntp_loss: 0.8020 global_avg_top_loss: 1.9898 +[titan] 2025-09-09 10:13:43,040 - root - INFO - lr: 9.5777e-06 gnorm: 0.36 [1 day, 16:38:15<1 day, 8:34:59] +[titan] 2025-09-09 10:14:14,772 - root - INFO - step: 22205 loss: 2.7796 memory: 122.03GiB(87.57%) tps: 10,327 tflops: 492.16 mfu: 49.76% global_avg_ntp_loss: 0.7992 global_avg_top_loss: 1.9804 +[titan] 2025-09-09 10:14:14,773 - root - INFO - lr: 9.5741e-06 gnorm: 0.39 [1 day, 16:38:47<1 day, 8:34:25] +[titan] 2025-09-09 10:14:46,616 - root - INFO - step: 22210 loss: 2.7982 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.44 mfu: 49.59% global_avg_ntp_loss: 0.8021 global_avg_top_loss: 1.9961 +[titan] 2025-09-09 10:14:46,617 - root - INFO - lr: 9.5706e-06 gnorm: 0.34 [1 day, 16:39:18<1 day, 8:33:52] +[titan] 2025-09-09 10:15:18,345 - root - INFO - step: 22215 loss: 2.6928 memory: 122.03GiB(87.57%) tps: 10,328 tflops: 492.22 mfu: 49.77% global_avg_ntp_loss: 0.7549 global_avg_top_loss: 1.9379 +[titan] 2025-09-09 10:15:18,346 - root - INFO - lr: 9.5671e-06 gnorm: 0.33 [1 day, 16:39:50<1 day, 8:33:18] +[titan] 2025-09-09 10:15:50,299 - root - INFO - step: 22220 loss: 2.7928 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.75 mfu: 49.42% global_avg_ntp_loss: 0.8067 global_avg_top_loss: 1.9861 +[titan] 2025-09-09 10:15:50,299 - root - INFO - lr: 9.5636e-06 gnorm: 0.34 [1 day, 16:40:22<1 day, 8:32:44] +[titan] 2025-09-09 10:16:22,115 - root - INFO - step: 22225 loss: 2.8799 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.87 mfu: 49.63% global_avg_ntp_loss: 0.8419 global_avg_top_loss: 2.0380 +[titan] 2025-09-09 10:16:22,115 - root - INFO - lr: 9.5600e-06 gnorm: 0.86 [1 day, 16:40:54<1 day, 8:32:10] +[titan] 2025-09-09 10:16:53,932 - root - INFO - step: 22230 loss: 2.7991 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.86 mfu: 49.63% global_avg_ntp_loss: 0.8039 global_avg_top_loss: 1.9952 +[titan] 2025-09-09 10:16:53,932 - root - INFO - lr: 9.5565e-06 gnorm: 0.34 [1 day, 16:41:26<1 day, 8:31:36] +[titan] 2025-09-09 10:17:25,893 - root - INFO - step: 22235 loss: 3.1268 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.64 mfu: 49.41% global_avg_ntp_loss: 0.9946 global_avg_top_loss: 2.1322 +[titan] 2025-09-09 10:17:25,893 - root - INFO - lr: 9.5530e-06 gnorm: 0.40 [1 day, 16:41:58<1 day, 8:31:03] +[titan] 2025-09-09 10:17:57,780 - root - INFO - step: 22240 loss: 2.7001 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.78 mfu: 49.52% global_avg_ntp_loss: 0.7597 global_avg_top_loss: 1.9404 +[titan] 2025-09-09 10:17:57,780 - root - INFO - lr: 9.5495e-06 gnorm: 0.40 [1 day, 16:42:30<1 day, 8:30:29] +[titan] 2025-09-09 10:18:29,767 - root - INFO - step: 22245 loss: 2.8551 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.25 mfu: 49.37% global_avg_ntp_loss: 0.8313 global_avg_top_loss: 2.0238 +[titan] 2025-09-09 10:18:29,767 - root - INFO - lr: 9.5460e-06 gnorm: 0.36 [1 day, 16:43:02<1 day, 8:29:55] +[titan] 2025-09-09 10:18:55,315 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 10:19:01,713 - root - INFO - step: 22250 loss: 2.8191 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.87 mfu: 49.43% global_avg_ntp_loss: 0.8132 global_avg_top_loss: 2.0059 +[titan] 2025-09-09 10:19:01,713 - root - INFO - lr: 9.5424e-06 gnorm: 0.36 [1 day, 16:43:34<1 day, 8:29:21] +[titan] 2025-09-09 10:19:33,612 - root - INFO - step: 22255 loss: 2.7885 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.58 mfu: 49.50% global_avg_ntp_loss: 0.8026 global_avg_top_loss: 1.9859 +[titan] 2025-09-09 10:19:33,613 - root - INFO - lr: 9.5389e-06 gnorm: 0.33 [1 day, 16:44:05<1 day, 8:28:47] +[titan] 2025-09-09 10:20:05,773 - root - INFO - step: 22260 loss: 2.7400 memory: 122.03GiB(87.57%) tps: 10,189 tflops: 485.60 mfu: 49.10% global_avg_ntp_loss: 0.7771 global_avg_top_loss: 1.9629 +[titan] 2025-09-09 10:20:05,774 - root - INFO - lr: 9.5354e-06 gnorm: 0.34 [1 day, 16:44:38<1 day, 8:28:14] +[titan] 2025-09-09 10:20:37,760 - root - INFO - step: 22265 loss: 2.7462 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.24 mfu: 49.37% global_avg_ntp_loss: 0.7826 global_avg_top_loss: 1.9636 +[titan] 2025-09-09 10:20:37,761 - root - INFO - lr: 9.5319e-06 gnorm: 0.34 [1 day, 16:45:10<1 day, 8:27:40] +[titan] 2025-09-09 10:21:09,496 - root - INFO - step: 22270 loss: 2.7766 memory: 122.03GiB(87.57%) tps: 10,326 tflops: 492.11 mfu: 49.76% global_avg_ntp_loss: 0.7942 global_avg_top_loss: 1.9823 +[titan] 2025-09-09 10:21:09,496 - root - INFO - lr: 9.5283e-06 gnorm: 0.36 [1 day, 16:45:41<1 day, 8:27:06] +[titan] 2025-09-09 10:21:41,427 - root - INFO - step: 22275 loss: 2.8120 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.10 mfu: 49.45% global_avg_ntp_loss: 0.8139 global_avg_top_loss: 1.9981 +[titan] 2025-09-09 10:21:41,427 - root - INFO - lr: 9.5248e-06 gnorm: 0.33 [1 day, 16:46:13<1 day, 8:26:33] +[titan] 2025-09-09 10:22:13,141 - root - INFO - step: 22280 loss: 2.7896 memory: 122.03GiB(87.57%) tps: 10,333 tflops: 492.45 mfu: 49.79% global_avg_ntp_loss: 0.8030 global_avg_top_loss: 1.9865 +[titan] 2025-09-09 10:22:13,141 - root - INFO - lr: 9.5213e-06 gnorm: 0.34 [1 day, 16:46:45<1 day, 8:25:59] +[titan] 2025-09-09 10:22:44,944 - root - INFO - step: 22285 loss: 3.1378 memory: 122.03GiB(87.57%) tps: 10,304 tflops: 491.06 mfu: 49.65% global_avg_ntp_loss: 0.9973 global_avg_top_loss: 2.1405 +[titan] 2025-09-09 10:22:44,945 - root - INFO - lr: 9.5178e-06 gnorm: 0.34 [1 day, 16:47:17<1 day, 8:25:25] +[titan] 2025-09-09 10:23:16,856 - root - INFO - step: 22290 loss: 2.7761 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.39 mfu: 49.48% global_avg_ntp_loss: 0.7948 global_avg_top_loss: 1.9813 +[titan] 2025-09-09 10:23:16,857 - root - INFO - lr: 9.5143e-06 gnorm: 0.37 [1 day, 16:47:49<1 day, 8:24:51] +[titan] 2025-09-09 10:23:48,837 - root - INFO - step: 22295 loss: 3.2144 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.33 mfu: 49.38% global_avg_ntp_loss: 1.0470 global_avg_top_loss: 2.1675 +[titan] 2025-09-09 10:23:48,838 - root - INFO - lr: 9.5107e-06 gnorm: 0.35 [1 day, 16:48:21<1 day, 8:24:17] +[titan] 2025-09-09 10:24:14,316 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 10:24:20,731 - root - INFO - step: 22300 loss: 2.7452 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.67 mfu: 49.51% global_avg_ntp_loss: 0.7705 global_avg_top_loss: 1.9747 +[titan] 2025-09-09 10:24:20,732 - root - INFO - lr: 9.5072e-06 gnorm: 1.10 [1 day, 16:48:53<1 day, 8:23:44] +[titan] 2025-09-09 10:24:52,942 - root - INFO - step: 22305 loss: 2.8242 memory: 122.03GiB(87.57%) tps: 10,173 tflops: 484.85 mfu: 49.02% global_avg_ntp_loss: 0.8184 global_avg_top_loss: 2.0058 +[titan] 2025-09-09 10:24:52,942 - root - INFO - lr: 9.5037e-06 gnorm: 0.39 [1 day, 16:49:25<1 day, 8:23:10] +[titan] 2025-09-09 10:25:24,735 - root - INFO - step: 22310 loss: 3.0296 memory: 122.03GiB(87.57%) tps: 10,307 tflops: 491.22 mfu: 49.67% global_avg_ntp_loss: 0.9360 global_avg_top_loss: 2.0937 +[titan] 2025-09-09 10:25:24,736 - root - INFO - lr: 9.5002e-06 gnorm: 0.35 [1 day, 16:49:57<1 day, 8:22:36] +[titan] 2025-09-09 10:25:56,775 - root - INFO - step: 22315 loss: 2.7146 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.44 mfu: 49.29% global_avg_ntp_loss: 0.7729 global_avg_top_loss: 1.9417 +[titan] 2025-09-09 10:25:56,775 - root - INFO - lr: 9.4967e-06 gnorm: 0.38 [1 day, 16:50:29<1 day, 8:22:02] +[titan] 2025-09-09 10:26:28,848 - root - INFO - step: 22320 loss: 2.7762 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.94 mfu: 49.24% global_avg_ntp_loss: 0.7959 global_avg_top_loss: 1.9804 +[titan] 2025-09-09 10:26:28,848 - root - INFO - lr: 9.4931e-06 gnorm: 0.35 [1 day, 16:51:01<1 day, 8:21:29] +[titan] 2025-09-09 10:27:00,585 - root - INFO - step: 22325 loss: 2.7520 memory: 122.03GiB(87.57%) tps: 10,325 tflops: 492.09 mfu: 49.76% global_avg_ntp_loss: 0.7834 global_avg_top_loss: 1.9685 +[titan] 2025-09-09 10:27:00,586 - root - INFO - lr: 9.4896e-06 gnorm: 0.35 [1 day, 16:51:32<1 day, 8:20:55] +[titan] 2025-09-09 10:27:32,400 - root - INFO - step: 22330 loss: 2.7717 memory: 122.03GiB(87.57%) tps: 10,300 tflops: 490.89 mfu: 49.64% global_avg_ntp_loss: 0.7958 global_avg_top_loss: 1.9759 +[titan] 2025-09-09 10:27:32,400 - root - INFO - lr: 9.4861e-06 gnorm: 0.35 [1 day, 16:52:04<1 day, 8:20:21] +[titan] 2025-09-09 10:28:04,253 - root - INFO - step: 22335 loss: 2.6690 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.30 mfu: 49.57% global_avg_ntp_loss: 0.7426 global_avg_top_loss: 1.9264 +[titan] 2025-09-09 10:28:04,253 - root - INFO - lr: 9.4826e-06 gnorm: 1.22 [1 day, 16:52:36<1 day, 8:19:47] +[titan] 2025-09-09 10:28:36,221 - root - INFO - step: 22340 loss: 2.7836 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.53 mfu: 49.40% global_avg_ntp_loss: 0.8001 global_avg_top_loss: 1.9835 +[titan] 2025-09-09 10:28:36,222 - root - INFO - lr: 9.4791e-06 gnorm: 0.33 [1 day, 16:53:08<1 day, 8:19:14] +[titan] 2025-09-09 10:29:08,159 - root - INFO - step: 22345 loss: 2.9081 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 489.00 mfu: 49.44% global_avg_ntp_loss: 0.8610 global_avg_top_loss: 2.0471 +[titan] 2025-09-09 10:29:08,159 - root - INFO - lr: 9.4755e-06 gnorm: 0.34 [1 day, 16:53:40<1 day, 8:18:40] +[titan] 2025-09-09 10:29:33,594 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 10:29:40,005 - root - INFO - step: 22350 loss: 2.8288 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.40 mfu: 49.59% global_avg_ntp_loss: 0.8207 global_avg_top_loss: 2.0080 +[titan] 2025-09-09 10:29:40,006 - root - INFO - lr: 9.4720e-06 gnorm: 0.34 [1 day, 16:54:12<1 day, 8:18:06] +[titan] 2025-09-09 10:30:11,980 - root - INFO - step: 22355 loss: 2.7864 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.43 mfu: 49.39% global_avg_ntp_loss: 0.7995 global_avg_top_loss: 1.9869 +[titan] 2025-09-09 10:30:11,980 - root - INFO - lr: 9.4685e-06 gnorm: 0.33 [1 day, 16:54:44<1 day, 8:17:32] +[titan] 2025-09-09 10:30:44,054 - root - INFO - step: 22360 loss: 2.8197 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.91 mfu: 49.23% global_avg_ntp_loss: 0.8168 global_avg_top_loss: 2.0029 +[titan] 2025-09-09 10:30:44,055 - root - INFO - lr: 9.4650e-06 gnorm: 0.34 [1 day, 16:55:16<1 day, 8:16:59] +[titan] 2025-09-09 10:31:15,806 - root - INFO - step: 22365 loss: 3.1971 memory: 122.03GiB(87.57%) tps: 10,320 tflops: 491.86 mfu: 49.73% global_avg_ntp_loss: 1.0403 global_avg_top_loss: 2.1568 +[titan] 2025-09-09 10:31:15,806 - root - INFO - lr: 9.4615e-06 gnorm: 0.36 [1 day, 16:55:48<1 day, 8:16:25] +[titan] 2025-09-09 10:31:47,589 - root - INFO - step: 22370 loss: 2.7473 memory: 122.03GiB(87.57%) tps: 10,310 tflops: 491.37 mfu: 49.68% global_avg_ntp_loss: 0.7877 global_avg_top_loss: 1.9596 +[titan] 2025-09-09 10:31:47,590 - root - INFO - lr: 9.4580e-06 gnorm: 0.39 [1 day, 16:56:19<1 day, 8:15:51] +[titan] 2025-09-09 10:32:19,470 - root - INFO - step: 22375 loss: 3.7100 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.87 mfu: 49.53% global_avg_ntp_loss: 1.3271 global_avg_top_loss: 2.3829 +[titan] 2025-09-09 10:32:19,471 - root - INFO - lr: 9.4544e-06 gnorm: 0.36 [1 day, 16:56:51<1 day, 8:15:17] +[titan] 2025-09-09 10:32:51,533 - root - INFO - step: 22380 loss: 2.7663 memory: 122.03GiB(87.57%) tps: 10,220 tflops: 487.08 mfu: 49.25% global_avg_ntp_loss: 0.7923 global_avg_top_loss: 1.9740 +[titan] 2025-09-09 10:32:51,534 - root - INFO - lr: 9.4509e-06 gnorm: 0.35 [1 day, 16:57:23<1 day, 8:14:44] +[titan] 2025-09-09 10:33:23,490 - root - INFO - step: 22385 loss: 2.7515 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.71 mfu: 49.41% global_avg_ntp_loss: 0.7823 global_avg_top_loss: 1.9692 +[titan] 2025-09-09 10:33:23,491 - root - INFO - lr: 9.4474e-06 gnorm: 0.33 [1 day, 16:57:55<1 day, 8:14:10] +[titan] 2025-09-09 10:33:55,253 - root - INFO - step: 22390 loss: 2.8091 memory: 122.03GiB(87.57%) tps: 10,317 tflops: 491.69 mfu: 49.72% global_avg_ntp_loss: 0.8087 global_avg_top_loss: 2.0004 +[titan] 2025-09-09 10:33:55,254 - root - INFO - lr: 9.4439e-06 gnorm: 0.33 [1 day, 16:58:27<1 day, 8:13:36] +[titan] 2025-09-09 10:34:27,319 - root - INFO - step: 22395 loss: 2.7495 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.05 mfu: 49.25% global_avg_ntp_loss: 0.7844 global_avg_top_loss: 1.9651 +[titan] 2025-09-09 10:34:27,319 - root - INFO - lr: 9.4404e-06 gnorm: 0.34 [1 day, 16:58:59<1 day, 8:13:02] +[titan] 2025-09-09 10:34:52,743 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 10:34:59,120 - root - INFO - step: 22400 loss: 2.7576 memory: 122.03GiB(87.57%) tps: 10,304 tflops: 491.10 mfu: 49.66% global_avg_ntp_loss: 0.7869 global_avg_top_loss: 1.9707 +[titan] 2025-09-09 10:34:59,121 - root - INFO - lr: 9.4369e-06 gnorm: 0.38 [1 day, 16:59:31<1 day, 8:12:28] +[titan] 2025-09-09 10:35:31,054 - root - INFO - step: 22405 loss: 2.6767 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.06 mfu: 49.45% global_avg_ntp_loss: 0.7545 global_avg_top_loss: 1.9222 +[titan] 2025-09-09 10:35:31,054 - root - INFO - lr: 9.4333e-06 gnorm: 0.35 [1 day, 17:00:03<1 day, 8:11:55] +[titan] 2025-09-09 10:36:02,956 - root - INFO - step: 22410 loss: 2.8015 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.54 mfu: 49.50% global_avg_ntp_loss: 0.8100 global_avg_top_loss: 1.9915 +[titan] 2025-09-09 10:36:02,957 - root - INFO - lr: 9.4298e-06 gnorm: 0.34 [1 day, 17:00:35<1 day, 8:11:21] +[titan] 2025-09-09 10:36:35,048 - root - INFO - step: 22415 loss: 2.6917 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.66 mfu: 49.21% global_avg_ntp_loss: 0.7588 global_avg_top_loss: 1.9329 +[titan] 2025-09-09 10:36:35,048 - root - INFO - lr: 9.4263e-06 gnorm: 0.45 [1 day, 17:01:07<1 day, 8:10:47] +[titan] 2025-09-09 10:37:06,842 - root - INFO - step: 22420 loss: 2.6305 memory: 122.03GiB(87.57%) tps: 10,306 tflops: 491.19 mfu: 49.67% global_avg_ntp_loss: 0.7283 global_avg_top_loss: 1.9022 +[titan] 2025-09-09 10:37:06,843 - root - INFO - lr: 9.4228e-06 gnorm: 0.38 [1 day, 17:01:39<1 day, 8:10:14] +[titan] 2025-09-09 10:37:38,716 - root - INFO - step: 22425 loss: 2.7768 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.98 mfu: 49.54% global_avg_ntp_loss: 0.7934 global_avg_top_loss: 1.9834 +[titan] 2025-09-09 10:37:38,716 - root - INFO - lr: 9.4193e-06 gnorm: 0.35 [1 day, 17:02:11<1 day, 8:09:40] +[titan] 2025-09-09 10:38:10,781 - root - INFO - step: 22430 loss: 2.7943 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.05 mfu: 49.25% global_avg_ntp_loss: 0.8103 global_avg_top_loss: 1.9840 +[titan] 2025-09-09 10:38:10,782 - root - INFO - lr: 9.4158e-06 gnorm: 0.36 [1 day, 17:02:43<1 day, 8:09:06] +[titan] 2025-09-09 10:38:42,775 - root - INFO - step: 22435 loss: 2.7842 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.14 mfu: 49.36% global_avg_ntp_loss: 0.7941 global_avg_top_loss: 1.9901 +[titan] 2025-09-09 10:38:42,776 - root - INFO - lr: 9.4123e-06 gnorm: 0.41 [1 day, 17:03:15<1 day, 8:08:32] +[titan] 2025-09-09 10:39:14,484 - root - INFO - step: 22440 loss: 2.7109 memory: 122.03GiB(87.57%) tps: 10,335 tflops: 492.54 mfu: 49.80% global_avg_ntp_loss: 0.7639 global_avg_top_loss: 1.9470 +[titan] 2025-09-09 10:39:14,484 - root - INFO - lr: 9.4087e-06 gnorm: 0.39 [1 day, 17:03:46<1 day, 8:07:59] +[titan] 2025-09-09 10:39:46,414 - root - INFO - step: 22445 loss: 3.2198 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.11 mfu: 49.46% global_avg_ntp_loss: 1.0513 global_avg_top_loss: 2.1685 +[titan] 2025-09-09 10:39:46,414 - root - INFO - lr: 9.4052e-06 gnorm: 0.35 [1 day, 17:04:18<1 day, 8:07:25] +[titan] 2025-09-09 10:40:11,863 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 10:40:18,265 - root - INFO - step: 22450 loss: 2.7975 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.33 mfu: 49.58% global_avg_ntp_loss: 0.8064 global_avg_top_loss: 1.9911 +[titan] 2025-09-09 10:40:18,265 - root - INFO - lr: 9.4017e-06 gnorm: 0.33 [1 day, 17:04:50<1 day, 8:06:51] +[titan] 2025-09-09 10:40:50,369 - root - INFO - step: 22455 loss: 3.1519 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.47 mfu: 49.19% global_avg_ntp_loss: 1.0199 global_avg_top_loss: 2.1320 +[titan] 2025-09-09 10:40:50,369 - root - INFO - lr: 9.3982e-06 gnorm: 0.39 [1 day, 17:05:22<1 day, 8:06:17] +[titan] 2025-09-09 10:41:22,176 - root - INFO - step: 22460 loss: 2.7428 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 491.01 mfu: 49.65% global_avg_ntp_loss: 0.7800 global_avg_top_loss: 1.9628 +[titan] 2025-09-09 10:41:22,176 - root - INFO - lr: 9.3947e-06 gnorm: 0.34 [1 day, 17:05:54<1 day, 8:05:44] +[titan] 2025-09-09 10:41:54,086 - root - INFO - step: 22465 loss: 2.7963 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.42 mfu: 49.49% global_avg_ntp_loss: 0.8013 global_avg_top_loss: 1.9951 +[titan] 2025-09-09 10:41:54,086 - root - INFO - lr: 9.3912e-06 gnorm: 0.36 [1 day, 17:06:26<1 day, 8:05:10] +[titan] 2025-09-09 10:42:25,953 - root - INFO - step: 22470 loss: 3.2035 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.07 mfu: 49.55% global_avg_ntp_loss: 1.0400 global_avg_top_loss: 2.1635 +[titan] 2025-09-09 10:42:25,954 - root - INFO - lr: 9.3877e-06 gnorm: 0.41 [1 day, 17:06:58<1 day, 8:04:36] +[titan] 2025-09-09 10:42:57,858 - root - INFO - step: 22475 loss: 2.7818 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.51 mfu: 49.50% global_avg_ntp_loss: 0.7972 global_avg_top_loss: 1.9846 +[titan] 2025-09-09 10:42:57,858 - root - INFO - lr: 9.3841e-06 gnorm: 0.32 [1 day, 17:07:30<1 day, 8:04:02] +[titan] 2025-09-09 10:43:29,724 - root - INFO - step: 22480 loss: 2.7182 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.10 mfu: 49.56% global_avg_ntp_loss: 0.7633 global_avg_top_loss: 1.9549 +[titan] 2025-09-09 10:43:29,724 - root - INFO - lr: 9.3806e-06 gnorm: 0.87 [1 day, 17:08:02<1 day, 8:03:29] +[titan] 2025-09-09 10:44:01,625 - root - INFO - step: 22485 loss: 2.6214 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.55 mfu: 49.50% global_avg_ntp_loss: 0.7243 global_avg_top_loss: 1.8971 +[titan] 2025-09-09 10:44:01,626 - root - INFO - lr: 9.3771e-06 gnorm: 0.38 [1 day, 17:08:33<1 day, 8:02:55] +[titan] 2025-09-09 10:44:33,371 - root - INFO - step: 22490 loss: 2.7821 memory: 122.03GiB(87.57%) tps: 10,322 tflops: 491.96 mfu: 49.74% global_avg_ntp_loss: 0.7987 global_avg_top_loss: 1.9834 +[titan] 2025-09-09 10:44:33,372 - root - INFO - lr: 9.3736e-06 gnorm: 0.34 [1 day, 17:09:05<1 day, 8:02:21] +[titan] 2025-09-09 10:45:05,613 - root - INFO - step: 22495 loss: 2.7225 memory: 122.03GiB(87.57%) tps: 10,164 tflops: 484.39 mfu: 48.98% global_avg_ntp_loss: 0.7685 global_avg_top_loss: 1.9540 +[titan] 2025-09-09 10:45:05,613 - root - INFO - lr: 9.3701e-06 gnorm: 0.33 [1 day, 17:09:37<1 day, 8:01:48] +[titan] 2025-09-09 10:45:31,000 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds. +[titan] 2025-09-09 10:45:37,393 - root - INFO - step: 22500 loss: 2.7479 memory: 122.03GiB(87.57%) tps: 10,311 tflops: 491.42 mfu: 49.69% global_avg_ntp_loss: 0.7795 global_avg_top_loss: 1.9683 +[titan] 2025-09-09 10:45:37,393 - root - INFO - lr: 9.3666e-06 gnorm: 0.35 [1 day, 17:10:09<1 day, 8:01:14] +[titan] 2025-09-09 10:46:09,297 - root - INFO - step: 22505 loss: 2.8345 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.51 mfu: 49.50% global_avg_ntp_loss: 0.8235 global_avg_top_loss: 2.0110 +[titan] 2025-09-09 10:46:09,297 - root - INFO - lr: 9.3631e-06 gnorm: 0.34 [1 day, 17:10:41<1 day, 8:00:40] +[titan] 2025-09-09 10:46:41,250 - root - INFO - step: 22510 loss: 2.7794 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.76 mfu: 49.42% global_avg_ntp_loss: 0.7973 global_avg_top_loss: 1.9821 +[titan] 2025-09-09 10:46:41,250 - root - INFO - lr: 9.3596e-06 gnorm: 0.45 [1 day, 17:11:13<1 day, 8:00:06] +[titan] 2025-09-09 10:47:13,222 - root - INFO - step: 22515 loss: 2.8173 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.47 mfu: 49.39% global_avg_ntp_loss: 0.8121 global_avg_top_loss: 2.0052 +[titan] 2025-09-09 10:47:13,222 - root - INFO - lr: 9.3561e-06 gnorm: 0.37 [1 day, 17:11:45<1 day, 7:59:33] +[titan] 2025-09-09 10:47:45,148 - root - INFO - step: 22520 loss: 2.6888 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.18 mfu: 49.46% global_avg_ntp_loss: 0.7589 global_avg_top_loss: 1.9299 +[titan] 2025-09-09 10:47:45,148 - root - INFO - lr: 9.3526e-06 gnorm: 0.33 [1 day, 17:12:17<1 day, 7:58:59] +[titan] 2025-09-09 10:48:17,091 - root - INFO - step: 22525 loss: 3.2359 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.92 mfu: 49.44% global_avg_ntp_loss: 1.0577 global_avg_top_loss: 2.1781 +[titan] 2025-09-09 10:48:17,091 - root - INFO - lr: 9.3490e-06 gnorm: 0.33 [1 day, 17:12:49<1 day, 7:58:25] +[titan] 2025-09-09 10:48:36,555 - root - INFO - Dumping profiler traces at step 22528 +[titan] 2025-09-09 10:48:36,623 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-09 10:48:49,411 - root - INFO - step: 22530 loss: 2.8530 memory: 122.03GiB(87.57%) tps: 10,139 tflops: 483.21 mfu: 48.86% global_avg_ntp_loss: 0.8325 global_avg_top_loss: 2.0205 +[titan] 2025-09-09 10:48:49,411 - root - INFO - lr: 9.3455e-06 gnorm: 0.33 [1 day, 17:13:21<1 day, 7:57:52] +[titan] 2025-09-09 10:49:21,402 - root - INFO - step: 22535 loss: 3.2439 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.18 mfu: 49.36% global_avg_ntp_loss: 1.0600 global_avg_top_loss: 2.1838 +[titan] 2025-09-09 10:49:21,403 - root - INFO - lr: 9.3420e-06 gnorm: 0.33 [1 day, 17:13:53<1 day, 7:57:18] +[titan] 2025-09-09 10:49:53,344 - root - INFO - step: 22540 loss: 2.6626 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.94 mfu: 49.44% global_avg_ntp_loss: 0.7424 global_avg_top_loss: 1.9202 +[titan] 2025-09-09 10:49:53,344 - root - INFO - lr: 9.3385e-06 gnorm: 0.49 [1 day, 17:14:25<1 day, 7:56:44] +[titan] 2025-09-09 10:50:25,321 - root - INFO - step: 22545 loss: 2.7366 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.40 mfu: 49.38% global_avg_ntp_loss: 0.7754 global_avg_top_loss: 1.9612 +[titan] 2025-09-09 10:50:25,321 - root - INFO - lr: 9.3350e-06 gnorm: 0.38 [1 day, 17:14:57<1 day, 7:56:11] +[titan] 2025-09-09 10:50:50,664 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds. +[titan] 2025-09-09 10:50:57,066 - root - INFO - step: 22550 loss: 2.7777 memory: 122.03GiB(87.57%) tps: 10,322 tflops: 491.96 mfu: 49.74% global_avg_ntp_loss: 0.7964 global_avg_top_loss: 1.9813 +[titan] 2025-09-09 10:50:57,066 - root - INFO - lr: 9.3315e-06 gnorm: 0.34 [1 day, 17:15:29<1 day, 7:55:37] +[titan] 2025-09-09 10:51:28,927 - root - INFO - step: 22555 loss: 2.8046 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.18 mfu: 49.56% global_avg_ntp_loss: 0.8085 global_avg_top_loss: 1.9961 +[titan] 2025-09-09 10:51:28,927 - root - INFO - lr: 9.3280e-06 gnorm: 0.34 [1 day, 17:16:01<1 day, 7:55:03] +[titan] 2025-09-09 10:52:00,706 - root - INFO - step: 22560 loss: 2.8238 memory: 122.03GiB(87.57%) tps: 10,312 tflops: 491.44 mfu: 49.69% global_avg_ntp_loss: 0.8159 global_avg_top_loss: 2.0079 +[titan] 2025-09-09 10:52:00,706 - root - INFO - lr: 9.3245e-06 gnorm: 0.35 [1 day, 17:16:32<1 day, 7:54:29] +[titan] 2025-09-09 10:52:32,649 - root - INFO - step: 22565 loss: 2.7995 memory: 122.03GiB(87.57%) tps: 10,258 tflops: 488.91 mfu: 49.43% global_avg_ntp_loss: 0.8084 global_avg_top_loss: 1.9911 +[titan] 2025-09-09 10:52:32,650 - root - INFO - lr: 9.3210e-06 gnorm: 0.33 [1 day, 17:17:04<1 day, 7:53:56] +[titan] 2025-09-09 10:53:04,351 - root - INFO - step: 22570 loss: 2.7344 memory: 122.03GiB(87.57%) tps: 10,337 tflops: 492.64 mfu: 49.81% global_avg_ntp_loss: 0.7736 global_avg_top_loss: 1.9607 +[titan] 2025-09-09 10:53:04,351 - root - INFO - lr: 9.3175e-06 gnorm: 0.33 [1 day, 17:17:36<1 day, 7:53:22] +[titan] 2025-09-09 10:53:36,468 - root - INFO - step: 22575 loss: 2.7945 memory: 122.03GiB(87.57%) tps: 10,203 tflops: 486.26 mfu: 49.17% global_avg_ntp_loss: 0.8036 global_avg_top_loss: 1.9910 +[titan] 2025-09-09 10:53:36,469 - root - INFO - lr: 9.3140e-06 gnorm: 0.34 [1 day, 17:18:08<1 day, 7:52:48] +[titan] 2025-09-09 10:54:08,276 - root - INFO - step: 22580 loss: 2.8650 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 490.99 mfu: 49.65% global_avg_ntp_loss: 0.8340 global_avg_top_loss: 2.0310 +[titan] 2025-09-09 10:54:08,277 - root - INFO - lr: 9.3105e-06 gnorm: 0.37 [1 day, 17:18:40<1 day, 7:52:14] +[titan] 2025-09-09 10:54:40,256 - root - INFO - step: 22585 loss: 2.8324 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.35 mfu: 49.38% global_avg_ntp_loss: 0.8186 global_avg_top_loss: 2.0138 +[titan] 2025-09-09 10:54:40,257 - root - INFO - lr: 9.3069e-06 gnorm: 0.34 [1 day, 17:19:12<1 day, 7:51:41] +[titan] 2025-09-09 10:55:12,134 - root - INFO - step: 22590 loss: 2.8197 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.92 mfu: 49.54% global_avg_ntp_loss: 0.8118 global_avg_top_loss: 2.0079 +[titan] 2025-09-09 10:55:12,134 - root - INFO - lr: 9.3034e-06 gnorm: 0.36 [1 day, 17:19:44<1 day, 7:51:07] +[titan] 2025-09-09 10:55:44,077 - root - INFO - step: 22595 loss: 2.8490 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.92 mfu: 49.44% global_avg_ntp_loss: 0.8302 global_avg_top_loss: 2.0188 +[titan] 2025-09-09 10:55:44,077 - root - INFO - lr: 9.2999e-06 gnorm: 0.33 [1 day, 17:20:16<1 day, 7:50:33] +[titan] 2025-09-09 10:56:09,586 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 10:56:15,911 - root - INFO - step: 22600 loss: 2.7900 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.58 mfu: 49.60% global_avg_ntp_loss: 0.7988 global_avg_top_loss: 1.9912 +[titan] 2025-09-09 10:56:15,912 - root - INFO - lr: 9.2964e-06 gnorm: 0.33 [1 day, 17:20:48<1 day, 7:49:59] +[titan] 2025-09-09 10:56:47,763 - root - INFO - step: 22605 loss: 3.3223 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.32 mfu: 49.58% global_avg_ntp_loss: 1.1002 global_avg_top_loss: 2.2221 +[titan] 2025-09-09 10:56:47,764 - root - INFO - lr: 9.2929e-06 gnorm: 0.32 [1 day, 17:21:20<1 day, 7:49:26] +[titan] 2025-09-09 10:57:19,784 - root - INFO - step: 22610 loss: 2.6639 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.72 mfu: 49.31% global_avg_ntp_loss: 0.7431 global_avg_top_loss: 1.9208 +[titan] 2025-09-09 10:57:19,785 - root - INFO - lr: 9.2894e-06 gnorm: 0.33 [1 day, 17:21:52<1 day, 7:48:52] +[titan] 2025-09-09 10:57:51,810 - root - INFO - step: 22615 loss: 3.2190 memory: 122.03GiB(87.57%) tps: 10,232 tflops: 487.65 mfu: 49.31% global_avg_ntp_loss: 1.0484 global_avg_top_loss: 2.1705 +[titan] 2025-09-09 10:57:51,811 - root - INFO - lr: 9.2859e-06 gnorm: 0.34 [1 day, 17:22:24<1 day, 7:48:18] +[titan] 2025-09-09 10:58:23,853 - root - INFO - step: 22620 loss: 2.6631 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.40 mfu: 49.28% global_avg_ntp_loss: 0.7442 global_avg_top_loss: 1.9189 +[titan] 2025-09-09 10:58:23,853 - root - INFO - lr: 9.2824e-06 gnorm: 0.48 [1 day, 17:22:56<1 day, 7:47:45] +[titan] 2025-09-09 10:58:55,700 - root - INFO - step: 22625 loss: 2.6856 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.38 mfu: 49.58% global_avg_ntp_loss: 0.7538 global_avg_top_loss: 1.9318 +[titan] 2025-09-09 10:58:55,701 - root - INFO - lr: 9.2789e-06 gnorm: 0.36 [1 day, 17:23:27<1 day, 7:47:11] +[titan] 2025-09-09 10:59:27,447 - root - INFO - step: 22630 loss: 2.7753 memory: 122.03GiB(87.57%) tps: 10,322 tflops: 491.94 mfu: 49.74% global_avg_ntp_loss: 0.7984 global_avg_top_loss: 1.9769 +[titan] 2025-09-09 10:59:27,448 - root - INFO - lr: 9.2754e-06 gnorm: 0.34 [1 day, 17:23:59<1 day, 7:46:37] +[titan] 2025-09-09 10:59:59,425 - root - INFO - step: 22635 loss: 2.8106 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.38 mfu: 49.38% global_avg_ntp_loss: 0.8130 global_avg_top_loss: 1.9975 +[titan] 2025-09-09 10:59:59,426 - root - INFO - lr: 9.2719e-06 gnorm: 0.34 [1 day, 17:24:31<1 day, 7:46:04] +[titan] 2025-09-09 11:00:31,577 - root - INFO - step: 22640 loss: 2.7696 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.75 mfu: 49.11% global_avg_ntp_loss: 0.7899 global_avg_top_loss: 1.9797 +[titan] 2025-09-09 11:00:31,577 - root - INFO - lr: 9.2684e-06 gnorm: 0.33 [1 day, 17:25:03<1 day, 7:45:30] +[titan] 2025-09-09 11:01:03,377 - root - INFO - step: 22645 loss: 2.8397 memory: 122.03GiB(87.57%) tps: 10,304 tflops: 491.10 mfu: 49.66% global_avg_ntp_loss: 0.8273 global_avg_top_loss: 2.0124 +[titan] 2025-09-09 11:01:03,378 - root - INFO - lr: 9.2649e-06 gnorm: 0.35 [1 day, 17:25:35<1 day, 7:44:56] +[titan] 2025-09-09 11:01:28,758 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 11:01:35,230 - root - INFO - step: 22650 loss: 2.8608 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.30 mfu: 49.57% global_avg_ntp_loss: 0.8327 global_avg_top_loss: 2.0280 +[titan] 2025-09-09 11:01:35,231 - root - INFO - lr: 9.2614e-06 gnorm: 0.35 [1 day, 17:26:07<1 day, 7:44:22] +[titan] 2025-09-09 11:02:07,070 - root - INFO - step: 22655 loss: 2.7537 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.50 mfu: 49.60% global_avg_ntp_loss: 0.7841 global_avg_top_loss: 1.9696 +[titan] 2025-09-09 11:02:07,070 - root - INFO - lr: 9.2579e-06 gnorm: 0.35 [1 day, 17:26:39<1 day, 7:43:49] +[titan] 2025-09-09 11:02:38,890 - root - INFO - step: 22660 loss: 2.7692 memory: 122.03GiB(87.57%) tps: 10,298 tflops: 490.80 mfu: 49.63% global_avg_ntp_loss: 0.7933 global_avg_top_loss: 1.9759 +[titan] 2025-09-09 11:02:38,891 - root - INFO - lr: 9.2544e-06 gnorm: 0.33 [1 day, 17:27:11<1 day, 7:43:15] +[titan] 2025-09-09 11:03:10,840 - root - INFO - step: 22665 loss: 2.8392 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.81 mfu: 49.42% global_avg_ntp_loss: 0.8259 global_avg_top_loss: 2.0133 +[titan] 2025-09-09 11:03:10,840 - root - INFO - lr: 9.2509e-06 gnorm: 0.36 [1 day, 17:27:43<1 day, 7:42:41] +[titan] 2025-09-09 11:03:42,657 - root - INFO - step: 22670 loss: 2.7571 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.85 mfu: 49.63% global_avg_ntp_loss: 0.7886 global_avg_top_loss: 1.9686 +[titan] 2025-09-09 11:03:42,657 - root - INFO - lr: 9.2474e-06 gnorm: 0.35 [1 day, 17:28:14<1 day, 7:42:07] +[titan] 2025-09-09 11:04:14,528 - root - INFO - step: 22675 loss: 2.7855 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.03 mfu: 49.55% global_avg_ntp_loss: 0.7986 global_avg_top_loss: 1.9869 +[titan] 2025-09-09 11:04:14,528 - root - INFO - lr: 9.2439e-06 gnorm: 0.34 [1 day, 17:28:46<1 day, 7:41:34] +[titan] 2025-09-09 11:04:46,139 - root - INFO - step: 22680 loss: 2.7507 memory: 122.03GiB(87.57%) tps: 10,366 tflops: 494.04 mfu: 49.95% global_avg_ntp_loss: 0.7822 global_avg_top_loss: 1.9685 +[titan] 2025-09-09 11:04:46,140 - root - INFO - lr: 9.2404e-06 gnorm: 0.34 [1 day, 17:29:18<1 day, 7:41:00] +[titan] 2025-09-09 11:05:18,104 - root - INFO - step: 22685 loss: 2.7724 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.59 mfu: 49.40% global_avg_ntp_loss: 0.7908 global_avg_top_loss: 1.9816 +[titan] 2025-09-09 11:05:18,104 - root - INFO - lr: 9.2369e-06 gnorm: 0.34 [1 day, 17:29:50<1 day, 7:40:26] +[titan] 2025-09-09 11:05:49,931 - root - INFO - step: 22690 loss: 2.7801 memory: 122.03GiB(87.57%) tps: 10,296 tflops: 490.70 mfu: 49.62% global_avg_ntp_loss: 0.8051 global_avg_top_loss: 1.9750 +[titan] 2025-09-09 11:05:49,931 - root - INFO - lr: 9.2334e-06 gnorm: 0.33 [1 day, 17:30:22<1 day, 7:39:52] +[titan] 2025-09-09 11:06:21,895 - root - INFO - step: 22695 loss: 2.7162 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.59 mfu: 49.40% global_avg_ntp_loss: 0.7669 global_avg_top_loss: 1.9493 +[titan] 2025-09-09 11:06:21,895 - root - INFO - lr: 9.2299e-06 gnorm: 0.33 [1 day, 17:30:54<1 day, 7:39:19] +[titan] 2025-09-09 11:06:47,431 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 11:06:53,760 - root - INFO - step: 22700 loss: 2.8097 memory: 122.03GiB(87.57%) tps: 10,284 tflops: 490.11 mfu: 49.56% global_avg_ntp_loss: 0.8060 global_avg_top_loss: 2.0037 +[titan] 2025-09-09 11:06:53,761 - root - INFO - lr: 9.2264e-06 gnorm: 0.34 [1 day, 17:31:26<1 day, 7:38:45] +[titan] 2025-09-09 11:07:25,749 - root - INFO - step: 22705 loss: 2.7664 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.21 mfu: 49.36% global_avg_ntp_loss: 0.7870 global_avg_top_loss: 1.9794 +[titan] 2025-09-09 11:07:25,750 - root - INFO - lr: 9.2229e-06 gnorm: 0.33 [1 day, 17:31:58<1 day, 7:38:11] +[titan] 2025-09-09 11:07:57,582 - root - INFO - step: 22710 loss: 2.7659 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.61 mfu: 49.61% global_avg_ntp_loss: 0.7872 global_avg_top_loss: 1.9787 +[titan] 2025-09-09 11:07:57,583 - root - INFO - lr: 9.2194e-06 gnorm: 0.34 [1 day, 17:32:29<1 day, 7:37:38] +[titan] 2025-09-09 11:08:29,518 - root - INFO - step: 22715 loss: 2.6822 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.03 mfu: 49.45% global_avg_ntp_loss: 0.7539 global_avg_top_loss: 1.9284 +[titan] 2025-09-09 11:08:29,518 - root - INFO - lr: 9.2159e-06 gnorm: 0.38 [1 day, 17:33:01<1 day, 7:37:04] +[titan] 2025-09-09 11:09:01,396 - root - INFO - step: 22720 loss: 2.7197 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.91 mfu: 49.54% global_avg_ntp_loss: 0.7646 global_avg_top_loss: 1.9552 +[titan] 2025-09-09 11:09:01,396 - root - INFO - lr: 9.2124e-06 gnorm: 0.37 [1 day, 17:33:33<1 day, 7:36:30] +[titan] 2025-09-09 11:09:33,488 - root - INFO - step: 22725 loss: 2.7337 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.64 mfu: 49.21% global_avg_ntp_loss: 0.7739 global_avg_top_loss: 1.9597 +[titan] 2025-09-09 11:09:33,488 - root - INFO - lr: 9.2089e-06 gnorm: 0.37 [1 day, 17:34:05<1 day, 7:35:57] +[titan] 2025-09-09 11:10:05,420 - root - INFO - step: 22730 loss: 2.7944 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.07 mfu: 49.45% global_avg_ntp_loss: 0.8004 global_avg_top_loss: 1.9940 +[titan] 2025-09-09 11:10:05,421 - root - INFO - lr: 9.2054e-06 gnorm: 0.34 [1 day, 17:34:37<1 day, 7:35:23] +[titan] 2025-09-09 11:10:37,327 - root - INFO - step: 22735 loss: 2.8260 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.48 mfu: 49.49% global_avg_ntp_loss: 0.8181 global_avg_top_loss: 2.0079 +[titan] 2025-09-09 11:10:37,327 - root - INFO - lr: 9.2019e-06 gnorm: 0.35 [1 day, 17:35:09<1 day, 7:34:49] +[titan] 2025-09-09 11:11:09,288 - root - INFO - step: 22740 loss: 2.7214 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.64 mfu: 49.41% global_avg_ntp_loss: 0.7718 global_avg_top_loss: 1.9496 +[titan] 2025-09-09 11:11:09,288 - root - INFO - lr: 9.1984e-06 gnorm: 0.34 [1 day, 17:35:41<1 day, 7:34:16] +[titan] 2025-09-09 11:11:41,418 - root - INFO - step: 22745 loss: 2.7742 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.07 mfu: 49.15% global_avg_ntp_loss: 0.7975 global_avg_top_loss: 1.9766 +[titan] 2025-09-09 11:11:41,418 - root - INFO - lr: 9.1949e-06 gnorm: 0.34 [1 day, 17:36:13<1 day, 7:33:42] +[titan] 2025-09-09 11:12:06,924 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 11:12:13,283 - root - INFO - step: 22750 loss: 2.8156 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.10 mfu: 49.56% global_avg_ntp_loss: 0.8110 global_avg_top_loss: 2.0046 +[titan] 2025-09-09 11:12:13,284 - root - INFO - lr: 9.1914e-06 gnorm: 0.34 [1 day, 17:36:45<1 day, 7:33:08] +[titan] 2025-09-09 11:12:45,201 - root - INFO - step: 22755 loss: 2.6956 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.30 mfu: 49.47% global_avg_ntp_loss: 0.7568 global_avg_top_loss: 1.9388 +[titan] 2025-09-09 11:12:45,202 - root - INFO - lr: 9.1879e-06 gnorm: 0.38 [1 day, 17:37:17<1 day, 7:32:35] +[titan] 2025-09-09 11:13:16,846 - root - INFO - step: 22760 loss: 2.9061 memory: 122.03GiB(87.57%) tps: 10,355 tflops: 493.52 mfu: 49.90% global_avg_ntp_loss: 0.8713 global_avg_top_loss: 2.0348 +[titan] 2025-09-09 11:13:16,847 - root - INFO - lr: 9.1844e-06 gnorm: 0.33 [1 day, 17:37:49<1 day, 7:32:01] +[titan] 2025-09-09 11:13:48,720 - root - INFO - step: 22765 loss: 2.6836 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.98 mfu: 49.54% global_avg_ntp_loss: 0.7558 global_avg_top_loss: 1.9277 +[titan] 2025-09-09 11:13:48,720 - root - INFO - lr: 9.1809e-06 gnorm: 0.34 [1 day, 17:38:20<1 day, 7:31:27] +[titan] 2025-09-09 11:14:20,695 - root - INFO - step: 22770 loss: 2.9051 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.42 mfu: 49.39% global_avg_ntp_loss: 0.8531 global_avg_top_loss: 2.0520 +[titan] 2025-09-09 11:14:20,695 - root - INFO - lr: 9.1774e-06 gnorm: 0.38 [1 day, 17:38:52<1 day, 7:30:53] +[titan] 2025-09-09 11:14:52,570 - root - INFO - step: 22775 loss: 2.7745 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.96 mfu: 49.54% global_avg_ntp_loss: 0.7923 global_avg_top_loss: 1.9822 +[titan] 2025-09-09 11:14:52,571 - root - INFO - lr: 9.1739e-06 gnorm: 0.35 [1 day, 17:39:24<1 day, 7:30:20] +[titan] 2025-09-09 11:15:24,495 - root - INFO - step: 22780 loss: 2.7520 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.19 mfu: 49.46% global_avg_ntp_loss: 0.7901 global_avg_top_loss: 1.9619 +[titan] 2025-09-09 11:15:24,496 - root - INFO - lr: 9.1704e-06 gnorm: 0.33 [1 day, 17:39:56<1 day, 7:29:46] +[titan] 2025-09-09 11:15:56,459 - root - INFO - step: 22785 loss: 2.7848 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.61 mfu: 49.40% global_avg_ntp_loss: 0.7977 global_avg_top_loss: 1.9870 +[titan] 2025-09-09 11:15:56,459 - root - INFO - lr: 9.1669e-06 gnorm: 0.35 [1 day, 17:40:28<1 day, 7:29:12] +[titan] 2025-09-09 11:16:28,305 - root - INFO - step: 22790 loss: 2.8074 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.41 mfu: 49.59% global_avg_ntp_loss: 0.8169 global_avg_top_loss: 1.9905 +[titan] 2025-09-09 11:16:28,305 - root - INFO - lr: 9.1634e-06 gnorm: 0.36 [1 day, 17:41:00<1 day, 7:28:39] +[titan] 2025-09-09 11:17:00,082 - root - INFO - step: 22795 loss: 2.7460 memory: 122.03GiB(87.57%) tps: 10,312 tflops: 491.46 mfu: 49.69% global_avg_ntp_loss: 0.7854 global_avg_top_loss: 1.9607 +[titan] 2025-09-09 11:17:00,083 - root - INFO - lr: 9.1599e-06 gnorm: 0.36 [1 day, 17:41:32<1 day, 7:28:05] +[titan] 2025-09-09 11:17:25,548 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 11:17:31,876 - root - INFO - step: 22800 loss: 2.7609 memory: 122.03GiB(87.57%) tps: 10,307 tflops: 491.21 mfu: 49.67% global_avg_ntp_loss: 0.7932 global_avg_top_loss: 1.9677 +[titan] 2025-09-09 11:17:31,877 - root - INFO - lr: 9.1564e-06 gnorm: 0.34 [1 day, 17:42:04<1 day, 7:27:31] +[titan] 2025-09-09 11:18:03,734 - root - INFO - step: 22805 loss: 2.7826 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.22 mfu: 49.57% global_avg_ntp_loss: 0.7975 global_avg_top_loss: 1.9850 +[titan] 2025-09-09 11:18:03,735 - root - INFO - lr: 9.1529e-06 gnorm: 0.34 [1 day, 17:42:35<1 day, 7:26:57] +[titan] 2025-09-09 11:18:35,894 - root - INFO - step: 22810 loss: 2.8022 memory: 122.03GiB(87.57%) tps: 10,190 tflops: 485.62 mfu: 49.10% global_avg_ntp_loss: 0.8065 global_avg_top_loss: 1.9957 +[titan] 2025-09-09 11:18:35,894 - root - INFO - lr: 9.1494e-06 gnorm: 0.36 [1 day, 17:43:08<1 day, 7:26:24] +[titan] 2025-09-09 11:19:07,699 - root - INFO - step: 22815 loss: 2.7565 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.03 mfu: 49.65% global_avg_ntp_loss: 0.7833 global_avg_top_loss: 1.9732 +[titan] 2025-09-09 11:19:07,700 - root - INFO - lr: 9.1460e-06 gnorm: 0.34 [1 day, 17:43:39<1 day, 7:25:50] +[titan] 2025-09-09 11:19:39,497 - root - INFO - step: 22820 loss: 2.7649 memory: 122.03GiB(87.57%) tps: 10,305 tflops: 491.15 mfu: 49.66% global_avg_ntp_loss: 0.7868 global_avg_top_loss: 1.9781 +[titan] 2025-09-09 11:19:39,497 - root - INFO - lr: 9.1425e-06 gnorm: 0.34 [1 day, 17:44:11<1 day, 7:25:16] +[titan] 2025-09-09 11:20:11,555 - root - INFO - step: 22825 loss: 2.8329 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.17 mfu: 49.26% global_avg_ntp_loss: 0.8225 global_avg_top_loss: 2.0104 +[titan] 2025-09-09 11:20:11,555 - root - INFO - lr: 9.1390e-06 gnorm: 0.33 [1 day, 17:44:43<1 day, 7:24:43] +[titan] 2025-09-09 11:20:43,313 - root - INFO - step: 22830 loss: 2.7845 memory: 122.03GiB(87.57%) tps: 10,318 tflops: 491.75 mfu: 49.72% global_avg_ntp_loss: 0.7975 global_avg_top_loss: 1.9870 +[titan] 2025-09-09 11:20:43,314 - root - INFO - lr: 9.1355e-06 gnorm: 0.34 [1 day, 17:45:15<1 day, 7:24:09] +[titan] 2025-09-09 11:21:15,287 - root - INFO - step: 22835 loss: 2.7507 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.44 mfu: 49.39% global_avg_ntp_loss: 0.7929 global_avg_top_loss: 1.9578 +[titan] 2025-09-09 11:21:15,288 - root - INFO - lr: 9.1320e-06 gnorm: 0.33 [1 day, 17:45:47<1 day, 7:23:35] +[titan] 2025-09-09 11:21:47,194 - root - INFO - step: 22840 loss: 2.7794 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.47 mfu: 49.49% global_avg_ntp_loss: 0.7953 global_avg_top_loss: 1.9841 +[titan] 2025-09-09 11:21:47,195 - root - INFO - lr: 9.1285e-06 gnorm: 0.36 [1 day, 17:46:19<1 day, 7:23:02] +[titan] 2025-09-09 11:22:19,156 - root - INFO - step: 22845 loss: 2.7638 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.64 mfu: 49.41% global_avg_ntp_loss: 0.7921 global_avg_top_loss: 1.9717 +[titan] 2025-09-09 11:22:19,156 - root - INFO - lr: 9.1250e-06 gnorm: 0.37 [1 day, 17:46:51<1 day, 7:22:28] +[titan] 2025-09-09 11:22:44,617 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 11:22:51,039 - root - INFO - step: 22850 loss: 2.8019 memory: 122.03GiB(87.57%) tps: 10,278 tflops: 489.83 mfu: 49.53% global_avg_ntp_loss: 0.8064 global_avg_top_loss: 1.9954 +[titan] 2025-09-09 11:22:51,040 - root - INFO - lr: 9.1215e-06 gnorm: 0.35 [1 day, 17:47:23<1 day, 7:21:54] +[titan] 2025-09-09 11:23:22,972 - root - INFO - step: 22855 loss: 3.1268 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.07 mfu: 49.45% global_avg_ntp_loss: 1.0072 global_avg_top_loss: 2.1197 +[titan] 2025-09-09 11:23:22,972 - root - INFO - lr: 9.1180e-06 gnorm: 0.35 [1 day, 17:47:55<1 day, 7:21:21] +[titan] 2025-09-09 11:23:54,718 - root - INFO - step: 22860 loss: 2.7160 memory: 122.03GiB(87.57%) tps: 10,322 tflops: 491.95 mfu: 49.74% global_avg_ntp_loss: 0.7664 global_avg_top_loss: 1.9496 +[titan] 2025-09-09 11:23:54,718 - root - INFO - lr: 9.1145e-06 gnorm: 0.35 [1 day, 17:48:26<1 day, 7:20:47] +[titan] 2025-09-09 11:24:26,551 - root - INFO - step: 22865 loss: 2.7752 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.60 mfu: 49.61% global_avg_ntp_loss: 0.7947 global_avg_top_loss: 1.9805 +[titan] 2025-09-09 11:24:26,552 - root - INFO - lr: 9.1110e-06 gnorm: 0.34 [1 day, 17:48:58<1 day, 7:20:13] +[titan] 2025-09-09 11:24:58,450 - root - INFO - step: 22870 loss: 2.7930 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.59 mfu: 49.50% global_avg_ntp_loss: 0.8056 global_avg_top_loss: 1.9874 +[titan] 2025-09-09 11:24:58,451 - root - INFO - lr: 9.1075e-06 gnorm: 0.35 [1 day, 17:49:30<1 day, 7:19:39] +[titan] 2025-09-09 11:25:30,304 - root - INFO - step: 22875 loss: 2.5558 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.29 mfu: 49.57% global_avg_ntp_loss: 0.6957 global_avg_top_loss: 1.8601 +[titan] 2025-09-09 11:25:30,304 - root - INFO - lr: 9.1041e-06 gnorm: 0.39 [1 day, 17:50:02<1 day, 7:19:06] +[titan] 2025-09-09 11:26:02,104 - root - INFO - step: 22880 loss: 2.6741 memory: 122.03GiB(87.57%) tps: 10,305 tflops: 491.11 mfu: 49.66% global_avg_ntp_loss: 0.7454 global_avg_top_loss: 1.9287 +[titan] 2025-09-09 11:26:02,105 - root - INFO - lr: 9.1006e-06 gnorm: 0.51 [1 day, 17:50:34<1 day, 7:18:32] +[titan] 2025-09-09 11:26:33,809 - root - INFO - step: 22885 loss: 2.7962 memory: 122.03GiB(87.57%) tps: 10,336 tflops: 492.59 mfu: 49.81% global_avg_ntp_loss: 0.8071 global_avg_top_loss: 1.9891 +[titan] 2025-09-09 11:26:33,809 - root - INFO - lr: 9.0971e-06 gnorm: 0.35 [1 day, 17:51:06<1 day, 7:17:58] +[titan] 2025-09-09 11:27:05,669 - root - INFO - step: 22890 loss: 2.8700 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.19 mfu: 49.56% global_avg_ntp_loss: 0.8373 global_avg_top_loss: 2.0327 +[titan] 2025-09-09 11:27:05,669 - root - INFO - lr: 9.0936e-06 gnorm: 0.37 [1 day, 17:51:37<1 day, 7:17:24] +[titan] 2025-09-09 11:27:37,617 - root - INFO - step: 22895 loss: 2.8032 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.83 mfu: 49.43% global_avg_ntp_loss: 0.8068 global_avg_top_loss: 1.9964 +[titan] 2025-09-09 11:27:37,618 - root - INFO - lr: 9.0901e-06 gnorm: 0.35 [1 day, 17:52:09<1 day, 7:16:51] +[titan] 2025-09-09 11:28:03,041 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 11:28:09,363 - root - INFO - step: 22900 loss: 2.8294 memory: 122.03GiB(87.57%) tps: 10,322 tflops: 491.95 mfu: 49.74% global_avg_ntp_loss: 0.8210 global_avg_top_loss: 2.0084 +[titan] 2025-09-09 11:28:09,364 - root - INFO - lr: 9.0866e-06 gnorm: 0.35 [1 day, 17:52:41<1 day, 7:16:17] +[titan] 2025-09-09 11:28:41,201 - root - INFO - step: 22905 loss: 2.7928 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.53 mfu: 49.60% global_avg_ntp_loss: 0.8006 global_avg_top_loss: 1.9921 +[titan] 2025-09-09 11:28:41,202 - root - INFO - lr: 9.0831e-06 gnorm: 0.33 [1 day, 17:53:13<1 day, 7:15:43] +[titan] 2025-09-09 11:29:12,985 - root - INFO - step: 22910 loss: 2.7151 memory: 122.03GiB(87.57%) tps: 10,310 tflops: 491.37 mfu: 49.68% global_avg_ntp_loss: 0.7683 global_avg_top_loss: 1.9468 +[titan] 2025-09-09 11:29:12,985 - root - INFO - lr: 9.0796e-06 gnorm: 0.35 [1 day, 17:53:45<1 day, 7:15:09] +[titan] 2025-09-09 11:29:44,871 - root - INFO - step: 22915 loss: 2.8219 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.79 mfu: 49.52% global_avg_ntp_loss: 0.8126 global_avg_top_loss: 2.0093 +[titan] 2025-09-09 11:29:44,871 - root - INFO - lr: 9.0761e-06 gnorm: 0.36 [1 day, 17:54:17<1 day, 7:14:36] +[titan] 2025-09-09 11:30:16,700 - root - INFO - step: 22920 loss: 2.7748 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.66 mfu: 49.61% global_avg_ntp_loss: 0.7942 global_avg_top_loss: 1.9806 +[titan] 2025-09-09 11:30:16,700 - root - INFO - lr: 9.0727e-06 gnorm: 0.37 [1 day, 17:54:48<1 day, 7:14:02] +[titan] 2025-09-09 11:30:48,436 - root - INFO - step: 22925 loss: 2.7282 memory: 122.03GiB(87.57%) tps: 10,325 tflops: 492.10 mfu: 49.76% global_avg_ntp_loss: 0.7720 global_avg_top_loss: 1.9562 +[titan] 2025-09-09 11:30:48,437 - root - INFO - lr: 9.0692e-06 gnorm: 0.35 [1 day, 17:55:20<1 day, 7:13:28] +[titan] 2025-09-09 11:31:20,306 - root - INFO - step: 22930 loss: 2.8364 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.05 mfu: 49.55% global_avg_ntp_loss: 0.8199 global_avg_top_loss: 2.0165 +[titan] 2025-09-09 11:31:20,306 - root - INFO - lr: 9.0657e-06 gnorm: 0.39 [1 day, 17:55:52<1 day, 7:12:55] +[titan] 2025-09-09 11:31:52,015 - root - INFO - step: 22935 loss: 2.6347 memory: 122.03GiB(87.57%) tps: 10,334 tflops: 492.52 mfu: 49.80% global_avg_ntp_loss: 0.7312 global_avg_top_loss: 1.9035 +[titan] 2025-09-09 11:31:52,015 - root - INFO - lr: 9.0622e-06 gnorm: 0.33 [1 day, 17:56:24<1 day, 7:12:21] +[titan] 2025-09-09 11:32:23,867 - root - INFO - step: 22940 loss: 2.7362 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.31 mfu: 49.58% global_avg_ntp_loss: 0.7749 global_avg_top_loss: 1.9613 +[titan] 2025-09-09 11:32:23,868 - root - INFO - lr: 9.0587e-06 gnorm: 0.34 [1 day, 17:56:56<1 day, 7:11:47] +[titan] 2025-09-09 11:32:55,767 - root - INFO - step: 22945 loss: 2.7867 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.58 mfu: 49.50% global_avg_ntp_loss: 0.7998 global_avg_top_loss: 1.9869 +[titan] 2025-09-09 11:32:55,768 - root - INFO - lr: 9.0552e-06 gnorm: 0.34 [1 day, 17:57:27<1 day, 7:11:13] +[titan] 2025-09-09 11:33:21,346 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 11:33:27,657 - root - INFO - step: 22950 loss: 2.8209 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.73 mfu: 49.52% global_avg_ntp_loss: 0.8159 global_avg_top_loss: 2.0050 +[titan] 2025-09-09 11:33:27,657 - root - INFO - lr: 9.0517e-06 gnorm: 0.34 [1 day, 17:57:59<1 day, 7:10:40] +[titan] 2025-09-09 11:33:59,558 - root - INFO - step: 22955 loss: 2.7284 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.56 mfu: 49.50% global_avg_ntp_loss: 0.7689 global_avg_top_loss: 1.9596 +[titan] 2025-09-09 11:33:59,559 - root - INFO - lr: 9.0483e-06 gnorm: 0.33 [1 day, 17:58:31<1 day, 7:10:06] +[titan] 2025-09-09 11:34:31,523 - root - INFO - step: 22960 loss: 2.6068 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.58 mfu: 49.40% global_avg_ntp_loss: 0.7167 global_avg_top_loss: 1.8901 +[titan] 2025-09-09 11:34:31,523 - root - INFO - lr: 9.0448e-06 gnorm: 0.41 [1 day, 17:59:03<1 day, 7:09:32] +[titan] 2025-09-09 11:35:03,263 - root - INFO - step: 22965 loss: 2.7925 memory: 122.03GiB(87.57%) tps: 10,324 tflops: 492.04 mfu: 49.75% global_avg_ntp_loss: 0.8003 global_avg_top_loss: 1.9922 +[titan] 2025-09-09 11:35:03,264 - root - INFO - lr: 9.0413e-06 gnorm: 0.34 [1 day, 17:59:35<1 day, 7:08:59] +[titan] 2025-09-09 11:35:35,175 - root - INFO - step: 22970 loss: 2.6798 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.39 mfu: 49.48% global_avg_ntp_loss: 0.7490 global_avg_top_loss: 1.9308 +[titan] 2025-09-09 11:35:35,176 - root - INFO - lr: 9.0378e-06 gnorm: 0.34 [1 day, 18:00:07<1 day, 7:08:25] +[titan] 2025-09-09 11:36:07,033 - root - INFO - step: 22975 loss: 2.7753 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.22 mfu: 49.57% global_avg_ntp_loss: 0.7944 global_avg_top_loss: 1.9809 +[titan] 2025-09-09 11:36:07,034 - root - INFO - lr: 9.0343e-06 gnorm: 0.34 [1 day, 18:00:39<1 day, 7:07:51] +[titan] 2025-09-09 11:36:38,804 - root - INFO - step: 22980 loss: 2.7525 memory: 122.03GiB(87.57%) tps: 10,314 tflops: 491.56 mfu: 49.70% global_avg_ntp_loss: 0.7825 global_avg_top_loss: 1.9700 +[titan] 2025-09-09 11:36:38,805 - root - INFO - lr: 9.0308e-06 gnorm: 0.34 [1 day, 18:01:11<1 day, 7:07:17] +[titan] 2025-09-09 11:37:10,727 - root - INFO - step: 22985 loss: 2.7750 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.22 mfu: 49.47% global_avg_ntp_loss: 0.7942 global_avg_top_loss: 1.9808 +[titan] 2025-09-09 11:37:10,728 - root - INFO - lr: 9.0274e-06 gnorm: 0.33 [1 day, 18:01:42<1 day, 7:06:44] +[titan] 2025-09-09 11:37:42,724 - root - INFO - step: 22990 loss: 2.7206 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.09 mfu: 49.35% global_avg_ntp_loss: 0.7689 global_avg_top_loss: 1.9517 +[titan] 2025-09-09 11:37:42,725 - root - INFO - lr: 9.0239e-06 gnorm: 0.34 [1 day, 18:02:14<1 day, 7:06:10] +[titan] 2025-09-09 11:38:14,616 - root - INFO - step: 22995 loss: 2.7913 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.71 mfu: 49.52% global_avg_ntp_loss: 0.8000 global_avg_top_loss: 1.9913 +[titan] 2025-09-09 11:38:14,616 - root - INFO - lr: 9.0204e-06 gnorm: 0.34 [1 day, 18:02:46<1 day, 7:05:37] +[titan] 2025-09-09 11:38:40,130 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 11:38:46,600 - root - INFO - step: 23000 loss: 2.7613 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.29 mfu: 49.37% global_avg_ntp_loss: 0.7908 global_avg_top_loss: 1.9706 +[titan] 2025-09-09 11:38:46,600 - root - INFO - lr: 9.0169e-06 gnorm: 0.34 [1 day, 18:03:18<1 day, 7:05:03] +[titan] 2025-09-09 11:39:18,600 - root - INFO - step: 23005 loss: 2.8584 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.05 mfu: 49.35% global_avg_ntp_loss: 0.8325 global_avg_top_loss: 2.0259 +[titan] 2025-09-09 11:39:18,600 - root - INFO - lr: 9.0134e-06 gnorm: 0.36 [1 day, 18:03:50<1 day, 7:04:29] +[titan] 2025-09-09 11:39:50,502 - root - INFO - step: 23010 loss: 3.0171 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.53 mfu: 49.50% global_avg_ntp_loss: 0.9313 global_avg_top_loss: 2.0859 +[titan] 2025-09-09 11:39:50,503 - root - INFO - lr: 9.0099e-06 gnorm: 0.35 [1 day, 18:04:22<1 day, 7:03:56] +[titan] 2025-09-09 11:40:22,576 - root - INFO - step: 23015 loss: 2.7704 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.93 mfu: 49.23% global_avg_ntp_loss: 0.7901 global_avg_top_loss: 1.9803 +[titan] 2025-09-09 11:40:22,576 - root - INFO - lr: 9.0065e-06 gnorm: 0.48 [1 day, 18:04:54<1 day, 7:03:22] +[titan] 2025-09-09 11:40:54,414 - root - INFO - step: 23020 loss: 2.8252 memory: 122.03GiB(87.57%) tps: 10,292 tflops: 490.53 mfu: 49.60% global_avg_ntp_loss: 0.8169 global_avg_top_loss: 2.0083 +[titan] 2025-09-09 11:40:54,414 - root - INFO - lr: 9.0030e-06 gnorm: 0.40 [1 day, 18:05:26<1 day, 7:02:48] +[titan] 2025-09-09 11:41:26,341 - root - INFO - step: 23025 loss: 2.7760 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.16 mfu: 49.46% global_avg_ntp_loss: 0.7956 global_avg_top_loss: 1.9804 +[titan] 2025-09-09 11:41:26,342 - root - INFO - lr: 8.9995e-06 gnorm: 0.38 [1 day, 18:05:58<1 day, 7:02:15] +[titan] 2025-09-09 11:41:58,349 - root - INFO - step: 23030 loss: 3.0877 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.93 mfu: 49.34% global_avg_ntp_loss: 0.9368 global_avg_top_loss: 2.1509 +[titan] 2025-09-09 11:41:58,349 - root - INFO - lr: 8.9960e-06 gnorm: 0.45 [1 day, 18:06:30<1 day, 7:01:41] +[titan] 2025-09-09 11:42:30,275 - root - INFO - step: 23035 loss: 2.6858 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.18 mfu: 49.46% global_avg_ntp_loss: 0.7464 global_avg_top_loss: 1.9394 +[titan] 2025-09-09 11:42:30,275 - root - INFO - lr: 8.9925e-06 gnorm: 0.47 [1 day, 18:07:02<1 day, 7:01:08] +[titan] 2025-09-09 11:43:02,324 - root - INFO - step: 23040 loss: 2.7471 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.29 mfu: 49.27% global_avg_ntp_loss: 0.7782 global_avg_top_loss: 1.9689 +[titan] 2025-09-09 11:43:02,325 - root - INFO - lr: 8.9891e-06 gnorm: 0.34 [1 day, 18:07:34<1 day, 7:00:34] +[titan] 2025-09-09 11:43:02,609 - root - INFO - Dumping profiler traces at step 23040 +[titan] 2025-09-09 11:43:02,679 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-09 11:43:34,417 - root - INFO - step: 23045 loss: 2.7461 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.63 mfu: 49.20% global_avg_ntp_loss: 0.7832 global_avg_top_loss: 1.9629 +[titan] 2025-09-09 11:43:34,418 - root - INFO - lr: 8.9856e-06 gnorm: 0.37 [1 day, 18:08:06<1 day, 7:00:01] +[titan] 2025-09-09 11:43:59,967 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 11:44:06,339 - root - INFO - step: 23050 loss: 2.7869 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.24 mfu: 49.47% global_avg_ntp_loss: 0.8034 global_avg_top_loss: 1.9834 +[titan] 2025-09-09 11:44:06,340 - root - INFO - lr: 8.9821e-06 gnorm: 0.36 [1 day, 18:08:38<1 day, 6:59:27] +[titan] 2025-09-09 11:44:38,083 - root - INFO - step: 23055 loss: 2.6814 memory: 122.03GiB(87.57%) tps: 10,323 tflops: 491.98 mfu: 49.75% global_avg_ntp_loss: 0.7484 global_avg_top_loss: 1.9330 +[titan] 2025-09-09 11:44:38,084 - root - INFO - lr: 8.9786e-06 gnorm: 0.35 [1 day, 18:09:10<1 day, 6:58:53] +[titan] 2025-09-09 11:45:10,114 - root - INFO - step: 23060 loss: 2.8875 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.58 mfu: 49.30% global_avg_ntp_loss: 0.8427 global_avg_top_loss: 2.0448 +[titan] 2025-09-09 11:45:10,114 - root - INFO - lr: 8.9751e-06 gnorm: 0.35 [1 day, 18:09:42<1 day, 6:58:20] +[titan] 2025-09-09 11:45:41,908 - root - INFO - step: 23065 loss: 2.7740 memory: 122.03GiB(87.57%) tps: 10,307 tflops: 491.21 mfu: 49.67% global_avg_ntp_loss: 0.7953 global_avg_top_loss: 1.9787 +[titan] 2025-09-09 11:45:41,908 - root - INFO - lr: 8.9717e-06 gnorm: 0.38 [1 day, 18:10:14<1 day, 6:57:46] +[titan] 2025-09-09 11:46:13,769 - root - INFO - step: 23070 loss: 2.8134 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.18 mfu: 49.56% global_avg_ntp_loss: 0.8113 global_avg_top_loss: 2.0021 +[titan] 2025-09-09 11:46:13,769 - root - INFO - lr: 8.9682e-06 gnorm: 0.34 [1 day, 18:10:45<1 day, 6:57:12] +[titan] 2025-09-09 11:46:45,656 - root - INFO - step: 23075 loss: 2.8088 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.76 mfu: 49.52% global_avg_ntp_loss: 0.8091 global_avg_top_loss: 1.9996 +[titan] 2025-09-09 11:46:45,657 - root - INFO - lr: 8.9647e-06 gnorm: 0.37 [1 day, 18:11:17<1 day, 6:56:39] +[titan] 2025-09-09 11:47:17,644 - root - INFO - step: 23080 loss: 2.7354 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.24 mfu: 49.37% global_avg_ntp_loss: 0.7726 global_avg_top_loss: 1.9629 +[titan] 2025-09-09 11:47:17,644 - root - INFO - lr: 8.9612e-06 gnorm: 0.35 [1 day, 18:11:49<1 day, 6:56:05] +[titan] 2025-09-09 11:47:49,422 - root - INFO - step: 23085 loss: 2.8107 memory: 122.03GiB(87.57%) tps: 10,312 tflops: 491.46 mfu: 49.69% global_avg_ntp_loss: 0.8102 global_avg_top_loss: 2.0005 +[titan] 2025-09-09 11:47:49,422 - root - INFO - lr: 8.9578e-06 gnorm: 0.35 [1 day, 18:12:21<1 day, 6:55:31] +[titan] 2025-09-09 11:48:21,076 - root - INFO - step: 23090 loss: 2.7969 memory: 122.03GiB(87.57%) tps: 10,352 tflops: 493.37 mfu: 49.89% global_avg_ntp_loss: 0.8008 global_avg_top_loss: 1.9961 +[titan] 2025-09-09 11:48:21,077 - root - INFO - lr: 8.9543e-06 gnorm: 0.35 [1 day, 18:12:53<1 day, 6:54:57] +[titan] 2025-09-09 11:48:53,000 - root - INFO - step: 23095 loss: 2.7270 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.22 mfu: 49.47% global_avg_ntp_loss: 0.7717 global_avg_top_loss: 1.9553 +[titan] 2025-09-09 11:48:53,000 - root - INFO - lr: 8.9508e-06 gnorm: 0.35 [1 day, 18:13:25<1 day, 6:54:24] +[titan] 2025-09-09 11:49:18,548 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 11:49:24,954 - root - INFO - step: 23100 loss: 2.7597 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.75 mfu: 49.42% global_avg_ntp_loss: 0.7865 global_avg_top_loss: 1.9731 +[titan] 2025-09-09 11:49:24,954 - root - INFO - lr: 8.9473e-06 gnorm: 0.36 [1 day, 18:13:57<1 day, 6:53:50] +[titan] 2025-09-09 11:49:56,770 - root - INFO - step: 23105 loss: 2.6881 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.86 mfu: 49.63% global_avg_ntp_loss: 0.7603 global_avg_top_loss: 1.9278 +[titan] 2025-09-09 11:49:56,771 - root - INFO - lr: 8.9439e-06 gnorm: 0.43 [1 day, 18:14:28<1 day, 6:53:16] +[titan] 2025-09-09 11:50:28,605 - root - INFO - step: 23110 loss: 2.7222 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.59 mfu: 49.60% global_avg_ntp_loss: 0.7715 global_avg_top_loss: 1.9506 +[titan] 2025-09-09 11:50:28,605 - root - INFO - lr: 8.9404e-06 gnorm: 0.38 [1 day, 18:15:00<1 day, 6:52:43] +[titan] 2025-09-09 11:51:00,852 - root - INFO - step: 23115 loss: 3.1842 memory: 122.03GiB(87.57%) tps: 10,162 tflops: 484.30 mfu: 48.97% global_avg_ntp_loss: 1.0304 global_avg_top_loss: 2.1538 +[titan] 2025-09-09 11:51:00,852 - root - INFO - lr: 8.9369e-06 gnorm: 0.34 [1 day, 18:15:33<1 day, 6:52:09] +[titan] 2025-09-09 11:51:32,818 - root - INFO - step: 23120 loss: 2.7098 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.56 mfu: 49.40% global_avg_ntp_loss: 0.7635 global_avg_top_loss: 1.9463 +[titan] 2025-09-09 11:51:32,818 - root - INFO - lr: 8.9334e-06 gnorm: 0.34 [1 day, 18:16:04<1 day, 6:51:36] +[titan] 2025-09-09 11:52:04,703 - root - INFO - step: 23125 loss: 2.7399 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.81 mfu: 49.53% global_avg_ntp_loss: 0.7820 global_avg_top_loss: 1.9579 +[titan] 2025-09-09 11:52:04,703 - root - INFO - lr: 8.9300e-06 gnorm: 0.39 [1 day, 18:16:36<1 day, 6:51:02] +[titan] 2025-09-09 11:52:36,806 - root - INFO - step: 23130 loss: 2.7554 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.47 mfu: 49.19% global_avg_ntp_loss: 0.7868 global_avg_top_loss: 1.9686 +[titan] 2025-09-09 11:52:36,807 - root - INFO - lr: 8.9265e-06 gnorm: 0.37 [1 day, 18:17:08<1 day, 6:50:29] +[titan] 2025-09-09 11:53:08,936 - root - INFO - step: 23135 loss: 2.8058 memory: 122.03GiB(87.57%) tps: 10,199 tflops: 486.07 mfu: 49.15% global_avg_ntp_loss: 0.8041 global_avg_top_loss: 2.0017 +[titan] 2025-09-09 11:53:08,937 - root - INFO - lr: 8.9230e-06 gnorm: 0.35 [1 day, 18:17:41<1 day, 6:49:55] +[titan] 2025-09-09 11:53:40,833 - root - INFO - step: 23140 loss: 2.7882 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.63 mfu: 49.51% global_avg_ntp_loss: 0.7986 global_avg_top_loss: 1.9897 +[titan] 2025-09-09 11:53:40,833 - root - INFO - lr: 8.9195e-06 gnorm: 0.38 [1 day, 18:18:12<1 day, 6:49:21] +[titan] 2025-09-09 11:54:12,844 - root - INFO - step: 23145 loss: 2.6345 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.87 mfu: 49.33% global_avg_ntp_loss: 0.7333 global_avg_top_loss: 1.9012 +[titan] 2025-09-09 11:54:12,845 - root - INFO - lr: 8.9161e-06 gnorm: 0.35 [1 day, 18:18:45<1 day, 6:48:48] +[titan] 2025-09-09 11:54:38,348 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 11:54:44,752 - root - INFO - step: 23150 loss: 2.7469 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.46 mfu: 49.49% global_avg_ntp_loss: 0.7794 global_avg_top_loss: 1.9675 +[titan] 2025-09-09 11:54:44,753 - root - INFO - lr: 8.9126e-06 gnorm: 0.34 [1 day, 18:19:16<1 day, 6:48:14] +[titan] 2025-09-09 11:55:16,689 - root - INFO - step: 23155 loss: 2.7659 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.01 mfu: 49.45% global_avg_ntp_loss: 0.7974 global_avg_top_loss: 1.9685 +[titan] 2025-09-09 11:55:16,689 - root - INFO - lr: 8.9091e-06 gnorm: 0.36 [1 day, 18:19:48<1 day, 6:47:41] +[titan] 2025-09-09 11:55:48,590 - root - INFO - step: 23160 loss: 2.7921 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.56 mfu: 49.50% global_avg_ntp_loss: 0.8010 global_avg_top_loss: 1.9911 +[titan] 2025-09-09 11:55:48,590 - root - INFO - lr: 8.9056e-06 gnorm: 0.35 [1 day, 18:20:20<1 day, 6:47:07] +[titan] 2025-09-09 11:56:20,500 - root - INFO - step: 23165 loss: 2.8496 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.41 mfu: 49.49% global_avg_ntp_loss: 0.8365 global_avg_top_loss: 2.0131 +[titan] 2025-09-09 11:56:20,501 - root - INFO - lr: 8.9022e-06 gnorm: 0.34 [1 day, 18:20:52<1 day, 6:46:33] +[titan] 2025-09-09 11:56:52,479 - root - INFO - step: 23170 loss: 2.7570 memory: 122.03GiB(87.57%) tps: 10,247 tflops: 488.37 mfu: 49.38% global_avg_ntp_loss: 0.7863 global_avg_top_loss: 1.9707 +[titan] 2025-09-09 11:56:52,480 - root - INFO - lr: 8.8987e-06 gnorm: 0.36 [1 day, 18:21:24<1 day, 6:46:00] +[titan] 2025-09-09 11:57:24,376 - root - INFO - step: 23175 loss: 2.7902 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.63 mfu: 49.51% global_avg_ntp_loss: 0.8023 global_avg_top_loss: 1.9879 +[titan] 2025-09-09 11:57:24,376 - root - INFO - lr: 8.8952e-06 gnorm: 0.37 [1 day, 18:21:56<1 day, 6:45:26] +[titan] 2025-09-09 11:57:56,644 - root - INFO - step: 23180 loss: 2.6998 memory: 122.03GiB(87.57%) tps: 10,155 tflops: 483.99 mfu: 48.94% global_avg_ntp_loss: 0.7586 global_avg_top_loss: 1.9412 +[titan] 2025-09-09 11:57:56,644 - root - INFO - lr: 8.8918e-06 gnorm: 0.36 [1 day, 18:22:28<1 day, 6:44:53] +[titan] 2025-09-09 11:58:28,425 - root - INFO - step: 23185 loss: 2.7027 memory: 122.03GiB(87.57%) tps: 10,311 tflops: 491.40 mfu: 49.69% global_avg_ntp_loss: 0.7590 global_avg_top_loss: 1.9437 +[titan] 2025-09-09 11:58:28,426 - root - INFO - lr: 8.8883e-06 gnorm: 0.34 [1 day, 18:23:00<1 day, 6:44:19] +[titan] 2025-09-09 11:59:00,240 - root - INFO - step: 23190 loss: 2.7335 memory: 122.03GiB(87.57%) tps: 10,300 tflops: 490.89 mfu: 49.63% global_avg_ntp_loss: 0.7693 global_avg_top_loss: 1.9642 +[titan] 2025-09-09 11:59:00,241 - root - INFO - lr: 8.8848e-06 gnorm: 0.35 [1 day, 18:23:32<1 day, 6:43:45] +[titan] 2025-09-09 11:59:32,237 - root - INFO - step: 23195 loss: 3.1848 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.10 mfu: 49.35% global_avg_ntp_loss: 1.0295 global_avg_top_loss: 2.1552 +[titan] 2025-09-09 11:59:32,237 - root - INFO - lr: 8.8813e-06 gnorm: 0.34 [1 day, 18:24:04<1 day, 6:43:12] +[titan] 2025-09-09 11:59:57,814 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:00:04,191 - root - INFO - step: 23200 loss: 2.8072 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.74 mfu: 49.42% global_avg_ntp_loss: 0.8083 global_avg_top_loss: 1.9988 +[titan] 2025-09-09 12:00:04,192 - root - INFO - lr: 8.8779e-06 gnorm: 0.37 [1 day, 18:24:36<1 day, 6:42:38] +[titan] 2025-09-09 12:00:36,129 - root - INFO - step: 23205 loss: 2.8785 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 489.00 mfu: 49.44% global_avg_ntp_loss: 0.8585 global_avg_top_loss: 2.0200 +[titan] 2025-09-09 12:00:36,130 - root - INFO - lr: 8.8744e-06 gnorm: 1.67 [1 day, 18:25:08<1 day, 6:42:05] +[titan] 2025-09-09 12:01:08,076 - root - INFO - step: 23210 loss: 2.8774 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.85 mfu: 49.43% global_avg_ntp_loss: 0.8501 global_avg_top_loss: 2.0273 +[titan] 2025-09-09 12:01:08,077 - root - INFO - lr: 8.8709e-06 gnorm: 0.56 [1 day, 18:25:40<1 day, 6:41:31] +[titan] 2025-09-09 12:01:39,999 - root - INFO - step: 23215 loss: 2.7257 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.23 mfu: 49.47% global_avg_ntp_loss: 0.7716 global_avg_top_loss: 1.9541 +[titan] 2025-09-09 12:01:40,000 - root - INFO - lr: 8.8675e-06 gnorm: 0.33 [1 day, 18:26:12<1 day, 6:40:57] +[titan] 2025-09-09 12:02:11,927 - root - INFO - step: 23220 loss: 2.7426 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.15 mfu: 49.46% global_avg_ntp_loss: 0.7794 global_avg_top_loss: 1.9631 +[titan] 2025-09-09 12:02:11,928 - root - INFO - lr: 8.8640e-06 gnorm: 0.39 [1 day, 18:26:44<1 day, 6:40:24] +[titan] 2025-09-09 12:02:43,813 - root - INFO - step: 23225 loss: 2.7552 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.79 mfu: 49.52% global_avg_ntp_loss: 0.7856 global_avg_top_loss: 1.9696 +[titan] 2025-09-09 12:02:43,814 - root - INFO - lr: 8.8605e-06 gnorm: 0.38 [1 day, 18:27:15<1 day, 6:39:50] +[titan] 2025-09-09 12:03:15,806 - root - INFO - step: 23230 loss: 2.8304 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.16 mfu: 49.36% global_avg_ntp_loss: 0.8263 global_avg_top_loss: 2.0041 +[titan] 2025-09-09 12:03:15,806 - root - INFO - lr: 8.8571e-06 gnorm: 0.42 [1 day, 18:27:47<1 day, 6:39:17] +[titan] 2025-09-09 12:03:47,624 - root - INFO - step: 23235 loss: 2.8164 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.84 mfu: 49.63% global_avg_ntp_loss: 0.8141 global_avg_top_loss: 2.0023 +[titan] 2025-09-09 12:03:47,624 - root - INFO - lr: 8.8536e-06 gnorm: 0.35 [1 day, 18:28:19<1 day, 6:38:43] +[titan] 2025-09-09 12:04:19,567 - root - INFO - step: 23240 loss: 2.7515 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.92 mfu: 49.44% global_avg_ntp_loss: 0.7795 global_avg_top_loss: 1.9721 +[titan] 2025-09-09 12:04:19,567 - root - INFO - lr: 8.8501e-06 gnorm: 0.38 [1 day, 18:28:51<1 day, 6:38:09] +[titan] 2025-09-09 12:04:51,346 - root - INFO - step: 23245 loss: 2.6992 memory: 122.03GiB(87.57%) tps: 10,311 tflops: 491.43 mfu: 49.69% global_avg_ntp_loss: 0.7580 global_avg_top_loss: 1.9413 +[titan] 2025-09-09 12:04:51,347 - root - INFO - lr: 8.8467e-06 gnorm: 0.42 [1 day, 18:29:23<1 day, 6:37:36] +[titan] 2025-09-09 12:05:17,083 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:05:23,483 - root - INFO - step: 23250 loss: 2.7266 memory: 122.03GiB(87.57%) tps: 10,197 tflops: 485.98 mfu: 49.14% global_avg_ntp_loss: 0.7712 global_avg_top_loss: 1.9554 +[titan] 2025-09-09 12:05:23,483 - root - INFO - lr: 8.8432e-06 gnorm: 0.36 [1 day, 18:29:55<1 day, 6:37:02] +[titan] 2025-09-09 12:05:55,347 - root - INFO - step: 23255 loss: 2.7597 memory: 122.03GiB(87.57%) tps: 10,284 tflops: 490.11 mfu: 49.56% global_avg_ntp_loss: 0.7876 global_avg_top_loss: 1.9721 +[titan] 2025-09-09 12:05:55,348 - root - INFO - lr: 8.8397e-06 gnorm: 0.37 [1 day, 18:30:27<1 day, 6:36:29] +[titan] 2025-09-09 12:06:27,342 - root - INFO - step: 23260 loss: 2.7848 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.13 mfu: 49.36% global_avg_ntp_loss: 0.8092 global_avg_top_loss: 1.9756 +[titan] 2025-09-09 12:06:27,342 - root - INFO - lr: 8.8363e-06 gnorm: 0.34 [1 day, 18:30:59<1 day, 6:35:55] +[titan] 2025-09-09 12:06:59,311 - root - INFO - step: 23265 loss: 2.6614 memory: 122.03GiB(87.57%) tps: 10,250 tflops: 488.52 mfu: 49.39% global_avg_ntp_loss: 0.7415 global_avg_top_loss: 1.9199 +[titan] 2025-09-09 12:06:59,311 - root - INFO - lr: 8.8328e-06 gnorm: 0.36 [1 day, 18:31:31<1 day, 6:35:21] +[titan] 2025-09-09 12:07:31,117 - root - INFO - step: 23270 loss: 2.7784 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.01 mfu: 49.65% global_avg_ntp_loss: 0.7939 global_avg_top_loss: 1.9845 +[titan] 2025-09-09 12:07:31,118 - root - INFO - lr: 8.8293e-06 gnorm: 0.37 [1 day, 18:32:03<1 day, 6:34:48] +[titan] 2025-09-09 12:08:03,210 - root - INFO - step: 23275 loss: 3.1576 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.63 mfu: 49.20% global_avg_ntp_loss: 1.0188 global_avg_top_loss: 2.1388 +[titan] 2025-09-09 12:08:03,211 - root - INFO - lr: 8.8259e-06 gnorm: 0.33 [1 day, 18:32:35<1 day, 6:34:14] +[titan] 2025-09-09 12:08:35,038 - root - INFO - step: 23280 loss: 2.6968 memory: 122.03GiB(87.57%) tps: 10,296 tflops: 490.68 mfu: 49.61% global_avg_ntp_loss: 0.7566 global_avg_top_loss: 1.9402 +[titan] 2025-09-09 12:08:35,039 - root - INFO - lr: 8.8224e-06 gnorm: 0.32 [1 day, 18:33:07<1 day, 6:33:41] +[titan] 2025-09-09 12:09:06,992 - root - INFO - step: 23285 loss: 2.7542 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.76 mfu: 49.42% global_avg_ntp_loss: 0.7828 global_avg_top_loss: 1.9714 +[titan] 2025-09-09 12:09:06,992 - root - INFO - lr: 8.8189e-06 gnorm: 0.34 [1 day, 18:33:39<1 day, 6:33:07] +[titan] 2025-09-09 12:09:39,069 - root - INFO - step: 23290 loss: 2.8147 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.87 mfu: 49.23% global_avg_ntp_loss: 0.8125 global_avg_top_loss: 2.0022 +[titan] 2025-09-09 12:09:39,069 - root - INFO - lr: 8.8155e-06 gnorm: 0.35 [1 day, 18:34:11<1 day, 6:32:33] +[titan] 2025-09-09 12:10:11,005 - root - INFO - step: 23295 loss: 2.7415 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.02 mfu: 49.45% global_avg_ntp_loss: 0.7804 global_avg_top_loss: 1.9612 +[titan] 2025-09-09 12:10:11,006 - root - INFO - lr: 8.8120e-06 gnorm: 0.35 [1 day, 18:34:43<1 day, 6:32:00] +[titan] 2025-09-09 12:10:36,637 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:10:43,076 - root - INFO - step: 23300 loss: 2.7410 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.96 mfu: 49.24% global_avg_ntp_loss: 0.7760 global_avg_top_loss: 1.9650 +[titan] 2025-09-09 12:10:43,077 - root - INFO - lr: 8.8086e-06 gnorm: 0.35 [1 day, 18:35:15<1 day, 6:31:26] +[titan] 2025-09-09 12:11:15,031 - root - INFO - step: 23305 loss: 2.7628 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.73 mfu: 49.42% global_avg_ntp_loss: 0.7868 global_avg_top_loss: 1.9760 +[titan] 2025-09-09 12:11:15,032 - root - INFO - lr: 8.8051e-06 gnorm: 0.35 [1 day, 18:35:47<1 day, 6:30:53] +[titan] 2025-09-09 12:11:47,044 - root - INFO - step: 23310 loss: 2.7991 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.85 mfu: 49.33% global_avg_ntp_loss: 0.8023 global_avg_top_loss: 1.9968 +[titan] 2025-09-09 12:11:47,044 - root - INFO - lr: 8.8016e-06 gnorm: 0.53 [1 day, 18:36:19<1 day, 6:30:19] +[titan] 2025-09-09 12:12:19,090 - root - INFO - step: 23315 loss: 2.8062 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.35 mfu: 49.28% global_avg_ntp_loss: 0.8075 global_avg_top_loss: 1.9987 +[titan] 2025-09-09 12:12:19,090 - root - INFO - lr: 8.7982e-06 gnorm: 0.38 [1 day, 18:36:51<1 day, 6:29:46] +[titan] 2025-09-09 12:12:50,890 - root - INFO - step: 23320 loss: 2.7351 memory: 122.03GiB(87.57%) tps: 10,305 tflops: 491.11 mfu: 49.66% global_avg_ntp_loss: 0.7736 global_avg_top_loss: 1.9614 +[titan] 2025-09-09 12:12:50,890 - root - INFO - lr: 8.7947e-06 gnorm: 0.34 [1 day, 18:37:23<1 day, 6:29:12] +[titan] 2025-09-09 12:13:22,908 - root - INFO - step: 23325 loss: 2.6636 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.77 mfu: 49.32% global_avg_ntp_loss: 0.7399 global_avg_top_loss: 1.9236 +[titan] 2025-09-09 12:13:22,908 - root - INFO - lr: 8.7912e-06 gnorm: 0.36 [1 day, 18:37:55<1 day, 6:28:39] +[titan] 2025-09-09 12:13:54,931 - root - INFO - step: 23330 loss: 2.6635 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.69 mfu: 49.31% global_avg_ntp_loss: 0.7441 global_avg_top_loss: 1.9194 +[titan] 2025-09-09 12:13:54,932 - root - INFO - lr: 8.7878e-06 gnorm: 0.33 [1 day, 18:38:27<1 day, 6:28:05] +[titan] 2025-09-09 12:14:26,892 - root - INFO - step: 23335 loss: 2.5600 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.65 mfu: 49.41% global_avg_ntp_loss: 0.6974 global_avg_top_loss: 1.8626 +[titan] 2025-09-09 12:14:26,892 - root - INFO - lr: 8.7843e-06 gnorm: 0.41 [1 day, 18:38:59<1 day, 6:27:31] +[titan] 2025-09-09 12:14:58,920 - root - INFO - step: 23340 loss: 2.8411 memory: 122.03GiB(87.57%) tps: 10,231 tflops: 487.62 mfu: 49.30% global_avg_ntp_loss: 0.8424 global_avg_top_loss: 1.9986 +[titan] 2025-09-09 12:14:58,920 - root - INFO - lr: 8.7809e-06 gnorm: 0.34 [1 day, 18:39:31<1 day, 6:26:58] +[titan] 2025-09-09 12:15:30,777 - root - INFO - step: 23345 loss: 2.7061 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.23 mfu: 49.57% global_avg_ntp_loss: 0.7633 global_avg_top_loss: 1.9427 +[titan] 2025-09-09 12:15:30,778 - root - INFO - lr: 8.7774e-06 gnorm: 0.35 [1 day, 18:40:02<1 day, 6:26:24] +[titan] 2025-09-09 12:15:56,333 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:16:02,742 - root - INFO - step: 23350 loss: 2.7458 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.59 mfu: 49.40% global_avg_ntp_loss: 0.7796 global_avg_top_loss: 1.9662 +[titan] 2025-09-09 12:16:02,742 - root - INFO - lr: 8.7740e-06 gnorm: 0.38 [1 day, 18:40:34<1 day, 6:25:51] +[titan] 2025-09-09 12:16:34,601 - root - INFO - step: 23355 loss: 3.2101 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.20 mfu: 49.56% global_avg_ntp_loss: 1.0401 global_avg_top_loss: 2.1699 +[titan] 2025-09-09 12:16:34,602 - root - INFO - lr: 8.7705e-06 gnorm: 0.60 [1 day, 18:41:06<1 day, 6:25:17] +[titan] 2025-09-09 12:17:06,548 - root - INFO - step: 23360 loss: 2.7253 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.85 mfu: 49.43% global_avg_ntp_loss: 0.7679 global_avg_top_loss: 1.9574 +[titan] 2025-09-09 12:17:06,549 - root - INFO - lr: 8.7670e-06 gnorm: 0.35 [1 day, 18:41:38<1 day, 6:24:43] +[titan] 2025-09-09 12:17:38,393 - root - INFO - step: 23365 loss: 2.7058 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.42 mfu: 49.59% global_avg_ntp_loss: 0.7605 global_avg_top_loss: 1.9453 +[titan] 2025-09-09 12:17:38,393 - root - INFO - lr: 8.7636e-06 gnorm: 0.37 [1 day, 18:42:10<1 day, 6:24:10] +[titan] 2025-09-09 12:18:10,211 - root - INFO - step: 23370 loss: 2.7983 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.84 mfu: 49.63% global_avg_ntp_loss: 0.8055 global_avg_top_loss: 1.9927 +[titan] 2025-09-09 12:18:10,212 - root - INFO - lr: 8.7601e-06 gnorm: 0.42 [1 day, 18:42:42<1 day, 6:23:36] +[titan] 2025-09-09 12:18:42,398 - root - INFO - step: 23375 loss: 2.7652 memory: 122.03GiB(87.57%) tps: 10,181 tflops: 485.21 mfu: 49.06% global_avg_ntp_loss: 0.7873 global_avg_top_loss: 1.9779 +[titan] 2025-09-09 12:18:42,399 - root - INFO - lr: 8.7567e-06 gnorm: 0.35 [1 day, 18:43:14<1 day, 6:23:03] +[titan] 2025-09-09 12:19:14,226 - root - INFO - step: 23380 loss: 2.7757 memory: 122.03GiB(87.57%) tps: 10,296 tflops: 490.68 mfu: 49.61% global_avg_ntp_loss: 0.7929 global_avg_top_loss: 1.9828 +[titan] 2025-09-09 12:19:14,227 - root - INFO - lr: 8.7532e-06 gnorm: 0.37 [1 day, 18:43:46<1 day, 6:22:29] +[titan] 2025-09-09 12:19:46,052 - root - INFO - step: 23385 loss: 2.7610 memory: 122.03GiB(87.57%) tps: 10,296 tflops: 490.72 mfu: 49.62% global_avg_ntp_loss: 0.7895 global_avg_top_loss: 1.9714 +[titan] 2025-09-09 12:19:46,053 - root - INFO - lr: 8.7497e-06 gnorm: 0.40 [1 day, 18:44:18<1 day, 6:21:55] +[titan] 2025-09-09 12:20:17,962 - root - INFO - step: 23390 loss: 3.0177 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.42 mfu: 49.49% global_avg_ntp_loss: 0.9169 global_avg_top_loss: 2.1008 +[titan] 2025-09-09 12:20:17,963 - root - INFO - lr: 8.7463e-06 gnorm: 0.46 [1 day, 18:44:50<1 day, 6:21:22] +[titan] 2025-09-09 12:20:50,114 - root - INFO - step: 23395 loss: 2.8390 memory: 122.03GiB(87.57%) tps: 10,192 tflops: 485.74 mfu: 49.11% global_avg_ntp_loss: 0.8225 global_avg_top_loss: 2.0165 +[titan] 2025-09-09 12:20:50,115 - root - INFO - lr: 8.7428e-06 gnorm: 0.35 [1 day, 18:45:22<1 day, 6:20:48] +[titan] 2025-09-09 12:21:15,917 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:21:22,196 - root - INFO - step: 23400 loss: 2.7674 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.80 mfu: 49.22% global_avg_ntp_loss: 0.7905 global_avg_top_loss: 1.9769 +[titan] 2025-09-09 12:21:22,196 - root - INFO - lr: 8.7394e-06 gnorm: 0.34 [1 day, 18:45:54<1 day, 6:20:15] +[titan] 2025-09-09 12:21:54,058 - root - INFO - step: 23405 loss: 2.7830 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.16 mfu: 49.56% global_avg_ntp_loss: 0.7948 global_avg_top_loss: 1.9882 +[titan] 2025-09-09 12:21:54,058 - root - INFO - lr: 8.7359e-06 gnorm: 0.40 [1 day, 18:46:26<1 day, 6:19:41] +[titan] 2025-09-09 12:22:25,950 - root - INFO - step: 23410 loss: 2.8381 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.70 mfu: 49.52% global_avg_ntp_loss: 0.8232 global_avg_top_loss: 2.0149 +[titan] 2025-09-09 12:22:25,950 - root - INFO - lr: 8.7325e-06 gnorm: 0.38 [1 day, 18:46:58<1 day, 6:19:08] +[titan] 2025-09-09 12:22:58,010 - root - INFO - step: 23415 loss: 2.6745 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.12 mfu: 49.25% global_avg_ntp_loss: 0.7517 global_avg_top_loss: 1.9228 +[titan] 2025-09-09 12:22:58,011 - root - INFO - lr: 8.7290e-06 gnorm: 0.35 [1 day, 18:47:30<1 day, 6:18:34] +[titan] 2025-09-09 12:23:29,890 - root - INFO - step: 23420 loss: 2.7963 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.88 mfu: 49.53% global_avg_ntp_loss: 0.8049 global_avg_top_loss: 1.9914 +[titan] 2025-09-09 12:23:29,891 - root - INFO - lr: 8.7256e-06 gnorm: 0.39 [1 day, 18:48:01<1 day, 6:18:01] +[titan] 2025-09-09 12:24:01,956 - root - INFO - step: 23425 loss: 2.6885 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.05 mfu: 49.25% global_avg_ntp_loss: 0.7613 global_avg_top_loss: 1.9272 +[titan] 2025-09-09 12:24:01,956 - root - INFO - lr: 8.7221e-06 gnorm: 0.39 [1 day, 18:48:34<1 day, 6:17:27] +[titan] 2025-09-09 12:24:33,893 - root - INFO - step: 23430 loss: 2.7514 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.01 mfu: 49.45% global_avg_ntp_loss: 0.7817 global_avg_top_loss: 1.9698 +[titan] 2025-09-09 12:24:33,893 - root - INFO - lr: 8.7187e-06 gnorm: 0.36 [1 day, 18:49:05<1 day, 6:16:54] +[titan] 2025-09-09 12:25:05,823 - root - INFO - step: 23435 loss: 3.1636 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.11 mfu: 49.45% global_avg_ntp_loss: 1.0187 global_avg_top_loss: 2.1450 +[titan] 2025-09-09 12:25:05,824 - root - INFO - lr: 8.7152e-06 gnorm: 0.43 [1 day, 18:49:37<1 day, 6:16:20] +[titan] 2025-09-09 12:25:37,661 - root - INFO - step: 23440 loss: 2.7340 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.54 mfu: 49.60% global_avg_ntp_loss: 0.7726 global_avg_top_loss: 1.9614 +[titan] 2025-09-09 12:25:37,661 - root - INFO - lr: 8.7117e-06 gnorm: 0.36 [1 day, 18:50:09<1 day, 6:15:46] +[titan] 2025-09-09 12:26:09,802 - root - INFO - step: 23445 loss: 2.7629 memory: 122.03GiB(87.57%) tps: 10,195 tflops: 485.90 mfu: 49.13% global_avg_ntp_loss: 0.7870 global_avg_top_loss: 1.9759 +[titan] 2025-09-09 12:26:09,803 - root - INFO - lr: 8.7083e-06 gnorm: 0.37 [1 day, 18:50:41<1 day, 6:15:13] +[titan] 2025-09-09 12:26:35,446 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:26:41,842 - root - INFO - step: 23450 loss: 2.7548 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.45 mfu: 49.29% global_avg_ntp_loss: 0.7878 global_avg_top_loss: 1.9670 +[titan] 2025-09-09 12:26:41,842 - root - INFO - lr: 8.7048e-06 gnorm: 0.41 [1 day, 18:51:13<1 day, 6:14:39] +[titan] 2025-09-09 12:27:13,893 - root - INFO - step: 23455 loss: 2.7984 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.26 mfu: 49.27% global_avg_ntp_loss: 0.8033 global_avg_top_loss: 1.9950 +[titan] 2025-09-09 12:27:13,893 - root - INFO - lr: 8.7014e-06 gnorm: 0.35 [1 day, 18:51:45<1 day, 6:14:06] +[titan] 2025-09-09 12:27:45,931 - root - INFO - step: 23460 loss: 2.7268 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.46 mfu: 49.29% global_avg_ntp_loss: 0.7748 global_avg_top_loss: 1.9520 +[titan] 2025-09-09 12:27:45,932 - root - INFO - lr: 8.6979e-06 gnorm: 0.34 [1 day, 18:52:18<1 day, 6:13:32] +[titan] 2025-09-09 12:28:17,892 - root - INFO - step: 23465 loss: 2.8195 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.65 mfu: 49.41% global_avg_ntp_loss: 0.8143 global_avg_top_loss: 2.0052 +[titan] 2025-09-09 12:28:17,892 - root - INFO - lr: 8.6945e-06 gnorm: 0.37 [1 day, 18:52:49<1 day, 6:12:59] +[titan] 2025-09-09 12:28:49,979 - root - INFO - step: 23470 loss: 3.1037 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.72 mfu: 49.21% global_avg_ntp_loss: 0.9470 global_avg_top_loss: 2.1566 +[titan] 2025-09-09 12:28:49,979 - root - INFO - lr: 8.6910e-06 gnorm: 0.39 [1 day, 18:53:22<1 day, 6:12:25] +[titan] 2025-09-09 12:29:21,796 - root - INFO - step: 23475 loss: 2.6998 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.86 mfu: 49.63% global_avg_ntp_loss: 0.7643 global_avg_top_loss: 1.9354 +[titan] 2025-09-09 12:29:21,796 - root - INFO - lr: 8.6876e-06 gnorm: 0.35 [1 day, 18:53:53<1 day, 6:11:52] +[titan] 2025-09-09 12:29:54,060 - root - INFO - step: 23480 loss: 2.7357 memory: 122.03GiB(87.57%) tps: 10,156 tflops: 484.05 mfu: 48.94% global_avg_ntp_loss: 0.7758 global_avg_top_loss: 1.9598 +[titan] 2025-09-09 12:29:54,060 - root - INFO - lr: 8.6841e-06 gnorm: 0.34 [1 day, 18:54:26<1 day, 6:11:18] +[titan] 2025-09-09 12:30:26,233 - root - INFO - step: 23485 loss: 2.8005 memory: 122.03GiB(87.57%) tps: 10,185 tflops: 485.42 mfu: 49.08% global_avg_ntp_loss: 0.8039 global_avg_top_loss: 1.9966 +[titan] 2025-09-09 12:30:26,234 - root - INFO - lr: 8.6807e-06 gnorm: 0.34 [1 day, 18:54:58<1 day, 6:10:45] +[titan] 2025-09-09 12:30:58,284 - root - INFO - step: 23490 loss: 2.7755 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.26 mfu: 49.27% global_avg_ntp_loss: 0.7952 global_avg_top_loss: 1.9804 +[titan] 2025-09-09 12:30:58,285 - root - INFO - lr: 8.6772e-06 gnorm: 0.34 [1 day, 18:55:30<1 day, 6:10:12] +[titan] 2025-09-09 12:31:30,359 - root - INFO - step: 23495 loss: 2.6809 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.92 mfu: 49.23% global_avg_ntp_loss: 0.7512 global_avg_top_loss: 1.9297 +[titan] 2025-09-09 12:31:30,359 - root - INFO - lr: 8.6738e-06 gnorm: 0.35 [1 day, 18:56:02<1 day, 6:09:38] +[titan] 2025-09-09 12:31:56,191 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:32:02,547 - root - INFO - step: 23500 loss: 2.7743 memory: 122.03GiB(87.57%) tps: 10,180 tflops: 485.19 mfu: 49.06% global_avg_ntp_loss: 0.7926 global_avg_top_loss: 1.9817 +[titan] 2025-09-09 12:32:02,547 - root - INFO - lr: 8.6703e-06 gnorm: 0.37 [1 day, 18:56:34<1 day, 6:09:05] +[titan] 2025-09-09 12:32:34,616 - root - INFO - step: 23505 loss: 2.6703 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.99 mfu: 49.24% global_avg_ntp_loss: 0.7554 global_avg_top_loss: 1.9149 +[titan] 2025-09-09 12:32:34,616 - root - INFO - lr: 8.6669e-06 gnorm: 0.35 [1 day, 18:57:06<1 day, 6:08:31] +[titan] 2025-09-09 12:33:06,980 - root - INFO - step: 23510 loss: 2.7271 memory: 122.03GiB(87.57%) tps: 10,125 tflops: 482.55 mfu: 48.79% global_avg_ntp_loss: 0.7726 global_avg_top_loss: 1.9545 +[titan] 2025-09-09 12:33:06,980 - root - INFO - lr: 8.6634e-06 gnorm: 0.34 [1 day, 18:57:39<1 day, 6:07:58] +[titan] 2025-09-09 12:33:39,118 - root - INFO - step: 23515 loss: 3.1399 memory: 122.03GiB(87.57%) tps: 10,196 tflops: 485.96 mfu: 49.14% global_avg_ntp_loss: 0.9570 global_avg_top_loss: 2.1829 +[titan] 2025-09-09 12:33:39,118 - root - INFO - lr: 8.6600e-06 gnorm: 0.39 [1 day, 18:58:11<1 day, 6:07:25] +[titan] 2025-09-09 12:34:11,249 - root - INFO - step: 23520 loss: 2.7517 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.05 mfu: 49.15% global_avg_ntp_loss: 0.7878 global_avg_top_loss: 1.9640 +[titan] 2025-09-09 12:34:11,250 - root - INFO - lr: 8.6566e-06 gnorm: 0.36 [1 day, 18:58:43<1 day, 6:06:51] +[titan] 2025-09-09 12:34:43,178 - root - INFO - step: 23525 loss: 2.6464 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.14 mfu: 49.46% global_avg_ntp_loss: 0.7354 global_avg_top_loss: 1.9110 +[titan] 2025-09-09 12:34:43,178 - root - INFO - lr: 8.6531e-06 gnorm: 0.36 [1 day, 18:59:15<1 day, 6:06:18] +[titan] 2025-09-09 12:35:15,250 - root - INFO - step: 23530 loss: 2.8099 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.95 mfu: 49.24% global_avg_ntp_loss: 0.8116 global_avg_top_loss: 1.9983 +[titan] 2025-09-09 12:35:15,250 - root - INFO - lr: 8.6497e-06 gnorm: 0.35 [1 day, 18:59:47<1 day, 6:05:44] +[titan] 2025-09-09 12:35:47,336 - root - INFO - step: 23535 loss: 2.6512 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.73 mfu: 49.21% global_avg_ntp_loss: 0.7391 global_avg_top_loss: 1.9121 +[titan] 2025-09-09 12:35:47,337 - root - INFO - lr: 8.6462e-06 gnorm: 0.36 [1 day, 19:00:19<1 day, 6:05:11] +[titan] 2025-09-09 12:36:19,208 - root - INFO - step: 23540 loss: 2.7836 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.01 mfu: 49.55% global_avg_ntp_loss: 0.7960 global_avg_top_loss: 1.9876 +[titan] 2025-09-09 12:36:19,208 - root - INFO - lr: 8.6428e-06 gnorm: 0.35 [1 day, 19:00:51<1 day, 6:04:37] +[titan] 2025-09-09 12:36:51,143 - root - INFO - step: 23545 loss: 2.8085 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.03 mfu: 49.45% global_avg_ntp_loss: 0.8067 global_avg_top_loss: 2.0018 +[titan] 2025-09-09 12:36:51,144 - root - INFO - lr: 8.6393e-06 gnorm: 0.34 [1 day, 19:01:23<1 day, 6:04:03] +[titan] 2025-09-09 12:37:16,853 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:37:23,333 - root - INFO - step: 23550 loss: 2.7229 memory: 122.03GiB(87.57%) tps: 10,180 tflops: 485.17 mfu: 49.06% global_avg_ntp_loss: 0.7713 global_avg_top_loss: 1.9516 +[titan] 2025-09-09 12:37:23,333 - root - INFO - lr: 8.6359e-06 gnorm: 0.37 [1 day, 19:01:55<1 day, 6:03:30] +[titan] 2025-09-09 12:37:36,306 - root - INFO - Dumping profiler traces at step 23552 +[titan] 2025-09-09 12:37:36,375 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-09 12:37:55,411 - root - INFO - step: 23555 loss: 2.7637 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.85 mfu: 49.23% global_avg_ntp_loss: 0.7858 global_avg_top_loss: 1.9779 +[titan] 2025-09-09 12:37:55,412 - root - INFO - lr: 8.6324e-06 gnorm: 0.40 [1 day, 19:02:27<1 day, 6:02:57] +[titan] 2025-09-09 12:38:27,564 - root - INFO - step: 23560 loss: 2.7298 memory: 122.03GiB(87.57%) tps: 10,191 tflops: 485.72 mfu: 49.11% global_avg_ntp_loss: 0.7723 global_avg_top_loss: 1.9575 +[titan] 2025-09-09 12:38:27,565 - root - INFO - lr: 8.6290e-06 gnorm: 0.34 [1 day, 19:02:59<1 day, 6:02:23] +[titan] 2025-09-09 12:38:59,577 - root - INFO - step: 23565 loss: 2.7718 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.86 mfu: 49.33% global_avg_ntp_loss: 0.7922 global_avg_top_loss: 1.9796 +[titan] 2025-09-09 12:38:59,577 - root - INFO - lr: 8.6255e-06 gnorm: 0.36 [1 day, 19:03:31<1 day, 6:01:50] +[titan] 2025-09-09 12:39:31,578 - root - INFO - step: 23570 loss: 3.0739 memory: 122.03GiB(87.57%) tps: 10,240 tflops: 488.03 mfu: 49.35% global_avg_ntp_loss: 0.9525 global_avg_top_loss: 2.1214 +[titan] 2025-09-09 12:39:31,578 - root - INFO - lr: 8.6221e-06 gnorm: 0.35 [1 day, 19:04:03<1 day, 6:01:16] +[titan] 2025-09-09 12:40:03,797 - root - INFO - step: 23575 loss: 2.6237 memory: 122.03GiB(87.57%) tps: 10,171 tflops: 484.73 mfu: 49.01% global_avg_ntp_loss: 0.7232 global_avg_top_loss: 1.9004 +[titan] 2025-09-09 12:40:03,798 - root - INFO - lr: 8.6187e-06 gnorm: 0.34 [1 day, 19:04:35<1 day, 6:00:43] +[titan] 2025-09-09 12:40:35,545 - root - INFO - step: 23580 loss: 2.7272 memory: 122.03GiB(87.57%) tps: 10,322 tflops: 491.93 mfu: 49.74% global_avg_ntp_loss: 0.7693 global_avg_top_loss: 1.9579 +[titan] 2025-09-09 12:40:35,545 - root - INFO - lr: 8.6152e-06 gnorm: 0.34 [1 day, 19:05:07<1 day, 6:00:09] +[titan] 2025-09-09 12:41:07,454 - root - INFO - step: 23585 loss: 2.6902 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.44 mfu: 49.49% global_avg_ntp_loss: 0.7624 global_avg_top_loss: 1.9278 +[titan] 2025-09-09 12:41:07,454 - root - INFO - lr: 8.6118e-06 gnorm: 0.35 [1 day, 19:05:39<1 day, 5:59:36] +[titan] 2025-09-09 12:41:39,552 - root - INFO - step: 23590 loss: 2.7357 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.55 mfu: 49.20% global_avg_ntp_loss: 0.7731 global_avg_top_loss: 1.9626 +[titan] 2025-09-09 12:41:39,552 - root - INFO - lr: 8.6083e-06 gnorm: 0.34 [1 day, 19:06:11<1 day, 5:59:02] +[titan] 2025-09-09 12:42:11,437 - root - INFO - step: 23595 loss: 2.6897 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.80 mfu: 49.52% global_avg_ntp_loss: 0.7595 global_avg_top_loss: 1.9302 +[titan] 2025-09-09 12:42:11,438 - root - INFO - lr: 8.6049e-06 gnorm: 0.37 [1 day, 19:06:43<1 day, 5:58:29] +[titan] 2025-09-09 12:42:36,967 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:42:43,327 - root - INFO - step: 23600 loss: 2.7191 memory: 122.03GiB(87.57%) tps: 10,276 tflops: 489.73 mfu: 49.52% global_avg_ntp_loss: 0.7666 global_avg_top_loss: 1.9524 +[titan] 2025-09-09 12:42:43,328 - root - INFO - lr: 8.6015e-06 gnorm: 0.35 [1 day, 19:07:15<1 day, 5:57:55] +[titan] 2025-09-09 12:43:15,415 - root - INFO - step: 23605 loss: 2.9160 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.72 mfu: 49.21% global_avg_ntp_loss: 0.8723 global_avg_top_loss: 2.0437 +[titan] 2025-09-09 12:43:15,415 - root - INFO - lr: 8.5980e-06 gnorm: 0.37 [1 day, 19:07:47<1 day, 5:57:21] +[titan] 2025-09-09 12:43:47,503 - root - INFO - step: 23610 loss: 2.6779 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.71 mfu: 49.21% global_avg_ntp_loss: 0.7476 global_avg_top_loss: 1.9303 +[titan] 2025-09-09 12:43:47,503 - root - INFO - lr: 8.5946e-06 gnorm: 0.34 [1 day, 19:08:19<1 day, 5:56:48] +[titan] 2025-09-09 12:44:19,582 - root - INFO - step: 23615 loss: 2.7448 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.84 mfu: 49.23% global_avg_ntp_loss: 0.7784 global_avg_top_loss: 1.9664 +[titan] 2025-09-09 12:44:19,583 - root - INFO - lr: 8.5911e-06 gnorm: 0.35 [1 day, 19:08:51<1 day, 5:56:15] +[titan] 2025-09-09 12:44:51,379 - root - INFO - step: 23620 loss: 2.7088 memory: 122.03GiB(87.57%) tps: 10,306 tflops: 491.17 mfu: 49.66% global_avg_ntp_loss: 0.7653 global_avg_top_loss: 1.9435 +[titan] 2025-09-09 12:44:51,379 - root - INFO - lr: 8.5877e-06 gnorm: 0.37 [1 day, 19:09:23<1 day, 5:55:41] +[titan] 2025-09-09 12:45:23,511 - root - INFO - step: 23625 loss: 2.8010 memory: 122.03GiB(87.57%) tps: 10,198 tflops: 486.03 mfu: 49.14% global_avg_ntp_loss: 0.8065 global_avg_top_loss: 1.9945 +[titan] 2025-09-09 12:45:23,512 - root - INFO - lr: 8.5843e-06 gnorm: 0.35 [1 day, 19:09:55<1 day, 5:55:08] +[titan] 2025-09-09 12:45:55,505 - root - INFO - step: 23630 loss: 2.9796 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.14 mfu: 49.36% global_avg_ntp_loss: 0.8844 global_avg_top_loss: 2.0952 +[titan] 2025-09-09 12:45:55,505 - root - INFO - lr: 8.5808e-06 gnorm: 0.36 [1 day, 19:10:27<1 day, 5:54:34] +[titan] 2025-09-09 12:46:27,671 - root - INFO - step: 23635 loss: 2.7300 memory: 122.03GiB(87.57%) tps: 10,187 tflops: 485.52 mfu: 49.09% global_avg_ntp_loss: 0.7748 global_avg_top_loss: 1.9553 +[titan] 2025-09-09 12:46:27,672 - root - INFO - lr: 8.5774e-06 gnorm: 0.36 [1 day, 19:10:59<1 day, 5:54:01] +[titan] 2025-09-09 12:46:59,665 - root - INFO - step: 23640 loss: 2.7168 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.15 mfu: 49.36% global_avg_ntp_loss: 0.7695 global_avg_top_loss: 1.9473 +[titan] 2025-09-09 12:46:59,665 - root - INFO - lr: 8.5739e-06 gnorm: 0.36 [1 day, 19:11:31<1 day, 5:53:27] +[titan] 2025-09-09 12:47:31,752 - root - INFO - step: 23645 loss: 2.7886 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.75 mfu: 49.22% global_avg_ntp_loss: 0.7986 global_avg_top_loss: 1.9900 +[titan] 2025-09-09 12:47:31,752 - root - INFO - lr: 8.5705e-06 gnorm: 0.35 [1 day, 19:12:03<1 day, 5:52:54] +[titan] 2025-09-09 12:47:57,384 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:48:03,772 - root - INFO - step: 23650 loss: 2.8259 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.75 mfu: 49.32% global_avg_ntp_loss: 0.8204 global_avg_top_loss: 2.0055 +[titan] 2025-09-09 12:48:03,772 - root - INFO - lr: 8.5671e-06 gnorm: 0.41 [1 day, 19:12:35<1 day, 5:52:20] +[titan] 2025-09-09 12:48:36,003 - root - INFO - step: 23655 loss: 2.7630 memory: 122.03GiB(87.57%) tps: 10,167 tflops: 484.54 mfu: 48.99% global_avg_ntp_loss: 0.7897 global_avg_top_loss: 1.9733 +[titan] 2025-09-09 12:48:36,003 - root - INFO - lr: 8.5636e-06 gnorm: 0.41 [1 day, 19:13:08<1 day, 5:51:47] +[titan] 2025-09-09 12:49:07,821 - root - INFO - step: 23660 loss: 2.7133 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.84 mfu: 49.63% global_avg_ntp_loss: 0.7639 global_avg_top_loss: 1.9494 +[titan] 2025-09-09 12:49:07,821 - root - INFO - lr: 8.5602e-06 gnorm: 0.38 [1 day, 19:13:39<1 day, 5:51:13] +[titan] 2025-09-09 12:49:39,932 - root - INFO - step: 23665 loss: 2.7182 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.36 mfu: 49.18% global_avg_ntp_loss: 0.7685 global_avg_top_loss: 1.9496 +[titan] 2025-09-09 12:49:39,932 - root - INFO - lr: 8.5568e-06 gnorm: 0.34 [1 day, 19:14:11<1 day, 5:50:40] +[titan] 2025-09-09 12:50:11,942 - root - INFO - step: 23670 loss: 2.7891 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.89 mfu: 49.33% global_avg_ntp_loss: 0.7985 global_avg_top_loss: 1.9905 +[titan] 2025-09-09 12:50:11,942 - root - INFO - lr: 8.5533e-06 gnorm: 0.36 [1 day, 19:14:43<1 day, 5:50:06] +[titan] 2025-09-09 12:50:43,859 - root - INFO - step: 23675 loss: 2.7823 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.30 mfu: 49.47% global_avg_ntp_loss: 0.8097 global_avg_top_loss: 1.9726 +[titan] 2025-09-09 12:50:43,860 - root - INFO - lr: 8.5499e-06 gnorm: 0.34 [1 day, 19:15:15<1 day, 5:49:33] +[titan] 2025-09-09 12:51:15,789 - root - INFO - step: 23680 loss: 2.8048 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.12 mfu: 49.46% global_avg_ntp_loss: 0.8042 global_avg_top_loss: 2.0006 +[titan] 2025-09-09 12:51:15,790 - root - INFO - lr: 8.5464e-06 gnorm: 0.36 [1 day, 19:15:47<1 day, 5:48:59] +[titan] 2025-09-09 12:51:47,681 - root - INFO - step: 23685 loss: 3.0752 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.71 mfu: 49.52% global_avg_ntp_loss: 0.9326 global_avg_top_loss: 2.1426 +[titan] 2025-09-09 12:51:47,681 - root - INFO - lr: 8.5430e-06 gnorm: 0.52 [1 day, 19:16:19<1 day, 5:48:26] +[titan] 2025-09-09 12:52:19,455 - root - INFO - step: 23690 loss: 2.7414 memory: 122.03GiB(87.57%) tps: 10,313 tflops: 491.51 mfu: 49.70% global_avg_ntp_loss: 0.7755 global_avg_top_loss: 1.9659 +[titan] 2025-09-09 12:52:19,456 - root - INFO - lr: 8.5396e-06 gnorm: 0.35 [1 day, 19:16:51<1 day, 5:47:52] +[titan] 2025-09-09 12:52:51,258 - root - INFO - step: 23695 loss: 2.8033 memory: 122.03GiB(87.57%) tps: 10,304 tflops: 491.07 mfu: 49.65% global_avg_ntp_loss: 0.8079 global_avg_top_loss: 1.9955 +[titan] 2025-09-09 12:52:51,259 - root - INFO - lr: 8.5361e-06 gnorm: 0.35 [1 day, 19:17:23<1 day, 5:47:18] +[titan] 2025-09-09 12:53:17,026 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:53:23,372 - root - INFO - step: 23700 loss: 2.7465 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.31 mfu: 49.17% global_avg_ntp_loss: 0.7785 global_avg_top_loss: 1.9680 +[titan] 2025-09-09 12:53:23,373 - root - INFO - lr: 8.5327e-06 gnorm: 0.36 [1 day, 19:17:55<1 day, 5:46:45] +[titan] 2025-09-09 12:53:55,414 - root - INFO - step: 23705 loss: 2.7954 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.40 mfu: 49.28% global_avg_ntp_loss: 0.8048 global_avg_top_loss: 1.9906 +[titan] 2025-09-09 12:53:55,415 - root - INFO - lr: 8.5293e-06 gnorm: 0.35 [1 day, 19:18:27<1 day, 5:46:11] +[titan] 2025-09-09 12:54:27,389 - root - INFO - step: 23710 loss: 2.8390 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.43 mfu: 49.39% global_avg_ntp_loss: 0.8229 global_avg_top_loss: 2.0161 +[titan] 2025-09-09 12:54:27,390 - root - INFO - lr: 8.5258e-06 gnorm: 0.38 [1 day, 19:18:59<1 day, 5:45:38] +[titan] 2025-09-09 12:54:59,612 - root - INFO - step: 23715 loss: 2.7635 memory: 122.03GiB(87.57%) tps: 10,169 tflops: 484.67 mfu: 49.01% global_avg_ntp_loss: 0.7885 global_avg_top_loss: 1.9750 +[titan] 2025-09-09 12:54:59,613 - root - INFO - lr: 8.5224e-06 gnorm: 0.35 [1 day, 19:19:31<1 day, 5:45:05] +[titan] 2025-09-09 12:55:31,684 - root - INFO - step: 23720 loss: 2.7533 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.95 mfu: 49.24% global_avg_ntp_loss: 0.7840 global_avg_top_loss: 1.9693 +[titan] 2025-09-09 12:55:31,685 - root - INFO - lr: 8.5190e-06 gnorm: 0.37 [1 day, 19:20:03<1 day, 5:44:31] +[titan] 2025-09-09 12:56:03,627 - root - INFO - step: 23725 loss: 2.7679 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.93 mfu: 49.44% global_avg_ntp_loss: 0.7896 global_avg_top_loss: 1.9782 +[titan] 2025-09-09 12:56:03,627 - root - INFO - lr: 8.5156e-06 gnorm: 0.35 [1 day, 19:20:35<1 day, 5:43:58] +[titan] 2025-09-09 12:56:35,610 - root - INFO - step: 23730 loss: 2.7012 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.30 mfu: 49.37% global_avg_ntp_loss: 0.7586 global_avg_top_loss: 1.9426 +[titan] 2025-09-09 12:56:35,611 - root - INFO - lr: 8.5121e-06 gnorm: 0.33 [1 day, 19:21:07<1 day, 5:43:24] +[titan] 2025-09-09 12:57:07,595 - root - INFO - step: 23735 loss: 2.6789 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.27 mfu: 49.37% global_avg_ntp_loss: 0.7473 global_avg_top_loss: 1.9316 +[titan] 2025-09-09 12:57:07,596 - root - INFO - lr: 8.5087e-06 gnorm: 0.34 [1 day, 19:21:39<1 day, 5:42:51] +[titan] 2025-09-09 12:57:39,489 - root - INFO - step: 23740 loss: 2.7761 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.68 mfu: 49.51% global_avg_ntp_loss: 0.7952 global_avg_top_loss: 1.9809 +[titan] 2025-09-09 12:57:39,489 - root - INFO - lr: 8.5053e-06 gnorm: 0.34 [1 day, 19:22:11<1 day, 5:42:17] +[titan] 2025-09-09 12:58:11,609 - root - INFO - step: 23745 loss: 2.7724 memory: 122.03GiB(87.57%) tps: 10,202 tflops: 486.22 mfu: 49.16% global_avg_ntp_loss: 0.7937 global_avg_top_loss: 1.9787 +[titan] 2025-09-09 12:58:11,609 - root - INFO - lr: 8.5018e-06 gnorm: 0.35 [1 day, 19:22:43<1 day, 5:41:44] +[titan] 2025-09-09 12:58:37,354 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 12:58:43,798 - root - INFO - step: 23750 loss: 2.7206 memory: 122.03GiB(87.57%) tps: 10,180 tflops: 485.18 mfu: 49.06% global_avg_ntp_loss: 0.7687 global_avg_top_loss: 1.9519 +[titan] 2025-09-09 12:58:43,798 - root - INFO - lr: 8.4984e-06 gnorm: 0.35 [1 day, 19:23:15<1 day, 5:41:10] +[titan] 2025-09-09 12:59:15,640 - root - INFO - step: 23755 loss: 2.7171 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.46 mfu: 49.59% global_avg_ntp_loss: 0.7654 global_avg_top_loss: 1.9517 +[titan] 2025-09-09 12:59:15,640 - root - INFO - lr: 8.4950e-06 gnorm: 0.34 [1 day, 19:23:47<1 day, 5:40:37] +[titan] 2025-09-09 12:59:47,612 - root - INFO - step: 23760 loss: 2.7101 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.47 mfu: 49.39% global_avg_ntp_loss: 0.7638 global_avg_top_loss: 1.9463 +[titan] 2025-09-09 12:59:47,612 - root - INFO - lr: 8.4915e-06 gnorm: 0.37 [1 day, 19:24:19<1 day, 5:40:03] +[titan] 2025-09-09 13:00:20,051 - root - INFO - step: 23765 loss: 2.7622 memory: 122.03GiB(87.57%) tps: 10,102 tflops: 481.44 mfu: 48.68% global_avg_ntp_loss: 0.7887 global_avg_top_loss: 1.9735 +[titan] 2025-09-09 13:00:20,051 - root - INFO - lr: 8.4881e-06 gnorm: 0.35 [1 day, 19:24:52<1 day, 5:39:30] +[titan] 2025-09-09 13:00:51,953 - root - INFO - step: 23770 loss: 2.6923 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.54 mfu: 49.50% global_avg_ntp_loss: 0.7562 global_avg_top_loss: 1.9362 +[titan] 2025-09-09 13:00:51,953 - root - INFO - lr: 8.4847e-06 gnorm: 0.39 [1 day, 19:25:23<1 day, 5:38:56] +[titan] 2025-09-09 13:01:24,150 - root - INFO - step: 23775 loss: 2.7301 memory: 122.03GiB(87.57%) tps: 10,178 tflops: 485.06 mfu: 49.05% global_avg_ntp_loss: 0.7714 global_avg_top_loss: 1.9587 +[titan] 2025-09-09 13:01:24,150 - root - INFO - lr: 8.4813e-06 gnorm: 0.44 [1 day, 19:25:56<1 day, 5:38:23] +[titan] 2025-09-09 13:01:56,234 - root - INFO - step: 23780 loss: 2.7156 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.77 mfu: 49.22% global_avg_ntp_loss: 0.7650 global_avg_top_loss: 1.9506 +[titan] 2025-09-09 13:01:56,234 - root - INFO - lr: 8.4778e-06 gnorm: 0.34 [1 day, 19:26:28<1 day, 5:37:50] +[titan] 2025-09-09 13:02:28,321 - root - INFO - step: 23785 loss: 2.7678 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.72 mfu: 49.21% global_avg_ntp_loss: 0.7887 global_avg_top_loss: 1.9791 +[titan] 2025-09-09 13:02:28,321 - root - INFO - lr: 8.4744e-06 gnorm: 0.36 [1 day, 19:27:00<1 day, 5:37:16] +[titan] 2025-09-09 13:03:00,254 - root - INFO - step: 23790 loss: 2.7838 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.07 mfu: 49.45% global_avg_ntp_loss: 0.8106 global_avg_top_loss: 1.9732 +[titan] 2025-09-09 13:03:00,254 - root - INFO - lr: 8.4710e-06 gnorm: 0.35 [1 day, 19:27:32<1 day, 5:36:43] +[titan] 2025-09-09 13:03:32,362 - root - INFO - step: 23795 loss: 2.7578 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.40 mfu: 49.18% global_avg_ntp_loss: 0.7861 global_avg_top_loss: 1.9717 +[titan] 2025-09-09 13:03:32,363 - root - INFO - lr: 8.4676e-06 gnorm: 0.41 [1 day, 19:28:04<1 day, 5:36:09] +[titan] 2025-09-09 13:03:58,011 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 13:04:04,398 - root - INFO - step: 23800 loss: 2.6799 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.50 mfu: 49.29% global_avg_ntp_loss: 0.7520 global_avg_top_loss: 1.9279 +[titan] 2025-09-09 13:04:04,399 - root - INFO - lr: 8.4641e-06 gnorm: 0.37 [1 day, 19:28:36<1 day, 5:35:36] +[titan] 2025-09-09 13:04:36,295 - root - INFO - step: 23805 loss: 2.7871 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.63 mfu: 49.51% global_avg_ntp_loss: 0.7972 global_avg_top_loss: 1.9899 +[titan] 2025-09-09 13:04:36,295 - root - INFO - lr: 8.4607e-06 gnorm: 0.35 [1 day, 19:29:08<1 day, 5:35:02] +[titan] 2025-09-09 13:05:08,114 - root - INFO - step: 23810 loss: 2.7643 memory: 122.03GiB(87.57%) tps: 10,298 tflops: 490.82 mfu: 49.63% global_avg_ntp_loss: 0.7882 global_avg_top_loss: 1.9761 +[titan] 2025-09-09 13:05:08,115 - root - INFO - lr: 8.4573e-06 gnorm: 0.37 [1 day, 19:29:40<1 day, 5:34:29] +[titan] 2025-09-09 13:05:40,072 - root - INFO - step: 23815 loss: 2.8376 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.70 mfu: 49.41% global_avg_ntp_loss: 0.8248 global_avg_top_loss: 2.0128 +[titan] 2025-09-09 13:05:40,072 - root - INFO - lr: 8.4539e-06 gnorm: 0.39 [1 day, 19:30:12<1 day, 5:33:55] +[titan] 2025-09-09 13:06:12,175 - root - INFO - step: 23820 loss: 2.7753 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.47 mfu: 49.19% global_avg_ntp_loss: 0.7953 global_avg_top_loss: 1.9800 +[titan] 2025-09-09 13:06:12,175 - root - INFO - lr: 8.4504e-06 gnorm: 0.38 [1 day, 19:30:44<1 day, 5:33:22] +[titan] 2025-09-09 13:06:44,115 - root - INFO - step: 23825 loss: 2.7898 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.96 mfu: 49.44% global_avg_ntp_loss: 0.8021 global_avg_top_loss: 1.9877 +[titan] 2025-09-09 13:06:44,115 - root - INFO - lr: 8.4470e-06 gnorm: 0.36 [1 day, 19:31:16<1 day, 5:32:48] +[titan] 2025-09-09 13:07:15,890 - root - INFO - step: 23830 loss: 3.1567 memory: 122.03GiB(87.57%) tps: 10,313 tflops: 491.51 mfu: 49.70% global_avg_ntp_loss: 1.0199 global_avg_top_loss: 2.1368 +[titan] 2025-09-09 13:07:15,890 - root - INFO - lr: 8.4436e-06 gnorm: 0.44 [1 day, 19:31:47<1 day, 5:32:15] +[titan] 2025-09-09 13:07:47,767 - root - INFO - step: 23835 loss: 2.7924 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.92 mfu: 49.54% global_avg_ntp_loss: 0.7984 global_avg_top_loss: 1.9939 +[titan] 2025-09-09 13:07:47,768 - root - INFO - lr: 8.4402e-06 gnorm: 0.35 [1 day, 19:32:19<1 day, 5:31:41] +[titan] 2025-09-09 13:08:19,663 - root - INFO - step: 23840 loss: 2.6894 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.64 mfu: 49.51% global_avg_ntp_loss: 0.7556 global_avg_top_loss: 1.9338 +[titan] 2025-09-09 13:08:19,663 - root - INFO - lr: 8.4367e-06 gnorm: 0.36 [1 day, 19:32:51<1 day, 5:31:08] +[titan] 2025-09-09 13:08:51,714 - root - INFO - step: 23845 loss: 2.7534 memory: 122.03GiB(87.57%) tps: 10,224 tflops: 487.27 mfu: 49.27% global_avg_ntp_loss: 0.7857 global_avg_top_loss: 1.9676 +[titan] 2025-09-09 13:08:51,714 - root - INFO - lr: 8.4333e-06 gnorm: 0.35 [1 day, 19:33:23<1 day, 5:30:34] +[titan] 2025-09-09 13:09:17,454 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 13:09:23,828 - root - INFO - step: 23850 loss: 2.7687 memory: 122.03GiB(87.57%) tps: 10,204 tflops: 486.31 mfu: 49.17% global_avg_ntp_loss: 0.7978 global_avg_top_loss: 1.9710 +[titan] 2025-09-09 13:09:23,828 - root - INFO - lr: 8.4299e-06 gnorm: 0.38 [1 day, 19:33:55<1 day, 5:30:01] +[titan] 2025-09-09 13:09:55,720 - root - INFO - step: 23855 loss: 2.7126 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.70 mfu: 49.51% global_avg_ntp_loss: 0.7691 global_avg_top_loss: 1.9434 +[titan] 2025-09-09 13:09:55,720 - root - INFO - lr: 8.4265e-06 gnorm: 0.34 [1 day, 19:34:27<1 day, 5:29:27] +[titan] 2025-09-09 13:10:27,633 - root - INFO - step: 23860 loss: 2.7057 memory: 122.03GiB(87.57%) tps: 10,268 tflops: 489.37 mfu: 49.48% global_avg_ntp_loss: 0.7603 global_avg_top_loss: 1.9454 +[titan] 2025-09-09 13:10:27,633 - root - INFO - lr: 8.4231e-06 gnorm: 0.78 [1 day, 19:34:59<1 day, 5:28:54] +[titan] 2025-09-09 13:10:59,580 - root - INFO - step: 23865 loss: 2.7233 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.86 mfu: 49.43% global_avg_ntp_loss: 0.7694 global_avg_top_loss: 1.9539 +[titan] 2025-09-09 13:10:59,580 - root - INFO - lr: 8.4196e-06 gnorm: 0.37 [1 day, 19:35:31<1 day, 5:28:20] +[titan] 2025-09-09 13:11:31,518 - root - INFO - step: 23870 loss: 2.7358 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.99 mfu: 49.44% global_avg_ntp_loss: 0.7752 global_avg_top_loss: 1.9606 +[titan] 2025-09-09 13:11:31,518 - root - INFO - lr: 8.4162e-06 gnorm: 0.35 [1 day, 19:36:03<1 day, 5:27:47] +[titan] 2025-09-09 13:12:03,634 - root - INFO - step: 23875 loss: 2.7294 memory: 122.03GiB(87.57%) tps: 10,203 tflops: 486.27 mfu: 49.17% global_avg_ntp_loss: 0.7740 global_avg_top_loss: 1.9553 +[titan] 2025-09-09 13:12:03,635 - root - INFO - lr: 8.4128e-06 gnorm: 0.35 [1 day, 19:36:35<1 day, 5:27:13] +[titan] 2025-09-09 13:12:35,543 - root - INFO - step: 23880 loss: 2.6747 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.45 mfu: 49.49% global_avg_ntp_loss: 0.7496 global_avg_top_loss: 1.9251 +[titan] 2025-09-09 13:12:35,543 - root - INFO - lr: 8.4094e-06 gnorm: 0.36 [1 day, 19:37:07<1 day, 5:26:40] +[titan] 2025-09-09 13:13:07,652 - root - INFO - step: 23885 loss: 2.7978 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.39 mfu: 49.18% global_avg_ntp_loss: 0.8052 global_avg_top_loss: 1.9926 +[titan] 2025-09-09 13:13:07,652 - root - INFO - lr: 8.4060e-06 gnorm: 0.35 [1 day, 19:37:39<1 day, 5:26:06] +[titan] 2025-09-09 13:13:39,698 - root - INFO - step: 23890 loss: 2.6990 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.34 mfu: 49.28% global_avg_ntp_loss: 0.7588 global_avg_top_loss: 1.9402 +[titan] 2025-09-09 13:13:39,699 - root - INFO - lr: 8.4025e-06 gnorm: 0.36 [1 day, 19:38:11<1 day, 5:25:33] +[titan] 2025-09-09 13:14:11,783 - root - INFO - step: 23895 loss: 2.7727 memory: 122.03GiB(87.57%) tps: 10,213 tflops: 486.76 mfu: 49.22% global_avg_ntp_loss: 0.7720 global_avg_top_loss: 2.0007 +[titan] 2025-09-09 13:14:11,784 - root - INFO - lr: 8.3991e-06 gnorm: 1.27 [1 day, 19:38:43<1 day, 5:24:59] +[titan] 2025-09-09 13:14:37,525 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 13:14:43,968 - root - INFO - step: 23900 loss: 2.7869 memory: 122.03GiB(87.57%) tps: 10,181 tflops: 485.24 mfu: 49.06% global_avg_ntp_loss: 0.7982 global_avg_top_loss: 1.9887 +[titan] 2025-09-09 13:14:43,968 - root - INFO - lr: 8.3957e-06 gnorm: 0.36 [1 day, 19:39:15<1 day, 5:24:26] +[titan] 2025-09-09 13:15:15,840 - root - INFO - step: 23905 loss: 2.7571 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 490.01 mfu: 49.55% global_avg_ntp_loss: 0.7893 global_avg_top_loss: 1.9677 +[titan] 2025-09-09 13:15:15,840 - root - INFO - lr: 8.3923e-06 gnorm: 0.79 [1 day, 19:39:47<1 day, 5:23:53] +[titan] 2025-09-09 13:15:47,681 - root - INFO - step: 23910 loss: 3.0514 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.48 mfu: 49.59% global_avg_ntp_loss: 0.9716 global_avg_top_loss: 2.0798 +[titan] 2025-09-09 13:15:47,682 - root - INFO - lr: 8.3889e-06 gnorm: 0.35 [1 day, 19:40:19<1 day, 5:23:19] +[titan] 2025-09-09 13:16:19,629 - root - INFO - step: 23915 loss: 2.7363 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.84 mfu: 49.43% global_avg_ntp_loss: 0.7733 global_avg_top_loss: 1.9630 +[titan] 2025-09-09 13:16:19,630 - root - INFO - lr: 8.3855e-06 gnorm: 0.35 [1 day, 19:40:51<1 day, 5:22:45] +[titan] 2025-09-09 13:16:51,675 - root - INFO - step: 23920 loss: 2.6696 memory: 122.03GiB(87.57%) tps: 10,226 tflops: 487.35 mfu: 49.28% global_avg_ntp_loss: 0.7477 global_avg_top_loss: 1.9219 +[titan] 2025-09-09 13:16:51,675 - root - INFO - lr: 8.3820e-06 gnorm: 0.35 [1 day, 19:41:23<1 day, 5:22:12] +[titan] 2025-09-09 13:17:23,605 - root - INFO - step: 23925 loss: 2.6664 memory: 122.03GiB(87.57%) tps: 10,263 tflops: 489.11 mfu: 49.46% global_avg_ntp_loss: 0.7422 global_avg_top_loss: 1.9242 +[titan] 2025-09-09 13:17:23,605 - root - INFO - lr: 8.3786e-06 gnorm: 0.36 [1 day, 19:41:55<1 day, 5:21:38] +[titan] 2025-09-09 13:17:55,554 - root - INFO - step: 23930 loss: 2.7392 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.83 mfu: 49.43% global_avg_ntp_loss: 0.7781 global_avg_top_loss: 1.9611 +[titan] 2025-09-09 13:17:55,554 - root - INFO - lr: 8.3752e-06 gnorm: 0.36 [1 day, 19:42:27<1 day, 5:21:05] +[titan] 2025-09-09 13:18:27,600 - root - INFO - step: 23935 loss: 2.7458 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.33 mfu: 49.28% global_avg_ntp_loss: 0.7804 global_avg_top_loss: 1.9654 +[titan] 2025-09-09 13:18:27,601 - root - INFO - lr: 8.3718e-06 gnorm: 0.44 [1 day, 19:42:59<1 day, 5:20:32] +[titan] 2025-09-09 13:18:59,407 - root - INFO - step: 23940 loss: 2.6329 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.01 mfu: 49.65% global_avg_ntp_loss: 0.7268 global_avg_top_loss: 1.9061 +[titan] 2025-09-09 13:18:59,408 - root - INFO - lr: 8.3684e-06 gnorm: 0.60 [1 day, 19:43:31<1 day, 5:19:58] +[titan] 2025-09-09 13:19:31,349 - root - INFO - step: 23945 loss: 2.8411 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.93 mfu: 49.44% global_avg_ntp_loss: 0.8344 global_avg_top_loss: 2.0067 +[titan] 2025-09-09 13:19:31,350 - root - INFO - lr: 8.3650e-06 gnorm: 0.36 [1 day, 19:44:03<1 day, 5:19:24] +[titan] 2025-09-09 13:19:56,911 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 13:20:03,314 - root - INFO - step: 23950 loss: 2.7655 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.59 mfu: 49.40% global_avg_ntp_loss: 0.7908 global_avg_top_loss: 1.9747 +[titan] 2025-09-09 13:20:03,314 - root - INFO - lr: 8.3616e-06 gnorm: 0.35 [1 day, 19:44:35<1 day, 5:18:51] +[titan] 2025-09-09 13:20:35,440 - root - INFO - step: 23955 loss: 2.7691 memory: 122.03GiB(87.57%) tps: 10,200 tflops: 486.12 mfu: 49.15% global_avg_ntp_loss: 0.7909 global_avg_top_loss: 1.9782 +[titan] 2025-09-09 13:20:35,441 - root - INFO - lr: 8.3581e-06 gnorm: 0.35 [1 day, 19:45:07<1 day, 5:18:18] +[titan] 2025-09-09 13:21:07,624 - root - INFO - step: 23960 loss: 2.7289 memory: 122.03GiB(87.57%) tps: 10,182 tflops: 485.27 mfu: 49.07% global_avg_ntp_loss: 0.7718 global_avg_top_loss: 1.9571 +[titan] 2025-09-09 13:21:07,624 - root - INFO - lr: 8.3547e-06 gnorm: 0.36 [1 day, 19:45:39<1 day, 5:17:44] +[titan] 2025-09-09 13:21:39,703 - root - INFO - step: 23965 loss: 2.7192 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.84 mfu: 49.23% global_avg_ntp_loss: 0.7663 global_avg_top_loss: 1.9529 +[titan] 2025-09-09 13:21:39,703 - root - INFO - lr: 8.3513e-06 gnorm: 0.36 [1 day, 19:46:11<1 day, 5:17:11] +[titan] 2025-09-09 13:22:11,722 - root - INFO - step: 23970 loss: 2.7345 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.76 mfu: 49.32% global_avg_ntp_loss: 0.7679 global_avg_top_loss: 1.9666 +[titan] 2025-09-09 13:22:11,722 - root - INFO - lr: 8.3479e-06 gnorm: 0.37 [1 day, 19:46:43<1 day, 5:16:37] +[titan] 2025-09-09 13:22:43,657 - root - INFO - step: 23975 loss: 3.5589 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.03 mfu: 49.45% global_avg_ntp_loss: 1.2390 global_avg_top_loss: 2.3199 +[titan] 2025-09-09 13:22:43,658 - root - INFO - lr: 8.3445e-06 gnorm: 0.36 [1 day, 19:47:15<1 day, 5:16:04] +[titan] 2025-09-09 13:23:15,707 - root - INFO - step: 23980 loss: 2.8295 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.29 mfu: 49.27% global_avg_ntp_loss: 0.8171 global_avg_top_loss: 2.0124 +[titan] 2025-09-09 13:23:15,707 - root - INFO - lr: 8.3411e-06 gnorm: 0.37 [1 day, 19:47:47<1 day, 5:15:30] +[titan] 2025-09-09 13:23:47,768 - root - INFO - step: 23985 loss: 2.6171 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.11 mfu: 49.25% global_avg_ntp_loss: 0.7251 global_avg_top_loss: 1.8921 +[titan] 2025-09-09 13:23:47,769 - root - INFO - lr: 8.3377e-06 gnorm: 0.44 [1 day, 19:48:19<1 day, 5:14:57] +[titan] 2025-09-09 13:24:19,781 - root - INFO - step: 23990 loss: 3.2770 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.85 mfu: 49.33% global_avg_ntp_loss: 1.0733 global_avg_top_loss: 2.2038 +[titan] 2025-09-09 13:24:19,782 - root - INFO - lr: 8.3343e-06 gnorm: 0.38 [1 day, 19:48:51<1 day, 5:14:24] +[titan] 2025-09-09 13:24:52,008 - root - INFO - step: 23995 loss: 2.7645 memory: 122.03GiB(87.57%) tps: 10,168 tflops: 484.62 mfu: 49.00% global_avg_ntp_loss: 0.7881 global_avg_top_loss: 1.9764 +[titan] 2025-09-09 13:24:52,008 - root - INFO - lr: 8.3309e-06 gnorm: 0.35 [1 day, 19:49:23<1 day, 5:13:50] +[titan] 2025-09-09 13:25:17,570 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 13:25:23,948 - root - INFO - step: 24000 loss: 2.6465 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.95 mfu: 49.44% global_avg_ntp_loss: 0.7351 global_avg_top_loss: 1.9114 +[titan] 2025-09-09 13:25:23,949 - root - INFO - lr: 8.3274e-06 gnorm: 0.35 [1 day, 19:49:55<1 day, 5:13:17] +[titan] 2025-09-09 13:25:56,153 - root - INFO - step: 24005 loss: 2.7422 memory: 122.03GiB(87.57%) tps: 10,175 tflops: 484.94 mfu: 49.03% global_avg_ntp_loss: 0.7750 global_avg_top_loss: 1.9672 +[titan] 2025-09-09 13:25:56,153 - root - INFO - lr: 8.3240e-06 gnorm: 0.48 [1 day, 19:50:28<1 day, 5:12:43] +[titan] 2025-09-09 13:26:27,923 - root - INFO - step: 24010 loss: 2.7806 memory: 122.03GiB(87.57%) tps: 10,314 tflops: 491.57 mfu: 49.70% global_avg_ntp_loss: 0.7938 global_avg_top_loss: 1.9868 +[titan] 2025-09-09 13:26:27,924 - root - INFO - lr: 8.3206e-06 gnorm: 0.39 [1 day, 19:50:59<1 day, 5:12:10] +[titan] 2025-09-09 13:26:59,996 - root - INFO - step: 24015 loss: 2.9127 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.94 mfu: 49.24% global_avg_ntp_loss: 0.8678 global_avg_top_loss: 2.0449 +[titan] 2025-09-09 13:26:59,996 - root - INFO - lr: 8.3172e-06 gnorm: 0.40 [1 day, 19:51:31<1 day, 5:11:36] +[titan] 2025-09-09 13:27:31,819 - root - INFO - step: 24020 loss: 2.6760 memory: 122.03GiB(87.57%) tps: 10,297 tflops: 490.76 mfu: 49.62% global_avg_ntp_loss: 0.7498 global_avg_top_loss: 1.9262 +[titan] 2025-09-09 13:27:31,819 - root - INFO - lr: 8.3138e-06 gnorm: 0.54 [1 day, 19:52:03<1 day, 5:11:03] +[titan] 2025-09-09 13:28:03,919 - root - INFO - step: 24025 loss: 2.6722 memory: 122.03GiB(87.57%) tps: 10,208 tflops: 486.52 mfu: 49.19% global_avg_ntp_loss: 0.7470 global_avg_top_loss: 1.9252 +[titan] 2025-09-09 13:28:03,919 - root - INFO - lr: 8.3104e-06 gnorm: 0.34 [1 day, 19:52:35<1 day, 5:10:29] +[titan] 2025-09-09 13:28:35,728 - root - INFO - step: 24030 loss: 2.8901 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 490.97 mfu: 49.64% global_avg_ntp_loss: 0.8443 global_avg_top_loss: 2.0458 +[titan] 2025-09-09 13:28:35,729 - root - INFO - lr: 8.3070e-06 gnorm: 0.40 [1 day, 19:53:07<1 day, 5:09:56] +[titan] 2025-09-09 13:29:07,702 - root - INFO - step: 24035 loss: 2.7610 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.45 mfu: 49.39% global_avg_ntp_loss: 0.7864 global_avg_top_loss: 1.9745 +[titan] 2025-09-09 13:29:07,702 - root - INFO - lr: 8.3036e-06 gnorm: 0.38 [1 day, 19:53:39<1 day, 5:09:22] +[titan] 2025-09-09 13:29:39,793 - root - INFO - step: 24040 loss: 2.6985 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.66 mfu: 49.21% global_avg_ntp_loss: 0.7585 global_avg_top_loss: 1.9401 +[titan] 2025-09-09 13:29:39,793 - root - INFO - lr: 8.3002e-06 gnorm: 0.40 [1 day, 19:54:11<1 day, 5:08:49] +[titan] 2025-09-09 13:30:11,646 - root - INFO - step: 24045 loss: 3.0818 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.30 mfu: 49.58% global_avg_ntp_loss: 0.9355 global_avg_top_loss: 2.1463 +[titan] 2025-09-09 13:30:11,646 - root - INFO - lr: 8.2968e-06 gnorm: 0.43 [1 day, 19:54:43<1 day, 5:08:15] +[titan] 2025-09-09 13:30:37,177 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 13:30:43,693 - root - INFO - step: 24050 loss: 2.7219 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.33 mfu: 49.27% global_avg_ntp_loss: 0.7698 global_avg_top_loss: 1.9521 +[titan] 2025-09-09 13:30:43,693 - root - INFO - lr: 8.2934e-06 gnorm: 0.35 [1 day, 19:55:15<1 day, 5:07:42] +[titan] 2025-09-09 13:31:15,798 - root - INFO - step: 24055 loss: 3.1788 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.45 mfu: 49.19% global_avg_ntp_loss: 1.0313 global_avg_top_loss: 2.1476 +[titan] 2025-09-09 13:31:15,798 - root - INFO - lr: 8.2900e-06 gnorm: 0.37 [1 day, 19:55:47<1 day, 5:07:09] +[titan] 2025-09-09 13:31:47,562 - root - INFO - step: 24060 loss: 2.6929 memory: 122.03GiB(87.57%) tps: 10,316 tflops: 491.67 mfu: 49.71% global_avg_ntp_loss: 0.7555 global_avg_top_loss: 1.9374 +[titan] 2025-09-09 13:31:47,562 - root - INFO - lr: 8.2866e-06 gnorm: 0.38 [1 day, 19:56:19<1 day, 5:06:35] +[titan] 2025-09-09 13:32:13,694 - root - INFO - Dumping profiler traces at step 24064 +[titan] 2025-09-09 13:32:13,766 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-09 13:32:19,995 - root - INFO - step: 24065 loss: 2.6309 memory: 122.03GiB(87.57%) tps: 10,103 tflops: 481.52 mfu: 48.69% global_avg_ntp_loss: 0.7302 global_avg_top_loss: 1.9008 +[titan] 2025-09-09 13:32:19,995 - root - INFO - lr: 8.2832e-06 gnorm: 0.33 [1 day, 19:56:51<1 day, 5:06:02] +[titan] 2025-09-09 13:32:51,952 - root - INFO - step: 24070 loss: 2.6834 memory: 122.03GiB(87.57%) tps: 10,254 tflops: 488.70 mfu: 49.41% global_avg_ntp_loss: 0.7503 global_avg_top_loss: 1.9331 +[titan] 2025-09-09 13:32:51,952 - root - INFO - lr: 8.2798e-06 gnorm: 0.35 [1 day, 19:57:23<1 day, 5:05:28] +[titan] 2025-09-09 13:33:23,868 - root - INFO - step: 24075 loss: 2.7221 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.33 mfu: 49.48% global_avg_ntp_loss: 0.7654 global_avg_top_loss: 1.9567 +[titan] 2025-09-09 13:33:23,868 - root - INFO - lr: 8.2764e-06 gnorm: 0.35 [1 day, 19:57:55<1 day, 5:04:55] +[titan] 2025-09-09 13:33:55,860 - root - INFO - step: 24080 loss: 2.7347 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.17 mfu: 49.36% global_avg_ntp_loss: 0.7893 global_avg_top_loss: 1.9455 +[titan] 2025-09-09 13:33:55,861 - root - INFO - lr: 8.2730e-06 gnorm: 0.52 [1 day, 19:58:27<1 day, 5:04:21] +[titan] 2025-09-09 13:34:27,919 - root - INFO - step: 24085 loss: 2.7412 memory: 122.03GiB(87.57%) tps: 10,221 tflops: 487.15 mfu: 49.26% global_avg_ntp_loss: 0.7809 global_avg_top_loss: 1.9603 +[titan] 2025-09-09 13:34:27,920 - root - INFO - lr: 8.2696e-06 gnorm: 0.36 [1 day, 19:58:59<1 day, 5:03:48] +[titan] 2025-09-09 13:35:00,212 - root - INFO - step: 24090 loss: 2.7386 memory: 122.03GiB(87.57%) tps: 10,147 tflops: 483.62 mfu: 48.90% global_avg_ntp_loss: 0.7762 global_avg_top_loss: 1.9623 +[titan] 2025-09-09 13:35:00,212 - root - INFO - lr: 8.2662e-06 gnorm: 0.35 [1 day, 19:59:32<1 day, 5:03:15] +[titan] 2025-09-09 13:35:32,409 - root - INFO - step: 24095 loss: 2.8068 memory: 122.03GiB(87.57%) tps: 10,178 tflops: 485.06 mfu: 49.05% global_avg_ntp_loss: 0.8017 global_avg_top_loss: 2.0051 +[titan] 2025-09-09 13:35:32,410 - root - INFO - lr: 8.2627e-06 gnorm: 0.68 [1 day, 20:00:04<1 day, 5:02:42] +[titan] 2025-09-09 13:35:58,030 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 13:36:04,421 - root - INFO - step: 24100 loss: 2.7521 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.86 mfu: 49.33% global_avg_ntp_loss: 0.7877 global_avg_top_loss: 1.9645 +[titan] 2025-09-09 13:36:04,422 - root - INFO - lr: 8.2593e-06 gnorm: 0.78 [1 day, 20:00:36<1 day, 5:02:08] +[titan] 2025-09-09 13:36:36,524 - root - INFO - step: 24105 loss: 2.7440 memory: 122.03GiB(87.57%) tps: 10,208 tflops: 486.49 mfu: 49.19% global_avg_ntp_loss: 0.7771 global_avg_top_loss: 1.9669 +[titan] 2025-09-09 13:36:36,524 - root - INFO - lr: 8.2559e-06 gnorm: 0.35 [1 day, 20:01:08<1 day, 5:01:35] +[titan] 2025-09-09 13:37:08,694 - root - INFO - step: 24110 loss: 2.7829 memory: 122.03GiB(87.57%) tps: 10,186 tflops: 485.46 mfu: 49.09% global_avg_ntp_loss: 0.7966 global_avg_top_loss: 1.9863 +[titan] 2025-09-09 13:37:08,695 - root - INFO - lr: 8.2525e-06 gnorm: 0.35 [1 day, 20:01:40<1 day, 5:01:01] +[titan] 2025-09-09 13:37:40,675 - root - INFO - step: 24115 loss: 2.7578 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.34 mfu: 49.38% global_avg_ntp_loss: 0.7857 global_avg_top_loss: 1.9722 +[titan] 2025-09-09 13:37:40,676 - root - INFO - lr: 8.2491e-06 gnorm: 0.35 [1 day, 20:02:12<1 day, 5:00:28] +[titan] 2025-09-09 13:38:12,695 - root - INFO - step: 24120 loss: 2.7801 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.74 mfu: 49.32% global_avg_ntp_loss: 0.7919 global_avg_top_loss: 1.9882 +[titan] 2025-09-09 13:38:12,696 - root - INFO - lr: 8.2458e-06 gnorm: 0.36 [1 day, 20:02:44<1 day, 4:59:54] +[titan] 2025-09-09 13:38:44,627 - root - INFO - step: 24125 loss: 2.7460 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.08 mfu: 49.45% global_avg_ntp_loss: 0.7799 global_avg_top_loss: 1.9661 +[titan] 2025-09-09 13:38:44,628 - root - INFO - lr: 8.2424e-06 gnorm: 0.35 [1 day, 20:03:16<1 day, 4:59:21] +[titan] 2025-09-09 13:39:16,379 - root - INFO - step: 24130 loss: 2.7060 memory: 122.03GiB(87.57%) tps: 10,320 tflops: 491.86 mfu: 49.73% global_avg_ntp_loss: 0.7611 global_avg_top_loss: 1.9450 +[titan] 2025-09-09 13:39:16,380 - root - INFO - lr: 8.2390e-06 gnorm: 0.36 [1 day, 20:03:48<1 day, 4:58:47] +[titan] 2025-09-09 13:39:48,502 - root - INFO - step: 24135 loss: 3.1977 memory: 122.03GiB(87.57%) tps: 10,201 tflops: 486.18 mfu: 49.16% global_avg_ntp_loss: 1.0387 global_avg_top_loss: 2.1590 +[titan] 2025-09-09 13:39:48,502 - root - INFO - lr: 8.2356e-06 gnorm: 0.37 [1 day, 20:04:20<1 day, 4:58:14] +[titan] 2025-09-09 13:40:20,686 - root - INFO - step: 24140 loss: 2.8257 memory: 122.03GiB(87.57%) tps: 10,182 tflops: 485.26 mfu: 49.07% global_avg_ntp_loss: 0.8179 global_avg_top_loss: 2.0078 +[titan] 2025-09-09 13:40:20,686 - root - INFO - lr: 8.2322e-06 gnorm: 0.37 [1 day, 20:04:52<1 day, 4:57:41] +[titan] 2025-09-09 13:40:52,635 - root - INFO - step: 24145 loss: 2.7141 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.83 mfu: 49.43% global_avg_ntp_loss: 0.7642 global_avg_top_loss: 1.9499 +[titan] 2025-09-09 13:40:52,635 - root - INFO - lr: 8.2288e-06 gnorm: 0.37 [1 day, 20:05:24<1 day, 4:57:07] +[titan] 2025-09-09 13:41:18,162 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 13:41:24,562 - root - INFO - step: 24150 loss: 3.0353 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.15 mfu: 49.46% global_avg_ntp_loss: 0.9486 global_avg_top_loss: 2.0866 +[titan] 2025-09-09 13:41:24,563 - root - INFO - lr: 8.2254e-06 gnorm: 0.39 [1 day, 20:05:56<1 day, 4:56:34] +[titan] 2025-09-09 13:41:56,895 - root - INFO - step: 24155 loss: 3.0151 memory: 122.03GiB(87.57%) tps: 10,135 tflops: 483.02 mfu: 48.84% global_avg_ntp_loss: 0.9217 global_avg_top_loss: 2.0934 +[titan] 2025-09-09 13:41:56,896 - root - INFO - lr: 8.2220e-06 gnorm: 0.35 [1 day, 20:06:28<1 day, 4:56:01] +[titan] 2025-09-09 13:42:28,889 - root - INFO - step: 24160 loss: 2.7939 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.15 mfu: 49.36% global_avg_ntp_loss: 0.8035 global_avg_top_loss: 1.9904 +[titan] 2025-09-09 13:42:28,889 - root - INFO - lr: 8.2186e-06 gnorm: 0.37 [1 day, 20:07:00<1 day, 4:55:27] +[titan] 2025-09-09 13:43:00,976 - root - INFO - step: 24165 loss: 2.6934 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.72 mfu: 49.21% global_avg_ntp_loss: 0.7547 global_avg_top_loss: 1.9387 +[titan] 2025-09-09 13:43:00,976 - root - INFO - lr: 8.2152e-06 gnorm: 0.35 [1 day, 20:07:32<1 day, 4:54:54] +[titan] 2025-09-09 13:43:32,860 - root - INFO - step: 24170 loss: 2.6097 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.82 mfu: 49.53% global_avg_ntp_loss: 0.7178 global_avg_top_loss: 1.8919 +[titan] 2025-09-09 13:43:32,861 - root - INFO - lr: 8.2118e-06 gnorm: 0.35 [1 day, 20:08:04<1 day, 4:54:20] +[titan] 2025-09-09 13:44:04,719 - root - INFO - step: 24175 loss: 2.7924 memory: 122.03GiB(87.57%) tps: 10,286 tflops: 490.22 mfu: 49.57% global_avg_ntp_loss: 0.8012 global_avg_top_loss: 1.9913 +[titan] 2025-09-09 13:44:04,719 - root - INFO - lr: 8.2084e-06 gnorm: 0.53 [1 day, 20:08:36<1 day, 4:53:47] +[titan] 2025-09-09 13:44:36,913 - root - INFO - step: 24180 loss: 2.7876 memory: 122.03GiB(87.57%) tps: 10,178 tflops: 485.10 mfu: 49.05% global_avg_ntp_loss: 0.7980 global_avg_top_loss: 1.9896 +[titan] 2025-09-09 13:44:36,914 - root - INFO - lr: 8.2050e-06 gnorm: 0.37 [1 day, 20:09:08<1 day, 4:53:13] +[titan] 2025-09-09 13:45:08,972 - root - INFO - step: 24185 loss: 2.7621 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.15 mfu: 49.26% global_avg_ntp_loss: 0.7901 global_avg_top_loss: 1.9720 +[titan] 2025-09-09 13:45:08,973 - root - INFO - lr: 8.2016e-06 gnorm: 0.35 [1 day, 20:09:40<1 day, 4:52:40] +[titan] 2025-09-09 13:45:40,840 - root - INFO - step: 24190 loss: 2.8281 memory: 122.03GiB(87.57%) tps: 10,283 tflops: 490.07 mfu: 49.55% global_avg_ntp_loss: 0.8131 global_avg_top_loss: 2.0150 +[titan] 2025-09-09 13:45:40,840 - root - INFO - lr: 8.1982e-06 gnorm: 0.34 [1 day, 20:10:12<1 day, 4:52:06] +[titan] 2025-09-09 13:46:12,871 - root - INFO - step: 24195 loss: 2.7647 memory: 122.03GiB(87.57%) tps: 10,230 tflops: 487.57 mfu: 49.30% global_avg_ntp_loss: 0.7854 global_avg_top_loss: 1.9793 +[titan] 2025-09-09 13:46:12,871 - root - INFO - lr: 8.1948e-06 gnorm: 0.36 [1 day, 20:10:44<1 day, 4:51:33] +[titan] 2025-09-09 13:46:38,454 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 13:46:44,925 - root - INFO - step: 24200 loss: 2.7885 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.22 mfu: 49.26% global_avg_ntp_loss: 0.8007 global_avg_top_loss: 1.9878 +[titan] 2025-09-09 13:46:44,925 - root - INFO - lr: 8.1914e-06 gnorm: 0.42 [1 day, 20:11:16<1 day, 4:51:00] +[titan] 2025-09-09 13:47:16,947 - root - INFO - step: 24205 loss: 2.7038 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.71 mfu: 49.31% global_avg_ntp_loss: 0.7580 global_avg_top_loss: 1.9458 +[titan] 2025-09-09 13:47:16,947 - root - INFO - lr: 8.1880e-06 gnorm: 0.46 [1 day, 20:11:48<1 day, 4:50:26] +[titan] 2025-09-09 13:47:49,054 - root - INFO - step: 24210 loss: 2.8731 memory: 122.03GiB(87.57%) tps: 10,206 tflops: 486.42 mfu: 49.18% global_avg_ntp_loss: 0.8388 global_avg_top_loss: 2.0343 +[titan] 2025-09-09 13:47:49,054 - root - INFO - lr: 8.1846e-06 gnorm: 0.55 [1 day, 20:12:20<1 day, 4:49:53] +[titan] 2025-09-09 13:48:21,089 - root - INFO - step: 24215 loss: 3.4447 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.51 mfu: 49.29% global_avg_ntp_loss: 1.1791 global_avg_top_loss: 2.2656 +[titan] 2025-09-09 13:48:21,089 - root - INFO - lr: 8.1813e-06 gnorm: 0.41 [1 day, 20:12:53<1 day, 4:49:19] +[titan] 2025-09-09 13:48:52,860 - root - INFO - step: 24220 loss: 2.7942 memory: 122.03GiB(87.57%) tps: 10,314 tflops: 491.55 mfu: 49.70% global_avg_ntp_loss: 0.8011 global_avg_top_loss: 1.9931 +[titan] 2025-09-09 13:48:52,861 - root - INFO - lr: 8.1779e-06 gnorm: 0.38 [1 day, 20:13:24<1 day, 4:48:46] +[titan] 2025-09-09 13:49:24,901 - root - INFO - step: 24225 loss: 2.7667 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.42 mfu: 49.28% global_avg_ntp_loss: 0.7918 global_avg_top_loss: 1.9749 +[titan] 2025-09-09 13:49:24,902 - root - INFO - lr: 8.1745e-06 gnorm: 0.35 [1 day, 20:13:56<1 day, 4:48:12] +[titan] 2025-09-09 13:49:57,046 - root - INFO - step: 24230 loss: 2.7424 memory: 122.03GiB(87.57%) tps: 10,194 tflops: 485.84 mfu: 49.12% global_avg_ntp_loss: 0.7777 global_avg_top_loss: 1.9647 +[titan] 2025-09-09 13:49:57,047 - root - INFO - lr: 8.1711e-06 gnorm: 0.40 [1 day, 20:14:28<1 day, 4:47:39] +[titan] 2025-09-09 13:50:29,068 - root - INFO - step: 24235 loss: 2.7678 memory: 122.03GiB(87.57%) tps: 10,233 tflops: 487.72 mfu: 49.31% global_avg_ntp_loss: 0.7885 global_avg_top_loss: 1.9793 +[titan] 2025-09-09 13:50:29,068 - root - INFO - lr: 8.1677e-06 gnorm: 0.35 [1 day, 20:15:01<1 day, 4:47:06] +[titan] 2025-09-09 13:51:00,968 - root - INFO - step: 24240 loss: 2.8187 memory: 122.03GiB(87.57%) tps: 10,272 tflops: 489.58 mfu: 49.50% global_avg_ntp_loss: 0.8246 global_avg_top_loss: 1.9941 +[titan] 2025-09-09 13:51:00,968 - root - INFO - lr: 8.1643e-06 gnorm: 0.41 [1 day, 20:15:32<1 day, 4:46:32] +[titan] 2025-09-09 13:51:33,059 - root - INFO - step: 24245 loss: 2.6846 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.66 mfu: 49.21% global_avg_ntp_loss: 0.7499 global_avg_top_loss: 1.9347 +[titan] 2025-09-09 13:51:33,060 - root - INFO - lr: 8.1609e-06 gnorm: 0.38 [1 day, 20:16:04<1 day, 4:45:59] +[titan] 2025-09-09 13:51:58,559 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 13:52:04,983 - root - INFO - step: 24250 loss: 2.7423 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.21 mfu: 49.47% global_avg_ntp_loss: 0.7762 global_avg_top_loss: 1.9661 +[titan] 2025-09-09 13:52:04,983 - root - INFO - lr: 8.1575e-06 gnorm: 0.40 [1 day, 20:16:36<1 day, 4:45:25] +[titan] 2025-09-09 13:52:36,875 - root - INFO - step: 24255 loss: 2.7311 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.70 mfu: 49.51% global_avg_ntp_loss: 0.7760 global_avg_top_loss: 1.9550 +[titan] 2025-09-09 13:52:36,875 - root - INFO - lr: 8.1541e-06 gnorm: 0.53 [1 day, 20:17:08<1 day, 4:44:52] +[titan] 2025-09-09 13:53:09,121 - root - INFO - step: 24260 loss: 2.7513 memory: 122.03GiB(87.57%) tps: 10,162 tflops: 484.31 mfu: 48.97% global_avg_ntp_loss: 0.7812 global_avg_top_loss: 1.9701 +[titan] 2025-09-09 13:53:09,122 - root - INFO - lr: 8.1508e-06 gnorm: 0.35 [1 day, 20:17:41<1 day, 4:44:19] +[titan] 2025-09-09 13:53:41,070 - root - INFO - step: 24265 loss: 2.5740 memory: 122.03GiB(87.57%) tps: 10,257 tflops: 488.84 mfu: 49.43% global_avg_ntp_loss: 0.7042 global_avg_top_loss: 1.8698 +[titan] 2025-09-09 13:53:41,070 - root - INFO - lr: 8.1474e-06 gnorm: 0.36 [1 day, 20:18:13<1 day, 4:43:45] +[titan] 2025-09-09 13:54:13,046 - root - INFO - step: 24270 loss: 2.7462 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.41 mfu: 49.38% global_avg_ntp_loss: 0.7769 global_avg_top_loss: 1.9692 +[titan] 2025-09-09 13:54:13,046 - root - INFO - lr: 8.1440e-06 gnorm: 0.36 [1 day, 20:18:44<1 day, 4:43:12] +[titan] 2025-09-09 13:54:44,868 - root - INFO - step: 24275 loss: 2.7525 memory: 122.03GiB(87.57%) tps: 10,297 tflops: 490.77 mfu: 49.62% global_avg_ntp_loss: 0.7841 global_avg_top_loss: 1.9684 +[titan] 2025-09-09 13:54:44,869 - root - INFO - lr: 8.1406e-06 gnorm: 0.37 [1 day, 20:19:16<1 day, 4:42:38] +[titan] 2025-09-09 13:55:16,883 - root - INFO - step: 24280 loss: 2.7304 memory: 122.03GiB(87.57%) tps: 10,236 tflops: 487.82 mfu: 49.32% global_avg_ntp_loss: 0.7685 global_avg_top_loss: 1.9619 +[titan] 2025-09-09 13:55:16,883 - root - INFO - lr: 8.1372e-06 gnorm: 0.35 [1 day, 20:19:48<1 day, 4:42:05] +[titan] 2025-09-09 13:55:48,743 - root - INFO - step: 24285 loss: 2.7590 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.19 mfu: 49.56% global_avg_ntp_loss: 0.7876 global_avg_top_loss: 1.9713 +[titan] 2025-09-09 13:55:48,743 - root - INFO - lr: 8.1338e-06 gnorm: 0.35 [1 day, 20:20:20<1 day, 4:41:31] +[titan] 2025-09-09 13:56:20,549 - root - INFO - step: 24290 loss: 2.7280 memory: 122.03GiB(87.57%) tps: 10,303 tflops: 491.03 mfu: 49.65% global_avg_ntp_loss: 0.7720 global_avg_top_loss: 1.9559 +[titan] 2025-09-09 13:56:20,549 - root - INFO - lr: 8.1305e-06 gnorm: 0.46 [1 day, 20:20:52<1 day, 4:40:58] +[titan] 2025-09-09 13:56:52,585 - root - INFO - step: 24295 loss: 3.7614 memory: 122.03GiB(87.57%) tps: 10,229 tflops: 487.49 mfu: 49.29% global_avg_ntp_loss: 1.3523 global_avg_top_loss: 2.4091 +[titan] 2025-09-09 13:56:52,585 - root - INFO - lr: 8.1271e-06 gnorm: 0.44 [1 day, 20:21:24<1 day, 4:40:24] +[titan] 2025-09-09 13:57:18,113 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 13:57:24,604 - root - INFO - step: 24300 loss: 2.7631 memory: 122.03GiB(87.57%) tps: 10,234 tflops: 487.75 mfu: 49.32% global_avg_ntp_loss: 0.7887 global_avg_top_loss: 1.9745 +[titan] 2025-09-09 13:57:24,604 - root - INFO - lr: 8.1237e-06 gnorm: 0.35 [1 day, 20:21:56<1 day, 4:39:51] +[titan] 2025-09-09 13:57:56,423 - root - INFO - step: 24305 loss: 2.7928 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.82 mfu: 49.63% global_avg_ntp_loss: 0.7999 global_avg_top_loss: 1.9929 +[titan] 2025-09-09 13:57:56,423 - root - INFO - lr: 8.1203e-06 gnorm: 0.35 [1 day, 20:22:28<1 day, 4:39:17] +[titan] 2025-09-09 13:58:28,385 - root - INFO - step: 24310 loss: 3.2507 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.63 mfu: 49.41% global_avg_ntp_loss: 1.0666 global_avg_top_loss: 2.1841 +[titan] 2025-09-09 13:58:28,385 - root - INFO - lr: 8.1169e-06 gnorm: 0.42 [1 day, 20:23:00<1 day, 4:38:44] +[titan] 2025-09-09 13:59:00,442 - root - INFO - step: 24315 loss: 2.8062 memory: 122.03GiB(87.57%) tps: 10,222 tflops: 487.18 mfu: 49.26% global_avg_ntp_loss: 0.8062 global_avg_top_loss: 2.0000 +[titan] 2025-09-09 13:59:00,442 - root - INFO - lr: 8.1135e-06 gnorm: 0.35 [1 day, 20:23:32<1 day, 4:38:10] +[titan] 2025-09-09 13:59:32,433 - root - INFO - step: 24320 loss: 2.6860 memory: 122.03GiB(87.57%) tps: 10,243 tflops: 488.18 mfu: 49.36% global_avg_ntp_loss: 0.7554 global_avg_top_loss: 1.9306 +[titan] 2025-09-09 13:59:32,433 - root - INFO - lr: 8.1102e-06 gnorm: 0.35 [1 day, 20:24:04<1 day, 4:37:37] +[titan] 2025-09-09 14:00:04,473 - root - INFO - step: 24325 loss: 2.7479 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.43 mfu: 49.28% global_avg_ntp_loss: 0.7839 global_avg_top_loss: 1.9641 +[titan] 2025-09-09 14:00:04,474 - root - INFO - lr: 8.1068e-06 gnorm: 0.35 [1 day, 20:24:36<1 day, 4:37:04] +[titan] 2025-09-09 14:00:36,320 - root - INFO - step: 24330 loss: 2.7769 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.39 mfu: 49.58% global_avg_ntp_loss: 0.7931 global_avg_top_loss: 1.9839 +[titan] 2025-09-09 14:00:36,321 - root - INFO - lr: 8.1034e-06 gnorm: 0.59 [1 day, 20:25:08<1 day, 4:36:30] +[titan] 2025-09-09 14:01:08,166 - root - INFO - step: 24335 loss: 2.6969 memory: 122.03GiB(87.57%) tps: 10,290 tflops: 490.40 mfu: 49.59% global_avg_ntp_loss: 0.7575 global_avg_top_loss: 1.9394 +[titan] 2025-09-09 14:01:08,167 - root - INFO - lr: 8.1000e-06 gnorm: 0.46 [1 day, 20:25:40<1 day, 4:35:57] +[titan] 2025-09-09 14:01:40,149 - root - INFO - step: 24340 loss: 2.6555 memory: 122.03GiB(87.57%) tps: 10,246 tflops: 488.31 mfu: 49.37% global_avg_ntp_loss: 0.7437 global_avg_top_loss: 1.9118 +[titan] 2025-09-09 14:01:40,149 - root - INFO - lr: 8.0966e-06 gnorm: 0.35 [1 day, 20:26:12<1 day, 4:35:23] +[titan] 2025-09-09 14:02:12,035 - root - INFO - step: 24345 loss: 3.1250 memory: 122.03GiB(87.57%) tps: 10,277 tflops: 489.78 mfu: 49.52% global_avg_ntp_loss: 1.0039 global_avg_top_loss: 2.1211 +[titan] 2025-09-09 14:02:12,035 - root - INFO - lr: 8.0933e-06 gnorm: 0.41 [1 day, 20:26:43<1 day, 4:34:50] +[titan] 2025-09-09 14:02:37,510 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 14:02:43,884 - root - INFO - step: 24350 loss: 2.7159 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.36 mfu: 49.58% global_avg_ntp_loss: 0.7667 global_avg_top_loss: 1.9492 +[titan] 2025-09-09 14:02:43,885 - root - INFO - lr: 8.0899e-06 gnorm: 0.35 [1 day, 20:27:15<1 day, 4:34:16] +[titan] 2025-09-09 14:03:15,714 - root - INFO - step: 24355 loss: 2.6675 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.66 mfu: 49.61% global_avg_ntp_loss: 0.7449 global_avg_top_loss: 1.9226 +[titan] 2025-09-09 14:03:15,714 - root - INFO - lr: 8.0865e-06 gnorm: 0.35 [1 day, 20:27:47<1 day, 4:33:43] +[titan] 2025-09-09 14:03:47,788 - root - INFO - step: 24360 loss: 2.7716 memory: 122.03GiB(87.57%) tps: 10,217 tflops: 486.92 mfu: 49.23% global_avg_ntp_loss: 0.7906 global_avg_top_loss: 1.9810 +[titan] 2025-09-09 14:03:47,788 - root - INFO - lr: 8.0831e-06 gnorm: 0.37 [1 day, 20:28:19<1 day, 4:33:09] +[titan] 2025-09-09 14:04:19,866 - root - INFO - step: 24365 loss: 2.8802 memory: 122.03GiB(87.57%) tps: 10,215 tflops: 486.86 mfu: 49.23% global_avg_ntp_loss: 0.8467 global_avg_top_loss: 2.0336 +[titan] 2025-09-09 14:04:19,866 - root - INFO - lr: 8.0797e-06 gnorm: 0.36 [1 day, 20:28:51<1 day, 4:32:36] +[titan] 2025-09-09 14:04:51,742 - root - INFO - step: 24370 loss: 2.7575 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.94 mfu: 49.54% global_avg_ntp_loss: 0.7834 global_avg_top_loss: 1.9741 +[titan] 2025-09-09 14:04:51,742 - root - INFO - lr: 8.0764e-06 gnorm: 0.37 [1 day, 20:29:23<1 day, 4:32:02] +[titan] 2025-09-09 14:05:23,958 - root - INFO - step: 24375 loss: 3.2172 memory: 122.03GiB(87.57%) tps: 10,172 tflops: 484.78 mfu: 49.02% global_avg_ntp_loss: 1.0461 global_avg_top_loss: 2.1712 +[titan] 2025-09-09 14:05:23,958 - root - INFO - lr: 8.0730e-06 gnorm: 0.37 [1 day, 20:29:55<1 day, 4:31:29] +[titan] 2025-09-09 14:05:55,934 - root - INFO - step: 24380 loss: 2.8040 memory: 122.03GiB(87.57%) tps: 10,248 tflops: 488.41 mfu: 49.38% global_avg_ntp_loss: 0.8065 global_avg_top_loss: 1.9975 +[titan] 2025-09-09 14:05:55,934 - root - INFO - lr: 8.0696e-06 gnorm: 0.34 [1 day, 20:30:27<1 day, 4:30:56] +[titan] 2025-09-09 14:06:27,730 - root - INFO - step: 24385 loss: 2.7604 memory: 122.03GiB(87.57%) tps: 10,306 tflops: 491.18 mfu: 49.66% global_avg_ntp_loss: 0.7856 global_avg_top_loss: 1.9748 +[titan] 2025-09-09 14:06:27,730 - root - INFO - lr: 8.0662e-06 gnorm: 0.36 [1 day, 20:30:59<1 day, 4:30:22] +[titan] 2025-09-09 14:06:59,607 - root - INFO - step: 24390 loss: 3.1832 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.93 mfu: 49.54% global_avg_ntp_loss: 1.0331 global_avg_top_loss: 2.1501 +[titan] 2025-09-09 14:06:59,607 - root - INFO - lr: 8.0629e-06 gnorm: 0.37 [1 day, 20:31:31<1 day, 4:29:49] +[titan] 2025-09-09 14:07:31,570 - root - INFO - step: 24395 loss: 3.1253 memory: 122.03GiB(87.57%) tps: 10,252 tflops: 488.61 mfu: 49.40% global_avg_ntp_loss: 0.9935 global_avg_top_loss: 2.1319 +[titan] 2025-09-09 14:07:31,570 - root - INFO - lr: 8.0595e-06 gnorm: 0.34 [1 day, 20:32:03<1 day, 4:29:15] +[titan] 2025-09-09 14:07:57,159 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 14:08:03,581 - root - INFO - step: 24400 loss: 2.7249 memory: 122.03GiB(87.57%) tps: 10,237 tflops: 487.87 mfu: 49.33% global_avg_ntp_loss: 0.7699 global_avg_top_loss: 1.9550 +[titan] 2025-09-09 14:08:03,582 - root - INFO - lr: 8.0561e-06 gnorm: 0.35 [1 day, 20:32:35<1 day, 4:28:42] +[titan] 2025-09-09 14:08:35,429 - root - INFO - step: 24405 loss: 2.7365 memory: 122.03GiB(87.57%) tps: 10,289 tflops: 490.38 mfu: 49.58% global_avg_ntp_loss: 0.7742 global_avg_top_loss: 1.9623 +[titan] 2025-09-09 14:08:35,430 - root - INFO - lr: 8.0527e-06 gnorm: 0.36 [1 day, 20:33:07<1 day, 4:28:08] +[titan] 2025-09-09 14:09:07,322 - root - INFO - step: 24410 loss: 2.5786 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.69 mfu: 49.51% global_avg_ntp_loss: 0.7040 global_avg_top_loss: 1.8746 +[titan] 2025-09-09 14:09:07,322 - root - INFO - lr: 8.0494e-06 gnorm: 0.34 [1 day, 20:33:39<1 day, 4:27:35] +[titan] 2025-09-09 14:09:38,999 - root - INFO - step: 24415 loss: 2.7584 memory: 122.03GiB(87.57%) tps: 10,345 tflops: 493.01 mfu: 49.85% global_avg_ntp_loss: 0.7876 global_avg_top_loss: 1.9708 +[titan] 2025-09-09 14:09:39,000 - root - INFO - lr: 8.0460e-06 gnorm: 0.36 [1 day, 20:34:10<1 day, 4:27:01] +[titan] 2025-09-09 14:10:10,907 - root - INFO - step: 24420 loss: 2.7637 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.46 mfu: 49.49% global_avg_ntp_loss: 0.7908 global_avg_top_loss: 1.9729 +[titan] 2025-09-09 14:10:10,907 - root - INFO - lr: 8.0426e-06 gnorm: 0.35 [1 day, 20:34:42<1 day, 4:26:28] +[titan] 2025-09-09 14:10:42,813 - root - INFO - step: 24425 loss: 3.0768 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.49 mfu: 49.49% global_avg_ntp_loss: 0.9865 global_avg_top_loss: 2.0903 +[titan] 2025-09-09 14:10:42,813 - root - INFO - lr: 8.0393e-06 gnorm: 0.38 [1 day, 20:35:14<1 day, 4:25:54] +[titan] 2025-09-09 14:11:14,959 - root - INFO - step: 24430 loss: 2.6405 memory: 122.03GiB(87.57%) tps: 10,194 tflops: 485.82 mfu: 49.12% global_avg_ntp_loss: 0.7345 global_avg_top_loss: 1.9060 +[titan] 2025-09-09 14:11:14,960 - root - INFO - lr: 8.0359e-06 gnorm: 0.37 [1 day, 20:35:46<1 day, 4:25:21] +[titan] 2025-09-09 14:11:46,820 - root - INFO - step: 24435 loss: 2.7144 memory: 122.03GiB(87.57%) tps: 10,285 tflops: 490.18 mfu: 49.56% global_avg_ntp_loss: 0.7668 global_avg_top_loss: 1.9476 +[titan] 2025-09-09 14:11:46,820 - root - INFO - lr: 8.0325e-06 gnorm: 0.36 [1 day, 20:36:18<1 day, 4:24:48] +[titan] 2025-09-09 14:12:18,694 - root - INFO - step: 24440 loss: 2.7474 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.96 mfu: 49.54% global_avg_ntp_loss: 0.7800 global_avg_top_loss: 1.9674 +[titan] 2025-09-09 14:12:18,695 - root - INFO - lr: 8.0291e-06 gnorm: 0.38 [1 day, 20:36:50<1 day, 4:24:14] +[titan] 2025-09-09 14:12:50,660 - root - INFO - step: 24445 loss: 2.7675 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.57 mfu: 49.40% global_avg_ntp_loss: 0.7923 global_avg_top_loss: 1.9751 +[titan] 2025-09-09 14:12:50,660 - root - INFO - lr: 8.0258e-06 gnorm: 0.35 [1 day, 20:37:22<1 day, 4:23:41] +[titan] 2025-09-09 14:13:16,180 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 14:13:22,525 - root - INFO - step: 24450 loss: 2.6339 memory: 122.03GiB(87.57%) tps: 10,284 tflops: 490.12 mfu: 49.56% global_avg_ntp_loss: 0.7252 global_avg_top_loss: 1.9088 +[titan] 2025-09-09 14:13:22,525 - root - INFO - lr: 8.0224e-06 gnorm: 0.53 [1 day, 20:37:54<1 day, 4:23:07] +[titan] 2025-09-09 14:13:54,340 - root - INFO - step: 24455 loss: 3.2545 memory: 122.03GiB(87.57%) tps: 10,300 tflops: 490.88 mfu: 49.63% global_avg_ntp_loss: 1.0613 global_avg_top_loss: 2.1931 +[titan] 2025-09-09 14:13:54,340 - root - INFO - lr: 8.0190e-06 gnorm: 0.39 [1 day, 20:38:26<1 day, 4:22:34] +[titan] 2025-09-09 14:14:26,271 - root - INFO - step: 24460 loss: 2.7233 memory: 122.03GiB(87.57%) tps: 10,262 tflops: 489.10 mfu: 49.45% global_avg_ntp_loss: 0.7694 global_avg_top_loss: 1.9539 +[titan] 2025-09-09 14:14:26,272 - root - INFO - lr: 8.0157e-06 gnorm: 0.36 [1 day, 20:38:58<1 day, 4:22:00] +[titan] 2025-09-09 14:14:58,325 - root - INFO - step: 24465 loss: 2.7286 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.23 mfu: 49.26% global_avg_ntp_loss: 0.7700 global_avg_top_loss: 1.9586 +[titan] 2025-09-09 14:14:58,325 - root - INFO - lr: 8.0123e-06 gnorm: 0.36 [1 day, 20:39:30<1 day, 4:21:27] +[titan] 2025-09-09 14:15:30,262 - root - INFO - step: 24470 loss: 3.2392 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.01 mfu: 49.45% global_avg_ntp_loss: 1.0598 global_avg_top_loss: 2.1794 +[titan] 2025-09-09 14:15:30,262 - root - INFO - lr: 8.0089e-06 gnorm: 0.39 [1 day, 20:40:02<1 day, 4:20:53] +[titan] 2025-09-09 14:16:02,029 - root - INFO - step: 24475 loss: 2.7670 memory: 122.03GiB(87.57%) tps: 10,315 tflops: 491.62 mfu: 49.71% global_avg_ntp_loss: 0.7878 global_avg_top_loss: 1.9792 +[titan] 2025-09-09 14:16:02,029 - root - INFO - lr: 8.0056e-06 gnorm: 0.36 [1 day, 20:40:33<1 day, 4:20:20] +[titan] 2025-09-09 14:16:33,793 - root - INFO - step: 24480 loss: 2.7746 memory: 122.03GiB(87.57%) tps: 10,316 tflops: 491.67 mfu: 49.71% global_avg_ntp_loss: 0.7928 global_avg_top_loss: 1.9818 +[titan] 2025-09-09 14:16:33,794 - root - INFO - lr: 8.0022e-06 gnorm: 0.35 [1 day, 20:41:05<1 day, 4:19:46] +[titan] 2025-09-09 14:17:05,709 - root - INFO - step: 24485 loss: 2.8442 memory: 122.03GiB(87.57%) tps: 10,267 tflops: 489.33 mfu: 49.48% global_avg_ntp_loss: 0.8201 global_avg_top_loss: 2.0241 +[titan] 2025-09-09 14:17:05,710 - root - INFO - lr: 7.9988e-06 gnorm: 0.37 [1 day, 20:41:37<1 day, 4:19:13] +[titan] 2025-09-09 14:17:37,523 - root - INFO - step: 24490 loss: 2.7197 memory: 122.03GiB(87.57%) tps: 10,300 tflops: 490.91 mfu: 49.64% global_avg_ntp_loss: 0.7665 global_avg_top_loss: 1.9532 +[titan] 2025-09-09 14:17:37,523 - root - INFO - lr: 7.9955e-06 gnorm: 0.35 [1 day, 20:42:09<1 day, 4:18:39] +[titan] 2025-09-09 14:18:09,650 - root - INFO - step: 24495 loss: 2.8103 memory: 122.03GiB(87.57%) tps: 10,200 tflops: 486.11 mfu: 49.15% global_avg_ntp_loss: 0.8092 global_avg_top_loss: 2.0011 +[titan] 2025-09-09 14:18:09,650 - root - INFO - lr: 7.9921e-06 gnorm: 0.34 [1 day, 20:42:41<1 day, 4:18:06] +[titan] 2025-09-09 14:18:35,131 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 14:18:41,666 - root - INFO - step: 24500 loss: 2.7095 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.80 mfu: 49.32% global_avg_ntp_loss: 0.7635 global_avg_top_loss: 1.9460 +[titan] 2025-09-09 14:18:41,666 - root - INFO - lr: 7.9887e-06 gnorm: 0.35 [1 day, 20:43:13<1 day, 4:17:33] +[titan] 2025-09-09 14:19:13,423 - root - INFO - step: 24505 loss: 3.1674 memory: 122.03GiB(87.57%) tps: 10,319 tflops: 491.78 mfu: 49.72% global_avg_ntp_loss: 1.0246 global_avg_top_loss: 2.1428 +[titan] 2025-09-09 14:19:13,423 - root - INFO - lr: 7.9854e-06 gnorm: 0.36 [1 day, 20:43:45<1 day, 4:16:59] +[titan] 2025-09-09 14:19:45,316 - root - INFO - step: 24510 loss: 2.7075 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.67 mfu: 49.51% global_avg_ntp_loss: 0.7621 global_avg_top_loss: 1.9453 +[titan] 2025-09-09 14:19:45,317 - root - INFO - lr: 7.9820e-06 gnorm: 0.34 [1 day, 20:44:17<1 day, 4:16:26] +[titan] 2025-09-09 14:20:17,117 - root - INFO - step: 24515 loss: 2.6781 memory: 122.03GiB(87.57%) tps: 10,305 tflops: 491.11 mfu: 49.66% global_avg_ntp_loss: 0.7481 global_avg_top_loss: 1.9299 +[titan] 2025-09-09 14:20:17,117 - root - INFO - lr: 7.9787e-06 gnorm: 0.38 [1 day, 20:44:48<1 day, 4:15:52] +[titan] 2025-09-09 14:20:49,105 - root - INFO - step: 24520 loss: 2.5948 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.22 mfu: 49.37% global_avg_ntp_loss: 0.7119 global_avg_top_loss: 1.8829 +[titan] 2025-09-09 14:20:49,106 - root - INFO - lr: 7.9753e-06 gnorm: 0.41 [1 day, 20:45:20<1 day, 4:15:19] +[titan] 2025-09-09 14:21:21,055 - root - INFO - step: 24525 loss: 2.8212 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.81 mfu: 49.42% global_avg_ntp_loss: 0.8133 global_avg_top_loss: 2.0079 +[titan] 2025-09-09 14:21:21,056 - root - INFO - lr: 7.9719e-06 gnorm: 0.38 [1 day, 20:45:52<1 day, 4:14:45] +[titan] 2025-09-09 14:21:53,160 - root - INFO - step: 24530 loss: 2.7070 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.46 mfu: 49.19% global_avg_ntp_loss: 0.7627 global_avg_top_loss: 1.9444 +[titan] 2025-09-09 14:21:53,160 - root - INFO - lr: 7.9686e-06 gnorm: 0.38 [1 day, 20:46:25<1 day, 4:14:12] +[titan] 2025-09-09 14:22:25,241 - root - INFO - step: 24535 loss: 2.7028 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.80 mfu: 49.22% global_avg_ntp_loss: 0.7609 global_avg_top_loss: 1.9418 +[titan] 2025-09-09 14:22:25,242 - root - INFO - lr: 7.9652e-06 gnorm: 0.43 [1 day, 20:46:57<1 day, 4:13:39] +[titan] 2025-09-09 14:22:57,345 - root - INFO - step: 24540 loss: 2.7080 memory: 122.03GiB(87.57%) tps: 10,207 tflops: 486.47 mfu: 49.19% global_avg_ntp_loss: 0.7649 global_avg_top_loss: 1.9431 +[titan] 2025-09-09 14:22:57,345 - root - INFO - lr: 7.9618e-06 gnorm: 0.34 [1 day, 20:47:29<1 day, 4:13:05] +[titan] 2025-09-09 14:23:29,243 - root - INFO - step: 24545 loss: 2.7602 memory: 122.03GiB(87.57%) tps: 10,273 tflops: 489.60 mfu: 49.50% global_avg_ntp_loss: 0.7864 global_avg_top_loss: 1.9739 +[titan] 2025-09-09 14:23:29,244 - root - INFO - lr: 7.9585e-06 gnorm: 0.36 [1 day, 20:48:01<1 day, 4:12:32] +[titan] 2025-09-09 14:23:54,694 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 14:24:01,078 - root - INFO - step: 24550 loss: 2.7308 memory: 122.03GiB(87.57%) tps: 10,294 tflops: 490.59 mfu: 49.60% global_avg_ntp_loss: 0.7721 global_avg_top_loss: 1.9586 +[titan] 2025-09-09 14:24:01,078 - root - INFO - lr: 7.9551e-06 gnorm: 0.42 [1 day, 20:48:32<1 day, 4:11:58] +[titan] 2025-09-09 14:24:33,037 - root - INFO - step: 24555 loss: 2.7520 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.67 mfu: 49.41% global_avg_ntp_loss: 0.7840 global_avg_top_loss: 1.9680 +[titan] 2025-09-09 14:24:33,037 - root - INFO - lr: 7.9518e-06 gnorm: 0.36 [1 day, 20:49:04<1 day, 4:11:25] +[titan] 2025-09-09 14:25:04,856 - root - INFO - step: 24560 loss: 2.7458 memory: 122.03GiB(87.57%) tps: 10,298 tflops: 490.81 mfu: 49.63% global_avg_ntp_loss: 0.7833 global_avg_top_loss: 1.9625 +[titan] 2025-09-09 14:25:04,857 - root - INFO - lr: 7.9484e-06 gnorm: 0.38 [1 day, 20:49:36<1 day, 4:10:51] +[titan] 2025-09-09 14:25:36,808 - root - INFO - step: 24565 loss: 2.7147 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.78 mfu: 49.42% global_avg_ntp_loss: 0.7641 global_avg_top_loss: 1.9505 +[titan] 2025-09-09 14:25:36,809 - root - INFO - lr: 7.9451e-06 gnorm: 0.35 [1 day, 20:50:08<1 day, 4:10:18] +[titan] 2025-09-09 14:26:08,795 - root - INFO - step: 24570 loss: 2.6328 memory: 122.03GiB(87.57%) tps: 10,245 tflops: 488.25 mfu: 49.37% global_avg_ntp_loss: 0.7274 global_avg_top_loss: 1.9054 +[titan] 2025-09-09 14:26:08,795 - root - INFO - lr: 7.9417e-06 gnorm: 0.37 [1 day, 20:50:40<1 day, 4:09:44] +[titan] 2025-09-09 14:26:40,892 - root - INFO - step: 24575 loss: 2.8019 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.56 mfu: 49.20% global_avg_ntp_loss: 0.8043 global_avg_top_loss: 1.9976 +[titan] 2025-09-09 14:26:40,893 - root - INFO - lr: 7.9383e-06 gnorm: 0.36 [1 day, 20:51:12<1 day, 4:09:11] +[titan] 2025-09-09 14:26:47,479 - root - INFO - Dumping profiler traces at step 24576 +[titan] 2025-09-09 14:26:47,548 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-09 14:27:12,818 - root - INFO - step: 24580 loss: 2.7852 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.18 mfu: 49.46% global_avg_ntp_loss: 0.7954 global_avg_top_loss: 1.9898 +[titan] 2025-09-09 14:27:12,818 - root - INFO - lr: 7.9350e-06 gnorm: 0.38 [1 day, 20:51:44<1 day, 4:08:38] +[titan] 2025-09-09 14:27:44,784 - root - INFO - step: 24585 loss: 3.1717 memory: 122.03GiB(87.57%) tps: 10,251 tflops: 488.57 mfu: 49.40% global_avg_ntp_loss: 1.0256 global_avg_top_loss: 2.1461 +[titan] 2025-09-09 14:27:44,784 - root - INFO - lr: 7.9316e-06 gnorm: 0.34 [1 day, 20:52:16<1 day, 4:08:04] +[titan] 2025-09-09 14:28:16,876 - root - INFO - step: 24590 loss: 2.6412 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.64 mfu: 49.20% global_avg_ntp_loss: 0.7344 global_avg_top_loss: 1.9068 +[titan] 2025-09-09 14:28:16,877 - root - INFO - lr: 7.9283e-06 gnorm: 0.34 [1 day, 20:52:48<1 day, 4:07:31] +[titan] 2025-09-09 14:28:48,799 - root - INFO - step: 24595 loss: 2.6406 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.23 mfu: 49.47% global_avg_ntp_loss: 0.7305 global_avg_top_loss: 1.9101 +[titan] 2025-09-09 14:28:48,799 - root - INFO - lr: 7.9249e-06 gnorm: 0.36 [1 day, 20:53:20<1 day, 4:06:58] +[titan] 2025-09-09 14:29:14,287 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 14:29:20,650 - root - INFO - step: 24600 loss: 2.6756 memory: 122.03GiB(87.57%) tps: 10,288 tflops: 490.32 mfu: 49.58% global_avg_ntp_loss: 0.7485 global_avg_top_loss: 1.9271 +[titan] 2025-09-09 14:29:20,651 - root - INFO - lr: 7.9216e-06 gnorm: 0.35 [1 day, 20:53:52<1 day, 4:06:24] +[titan] 2025-09-09 14:29:52,464 - root - INFO - step: 24605 loss: 2.7393 memory: 122.03GiB(87.57%) tps: 10,300 tflops: 490.90 mfu: 49.64% global_avg_ntp_loss: 0.7822 global_avg_top_loss: 1.9571 +[titan] 2025-09-09 14:29:52,465 - root - INFO - lr: 7.9182e-06 gnorm: 0.36 [1 day, 20:54:24<1 day, 4:05:51] +[titan] 2025-09-09 14:30:24,386 - root - INFO - step: 24610 loss: 2.7336 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.24 mfu: 49.47% global_avg_ntp_loss: 0.7730 global_avg_top_loss: 1.9606 +[titan] 2025-09-09 14:30:24,386 - root - INFO - lr: 7.9148e-06 gnorm: 0.36 [1 day, 20:54:56<1 day, 4:05:17] +[titan] 2025-09-09 14:30:56,469 - root - INFO - step: 24615 loss: 2.7362 memory: 122.03GiB(87.57%) tps: 10,214 tflops: 486.79 mfu: 49.22% global_avg_ntp_loss: 0.7740 global_avg_top_loss: 1.9622 +[titan] 2025-09-09 14:30:56,469 - root - INFO - lr: 7.9115e-06 gnorm: 0.35 [1 day, 20:55:28<1 day, 4:04:44] +[titan] 2025-09-09 14:31:28,485 - root - INFO - step: 24620 loss: 2.7094 memory: 122.03GiB(87.57%) tps: 10,235 tflops: 487.80 mfu: 49.32% global_avg_ntp_loss: 0.7571 global_avg_top_loss: 1.9523 +[titan] 2025-09-09 14:31:28,485 - root - INFO - lr: 7.9081e-06 gnorm: 0.54 [1 day, 20:56:00<1 day, 4:04:10] +[titan] 2025-09-09 14:32:00,388 - root - INFO - step: 24625 loss: 2.6534 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.53 mfu: 49.50% global_avg_ntp_loss: 0.7373 global_avg_top_loss: 1.9160 +[titan] 2025-09-09 14:32:00,388 - root - INFO - lr: 7.9048e-06 gnorm: 0.37 [1 day, 20:56:32<1 day, 4:03:37] +[titan] 2025-09-09 14:32:32,442 - root - INFO - step: 24630 loss: 2.7817 memory: 122.03GiB(87.57%) tps: 10,223 tflops: 487.21 mfu: 49.26% global_avg_ntp_loss: 0.7958 global_avg_top_loss: 1.9859 +[titan] 2025-09-09 14:32:32,443 - root - INFO - lr: 7.9014e-06 gnorm: 0.39 [1 day, 20:57:04<1 day, 4:03:04] +[titan] 2025-09-09 14:33:04,238 - root - INFO - step: 24635 loss: 2.7515 memory: 122.03GiB(87.57%) tps: 10,306 tflops: 491.18 mfu: 49.66% global_avg_ntp_loss: 0.7818 global_avg_top_loss: 1.9697 +[titan] 2025-09-09 14:33:04,239 - root - INFO - lr: 7.8981e-06 gnorm: 0.34 [1 day, 20:57:36<1 day, 4:02:30] +[titan] 2025-09-09 14:33:36,037 - root - INFO - step: 24640 loss: 2.7131 memory: 122.03GiB(87.57%) tps: 10,305 tflops: 491.13 mfu: 49.66% global_avg_ntp_loss: 0.7644 global_avg_top_loss: 1.9488 +[titan] 2025-09-09 14:33:36,038 - root - INFO - lr: 7.8947e-06 gnorm: 0.35 [1 day, 20:58:07<1 day, 4:01:57] +[titan] 2025-09-09 14:34:07,907 - root - INFO - step: 24645 loss: 2.7299 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.05 mfu: 49.55% global_avg_ntp_loss: 0.7729 global_avg_top_loss: 1.9570 +[titan] 2025-09-09 14:34:07,907 - root - INFO - lr: 7.8914e-06 gnorm: 0.34 [1 day, 20:58:39<1 day, 4:01:23] +[titan] 2025-09-09 14:34:33,349 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 14:34:39,733 - root - INFO - step: 24650 loss: 2.7870 memory: 122.03GiB(87.57%) tps: 10,296 tflops: 490.71 mfu: 49.62% global_avg_ntp_loss: 0.7977 global_avg_top_loss: 1.9893 +[titan] 2025-09-09 14:34:39,733 - root - INFO - lr: 7.8880e-06 gnorm: 0.36 [1 day, 20:59:11<1 day, 4:00:50] +[titan] 2025-09-09 14:35:11,660 - root - INFO - step: 24655 loss: 2.7815 memory: 122.03GiB(87.57%) tps: 10,264 tflops: 489.16 mfu: 49.46% global_avg_ntp_loss: 0.7969 global_avg_top_loss: 1.9845 +[titan] 2025-09-09 14:35:11,661 - root - INFO - lr: 7.8847e-06 gnorm: 0.34 [1 day, 20:59:43<1 day, 4:00:16] +[titan] 2025-09-09 14:35:43,537 - root - INFO - step: 24660 loss: 2.7108 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.93 mfu: 49.54% global_avg_ntp_loss: 0.7669 global_avg_top_loss: 1.9439 +[titan] 2025-09-09 14:35:43,538 - root - INFO - lr: 7.8813e-06 gnorm: 0.34 [1 day, 21:00:15<1 day, 3:59:43] +[titan] 2025-09-09 14:36:15,608 - root - INFO - step: 24665 loss: 2.7022 memory: 122.03GiB(87.57%) tps: 10,218 tflops: 486.96 mfu: 49.24% global_avg_ntp_loss: 0.7633 global_avg_top_loss: 1.9389 +[titan] 2025-09-09 14:36:15,609 - root - INFO - lr: 7.8780e-06 gnorm: 0.36 [1 day, 21:00:47<1 day, 3:59:09] +[titan] 2025-09-09 14:36:47,445 - root - INFO - step: 24670 loss: 2.6612 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.55 mfu: 49.60% global_avg_ntp_loss: 0.7436 global_avg_top_loss: 1.9175 +[titan] 2025-09-09 14:36:47,446 - root - INFO - lr: 7.8746e-06 gnorm: 0.33 [1 day, 21:01:19<1 day, 3:58:36] +[titan] 2025-09-09 14:37:19,288 - root - INFO - step: 24675 loss: 2.7244 memory: 122.03GiB(87.57%) tps: 10,291 tflops: 490.46 mfu: 49.59% global_avg_ntp_loss: 0.7718 global_avg_top_loss: 1.9526 +[titan] 2025-09-09 14:37:19,288 - root - INFO - lr: 7.8713e-06 gnorm: 0.37 [1 day, 21:01:51<1 day, 3:58:03] +[titan] 2025-09-09 14:37:51,363 - root - INFO - step: 24680 loss: 2.7588 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.90 mfu: 49.23% global_avg_ntp_loss: 0.7815 global_avg_top_loss: 1.9773 +[titan] 2025-09-09 14:37:51,363 - root - INFO - lr: 7.8679e-06 gnorm: 0.35 [1 day, 21:02:23<1 day, 3:57:29] +[titan] 2025-09-09 14:38:23,298 - root - INFO - step: 24685 loss: 2.7681 memory: 122.03GiB(87.57%) tps: 10,261 tflops: 489.04 mfu: 49.45% global_avg_ntp_loss: 0.7891 global_avg_top_loss: 1.9790 +[titan] 2025-09-09 14:38:23,299 - root - INFO - lr: 7.8646e-06 gnorm: 0.39 [1 day, 21:02:55<1 day, 3:56:56] +[titan] 2025-09-09 14:38:55,253 - root - INFO - step: 24690 loss: 2.7401 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.73 mfu: 49.42% global_avg_ntp_loss: 0.7722 global_avg_top_loss: 1.9679 +[titan] 2025-09-09 14:38:55,253 - root - INFO - lr: 7.8613e-06 gnorm: 0.37 [1 day, 21:03:27<1 day, 3:56:22] +[titan] 2025-09-09 14:39:26,993 - root - INFO - step: 24695 loss: 2.8241 memory: 122.03GiB(87.57%) tps: 10,324 tflops: 492.05 mfu: 49.75% global_avg_ntp_loss: 0.8151 global_avg_top_loss: 2.0090 +[titan] 2025-09-09 14:39:26,993 - root - INFO - lr: 7.8579e-06 gnorm: 0.35 [1 day, 21:03:58<1 day, 3:55:49] +[titan] 2025-09-09 14:39:52,703 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 14:39:59,087 - root - INFO - step: 24700 loss: 2.8121 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.61 mfu: 49.20% global_avg_ntp_loss: 0.7910 global_avg_top_loss: 2.0212 +[titan] 2025-09-09 14:39:59,088 - root - INFO - lr: 7.8546e-06 gnorm: 3.70 [1 day, 21:04:30<1 day, 3:55:16] +[titan] 2025-09-09 14:40:30,950 - root - INFO - step: 24705 loss: 2.7303 memory: 122.03GiB(87.57%) tps: 10,284 tflops: 490.15 mfu: 49.56% global_avg_ntp_loss: 0.7716 global_avg_top_loss: 1.9588 +[titan] 2025-09-09 14:40:30,950 - root - INFO - lr: 7.8512e-06 gnorm: 0.35 [1 day, 21:05:02<1 day, 3:54:42] +[titan] 2025-09-09 14:41:03,059 - root - INFO - step: 24710 loss: 2.6409 memory: 122.03GiB(87.57%) tps: 10,205 tflops: 486.38 mfu: 49.18% global_avg_ntp_loss: 0.7367 global_avg_top_loss: 1.9042 +[titan] 2025-09-09 14:41:03,060 - root - INFO - lr: 7.8479e-06 gnorm: 0.35 [1 day, 21:05:34<1 day, 3:54:09] +[titan] 2025-09-09 14:41:34,968 - root - INFO - step: 24715 loss: 2.7260 memory: 122.03GiB(87.57%) tps: 10,270 tflops: 489.44 mfu: 49.49% global_avg_ntp_loss: 0.7745 global_avg_top_loss: 1.9516 +[titan] 2025-09-09 14:41:34,968 - root - INFO - lr: 7.8445e-06 gnorm: 0.35 [1 day, 21:06:06<1 day, 3:53:35] +[titan] 2025-09-09 14:42:06,966 - root - INFO - step: 24720 loss: 2.7557 memory: 122.03GiB(87.57%) tps: 10,241 tflops: 488.07 mfu: 49.35% global_avg_ntp_loss: 0.7831 global_avg_top_loss: 1.9726 +[titan] 2025-09-09 14:42:06,967 - root - INFO - lr: 7.8412e-06 gnorm: 0.36 [1 day, 21:06:38<1 day, 3:53:02] +[titan] 2025-09-09 14:42:38,908 - root - INFO - step: 24725 loss: 2.6834 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.93 mfu: 49.44% global_avg_ntp_loss: 0.7521 global_avg_top_loss: 1.9313 +[titan] 2025-09-09 14:42:38,909 - root - INFO - lr: 7.8378e-06 gnorm: 0.34 [1 day, 21:07:10<1 day, 3:52:29] +[titan] 2025-09-09 14:43:10,814 - root - INFO - step: 24730 loss: 2.7838 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.49 mfu: 49.49% global_avg_ntp_loss: 0.7999 global_avg_top_loss: 1.9839 +[titan] 2025-09-09 14:43:10,815 - root - INFO - lr: 7.8345e-06 gnorm: 0.35 [1 day, 21:07:42<1 day, 3:51:55] +[titan] 2025-09-09 14:43:42,705 - root - INFO - step: 24735 loss: 2.7348 memory: 122.03GiB(87.57%) tps: 10,275 tflops: 489.72 mfu: 49.52% global_avg_ntp_loss: 0.7740 global_avg_top_loss: 1.9608 +[titan] 2025-09-09 14:43:42,705 - root - INFO - lr: 7.8312e-06 gnorm: 0.35 [1 day, 21:08:14<1 day, 3:51:22] +[titan] 2025-09-09 14:44:14,645 - root - INFO - step: 24740 loss: 2.7169 memory: 122.03GiB(87.57%) tps: 10,260 tflops: 488.97 mfu: 49.44% global_avg_ntp_loss: 0.7670 global_avg_top_loss: 1.9499 +[titan] 2025-09-09 14:44:14,645 - root - INFO - lr: 7.8278e-06 gnorm: 0.34 [1 day, 21:08:46<1 day, 3:50:48] +[titan] 2025-09-09 14:44:46,722 - root - INFO - step: 24745 loss: 2.6963 memory: 122.03GiB(87.57%) tps: 10,216 tflops: 486.87 mfu: 49.23% global_avg_ntp_loss: 0.7644 global_avg_top_loss: 1.9319 +[titan] 2025-09-09 14:44:46,723 - root - INFO - lr: 7.8245e-06 gnorm: 0.36 [1 day, 21:09:18<1 day, 3:50:15] +[titan] 2025-09-09 14:45:12,224 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 14:45:18,632 - root - INFO - step: 24750 loss: 2.7410 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.42 mfu: 49.49% global_avg_ntp_loss: 0.7766 global_avg_top_loss: 1.9644 +[titan] 2025-09-09 14:45:18,632 - root - INFO - lr: 7.8211e-06 gnorm: 0.36 [1 day, 21:09:50<1 day, 3:49:42] +[titan] 2025-09-09 14:45:50,412 - root - INFO - step: 24755 loss: 2.6954 memory: 122.03GiB(87.57%) tps: 10,311 tflops: 491.42 mfu: 49.69% global_avg_ntp_loss: 0.7550 global_avg_top_loss: 1.9404 +[titan] 2025-09-09 14:45:50,413 - root - INFO - lr: 7.8178e-06 gnorm: 0.35 [1 day, 21:10:22<1 day, 3:49:08] +[titan] 2025-09-09 14:46:22,365 - root - INFO - step: 24760 loss: 3.7430 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.77 mfu: 49.42% global_avg_ntp_loss: 1.3471 global_avg_top_loss: 2.3960 +[titan] 2025-09-09 14:46:22,365 - root - INFO - lr: 7.8145e-06 gnorm: 0.39 [1 day, 21:10:54<1 day, 3:48:35] +[titan] 2025-09-09 14:46:54,412 - root - INFO - step: 24765 loss: 2.7549 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.33 mfu: 49.28% global_avg_ntp_loss: 0.7850 global_avg_top_loss: 1.9699 +[titan] 2025-09-09 14:46:54,412 - root - INFO - lr: 7.8111e-06 gnorm: 0.34 [1 day, 21:11:26<1 day, 3:48:01] +[titan] 2025-09-09 14:47:26,371 - root - INFO - step: 24770 loss: 2.8128 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.67 mfu: 49.41% global_avg_ntp_loss: 0.8189 global_avg_top_loss: 1.9939 +[titan] 2025-09-09 14:47:26,371 - root - INFO - lr: 7.8078e-06 gnorm: 0.34 [1 day, 21:11:58<1 day, 3:47:28] +[titan] 2025-09-09 14:47:58,294 - root - INFO - step: 24775 loss: 2.7395 memory: 122.03GiB(87.57%) tps: 10,265 tflops: 489.22 mfu: 49.47% global_avg_ntp_loss: 0.7772 global_avg_top_loss: 1.9623 +[titan] 2025-09-09 14:47:58,295 - root - INFO - lr: 7.8044e-06 gnorm: 0.37 [1 day, 21:12:30<1 day, 3:46:54] +[titan] 2025-09-09 14:48:30,389 - root - INFO - step: 24780 loss: 2.7780 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.60 mfu: 49.20% global_avg_ntp_loss: 0.7968 global_avg_top_loss: 1.9812 +[titan] 2025-09-09 14:48:30,390 - root - INFO - lr: 7.8011e-06 gnorm: 0.35 [1 day, 21:13:02<1 day, 3:46:21] +[titan] 2025-09-09 14:49:02,456 - root - INFO - step: 24785 loss: 2.7093 memory: 122.03GiB(87.57%) tps: 10,219 tflops: 487.02 mfu: 49.24% global_avg_ntp_loss: 0.7584 global_avg_top_loss: 1.9509 +[titan] 2025-09-09 14:49:02,457 - root - INFO - lr: 7.7978e-06 gnorm: 0.41 [1 day, 21:13:34<1 day, 3:45:48] +[titan] 2025-09-09 14:49:34,287 - root - INFO - step: 24790 loss: 2.6898 memory: 122.03GiB(87.57%) tps: 10,295 tflops: 490.65 mfu: 49.61% global_avg_ntp_loss: 0.7607 global_avg_top_loss: 1.9290 +[titan] 2025-09-09 14:49:34,287 - root - INFO - lr: 7.7944e-06 gnorm: 0.34 [1 day, 21:14:06<1 day, 3:45:14] +[titan] 2025-09-09 14:50:06,094 - root - INFO - step: 24795 loss: 3.1757 memory: 122.03GiB(87.57%) tps: 10,302 tflops: 490.99 mfu: 49.65% global_avg_ntp_loss: 1.0245 global_avg_top_loss: 2.1511 +[titan] 2025-09-09 14:50:06,095 - root - INFO - lr: 7.7911e-06 gnorm: 0.36 [1 day, 21:14:37<1 day, 3:44:41] +[titan] 2025-09-09 14:50:31,674 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 14:50:38,103 - root - INFO - step: 24800 loss: 2.7427 memory: 122.03GiB(87.57%) tps: 10,238 tflops: 487.92 mfu: 49.33% global_avg_ntp_loss: 0.7777 global_avg_top_loss: 1.9651 +[titan] 2025-09-09 14:50:38,103 - root - INFO - lr: 7.7878e-06 gnorm: 0.36 [1 day, 21:15:09<1 day, 3:44:08] +[titan] 2025-09-09 14:51:10,096 - root - INFO - step: 24805 loss: 2.7060 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.15 mfu: 49.36% global_avg_ntp_loss: 0.7642 global_avg_top_loss: 1.9418 +[titan] 2025-09-09 14:51:10,097 - root - INFO - lr: 7.7844e-06 gnorm: 0.37 [1 day, 21:15:41<1 day, 3:43:34] +[titan] 2025-09-09 14:51:42,191 - root - INFO - step: 24810 loss: 2.7922 memory: 122.03GiB(87.57%) tps: 10,210 tflops: 486.60 mfu: 49.20% global_avg_ntp_loss: 0.7986 global_avg_top_loss: 1.9936 +[titan] 2025-09-09 14:51:42,191 - root - INFO - lr: 7.7811e-06 gnorm: 0.36 [1 day, 21:16:14<1 day, 3:43:01] +[titan] 2025-09-09 14:52:14,027 - root - INFO - step: 24815 loss: 2.7399 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.55 mfu: 49.60% global_avg_ntp_loss: 0.7776 global_avg_top_loss: 1.9623 +[titan] 2025-09-09 14:52:14,028 - root - INFO - lr: 7.7778e-06 gnorm: 0.34 [1 day, 21:16:45<1 day, 3:42:27] +[titan] 2025-09-09 14:52:45,978 - root - INFO - step: 24820 loss: 2.8046 memory: 122.03GiB(87.57%) tps: 10,256 tflops: 488.80 mfu: 49.42% global_avg_ntp_loss: 0.8083 global_avg_top_loss: 1.9964 +[titan] 2025-09-09 14:52:45,978 - root - INFO - lr: 7.7744e-06 gnorm: 0.37 [1 day, 21:17:17<1 day, 3:41:54] +[titan] 2025-09-09 14:53:18,076 - root - INFO - step: 24825 loss: 2.7670 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.55 mfu: 49.20% global_avg_ntp_loss: 0.7887 global_avg_top_loss: 1.9784 +[titan] 2025-09-09 14:53:18,077 - root - INFO - lr: 7.7711e-06 gnorm: 0.35 [1 day, 21:17:49<1 day, 3:41:21] +[titan] 2025-09-09 14:53:50,118 - root - INFO - step: 24830 loss: 2.5914 memory: 122.03GiB(87.57%) tps: 10,227 tflops: 487.41 mfu: 49.28% global_avg_ntp_loss: 0.7095 global_avg_top_loss: 1.8820 +[titan] 2025-09-09 14:53:50,118 - root - INFO - lr: 7.7678e-06 gnorm: 0.33 [1 day, 21:18:21<1 day, 3:40:47] +[titan] 2025-09-09 14:54:22,208 - root - INFO - step: 24835 loss: 2.7687 memory: 122.03GiB(87.57%) tps: 10,211 tflops: 486.67 mfu: 49.21% global_avg_ntp_loss: 0.7876 global_avg_top_loss: 1.9811 +[titan] 2025-09-09 14:54:22,208 - root - INFO - lr: 7.7644e-06 gnorm: 0.36 [1 day, 21:18:54<1 day, 3:40:14] +[titan] 2025-09-09 14:54:54,169 - root - INFO - step: 24840 loss: 3.1903 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.65 mfu: 49.41% global_avg_ntp_loss: 1.0353 global_avg_top_loss: 2.1550 +[titan] 2025-09-09 14:54:54,169 - root - INFO - lr: 7.7611e-06 gnorm: 0.36 [1 day, 21:19:25<1 day, 3:39:41] +[titan] 2025-09-09 14:55:26,267 - root - INFO - step: 24845 loss: 2.7710 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.54 mfu: 49.20% global_avg_ntp_loss: 0.7889 global_avg_top_loss: 1.9821 +[titan] 2025-09-09 14:55:26,268 - root - INFO - lr: 7.7578e-06 gnorm: 0.45 [1 day, 21:19:58<1 day, 3:39:07] +[titan] 2025-09-09 14:55:51,734 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 14:55:58,123 - root - INFO - step: 24850 loss: 2.6850 memory: 122.03GiB(87.57%) tps: 10,287 tflops: 490.25 mfu: 49.57% global_avg_ntp_loss: 0.7535 global_avg_top_loss: 1.9316 +[titan] 2025-09-09 14:55:58,124 - root - INFO - lr: 7.7544e-06 gnorm: 0.38 [1 day, 21:20:29<1 day, 3:38:34] +[titan] 2025-09-09 14:56:30,002 - root - INFO - step: 24855 loss: 2.6466 memory: 122.03GiB(87.57%) tps: 10,279 tflops: 489.91 mfu: 49.54% global_avg_ntp_loss: 0.7419 global_avg_top_loss: 1.9047 +[titan] 2025-09-09 14:56:30,002 - root - INFO - lr: 7.7511e-06 gnorm: 0.35 [1 day, 21:21:01<1 day, 3:38:00] +[titan] 2025-09-09 14:57:01,912 - root - INFO - step: 24860 loss: 2.8290 memory: 122.03GiB(87.57%) tps: 10,269 tflops: 489.41 mfu: 49.49% global_avg_ntp_loss: 0.8163 global_avg_top_loss: 2.0127 +[titan] 2025-09-09 14:57:01,913 - root - INFO - lr: 7.7478e-06 gnorm: 0.39 [1 day, 21:21:33<1 day, 3:37:27] +[titan] 2025-09-09 14:57:33,748 - root - INFO - step: 24865 loss: 2.7765 memory: 122.03GiB(87.57%) tps: 10,293 tflops: 490.57 mfu: 49.60% global_avg_ntp_loss: 0.7959 global_avg_top_loss: 1.9806 +[titan] 2025-09-09 14:57:33,748 - root - INFO - lr: 7.7445e-06 gnorm: 0.39 [1 day, 21:22:05<1 day, 3:36:54] +[titan] 2025-09-09 14:58:05,708 - root - INFO - step: 24870 loss: 2.5812 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.65 mfu: 49.41% global_avg_ntp_loss: 0.7076 global_avg_top_loss: 1.8736 +[titan] 2025-09-09 14:58:05,708 - root - INFO - lr: 7.7411e-06 gnorm: 0.38 [1 day, 21:22:37<1 day, 3:36:20] +[titan] 2025-09-09 14:58:37,698 - root - INFO - step: 24875 loss: 2.7751 memory: 122.03GiB(87.57%) tps: 10,244 tflops: 488.21 mfu: 49.36% global_avg_ntp_loss: 0.7920 global_avg_top_loss: 1.9831 +[titan] 2025-09-09 14:58:37,698 - root - INFO - lr: 7.7378e-06 gnorm: 0.35 [1 day, 21:23:09<1 day, 3:35:47] +[titan] 2025-09-09 14:59:09,602 - root - INFO - step: 24880 loss: 2.8366 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.51 mfu: 49.50% global_avg_ntp_loss: 0.8213 global_avg_top_loss: 2.0153 +[titan] 2025-09-09 14:59:09,603 - root - INFO - lr: 7.7345e-06 gnorm: 0.37 [1 day, 21:23:41<1 day, 3:35:13] +[titan] 2025-09-09 14:59:41,691 - root - INFO - step: 24885 loss: 2.7641 memory: 122.03GiB(87.57%) tps: 10,212 tflops: 486.70 mfu: 49.21% global_avg_ntp_loss: 0.7857 global_avg_top_loss: 1.9783 +[titan] 2025-09-09 14:59:41,691 - root - INFO - lr: 7.7312e-06 gnorm: 0.36 [1 day, 21:24:13<1 day, 3:34:40] +[titan] 2025-09-09 15:00:13,595 - root - INFO - step: 24890 loss: 2.7429 memory: 122.03GiB(87.57%) tps: 10,271 tflops: 489.51 mfu: 49.50% global_avg_ntp_loss: 0.7790 global_avg_top_loss: 1.9639 +[titan] 2025-09-09 15:00:13,595 - root - INFO - lr: 7.7278e-06 gnorm: 0.36 [1 day, 21:24:45<1 day, 3:34:07] +[titan] 2025-09-09 15:00:45,716 - root - INFO - step: 24895 loss: 2.7298 memory: 122.03GiB(87.57%) tps: 10,202 tflops: 486.20 mfu: 49.16% global_avg_ntp_loss: 0.7738 global_avg_top_loss: 1.9559 +[titan] 2025-09-09 15:00:45,717 - root - INFO - lr: 7.7245e-06 gnorm: 0.36 [1 day, 21:25:17<1 day, 3:33:33] +[titan] 2025-09-09 15:01:11,130 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 15:01:17,533 - root - INFO - step: 24900 loss: 3.1743 memory: 122.03GiB(87.57%) tps: 10,299 tflops: 490.86 mfu: 49.63% global_avg_ntp_loss: 1.0251 global_avg_top_loss: 2.1492 +[titan] 2025-09-09 15:01:17,533 - root - INFO - lr: 7.7212e-06 gnorm: 0.51 [1 day, 21:25:49<1 day, 3:33:00] +[titan] 2025-09-09 15:01:49,285 - root - INFO - step: 24905 loss: 2.7488 memory: 122.03GiB(87.57%) tps: 10,320 tflops: 491.84 mfu: 49.73% global_avg_ntp_loss: 0.7819 global_avg_top_loss: 1.9670 +[titan] 2025-09-09 15:01:49,286 - root - INFO - lr: 7.7178e-06 gnorm: 0.36 [1 day, 21:26:21<1 day, 3:32:27] +[titan] 2025-09-09 15:02:21,257 - root - INFO - step: 24910 loss: 2.7033 memory: 122.03GiB(87.57%) tps: 10,249 tflops: 488.48 mfu: 49.39% global_avg_ntp_loss: 0.7582 global_avg_top_loss: 1.9451 +[titan] 2025-09-09 15:02:21,257 - root - INFO - lr: 7.7145e-06 gnorm: 0.36 [1 day, 21:26:53<1 day, 3:31:53] +[titan] 2025-09-09 15:02:53,178 - root - INFO - step: 24915 loss: 2.7503 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.25 mfu: 49.47% global_avg_ntp_loss: 0.7838 global_avg_top_loss: 1.9664 +[titan] 2025-09-09 15:02:53,179 - root - INFO - lr: 7.7112e-06 gnorm: 0.35 [1 day, 21:27:24<1 day, 3:31:20] +[titan] 2025-09-09 15:03:24,949 - root - INFO - step: 24920 loss: 3.1861 memory: 122.03GiB(87.57%) tps: 10,314 tflops: 491.57 mfu: 49.70% global_avg_ntp_loss: 1.0351 global_avg_top_loss: 2.1510 +[titan] 2025-09-09 15:03:24,949 - root - INFO - lr: 7.7079e-06 gnorm: 0.36 [1 day, 21:27:56<1 day, 3:30:46] +[titan] 2025-09-09 15:03:56,819 - root - INFO - step: 24925 loss: 2.7772 memory: 122.03GiB(87.57%) tps: 10,282 tflops: 490.03 mfu: 49.55% global_avg_ntp_loss: 0.7955 global_avg_top_loss: 1.9817 +[titan] 2025-09-09 15:03:56,820 - root - INFO - lr: 7.7046e-06 gnorm: 0.37 [1 day, 21:28:28<1 day, 3:30:13] +[titan] 2025-09-09 15:04:28,916 - root - INFO - step: 24930 loss: 2.7256 memory: 122.03GiB(87.57%) tps: 10,209 tflops: 486.57 mfu: 49.20% global_avg_ntp_loss: 0.7724 global_avg_top_loss: 1.9532 +[titan] 2025-09-09 15:04:28,917 - root - INFO - lr: 7.7012e-06 gnorm: 0.36 [1 day, 21:29:00<1 day, 3:29:40] +[titan] 2025-09-09 15:05:00,579 - root - INFO - step: 24935 loss: 2.6950 memory: 122.03GiB(87.57%) tps: 10,349 tflops: 493.24 mfu: 49.87% global_avg_ntp_loss: 0.7613 global_avg_top_loss: 1.9337 +[titan] 2025-09-09 15:05:00,579 - root - INFO - lr: 7.6979e-06 gnorm: 0.36 [1 day, 21:29:32<1 day, 3:29:06] +[titan] 2025-09-09 15:05:32,534 - root - INFO - step: 24940 loss: 2.6885 memory: 122.03GiB(87.57%) tps: 10,255 tflops: 488.73 mfu: 49.42% global_avg_ntp_loss: 0.7537 global_avg_top_loss: 1.9348 +[titan] 2025-09-09 15:05:32,534 - root - INFO - lr: 7.6946e-06 gnorm: 0.34 [1 day, 21:30:04<1 day, 3:28:33] +[titan] 2025-09-09 15:06:04,529 - root - INFO - step: 24945 loss: 2.6884 memory: 122.03GiB(87.57%) tps: 10,242 tflops: 488.12 mfu: 49.36% global_avg_ntp_loss: 0.7578 global_avg_top_loss: 1.9305 +[titan] 2025-09-09 15:06:04,530 - root - INFO - lr: 7.6913e-06 gnorm: 0.36 [1 day, 21:30:36<1 day, 3:27:59] +[titan] 2025-09-09 15:06:30,063 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 15:06:36,469 - root - INFO - step: 24950 loss: 2.7473 memory: 122.03GiB(87.57%) tps: 10,259 tflops: 488.96 mfu: 49.44% global_avg_ntp_loss: 0.7793 global_avg_top_loss: 1.9680 +[titan] 2025-09-09 15:06:36,470 - root - INFO - lr: 7.6880e-06 gnorm: 0.34 [1 day, 21:31:08<1 day, 3:27:26] +[titan] 2025-09-09 15:07:08,507 - root - INFO - step: 24955 loss: 2.9943 memory: 122.03GiB(87.57%) tps: 10,228 tflops: 487.48 mfu: 49.29% global_avg_ntp_loss: 0.9099 global_avg_top_loss: 2.0844 +[titan] 2025-09-09 15:07:08,507 - root - INFO - lr: 7.6846e-06 gnorm: 0.35 [1 day, 21:31:40<1 day, 3:26:53] +[titan] 2025-09-09 15:07:40,317 - root - INFO - step: 24960 loss: 2.7376 memory: 122.03GiB(87.57%) tps: 10,301 tflops: 490.95 mfu: 49.64% global_avg_ntp_loss: 0.7739 global_avg_top_loss: 1.9637 +[titan] 2025-09-09 15:07:40,318 - root - INFO - lr: 7.6813e-06 gnorm: 0.35 [1 day, 21:32:12<1 day, 3:26:19] +[titan] 2025-09-09 15:08:12,213 - root - INFO - step: 24965 loss: 2.7561 memory: 122.03GiB(87.57%) tps: 10,274 tflops: 489.64 mfu: 49.51% global_avg_ntp_loss: 0.7854 global_avg_top_loss: 1.9707 +[titan] 2025-09-09 15:08:12,213 - root - INFO - lr: 7.6780e-06 gnorm: 0.35 [1 day, 21:32:44<1 day, 3:25:46] +[titan] 2025-09-09 15:08:44,173 - root - INFO - step: 24970 loss: 3.2382 memory: 122.03GiB(87.57%) tps: 10,253 tflops: 488.65 mfu: 49.41% global_avg_ntp_loss: 1.0568 global_avg_top_loss: 2.1814 +[titan] 2025-09-09 15:08:44,173 - root - INFO - lr: 7.6747e-06 gnorm: 0.36 [1 day, 21:33:15<1 day, 3:25:12] +[titan] 2025-09-09 15:09:16,220 - root - INFO - step: 24975 loss: 2.7016 memory: 122.03GiB(87.57%) tps: 10,225 tflops: 487.32 mfu: 49.27% global_avg_ntp_loss: 0.7614 global_avg_top_loss: 1.9402 +[titan] 2025-09-09 15:09:16,221 - root - INFO - lr: 7.6714e-06 gnorm: 0.35 [1 day, 21:33:48<1 day, 3:24:39] +[titan] 2025-09-09 15:09:48,045 - root - INFO - step: 24980 loss: 2.8174 memory: 122.03GiB(87.57%) tps: 10,297 tflops: 490.74 mfu: 49.62% global_avg_ntp_loss: 0.8100 global_avg_top_loss: 2.0074 +[titan] 2025-09-09 15:09:48,045 - root - INFO - lr: 7.6680e-06 gnorm: 0.36 [1 day, 21:34:19<1 day, 3:24:06] +[titan] 2025-09-09 15:10:19,922 - root - INFO - step: 24985 loss: 2.7007 memory: 122.03GiB(87.57%) tps: 10,280 tflops: 489.93 mfu: 49.54% global_avg_ntp_loss: 0.7611 global_avg_top_loss: 1.9396 +[titan] 2025-09-09 15:10:19,922 - root - INFO - lr: 7.6647e-06 gnorm: 0.38 [1 day, 21:34:51<1 day, 3:23:32] +[titan] 2025-09-09 15:10:51,843 - root - INFO - step: 24990 loss: 2.7268 memory: 122.03GiB(87.57%) tps: 10,266 tflops: 489.26 mfu: 49.47% global_avg_ntp_loss: 0.7742 global_avg_top_loss: 1.9525 +[titan] 2025-09-09 15:10:51,843 - root - INFO - lr: 7.6614e-06 gnorm: 0.34 [1 day, 21:35:23<1 day, 3:22:59] +[titan] 2025-09-09 15:11:23,641 - root - INFO - step: 24995 loss: 2.7406 memory: 122.03GiB(87.57%) tps: 10,305 tflops: 491.15 mfu: 49.66% global_avg_ntp_loss: 0.7777 global_avg_top_loss: 1.9629 +[titan] 2025-09-09 15:11:23,641 - root - INFO - lr: 7.6581e-06 gnorm: 0.35 [1 day, 21:35:55<1 day, 3:22:25] +[titan] 2025-09-09 15:11:49,083 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 15:11:55,515 - root - INFO - step: 25000 loss: 3.2441 memory: 122.03GiB(87.57%) tps: 10,281 tflops: 489.97 mfu: 49.54% global_avg_ntp_loss: 1.0620 global_avg_top_loss: 2.1820 +[titan] 2025-09-09 15:11:55,515 - root - INFO - lr: 7.6548e-06 gnorm: 0.34 [1 day, 21:36:27<1 day, 3:21:52] +[titan] 2025-09-09 15:11:55,515 - root - INFO - Saving the checkpoint (or staging if async is enabled). +[titan] 2025-09-09 15:12:27,709 - root - INFO - [GC] GC collection invoked by checkpointer. 0.01 seconds. +[titan] 2025-09-09 15:12:27,710 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 32.19 seconds. +[titan] 2025-09-09 15:12:27,710 - root - INFO - Ensuring repository zaydzuhri/top-code-7B-4096-batch8x2-steps40000-20250909-061757 exists... +[titan] 2025-09-09 15:12:28,272 - root - INFO - Repository zaydzuhri/top-code-7B-4096-batch8x2-steps40000-20250909-061757 ensured. +[titan] 2025-09-09 15:12:28,272 - root - INFO - Uploading exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/checkpoint/step-25000 to zaydzuhri/top-code-7B-4096-batch8x2-steps40000-20250909-061757/step-25000 on Hugging Face Hub... +Processing Files (9 / 9) : 100%|██████████| 83.3GB / 83.3GB, 0.00B/s +New Data Upload : 100%|██████████| 83.3GB / 83.3GB, 0.00B/s + ...ine/checkpoint/step-25000/.metadata: 100%|██████████| 2.47MB / 2.47MB + .../checkpoint/step-25000/__1_0.distcp: 100%|██████████| 10.4GB / 10.4GB + .../checkpoint/step-25000/__4_0.distcp: 100%|██████████| 10.4GB / 10.4GB + .../checkpoint/step-25000/__7_0.distcp: 100%|██████████| 10.4GB / 10.4GB + .../checkpoint/step-25000/__2_0.distcp: 100%|██████████| 10.4GB / 10.4GB + .../checkpoint/step-25000/__0_0.distcp: 100%|██████████| 10.4GB / 10.4GB + .../checkpoint/step-25000/__5_0.distcp: 100%|██████████| 10.4GB / 10.4GB + .../checkpoint/step-25000/__3_0.distcp: 100%|██████████| 10.4GB / 10.4GB + .../checkpoint/step-25000/__6_0.distcp: 100%|██████████| 10.4GB / 10.4GB +[titan] 2025-09-09 15:30:21,940 - root - INFO - Successfully uploaded step 25000 to zaydzuhri/top-code-7B-4096-batch8x2-steps40000-20250909-061757. +[titan] 2025-09-09 15:30:51,806 - root - INFO - step: 25005 loss: 2.6619 memory: 122.04GiB(87.57%) tps: 288 tflops: 13.74 mfu: 1.39% global_avg_ntp_loss: 0.7419 global_avg_top_loss: 1.9200 +[titan] 2025-09-09 15:30:51,806 - root - INFO - lr: 7.6515e-06 gnorm: 0.34 [1 day, 21:55:23<1 day, 3:32:21] +[titan] 2025-09-09 15:31:21,791 - root - INFO - step: 25010 loss: 2.6994 memory: 122.04GiB(87.57%) tps: 10,929 tflops: 520.85 mfu: 52.66% global_avg_ntp_loss: 0.7595 global_avg_top_loss: 1.9399 +[titan] 2025-09-09 15:31:21,791 - root - INFO - lr: 7.6482e-06 gnorm: 0.34 [1 day, 21:55:53<1 day, 3:31:46] +[titan] 2025-09-09 15:31:51,816 - root - INFO - step: 25015 loss: 2.7680 memory: 122.04GiB(87.57%) tps: 10,914 tflops: 520.14 mfu: 52.59% global_avg_ntp_loss: 0.7980 global_avg_top_loss: 1.9700 +[titan] 2025-09-09 15:31:51,817 - root - INFO - lr: 7.6448e-06 gnorm: 0.39 [1 day, 21:56:23<1 day, 3:31:11] +[titan] 2025-09-09 15:32:22,100 - root - INFO - step: 25020 loss: 2.8093 memory: 122.04GiB(87.57%) tps: 10,821 tflops: 515.70 mfu: 52.14% global_avg_ntp_loss: 0.8147 global_avg_top_loss: 1.9946 +[titan] 2025-09-09 15:32:22,101 - root - INFO - lr: 7.6415e-06 gnorm: 0.35 [1 day, 21:56:53<1 day, 3:30:36] +[titan] 2025-09-09 15:32:52,563 - root - INFO - step: 25025 loss: 2.7062 memory: 122.04GiB(87.57%) tps: 10,757 tflops: 512.67 mfu: 51.84% global_avg_ntp_loss: 0.7619 global_avg_top_loss: 1.9443 +[titan] 2025-09-09 15:32:52,564 - root - INFO - lr: 7.6382e-06 gnorm: 0.34 [1 day, 21:57:24<1 day, 3:30:02] +[titan] 2025-09-09 15:33:23,014 - root - INFO - step: 25030 loss: 2.7342 memory: 122.04GiB(87.57%) tps: 10,762 tflops: 512.89 mfu: 51.86% global_avg_ntp_loss: 0.7730 global_avg_top_loss: 1.9612 +[titan] 2025-09-09 15:33:23,014 - root - INFO - lr: 7.6349e-06 gnorm: 0.35 [1 day, 21:57:54<1 day, 3:29:27] +[titan] 2025-09-09 15:33:53,496 - root - INFO - step: 25035 loss: 2.7302 memory: 122.04GiB(87.57%) tps: 10,750 tflops: 512.34 mfu: 51.80% global_avg_ntp_loss: 0.7741 global_avg_top_loss: 1.9561 +[titan] 2025-09-09 15:33:53,497 - root - INFO - lr: 7.6316e-06 gnorm: 0.36 [1 day, 21:58:25<1 day, 3:28:52] +[titan] 2025-09-09 15:34:24,349 - root - INFO - step: 25040 loss: 2.6863 memory: 122.04GiB(87.57%) tps: 10,621 tflops: 506.19 mfu: 51.18% global_avg_ntp_loss: 0.7550 global_avg_top_loss: 1.9313 +[titan] 2025-09-09 15:34:24,349 - root - INFO - lr: 7.6283e-06 gnorm: 0.35 [1 day, 21:58:56<1 day, 3:28:18] +[titan] 2025-09-09 15:34:55,417 - root - INFO - step: 25045 loss: 2.7444 memory: 122.04GiB(87.57%) tps: 10,548 tflops: 502.69 mfu: 50.83% global_avg_ntp_loss: 0.7779 global_avg_top_loss: 1.9664 +[titan] 2025-09-09 15:34:55,417 - root - INFO - lr: 7.6250e-06 gnorm: 0.34 [1 day, 21:59:27<1 day, 3:27:44] +[titan] 2025-09-09 15:35:20,334 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 15:35:26,657 - root - INFO - step: 25050 loss: 3.2923 memory: 122.04GiB(87.57%) tps: 10,489 tflops: 499.92 mfu: 50.55% global_avg_ntp_loss: 1.0811 global_avg_top_loss: 2.2112 +[titan] 2025-09-09 15:35:26,658 - root - INFO - lr: 7.6217e-06 gnorm: 0.37 [1 day, 21:59:58<1 day, 3:27:10] +[titan] 2025-09-09 15:35:58,018 - root - INFO - step: 25055 loss: 2.7621 memory: 122.04GiB(87.57%) tps: 10,449 tflops: 497.99 mfu: 50.35% global_avg_ntp_loss: 0.7882 global_avg_top_loss: 1.9739 +[titan] 2025-09-09 15:35:58,019 - root - INFO - lr: 7.6184e-06 gnorm: 0.34 [1 day, 22:00:29<1 day, 3:26:36] +[titan] 2025-09-09 15:36:29,514 - root - INFO - step: 25060 loss: 2.6972 memory: 122.04GiB(87.57%) tps: 10,404 tflops: 495.85 mfu: 50.14% global_avg_ntp_loss: 0.7655 global_avg_top_loss: 1.9317 +[titan] 2025-09-09 15:36:29,515 - root - INFO - lr: 7.6151e-06 gnorm: 0.36 [1 day, 22:01:01<1 day, 3:26:02] +[titan] 2025-09-09 15:37:01,088 - root - INFO - step: 25065 loss: 2.7690 memory: 122.04GiB(87.57%) tps: 10,379 tflops: 494.64 mfu: 50.01% global_avg_ntp_loss: 0.7900 global_avg_top_loss: 1.9790 +[titan] 2025-09-09 15:37:01,089 - root - INFO - lr: 7.6117e-06 gnorm: 0.36 [1 day, 22:01:32<1 day, 3:25:28] +[titan] 2025-09-09 15:37:32,800 - root - INFO - step: 25070 loss: 2.6993 memory: 122.04GiB(87.57%) tps: 10,333 tflops: 492.48 mfu: 49.80% global_avg_ntp_loss: 0.7578 global_avg_top_loss: 1.9415 +[titan] 2025-09-09 15:37:32,801 - root - INFO - lr: 7.6084e-06 gnorm: 0.34 [1 day, 22:02:04<1 day, 3:24:54] +[titan] 2025-09-09 15:38:04,486 - root - INFO - step: 25075 loss: 2.7475 memory: 122.04GiB(87.57%) tps: 10,342 tflops: 492.88 mfu: 49.84% global_avg_ntp_loss: 0.7803 global_avg_top_loss: 1.9672 +[titan] 2025-09-09 15:38:04,487 - root - INFO - lr: 7.6051e-06 gnorm: 0.36 [1 day, 22:02:36<1 day, 3:24:20] +[titan] 2025-09-09 15:38:36,007 - root - INFO - step: 25080 loss: 3.2259 memory: 122.04GiB(87.57%) tps: 10,396 tflops: 495.47 mfu: 50.10% global_avg_ntp_loss: 1.0516 global_avg_top_loss: 2.1744 +[titan] 2025-09-09 15:38:36,007 - root - INFO - lr: 7.6018e-06 gnorm: 0.37 [1 day, 22:03:07<1 day, 3:23:46] +[titan] 2025-09-09 15:39:07,903 - root - INFO - step: 25085 loss: 2.8085 memory: 122.04GiB(87.57%) tps: 10,274 tflops: 489.63 mfu: 49.51% global_avg_ntp_loss: 0.8177 global_avg_top_loss: 1.9908 +[titan] 2025-09-09 15:39:07,903 - root - INFO - lr: 7.5985e-06 gnorm: 0.36 [1 day, 22:03:39<1 day, 3:23:12] +[titan] 2025-09-09 15:39:27,232 - root - INFO - Dumping profiler traces at step 25088 +[titan] 2025-09-09 15:39:27,290 - root - INFO - Finished dumping profiler traces in 0.06 seconds +[titan] 2025-09-09 15:39:39,964 - root - INFO - step: 25090 loss: 2.7552 memory: 122.04GiB(87.57%) tps: 10,221 tflops: 487.12 mfu: 49.25% global_avg_ntp_loss: 0.7837 global_avg_top_loss: 1.9715 +[titan] 2025-09-09 15:39:39,964 - root - INFO - lr: 7.5952e-06 gnorm: 0.35 [1 day, 22:04:11<1 day, 3:22:39] +[titan] 2025-09-09 15:40:11,634 - root - INFO - step: 25095 loss: 2.6368 memory: 122.04GiB(87.57%) tps: 10,347 tflops: 493.14 mfu: 49.86% global_avg_ntp_loss: 0.7343 global_avg_top_loss: 1.9025 +[titan] 2025-09-09 15:40:11,634 - root - INFO - lr: 7.5919e-06 gnorm: 0.35 [1 day, 22:04:43<1 day, 3:22:05] +[titan] 2025-09-09 15:40:37,141 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 15:40:43,505 - root - INFO - step: 25100 loss: 2.7106 memory: 122.04GiB(87.57%) tps: 10,282 tflops: 490.01 mfu: 49.55% global_avg_ntp_loss: 0.7623 global_avg_top_loss: 1.9483 +[titan] 2025-09-09 15:40:43,505 - root - INFO - lr: 7.5886e-06 gnorm: 0.36 [1 day, 22:05:15<1 day, 3:21:31] +[titan] 2025-09-09 15:41:15,404 - root - INFO - step: 25105 loss: 2.8491 memory: 122.04GiB(87.57%) tps: 10,273 tflops: 489.60 mfu: 49.50% global_avg_ntp_loss: 0.8388 global_avg_top_loss: 2.0103 +[titan] 2025-09-09 15:41:15,404 - root - INFO - lr: 7.5853e-06 gnorm: 0.36 [1 day, 22:05:47<1 day, 3:20:57] +[titan] 2025-09-09 15:41:47,217 - root - INFO - step: 25110 loss: 3.1185 memory: 122.04GiB(87.57%) tps: 10,300 tflops: 490.91 mfu: 49.64% global_avg_ntp_loss: 0.9999 global_avg_top_loss: 2.1186 +[titan] 2025-09-09 15:41:47,217 - root - INFO - lr: 7.5820e-06 gnorm: 0.36 [1 day, 22:06:18<1 day, 3:20:24] +[titan] 2025-09-09 15:42:19,236 - root - INFO - step: 25115 loss: 2.7550 memory: 122.04GiB(87.57%) tps: 10,234 tflops: 487.74 mfu: 49.32% global_avg_ntp_loss: 0.7815 global_avg_top_loss: 1.9735 +[titan] 2025-09-09 15:42:19,237 - root - INFO - lr: 7.5787e-06 gnorm: 0.37 [1 day, 22:06:50<1 day, 3:19:50] +[titan] 2025-09-09 15:42:51,092 - root - INFO - step: 25120 loss: 2.6678 memory: 122.04GiB(87.57%) tps: 10,287 tflops: 490.26 mfu: 49.57% global_avg_ntp_loss: 0.7426 global_avg_top_loss: 1.9252 +[titan] 2025-09-09 15:42:51,092 - root - INFO - lr: 7.5754e-06 gnorm: 0.35 [1 day, 22:07:22<1 day, 3:19:16] +[titan] 2025-09-09 15:43:22,873 - root - INFO - step: 25125 loss: 2.7577 memory: 122.04GiB(87.57%) tps: 10,311 tflops: 491.41 mfu: 49.69% global_avg_ntp_loss: 0.7896 global_avg_top_loss: 1.9681 +[titan] 2025-09-09 15:43:22,873 - root - INFO - lr: 7.5721e-06 gnorm: 0.35 [1 day, 22:07:54<1 day, 3:18:42] +[titan] 2025-09-09 15:43:55,009 - root - INFO - step: 25130 loss: 3.8224 memory: 122.04GiB(87.57%) tps: 10,197 tflops: 485.97 mfu: 49.14% global_avg_ntp_loss: 1.3779 global_avg_top_loss: 2.4445 +[titan] 2025-09-09 15:43:55,009 - root - INFO - lr: 7.5688e-06 gnorm: 0.37 [1 day, 22:08:26<1 day, 3:18:09] +[titan] 2025-09-09 15:44:26,868 - root - INFO - step: 25135 loss: 2.7911 memory: 122.04GiB(87.57%) tps: 10,286 tflops: 490.21 mfu: 49.57% global_avg_ntp_loss: 0.8004 global_avg_top_loss: 1.9907 +[titan] 2025-09-09 15:44:26,868 - root - INFO - lr: 7.5655e-06 gnorm: 0.36 [1 day, 22:08:58<1 day, 3:17:35] +[titan] 2025-09-09 15:44:58,746 - root - INFO - step: 25140 loss: 2.7238 memory: 122.04GiB(87.57%) tps: 10,279 tflops: 489.91 mfu: 49.54% global_avg_ntp_loss: 0.7708 global_avg_top_loss: 1.9530 +[titan] 2025-09-09 15:44:58,746 - root - INFO - lr: 7.5622e-06 gnorm: 0.37 [1 day, 22:09:30<1 day, 3:17:01] +[titan] 2025-09-09 15:45:30,719 - root - INFO - step: 25145 loss: 2.6947 memory: 122.04GiB(87.57%) tps: 10,249 tflops: 488.46 mfu: 49.39% global_avg_ntp_loss: 0.7596 global_avg_top_loss: 1.9351 +[titan] 2025-09-09 15:45:30,720 - root - INFO - lr: 7.5589e-06 gnorm: 0.39 [1 day, 22:10:02<1 day, 3:16:28] +[titan] 2025-09-09 15:45:56,231 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 15:46:02,680 - root - INFO - step: 25150 loss: 2.7373 memory: 122.04GiB(87.57%) tps: 10,253 tflops: 488.64 mfu: 49.41% global_avg_ntp_loss: 0.7793 global_avg_top_loss: 1.9580 +[titan] 2025-09-09 15:46:02,681 - root - INFO - lr: 7.5556e-06 gnorm: 0.37 [1 day, 22:10:34<1 day, 3:15:54] +[titan] 2025-09-09 15:46:34,515 - root - INFO - step: 25155 loss: 2.8187 memory: 122.04GiB(87.57%) tps: 10,294 tflops: 490.59 mfu: 49.60% global_avg_ntp_loss: 0.8108 global_avg_top_loss: 2.0079 +[titan] 2025-09-09 15:46:34,515 - root - INFO - lr: 7.5523e-06 gnorm: 0.36 [1 day, 22:11:06<1 day, 3:15:20] +[titan] 2025-09-09 15:47:06,566 - root - INFO - step: 25160 loss: 3.1470 memory: 122.04GiB(87.57%) tps: 10,224 tflops: 487.27 mfu: 49.27% global_avg_ntp_loss: 1.0122 global_avg_top_loss: 2.1347 +[titan] 2025-09-09 15:47:06,566 - root - INFO - lr: 7.5490e-06 gnorm: 0.53 [1 day, 22:11:38<1 day, 3:14:46] +[titan] 2025-09-09 15:47:38,505 - root - INFO - step: 25165 loss: 2.7516 memory: 122.04GiB(87.57%) tps: 10,260 tflops: 488.97 mfu: 49.44% global_avg_ntp_loss: 0.7797 global_avg_top_loss: 1.9719 +[titan] 2025-09-09 15:47:38,505 - root - INFO - lr: 7.5457e-06 gnorm: 0.34 [1 day, 22:12:10<1 day, 3:14:13] +[titan] 2025-09-09 15:48:10,738 - root - INFO - step: 25170 loss: 2.6569 memory: 122.04GiB(87.57%) tps: 10,166 tflops: 484.51 mfu: 48.99% global_avg_ntp_loss: 0.7375 global_avg_top_loss: 1.9194 +[titan] 2025-09-09 15:48:10,739 - root - INFO - lr: 7.5424e-06 gnorm: 0.39 [1 day, 22:12:42<1 day, 3:13:39] +[titan] 2025-09-09 15:48:43,104 - root - INFO - step: 25175 loss: 2.7905 memory: 122.04GiB(87.57%) tps: 10,125 tflops: 482.54 mfu: 48.79% global_avg_ntp_loss: 0.8050 global_avg_top_loss: 1.9855 +[titan] 2025-09-09 15:48:43,104 - root - INFO - lr: 7.5391e-06 gnorm: 0.38 [1 day, 22:13:14<1 day, 3:13:06] +[titan] 2025-09-09 15:49:15,524 - root - INFO - step: 25180 loss: 2.6982 memory: 122.04GiB(87.57%) tps: 10,107 tflops: 481.71 mfu: 48.71% global_avg_ntp_loss: 0.7601 global_avg_top_loss: 1.9381 +[titan] 2025-09-09 15:49:15,525 - root - INFO - lr: 7.5358e-06 gnorm: 0.35 [1 day, 22:13:47<1 day, 3:12:32] +[titan] 2025-09-09 15:49:47,605 - root - INFO - step: 25185 loss: 2.7802 memory: 122.04GiB(87.57%) tps: 10,215 tflops: 486.83 mfu: 49.22% global_avg_ntp_loss: 0.7961 global_avg_top_loss: 1.9841 +[titan] 2025-09-09 15:49:47,605 - root - INFO - lr: 7.5325e-06 gnorm: 0.35 [1 day, 22:14:19<1 day, 3:11:59] +[titan] 2025-09-09 15:50:19,580 - root - INFO - step: 25190 loss: 2.6150 memory: 122.04GiB(87.57%) tps: 10,248 tflops: 488.42 mfu: 49.39% global_avg_ntp_loss: 0.7175 global_avg_top_loss: 1.8975 +[titan] 2025-09-09 15:50:19,581 - root - INFO - lr: 7.5292e-06 gnorm: 0.35 [1 day, 22:14:51<1 day, 3:11:25] +[titan] 2025-09-09 15:50:51,496 - root - INFO - step: 25195 loss: 2.7843 memory: 122.04GiB(87.57%) tps: 10,267 tflops: 489.33 mfu: 49.48% global_avg_ntp_loss: 0.7977 global_avg_top_loss: 1.9866 +[titan] 2025-09-09 15:50:51,497 - root - INFO - lr: 7.5259e-06 gnorm: 0.36 [1 day, 22:15:23<1 day, 3:10:51] +[titan] 2025-09-09 15:51:17,078 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 15:51:23,453 - root - INFO - step: 25200 loss: 2.8446 memory: 122.04GiB(87.57%) tps: 10,254 tflops: 488.70 mfu: 49.41% global_avg_ntp_loss: 0.8239 global_avg_top_loss: 2.0207 +[titan] 2025-09-09 15:51:23,454 - root - INFO - lr: 7.5226e-06 gnorm: 0.35 [1 day, 22:15:55<1 day, 3:10:18] +[titan] 2025-09-09 15:51:55,522 - root - INFO - step: 25205 loss: 2.7168 memory: 122.04GiB(87.57%) tps: 10,218 tflops: 487.00 mfu: 49.24% global_avg_ntp_loss: 0.7654 global_avg_top_loss: 1.9514 +[titan] 2025-09-09 15:51:55,522 - root - INFO - lr: 7.5194e-06 gnorm: 0.34 [1 day, 22:16:27<1 day, 3:09:44] +[titan] 2025-09-09 15:52:27,545 - root - INFO - step: 25210 loss: 3.2243 memory: 122.04GiB(87.57%) tps: 10,233 tflops: 487.69 mfu: 49.31% global_avg_ntp_loss: 1.0547 global_avg_top_loss: 2.1697 +[titan] 2025-09-09 15:52:27,545 - root - INFO - lr: 7.5161e-06 gnorm: 0.36 [1 day, 22:16:59<1 day, 3:09:10] +[titan] 2025-09-09 15:52:59,589 - root - INFO - step: 25215 loss: 2.7330 memory: 122.04GiB(87.57%) tps: 10,226 tflops: 487.37 mfu: 49.28% global_avg_ntp_loss: 0.7738 global_avg_top_loss: 1.9592 +[titan] 2025-09-09 15:52:59,590 - root - INFO - lr: 7.5128e-06 gnorm: 0.35 [1 day, 22:17:31<1 day, 3:08:37] +[titan] 2025-09-09 15:53:31,498 - root - INFO - step: 25220 loss: 2.7844 memory: 122.04GiB(87.57%) tps: 10,269 tflops: 489.44 mfu: 49.49% global_avg_ntp_loss: 0.7998 global_avg_top_loss: 1.9846 +[titan] 2025-09-09 15:53:31,499 - root - INFO - lr: 7.5095e-06 gnorm: 0.36 [1 day, 22:18:03<1 day, 3:08:03] +[titan] 2025-09-09 15:54:03,681 - root - INFO - step: 25225 loss: 3.2497 memory: 122.04GiB(87.57%) tps: 10,182 tflops: 485.28 mfu: 49.07% global_avg_ntp_loss: 1.0622 global_avg_top_loss: 2.1876 +[titan] 2025-09-09 15:54:03,681 - root - INFO - lr: 7.5062e-06 gnorm: 0.38 [1 day, 22:18:35<1 day, 3:07:29] +[titan] 2025-09-09 15:54:35,599 - root - INFO - step: 25230 loss: 2.7190 memory: 122.04GiB(87.57%) tps: 10,267 tflops: 489.30 mfu: 49.47% global_avg_ntp_loss: 0.7677 global_avg_top_loss: 1.9513 +[titan] 2025-09-09 15:54:35,600 - root - INFO - lr: 7.5029e-06 gnorm: 0.34 [1 day, 22:19:07<1 day, 3:06:56] +[titan] 2025-09-09 15:55:07,717 - root - INFO - step: 25235 loss: 2.7784 memory: 122.04GiB(87.57%) tps: 10,203 tflops: 486.25 mfu: 49.17% global_avg_ntp_loss: 0.7948 global_avg_top_loss: 1.9836 +[titan] 2025-09-09 15:55:07,718 - root - INFO - lr: 7.4996e-06 gnorm: 0.36 [1 day, 22:19:39<1 day, 3:06:22] +[titan] 2025-09-09 15:55:39,841 - root - INFO - step: 25240 loss: 2.7457 memory: 122.04GiB(87.57%) tps: 10,201 tflops: 486.16 mfu: 49.16% global_avg_ntp_loss: 0.7792 global_avg_top_loss: 1.9665 +[titan] 2025-09-09 15:55:39,842 - root - INFO - lr: 7.4963e-06 gnorm: 0.37 [1 day, 22:20:11<1 day, 3:05:49] +[titan] 2025-09-09 15:56:11,940 - root - INFO - step: 25245 loss: 2.9717 memory: 122.04GiB(87.57%) tps: 10,209 tflops: 486.54 mfu: 49.20% global_avg_ntp_loss: 0.9030 global_avg_top_loss: 2.0687 +[titan] 2025-09-09 15:56:11,940 - root - INFO - lr: 7.4930e-06 gnorm: 0.37 [1 day, 22:20:43<1 day, 3:05:15] +[titan] 2025-09-09 15:56:37,249 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 15:56:43,757 - root - INFO - step: 25250 loss: 2.7741 memory: 122.04GiB(87.57%) tps: 10,299 tflops: 490.85 mfu: 49.63% global_avg_ntp_loss: 0.7955 global_avg_top_loss: 1.9786 +[titan] 2025-09-09 15:56:43,757 - root - INFO - lr: 7.4897e-06 gnorm: 0.35 [1 day, 22:21:15<1 day, 3:04:41] +[titan] 2025-09-09 15:57:15,958 - root - INFO - step: 25255 loss: 2.6937 memory: 122.04GiB(87.57%) tps: 10,177 tflops: 485.01 mfu: 49.04% global_avg_ntp_loss: 0.7581 global_avg_top_loss: 1.9355 +[titan] 2025-09-09 15:57:15,958 - root - INFO - lr: 7.4865e-06 gnorm: 0.35 [1 day, 22:21:47<1 day, 3:04:08] +[titan] 2025-09-09 15:57:48,186 - root - INFO - step: 25260 loss: 2.7593 memory: 122.04GiB(87.57%) tps: 10,168 tflops: 484.58 mfu: 49.00% global_avg_ntp_loss: 0.7973 global_avg_top_loss: 1.9620 +[titan] 2025-09-09 15:57:48,187 - root - INFO - lr: 7.4832e-06 gnorm: 0.35 [1 day, 22:22:19<1 day, 3:03:34] +[titan] 2025-09-09 15:58:20,115 - root - INFO - step: 25265 loss: 2.7804 memory: 122.04GiB(87.57%) tps: 10,263 tflops: 489.13 mfu: 49.46% global_avg_ntp_loss: 0.7960 global_avg_top_loss: 1.9844 +[titan] 2025-09-09 15:58:20,116 - root - INFO - lr: 7.4799e-06 gnorm: 0.35 [1 day, 22:22:51<1 day, 3:03:00] +[titan] 2025-09-09 15:58:52,148 - root - INFO - step: 25270 loss: 2.6477 memory: 122.04GiB(87.57%) tps: 10,230 tflops: 487.55 mfu: 49.30% global_avg_ntp_loss: 0.7338 global_avg_top_loss: 1.9139 +[titan] 2025-09-09 15:58:52,148 - root - INFO - lr: 7.4766e-06 gnorm: 0.37 [1 day, 22:23:23<1 day, 3:02:27] +[titan] 2025-09-09 15:59:24,266 - root - INFO - step: 25275 loss: 3.2774 memory: 122.04GiB(87.57%) tps: 10,203 tflops: 486.26 mfu: 49.17% global_avg_ntp_loss: 1.0742 global_avg_top_loss: 2.2032 +[titan] 2025-09-09 15:59:24,266 - root - INFO - lr: 7.4733e-06 gnorm: 0.35 [1 day, 22:23:55<1 day, 3:01:53] +[titan] 2025-09-09 15:59:56,183 - root - INFO - step: 25280 loss: 2.6913 memory: 122.04GiB(87.57%) tps: 10,267 tflops: 489.31 mfu: 49.48% global_avg_ntp_loss: 0.7585 global_avg_top_loss: 1.9327 +[titan] 2025-09-09 15:59:56,183 - root - INFO - lr: 7.4700e-06 gnorm: 0.34 [1 day, 22:24:27<1 day, 3:01:20] +[titan] 2025-09-09 16:00:28,156 - root - INFO - step: 25285 loss: 2.7369 memory: 122.04GiB(87.57%) tps: 10,249 tflops: 488.45 mfu: 49.39% global_avg_ntp_loss: 0.7726 global_avg_top_loss: 1.9643 +[titan] 2025-09-09 16:00:28,157 - root - INFO - lr: 7.4668e-06 gnorm: 0.36 [1 day, 22:24:59<1 day, 3:00:46] +[titan] 2025-09-09 16:01:00,271 - root - INFO - step: 25290 loss: 3.2298 memory: 122.04GiB(87.57%) tps: 10,204 tflops: 486.30 mfu: 49.17% global_avg_ntp_loss: 1.0548 global_avg_top_loss: 2.1750 +[titan] 2025-09-09 16:01:00,271 - root - INFO - lr: 7.4635e-06 gnorm: 0.38 [1 day, 22:25:31<1 day, 3:00:12] +[titan] 2025-09-09 16:01:32,358 - root - INFO - step: 25295 loss: 2.6449 memory: 122.04GiB(87.57%) tps: 10,213 tflops: 486.72 mfu: 49.21% global_avg_ntp_loss: 0.7307 global_avg_top_loss: 1.9141 +[titan] 2025-09-09 16:01:32,359 - root - INFO - lr: 7.4602e-06 gnorm: 0.36 [1 day, 22:26:04<1 day, 2:59:39] +[titan] 2025-09-09 16:01:58,088 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 16:02:04,546 - root - INFO - step: 25300 loss: 2.7873 memory: 122.04GiB(87.57%) tps: 10,181 tflops: 485.20 mfu: 49.06% global_avg_ntp_loss: 0.7996 global_avg_top_loss: 1.9877 +[titan] 2025-09-09 16:02:04,546 - root - INFO - lr: 7.4569e-06 gnorm: 0.37 [1 day, 22:26:36<1 day, 2:59:05] +[titan] 2025-09-09 16:02:36,423 - root - INFO - step: 25305 loss: 3.1935 memory: 122.04GiB(87.57%) tps: 10,280 tflops: 489.92 mfu: 49.54% global_avg_ntp_loss: 1.0325 global_avg_top_loss: 2.1610 +[titan] 2025-09-09 16:02:36,424 - root - INFO - lr: 7.4536e-06 gnorm: 0.42 [1 day, 22:27:08<1 day, 2:58:31] +[titan] 2025-09-09 16:03:08,493 - root - INFO - step: 25310 loss: 2.7130 memory: 122.04GiB(87.57%) tps: 10,218 tflops: 486.99 mfu: 49.24% global_avg_ntp_loss: 0.7612 global_avg_top_loss: 1.9518 +[titan] 2025-09-09 16:03:08,493 - root - INFO - lr: 7.4503e-06 gnorm: 0.39 [1 day, 22:27:40<1 day, 2:57:58] +[titan] 2025-09-09 16:03:40,384 - root - INFO - step: 25315 loss: 2.7946 memory: 122.04GiB(87.57%) tps: 10,275 tflops: 489.71 mfu: 49.52% global_avg_ntp_loss: 0.8052 global_avg_top_loss: 1.9894 +[titan] 2025-09-09 16:03:40,384 - root - INFO - lr: 7.4471e-06 gnorm: 0.37 [1 day, 22:28:12<1 day, 2:57:24] +[titan] 2025-09-09 16:04:12,553 - root - INFO - step: 25320 loss: 2.7190 memory: 122.04GiB(87.57%) tps: 10,187 tflops: 485.48 mfu: 49.09% global_avg_ntp_loss: 0.7686 global_avg_top_loss: 1.9503 +[titan] 2025-09-09 16:04:12,553 - root - INFO - lr: 7.4438e-06 gnorm: 0.37 [1 day, 22:28:44<1 day, 2:56:51] +[titan] 2025-09-09 16:04:44,447 - root - INFO - step: 25325 loss: 2.7165 memory: 122.04GiB(87.57%) tps: 10,274 tflops: 489.66 mfu: 49.51% global_avg_ntp_loss: 0.7663 global_avg_top_loss: 1.9502 +[titan] 2025-09-09 16:04:44,447 - root - INFO - lr: 7.4405e-06 gnorm: 0.36 [1 day, 22:29:16<1 day, 2:56:17] +[titan] 2025-09-09 16:05:16,590 - root - INFO - step: 25330 loss: 2.7882 memory: 122.04GiB(87.57%) tps: 10,195 tflops: 485.88 mfu: 49.13% global_avg_ntp_loss: 0.7978 global_avg_top_loss: 1.9904 +[titan] 2025-09-09 16:05:16,590 - root - INFO - lr: 7.4372e-06 gnorm: 0.36 [1 day, 22:29:48<1 day, 2:55:43] +[titan] 2025-09-09 16:05:48,482 - root - INFO - step: 25335 loss: 2.6578 memory: 122.04GiB(87.57%) tps: 10,275 tflops: 489.69 mfu: 49.51% global_avg_ntp_loss: 0.7428 global_avg_top_loss: 1.9151 +[titan] 2025-09-09 16:05:48,483 - root - INFO - lr: 7.4339e-06 gnorm: 0.34 [1 day, 22:30:20<1 day, 2:55:10] +[titan] 2025-09-09 16:06:20,500 - root - INFO - step: 25340 loss: 2.7553 memory: 122.04GiB(87.57%) tps: 10,235 tflops: 487.78 mfu: 49.32% global_avg_ntp_loss: 0.7829 global_avg_top_loss: 1.9724 +[titan] 2025-09-09 16:06:20,500 - root - INFO - lr: 7.4307e-06 gnorm: 0.50 [1 day, 22:30:52<1 day, 2:54:36] +[titan] 2025-09-09 16:06:52,374 - root - INFO - step: 25345 loss: 2.7815 memory: 122.04GiB(87.57%) tps: 10,281 tflops: 489.97 mfu: 49.54% global_avg_ntp_loss: 0.7959 global_avg_top_loss: 1.9856 +[titan] 2025-09-09 16:06:52,374 - root - INFO - lr: 7.4274e-06 gnorm: 0.35 [1 day, 22:31:24<1 day, 2:54:02] +[titan] 2025-09-09 16:07:17,986 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 16:07:24,276 - root - INFO - step: 25350 loss: 2.9346 memory: 122.04GiB(87.57%) tps: 10,272 tflops: 489.54 mfu: 49.50% global_avg_ntp_loss: 0.8872 global_avg_top_loss: 2.0474 +[titan] 2025-09-09 16:07:24,277 - root - INFO - lr: 7.4241e-06 gnorm: 0.35 [1 day, 22:31:55<1 day, 2:53:29] +[titan] 2025-09-09 16:07:56,252 - root - INFO - step: 25355 loss: 3.1781 memory: 122.04GiB(87.57%) tps: 10,248 tflops: 488.41 mfu: 49.38% global_avg_ntp_loss: 1.0354 global_avg_top_loss: 2.1427 +[titan] 2025-09-09 16:07:56,252 - root - INFO - lr: 7.4208e-06 gnorm: 0.35 [1 day, 22:32:27<1 day, 2:52:55] +[titan] 2025-09-09 16:08:28,188 - root - INFO - step: 25360 loss: 2.7347 memory: 122.04GiB(87.57%) tps: 10,261 tflops: 489.02 mfu: 49.45% global_avg_ntp_loss: 0.7702 global_avg_top_loss: 1.9646 +[titan] 2025-09-09 16:08:28,189 - root - INFO - lr: 7.4176e-06 gnorm: 0.37 [1 day, 22:32:59<1 day, 2:52:21] +[titan] 2025-09-09 16:09:00,346 - root - INFO - step: 25365 loss: 2.6873 memory: 122.04GiB(87.57%) tps: 10,190 tflops: 485.66 mfu: 49.11% global_avg_ntp_loss: 0.7526 global_avg_top_loss: 1.9347 +[titan] 2025-09-09 16:09:00,346 - root - INFO - lr: 7.4143e-06 gnorm: 0.36 [1 day, 22:33:32<1 day, 2:51:48] +[titan] 2025-09-09 16:09:32,367 - root - INFO - step: 25370 loss: 3.2046 memory: 122.04GiB(87.57%) tps: 10,234 tflops: 487.73 mfu: 49.32% global_avg_ntp_loss: 1.0395 global_avg_top_loss: 2.1651 +[titan] 2025-09-09 16:09:32,367 - root - INFO - lr: 7.4110e-06 gnorm: 0.36 [1 day, 22:34:04<1 day, 2:51:14] +[titan] 2025-09-09 16:10:04,450 - root - INFO - step: 25375 loss: 2.6200 memory: 122.04GiB(87.57%) tps: 10,214 tflops: 486.78 mfu: 49.22% global_avg_ntp_loss: 0.7211 global_avg_top_loss: 1.8989 +[titan] 2025-09-09 16:10:04,450 - root - INFO - lr: 7.4077e-06 gnorm: 0.34 [1 day, 22:34:36<1 day, 2:50:40] +[titan] 2025-09-09 16:10:36,322 - root - INFO - step: 25380 loss: 2.7924 memory: 122.04GiB(87.57%) tps: 10,281 tflops: 490.00 mfu: 49.55% global_avg_ntp_loss: 0.7981 global_avg_top_loss: 1.9943 +[titan] 2025-09-09 16:10:36,323 - root - INFO - lr: 7.4045e-06 gnorm: 0.37 [1 day, 22:35:08<1 day, 2:50:07] +[titan] 2025-09-09 16:11:08,390 - root - INFO - step: 25385 loss: 3.2676 memory: 122.04GiB(87.57%) tps: 10,219 tflops: 487.02 mfu: 49.24% global_avg_ntp_loss: 1.0810 global_avg_top_loss: 2.1866 +[titan] 2025-09-09 16:11:08,390 - root - INFO - lr: 7.4012e-06 gnorm: 0.39 [1 day, 22:35:40<1 day, 2:49:33] +[titan] 2025-09-09 16:11:40,275 - root - INFO - step: 25390 loss: 2.7992 memory: 122.04GiB(87.57%) tps: 10,277 tflops: 489.80 mfu: 49.52% global_avg_ntp_loss: 0.7962 global_avg_top_loss: 2.0030 +[titan] 2025-09-09 16:11:40,276 - root - INFO - lr: 7.3979e-06 gnorm: 1.10 [1 day, 22:36:11<1 day, 2:48:59] +[titan] 2025-09-09 16:12:12,338 - root - INFO - step: 25395 loss: 2.6125 memory: 122.04GiB(87.57%) tps: 10,220 tflops: 487.09 mfu: 49.25% global_avg_ntp_loss: 0.7189 global_avg_top_loss: 1.8936 +[titan] 2025-09-09 16:12:12,338 - root - INFO - lr: 7.3947e-06 gnorm: 0.34 [1 day, 22:36:44<1 day, 2:48:26] +[titan] 2025-09-09 16:12:37,996 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 16:12:44,312 - root - INFO - step: 25400 loss: 2.6897 memory: 122.04GiB(87.57%) tps: 10,249 tflops: 488.45 mfu: 49.39% global_avg_ntp_loss: 0.7574 global_avg_top_loss: 1.9323 +[titan] 2025-09-09 16:12:44,312 - root - INFO - lr: 7.3914e-06 gnorm: 0.38 [1 day, 22:37:16<1 day, 2:47:52] +[titan] 2025-09-09 16:13:16,348 - root - INFO - step: 25405 loss: 2.6516 memory: 122.04GiB(87.57%) tps: 10,229 tflops: 487.50 mfu: 49.29% global_avg_ntp_loss: 0.7371 global_avg_top_loss: 1.9145 +[titan] 2025-09-09 16:13:16,348 - root - INFO - lr: 7.3881e-06 gnorm: 0.35 [1 day, 22:37:48<1 day, 2:47:19] +[titan] 2025-09-09 16:13:48,361 - root - INFO - step: 25410 loss: 2.8164 memory: 122.04GiB(87.57%) tps: 10,236 tflops: 487.84 mfu: 49.33% global_avg_ntp_loss: 0.8323 global_avg_top_loss: 1.9841 +[titan] 2025-09-09 16:13:48,361 - root - INFO - lr: 7.3848e-06 gnorm: 0.40 [1 day, 22:38:20<1 day, 2:46:45] +[titan] 2025-09-09 16:14:20,324 - root - INFO - step: 25415 loss: 2.6447 memory: 122.04GiB(87.57%) tps: 10,252 tflops: 488.61 mfu: 49.40% global_avg_ntp_loss: 0.7317 global_avg_top_loss: 1.9130 +[titan] 2025-09-09 16:14:20,325 - root - INFO - lr: 7.3816e-06 gnorm: 0.56 [1 day, 22:38:52<1 day, 2:46:11] +[titan] 2025-09-09 16:14:52,409 - root - INFO - step: 25420 loss: 2.6779 memory: 122.04GiB(87.57%) tps: 10,213 tflops: 486.75 mfu: 49.22% global_avg_ntp_loss: 0.7545 global_avg_top_loss: 1.9234 +[titan] 2025-09-09 16:14:52,410 - root - INFO - lr: 7.3783e-06 gnorm: 0.35 [1 day, 22:39:24<1 day, 2:45:38] +[titan] 2025-09-09 16:15:24,375 - root - INFO - step: 25425 loss: 2.6319 memory: 122.04GiB(87.57%) tps: 10,251 tflops: 488.58 mfu: 49.40% global_avg_ntp_loss: 0.7303 global_avg_top_loss: 1.9015 +[titan] 2025-09-09 16:15:24,375 - root - INFO - lr: 7.3750e-06 gnorm: 0.41 [1 day, 22:39:56<1 day, 2:45:04] +[titan] 2025-09-09 16:15:56,587 - root - INFO - step: 25430 loss: 2.7570 memory: 122.04GiB(87.57%) tps: 10,173 tflops: 484.83 mfu: 49.02% global_avg_ntp_loss: 0.7801 global_avg_top_loss: 1.9769 +[titan] 2025-09-09 16:15:56,587 - root - INFO - lr: 7.3718e-06 gnorm: 0.37 [1 day, 22:40:28<1 day, 2:44:31] +[titan] 2025-09-09 16:16:28,929 - root - INFO - step: 25435 loss: 3.1302 memory: 122.04GiB(87.57%) tps: 10,132 tflops: 482.88 mfu: 48.83% global_avg_ntp_loss: 1.0074 global_avg_top_loss: 2.1228 +[titan] 2025-09-09 16:16:28,929 - root - INFO - lr: 7.3685e-06 gnorm: 0.38 [1 day, 22:41:00<1 day, 2:43:57] +[titan] 2025-09-09 16:17:00,957 - root - INFO - step: 25440 loss: 2.5854 memory: 122.04GiB(87.57%) tps: 10,231 tflops: 487.61 mfu: 49.30% global_avg_ntp_loss: 0.7078 global_avg_top_loss: 1.8776 +[titan] 2025-09-09 16:17:00,958 - root - INFO - lr: 7.3652e-06 gnorm: 0.33 [1 day, 22:41:32<1 day, 2:43:23] +[titan] 2025-09-09 16:17:33,262 - root - INFO - step: 25445 loss: 2.7243 memory: 122.04GiB(87.57%) tps: 10,144 tflops: 483.44 mfu: 48.88% global_avg_ntp_loss: 0.7702 global_avg_top_loss: 1.9541 +[titan] 2025-09-09 16:17:33,263 - root - INFO - lr: 7.3620e-06 gnorm: 0.39 [1 day, 22:42:04<1 day, 2:42:50] +[titan] 2025-09-09 16:17:58,885 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 16:18:05,281 - root - INFO - step: 25450 loss: 3.2906 memory: 122.04GiB(87.57%) tps: 10,234 tflops: 487.76 mfu: 49.32% global_avg_ntp_loss: 1.0822 global_avg_top_loss: 2.2084 +[titan] 2025-09-09 16:18:05,281 - root - INFO - lr: 7.3587e-06 gnorm: 0.37 [1 day, 22:42:36<1 day, 2:42:16] +[titan] 2025-09-09 16:18:37,229 - root - INFO - step: 25455 loss: 2.8179 memory: 122.04GiB(87.57%) tps: 10,257 tflops: 488.84 mfu: 49.43% global_avg_ntp_loss: 0.8167 global_avg_top_loss: 2.0013 +[titan] 2025-09-09 16:18:37,229 - root - INFO - lr: 7.3554e-06 gnorm: 0.37 [1 day, 22:43:08<1 day, 2:41:43] +[titan] 2025-09-09 16:19:09,122 - root - INFO - step: 25460 loss: 2.7257 memory: 122.04GiB(87.57%) tps: 10,275 tflops: 489.68 mfu: 49.51% global_avg_ntp_loss: 0.7717 global_avg_top_loss: 1.9540 +[titan] 2025-09-09 16:19:09,122 - root - INFO - lr: 7.3522e-06 gnorm: 0.43 [1 day, 22:43:40<1 day, 2:41:09] +[titan] 2025-09-09 16:19:41,167 - root - INFO - step: 25465 loss: 3.0916 memory: 122.04GiB(87.57%) tps: 10,226 tflops: 487.36 mfu: 49.28% global_avg_ntp_loss: 0.9822 global_avg_top_loss: 2.1094 +[titan] 2025-09-09 16:19:41,167 - root - INFO - lr: 7.3489e-06 gnorm: 0.40 [1 day, 22:44:12<1 day, 2:40:35] +[titan] 2025-09-09 16:20:13,272 - root - INFO - step: 25470 loss: 2.7736 memory: 122.04GiB(87.57%) tps: 10,207 tflops: 486.45 mfu: 49.19% global_avg_ntp_loss: 0.7890 global_avg_top_loss: 1.9846 +[titan] 2025-09-09 16:20:13,273 - root - INFO - lr: 7.3457e-06 gnorm: 0.46 [1 day, 22:44:44<1 day, 2:40:02] +[titan] 2025-09-09 16:20:45,235 - root - INFO - step: 25475 loss: 2.7916 memory: 122.04GiB(87.57%) tps: 10,252 tflops: 488.61 mfu: 49.40% global_avg_ntp_loss: 0.8181 global_avg_top_loss: 1.9735 +[titan] 2025-09-09 16:20:45,236 - root - INFO - lr: 7.3424e-06 gnorm: 0.59 [1 day, 22:45:16<1 day, 2:39:28] +[titan] 2025-09-09 16:21:17,231 - root - INFO - step: 25480 loss: 2.7603 memory: 122.04GiB(87.57%) tps: 10,242 tflops: 488.11 mfu: 49.35% global_avg_ntp_loss: 0.7877 global_avg_top_loss: 1.9726 +[titan] 2025-09-09 16:21:17,231 - root - INFO - lr: 7.3391e-06 gnorm: 0.37 [1 day, 22:45:48<1 day, 2:38:55] +[titan] 2025-09-09 16:21:49,241 - root - INFO - step: 25485 loss: 2.7340 memory: 122.04GiB(87.57%) tps: 10,237 tflops: 487.89 mfu: 49.33% global_avg_ntp_loss: 0.7740 global_avg_top_loss: 1.9601 +[titan] 2025-09-09 16:21:49,242 - root - INFO - lr: 7.3359e-06 gnorm: 0.37 [1 day, 22:46:20<1 day, 2:38:21] +[titan] 2025-09-09 16:22:21,439 - root - INFO - step: 25490 loss: 2.6630 memory: 122.04GiB(87.57%) tps: 10,177 tflops: 485.04 mfu: 49.04% global_avg_ntp_loss: 0.7415 global_avg_top_loss: 1.9215 +[titan] 2025-09-09 16:22:21,440 - root - INFO - lr: 7.3326e-06 gnorm: 0.36 [1 day, 22:46:53<1 day, 2:37:47] +[titan] 2025-09-09 16:22:53,657 - root - INFO - step: 25495 loss: 3.2009 memory: 122.04GiB(87.57%) tps: 10,171 tflops: 484.75 mfu: 49.01% global_avg_ntp_loss: 1.0382 global_avg_top_loss: 2.1628 +[titan] 2025-09-09 16:22:53,657 - root - INFO - lr: 7.3293e-06 gnorm: 0.37 [1 day, 22:47:25<1 day, 2:37:14] +[titan] 2025-09-09 16:23:19,122 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 16:23:25,492 - root - INFO - step: 25500 loss: 2.7314 memory: 122.04GiB(87.57%) tps: 10,293 tflops: 490.58 mfu: 49.60% global_avg_ntp_loss: 0.7747 global_avg_top_loss: 1.9567 +[titan] 2025-09-09 16:23:25,492 - root - INFO - lr: 7.3261e-06 gnorm: 0.36 [1 day, 22:47:57<1 day, 2:36:40] +[titan] 2025-09-09 16:23:57,488 - root - INFO - step: 25505 loss: 2.6509 memory: 122.04GiB(87.57%) tps: 10,241 tflops: 488.09 mfu: 49.35% global_avg_ntp_loss: 0.7394 global_avg_top_loss: 1.9115 +[titan] 2025-09-09 16:23:57,489 - root - INFO - lr: 7.3228e-06 gnorm: 0.35 [1 day, 22:48:29<1 day, 2:36:07] +[titan] 2025-09-09 16:24:29,550 - root - INFO - step: 25510 loss: 2.8351 memory: 122.04GiB(87.57%) tps: 10,221 tflops: 487.11 mfu: 49.25% global_avg_ntp_loss: 0.8257 global_avg_top_loss: 2.0094 +[titan] 2025-09-09 16:24:29,550 - root - INFO - lr: 7.3196e-06 gnorm: 0.38 [1 day, 22:49:01<1 day, 2:35:33] +[titan] 2025-09-09 16:25:01,466 - root - INFO - step: 25515 loss: 3.2158 memory: 122.04GiB(87.57%) tps: 10,267 tflops: 489.33 mfu: 49.48% global_avg_ntp_loss: 1.0482 global_avg_top_loss: 2.1676 +[titan] 2025-09-09 16:25:01,466 - root - INFO - lr: 7.3163e-06 gnorm: 0.36 [1 day, 22:49:33<1 day, 2:34:59] +[titan] 2025-09-09 16:25:33,569 - root - INFO - step: 25520 loss: 2.7626 memory: 122.04GiB(87.57%) tps: 10,207 tflops: 486.47 mfu: 49.19% global_avg_ntp_loss: 0.7863 global_avg_top_loss: 1.9763 +[titan] 2025-09-09 16:25:33,570 - root - INFO - lr: 7.3131e-06 gnorm: 0.43 [1 day, 22:50:05<1 day, 2:34:26] +[titan] 2025-09-09 16:26:05,485 - root - INFO - step: 25525 loss: 2.7257 memory: 122.04GiB(87.57%) tps: 10,267 tflops: 489.34 mfu: 49.48% global_avg_ntp_loss: 0.7780 global_avg_top_loss: 1.9477 +[titan] 2025-09-09 16:26:05,486 - root - INFO - lr: 7.3098e-06 gnorm: 0.39 [1 day, 22:50:37<1 day, 2:33:52] +[titan] 2025-09-09 16:26:37,376 - root - INFO - step: 25530 loss: 2.8116 memory: 122.04GiB(87.57%) tps: 10,275 tflops: 489.71 mfu: 49.52% global_avg_ntp_loss: 0.8103 global_avg_top_loss: 2.0013 +[titan] 2025-09-09 16:26:37,377 - root - INFO - lr: 7.3065e-06 gnorm: 0.37 [1 day, 22:51:09<1 day, 2:33:18] +[titan] 2025-09-09 16:27:09,382 - root - INFO - step: 25535 loss: 2.7881 memory: 122.04GiB(87.57%) tps: 10,239 tflops: 487.96 mfu: 49.34% global_avg_ntp_loss: 0.8013 global_avg_top_loss: 1.9869 +[titan] 2025-09-09 16:27:09,382 - root - INFO - lr: 7.3033e-06 gnorm: 0.40 [1 day, 22:51:41<1 day, 2:32:45] +[titan] 2025-09-09 16:27:41,250 - root - INFO - step: 25540 loss: 2.7665 memory: 122.04GiB(87.57%) tps: 10,283 tflops: 490.07 mfu: 49.55% global_avg_ntp_loss: 0.7870 global_avg_top_loss: 1.9794 +[titan] 2025-09-09 16:27:41,250 - root - INFO - lr: 7.3000e-06 gnorm: 0.36 [1 day, 22:52:12<1 day, 2:32:11] +[titan] 2025-09-09 16:28:13,155 - root - INFO - step: 25545 loss: 2.6871 memory: 122.04GiB(87.57%) tps: 10,271 tflops: 489.49 mfu: 49.49% global_avg_ntp_loss: 0.7578 global_avg_top_loss: 1.9293 +[titan] 2025-09-09 16:28:13,156 - root - INFO - lr: 7.2968e-06 gnorm: 0.35 [1 day, 22:52:44<1 day, 2:31:37] +[titan] 2025-09-09 16:28:38,632 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 16:28:44,999 - root - INFO - step: 25550 loss: 2.6822 memory: 122.04GiB(87.57%) tps: 10,291 tflops: 490.44 mfu: 49.59% global_avg_ntp_loss: 0.7496 global_avg_top_loss: 1.9326 +[titan] 2025-09-09 16:28:44,999 - root - INFO - lr: 7.2935e-06 gnorm: 0.37 [1 day, 22:53:16<1 day, 2:31:04] +[titan] 2025-09-09 16:29:16,906 - root - INFO - step: 25555 loss: 2.7471 memory: 122.04GiB(87.57%) tps: 10,270 tflops: 489.47 mfu: 49.49% global_avg_ntp_loss: 0.7891 global_avg_top_loss: 1.9580 +[titan] 2025-09-09 16:29:16,906 - root - INFO - lr: 7.2903e-06 gnorm: 0.38 [1 day, 22:53:48<1 day, 2:30:30] +[titan] 2025-09-09 16:29:49,018 - root - INFO - step: 25560 loss: 2.8651 memory: 122.04GiB(87.57%) tps: 10,205 tflops: 486.35 mfu: 49.18% global_avg_ntp_loss: 0.8428 global_avg_top_loss: 2.0224 +[titan] 2025-09-09 16:29:49,018 - root - INFO - lr: 7.2870e-06 gnorm: 0.37 [1 day, 22:54:20<1 day, 2:29:57] +[titan] 2025-09-09 16:30:20,955 - root - INFO - step: 25565 loss: 2.7229 memory: 122.04GiB(87.57%) tps: 10,260 tflops: 489.01 mfu: 49.44% global_avg_ntp_loss: 0.7705 global_avg_top_loss: 1.9524 +[titan] 2025-09-09 16:30:20,955 - root - INFO - lr: 7.2838e-06 gnorm: 0.35 [1 day, 22:54:52<1 day, 2:29:23] +[titan] 2025-09-09 16:30:52,822 - root - INFO - step: 25570 loss: 2.7319 memory: 122.04GiB(87.57%) tps: 10,283 tflops: 490.08 mfu: 49.55% global_avg_ntp_loss: 0.7726 global_avg_top_loss: 1.9592 +[titan] 2025-09-09 16:30:52,823 - root - INFO - lr: 7.2805e-06 gnorm: 0.36 [1 day, 22:55:24<1 day, 2:28:49] +[titan] 2025-09-09 16:31:24,711 - root - INFO - step: 25575 loss: 3.2013 memory: 122.04GiB(87.57%) tps: 10,276 tflops: 489.75 mfu: 49.52% global_avg_ntp_loss: 1.0378 global_avg_top_loss: 2.1635 +[titan] 2025-09-09 16:31:24,711 - root - INFO - lr: 7.2773e-06 gnorm: 0.37 [1 day, 22:55:56<1 day, 2:28:16] +[titan] 2025-09-09 16:31:56,873 - root - INFO - step: 25580 loss: 2.7531 memory: 122.04GiB(87.57%) tps: 10,189 tflops: 485.59 mfu: 49.10% global_avg_ntp_loss: 0.7817 global_avg_top_loss: 1.9714 +[titan] 2025-09-09 16:31:56,873 - root - INFO - lr: 7.2740e-06 gnorm: 0.37 [1 day, 22:56:28<1 day, 2:27:42] +[titan] 2025-09-09 16:32:28,799 - root - INFO - step: 25585 loss: 2.6784 memory: 122.04GiB(87.57%) tps: 10,264 tflops: 489.18 mfu: 49.46% global_avg_ntp_loss: 0.7474 global_avg_top_loss: 1.9310 +[titan] 2025-09-09 16:32:28,799 - root - INFO - lr: 7.2708e-06 gnorm: 0.35 [1 day, 22:57:00<1 day, 2:27:08] +[titan] 2025-09-09 16:33:00,631 - root - INFO - step: 25590 loss: 2.6783 memory: 122.04GiB(87.57%) tps: 10,294 tflops: 490.61 mfu: 49.61% global_avg_ntp_loss: 0.7502 global_avg_top_loss: 1.9281 +[titan] 2025-09-09 16:33:00,632 - root - INFO - lr: 7.2675e-06 gnorm: 0.35 [1 day, 22:57:32<1 day, 2:26:35] +[titan] 2025-09-09 16:33:32,642 - root - INFO - step: 25595 loss: 3.1604 memory: 122.04GiB(87.57%) tps: 10,237 tflops: 487.88 mfu: 49.33% global_avg_ntp_loss: 1.0208 global_avg_top_loss: 2.1396 +[titan] 2025-09-09 16:33:32,643 - root - INFO - lr: 7.2643e-06 gnorm: 0.35 [1 day, 22:58:04<1 day, 2:26:01] +[titan] 2025-09-09 16:33:58,235 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 16:34:04,695 - root - INFO - step: 25600 loss: 2.6280 memory: 122.04GiB(87.57%) tps: 10,224 tflops: 487.25 mfu: 49.27% global_avg_ntp_loss: 0.7240 global_avg_top_loss: 1.9040 +[titan] 2025-09-09 16:34:04,695 - root - INFO - lr: 7.2610e-06 gnorm: 0.34 [1 day, 22:58:36<1 day, 2:25:27] +[titan] 2025-09-09 16:34:04,950 - root - INFO - Dumping profiler traces at step 25600 +[titan] 2025-09-09 16:34:05,010 - root - INFO - Finished dumping profiler traces in 0.06 seconds +[titan] 2025-09-09 16:34:36,890 - root - INFO - step: 25605 loss: 2.7729 memory: 122.04GiB(87.57%) tps: 10,178 tflops: 485.09 mfu: 49.05% global_avg_ntp_loss: 0.7875 global_avg_top_loss: 1.9855 +[titan] 2025-09-09 16:34:36,890 - root - INFO - lr: 7.2578e-06 gnorm: 0.36 [1 day, 22:59:08<1 day, 2:24:54] +[titan] 2025-09-09 16:35:08,770 - root - INFO - step: 25610 loss: 2.7704 memory: 122.04GiB(87.57%) tps: 10,279 tflops: 489.87 mfu: 49.53% global_avg_ntp_loss: 0.7873 global_avg_top_loss: 1.9831 +[titan] 2025-09-09 16:35:08,770 - root - INFO - lr: 7.2545e-06 gnorm: 0.37 [1 day, 22:59:40<1 day, 2:24:20] +[titan] 2025-09-09 16:35:40,590 - root - INFO - step: 25615 loss: 2.6083 memory: 122.04GiB(87.57%) tps: 10,299 tflops: 490.85 mfu: 49.63% global_avg_ntp_loss: 0.7160 global_avg_top_loss: 1.8923 +[titan] 2025-09-09 16:35:40,591 - root - INFO - lr: 7.2513e-06 gnorm: 0.38 [1 day, 23:00:12<1 day, 2:23:47] +[titan] 2025-09-09 16:36:12,515 - root - INFO - step: 25620 loss: 2.8002 memory: 122.04GiB(87.57%) tps: 10,264 tflops: 489.19 mfu: 49.46% global_avg_ntp_loss: 0.8017 global_avg_top_loss: 1.9985 +[titan] 2025-09-09 16:36:12,516 - root - INFO - lr: 7.2480e-06 gnorm: 0.36 [1 day, 23:00:44<1 day, 2:23:13] +[titan] 2025-09-09 16:36:44,513 - root - INFO - step: 25625 loss: 2.7163 memory: 122.04GiB(87.57%) tps: 10,241 tflops: 488.08 mfu: 49.35% global_avg_ntp_loss: 0.7670 global_avg_top_loss: 1.9494 +[titan] 2025-09-09 16:36:44,514 - root - INFO - lr: 7.2448e-06 gnorm: 0.35 [1 day, 23:01:16<1 day, 2:22:39] +[titan] 2025-09-09 16:37:16,448 - root - INFO - step: 25630 loss: 2.7114 memory: 122.04GiB(87.57%) tps: 10,261 tflops: 489.05 mfu: 49.45% global_avg_ntp_loss: 0.7650 global_avg_top_loss: 1.9465 +[titan] 2025-09-09 16:37:16,448 - root - INFO - lr: 7.2416e-06 gnorm: 0.37 [1 day, 23:01:48<1 day, 2:22:06] +[titan] 2025-09-09 16:37:48,514 - root - INFO - step: 25635 loss: 2.7578 memory: 122.04GiB(87.57%) tps: 10,219 tflops: 487.04 mfu: 49.25% global_avg_ntp_loss: 0.7860 global_avg_top_loss: 1.9718 +[titan] 2025-09-09 16:37:48,514 - root - INFO - lr: 7.2383e-06 gnorm: 0.36 [1 day, 23:02:20<1 day, 2:21:32] +[titan] 2025-09-09 16:38:20,467 - root - INFO - step: 25640 loss: 2.6960 memory: 122.04GiB(87.57%) tps: 10,255 tflops: 488.75 mfu: 49.42% global_avg_ntp_loss: 0.7554 global_avg_top_loss: 1.9406 +[titan] 2025-09-09 16:38:20,468 - root - INFO - lr: 7.2351e-06 gnorm: 0.35 [1 day, 23:02:52<1 day, 2:20:58] +[titan] 2025-09-09 16:38:52,615 - root - INFO - step: 25645 loss: 2.7464 memory: 122.04GiB(87.57%) tps: 10,193 tflops: 485.80 mfu: 49.12% global_avg_ntp_loss: 0.7800 global_avg_top_loss: 1.9664 +[titan] 2025-09-09 16:38:52,615 - root - INFO - lr: 7.2318e-06 gnorm: 0.36 [1 day, 23:03:24<1 day, 2:20:25] +[titan] 2025-09-09 16:39:18,125 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 16:39:24,527 - root - INFO - step: 25650 loss: 2.8335 memory: 122.04GiB(87.57%) tps: 10,269 tflops: 489.40 mfu: 49.48% global_avg_ntp_loss: 0.8255 global_avg_top_loss: 2.0080 +[titan] 2025-09-09 16:39:24,527 - root - INFO - lr: 7.2286e-06 gnorm: 0.40 [1 day, 23:03:56<1 day, 2:19:51] +[titan] 2025-09-09 16:39:56,644 - root - INFO - step: 25655 loss: 3.2557 memory: 122.04GiB(87.57%) tps: 10,203 tflops: 486.26 mfu: 49.17% global_avg_ntp_loss: 1.0758 global_avg_top_loss: 2.1799 +[titan] 2025-09-09 16:39:56,645 - root - INFO - lr: 7.2253e-06 gnorm: 0.35 [1 day, 23:04:28<1 day, 2:19:18] +[titan] 2025-09-09 16:40:28,606 - root - INFO - step: 25660 loss: 2.7666 memory: 122.04GiB(87.57%) tps: 10,253 tflops: 488.63 mfu: 49.41% global_avg_ntp_loss: 0.7880 global_avg_top_loss: 1.9786 +[titan] 2025-09-09 16:40:28,606 - root - INFO - lr: 7.2221e-06 gnorm: 0.37 [1 day, 23:05:00<1 day, 2:18:44] +[titan] 2025-09-09 16:41:00,483 - root - INFO - step: 25665 loss: 2.8095 memory: 122.04GiB(87.57%) tps: 10,280 tflops: 489.92 mfu: 49.54% global_avg_ntp_loss: 0.8120 global_avg_top_loss: 1.9974 +[titan] 2025-09-09 16:41:00,484 - root - INFO - lr: 7.2189e-06 gnorm: 0.38 [1 day, 23:05:32<1 day, 2:18:10] +[titan] 2025-09-09 16:41:32,557 - root - INFO - step: 25670 loss: 2.8191 memory: 122.04GiB(87.57%) tps: 10,217 tflops: 486.93 mfu: 49.23% global_avg_ntp_loss: 0.8116 global_avg_top_loss: 2.0074 +[titan] 2025-09-09 16:41:32,557 - root - INFO - lr: 7.2156e-06 gnorm: 0.38 [1 day, 23:06:04<1 day, 2:17:37] +[titan] 2025-09-09 16:42:04,691 - root - INFO - step: 25675 loss: 2.8360 memory: 122.04GiB(87.57%) tps: 10,198 tflops: 486.01 mfu: 49.14% global_avg_ntp_loss: 0.8426 global_avg_top_loss: 1.9935 +[titan] 2025-09-09 16:42:04,691 - root - INFO - lr: 7.2124e-06 gnorm: 0.36 [1 day, 23:06:36<1 day, 2:17:03] +[titan] 2025-09-09 16:42:36,428 - root - INFO - step: 25680 loss: 2.6169 memory: 122.04GiB(87.57%) tps: 10,325 tflops: 492.08 mfu: 49.76% global_avg_ntp_loss: 0.7155 global_avg_top_loss: 1.9014 +[titan] 2025-09-09 16:42:36,428 - root - INFO - lr: 7.2091e-06 gnorm: 0.51 [1 day, 23:07:08<1 day, 2:16:30] +[titan] 2025-09-09 16:43:08,422 - root - INFO - step: 25685 loss: 2.7441 memory: 122.04GiB(87.57%) tps: 10,242 tflops: 488.14 mfu: 49.36% global_avg_ntp_loss: 0.7760 global_avg_top_loss: 1.9681 +[titan] 2025-09-09 16:43:08,422 - root - INFO - lr: 7.2059e-06 gnorm: 0.36 [1 day, 23:07:40<1 day, 2:15:56] +[titan] 2025-09-09 16:43:40,546 - root - INFO - step: 25690 loss: 2.7775 memory: 122.04GiB(87.57%) tps: 10,201 tflops: 486.16 mfu: 49.16% global_avg_ntp_loss: 0.7930 global_avg_top_loss: 1.9846 +[titan] 2025-09-09 16:43:40,546 - root - INFO - lr: 7.2027e-06 gnorm: 0.37 [1 day, 23:08:12<1 day, 2:15:22] +[titan] 2025-09-09 16:44:12,481 - root - INFO - step: 25695 loss: 2.6731 memory: 122.04GiB(87.57%) tps: 10,261 tflops: 489.03 mfu: 49.45% global_avg_ntp_loss: 0.7430 global_avg_top_loss: 1.9301 +[titan] 2025-09-09 16:44:12,481 - root - INFO - lr: 7.1994e-06 gnorm: 0.42 [1 day, 23:08:44<1 day, 2:14:49] +[titan] 2025-09-09 16:44:38,153 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 16:44:44,521 - root - INFO - step: 25700 loss: 2.7730 memory: 122.04GiB(87.57%) tps: 10,228 tflops: 487.44 mfu: 49.29% global_avg_ntp_loss: 0.7941 global_avg_top_loss: 1.9789 +[titan] 2025-09-09 16:44:44,521 - root - INFO - lr: 7.1962e-06 gnorm: 0.38 [1 day, 23:09:16<1 day, 2:14:15] +[titan] 2025-09-09 16:45:16,478 - root - INFO - step: 25705 loss: 2.7261 memory: 122.04GiB(87.57%) tps: 10,254 tflops: 488.71 mfu: 49.41% global_avg_ntp_loss: 0.7730 global_avg_top_loss: 1.9531 +[titan] 2025-09-09 16:45:16,478 - root - INFO - lr: 7.1930e-06 gnorm: 0.37 [1 day, 23:09:48<1 day, 2:13:42] +[titan] 2025-09-09 16:45:48,423 - root - INFO - step: 25710 loss: 2.7508 memory: 122.04GiB(87.57%) tps: 10,258 tflops: 488.88 mfu: 49.43% global_avg_ntp_loss: 0.7822 global_avg_top_loss: 1.9686 +[titan] 2025-09-09 16:45:48,424 - root - INFO - lr: 7.1897e-06 gnorm: 0.37 [1 day, 23:10:20<1 day, 2:13:08] +[titan] 2025-09-09 16:46:20,281 - root - INFO - step: 25715 loss: 2.6996 memory: 122.04GiB(87.57%) tps: 10,286 tflops: 490.23 mfu: 49.57% global_avg_ntp_loss: 0.7619 global_avg_top_loss: 1.9377 +[titan] 2025-09-09 16:46:20,281 - root - INFO - lr: 7.1865e-06 gnorm: 0.36 [1 day, 23:10:51<1 day, 2:12:34] +[titan] 2025-09-09 16:46:52,243 - root - INFO - step: 25720 loss: 2.7088 memory: 122.04GiB(87.57%) tps: 10,252 tflops: 488.61 mfu: 49.40% global_avg_ntp_loss: 0.7621 global_avg_top_loss: 1.9468 +[titan] 2025-09-09 16:46:52,244 - root - INFO - lr: 7.1833e-06 gnorm: 3.44 [1 day, 23:11:23<1 day, 2:12:01] +[titan] 2025-09-09 16:47:24,367 - root - INFO - step: 25725 loss: 2.6939 memory: 122.04GiB(87.57%) tps: 10,201 tflops: 486.17 mfu: 49.16% global_avg_ntp_loss: 0.7550 global_avg_top_loss: 1.9389 +[titan] 2025-09-09 16:47:24,367 - root - INFO - lr: 7.1800e-06 gnorm: 0.37 [1 day, 23:11:55<1 day, 2:11:27] +[titan] 2025-09-09 16:47:56,420 - root - INFO - step: 25730 loss: 2.7190 memory: 122.04GiB(87.57%) tps: 10,223 tflops: 487.24 mfu: 49.27% global_avg_ntp_loss: 0.7672 global_avg_top_loss: 1.9518 +[titan] 2025-09-09 16:47:56,420 - root - INFO - lr: 7.1768e-06 gnorm: 0.38 [1 day, 23:12:28<1 day, 2:10:54] +[titan] 2025-09-09 16:48:28,327 - root - INFO - step: 25735 loss: 3.1863 memory: 122.04GiB(87.57%) tps: 10,270 tflops: 489.46 mfu: 49.49% global_avg_ntp_loss: 1.0377 global_avg_top_loss: 2.1486 +[titan] 2025-09-09 16:48:28,327 - root - INFO - lr: 7.1736e-06 gnorm: 0.37 [1 day, 23:12:59<1 day, 2:10:20] +[titan] 2025-09-09 16:49:00,377 - root - INFO - step: 25740 loss: 2.7238 memory: 122.04GiB(87.57%) tps: 10,224 tflops: 487.28 mfu: 49.27% global_avg_ntp_loss: 0.7691 global_avg_top_loss: 1.9547 +[titan] 2025-09-09 16:49:00,377 - root - INFO - lr: 7.1703e-06 gnorm: 0.36 [1 day, 23:13:32<1 day, 2:09:46] +[titan] 2025-09-09 16:49:32,665 - root - INFO - step: 25745 loss: 2.8013 memory: 122.04GiB(87.57%) tps: 10,149 tflops: 483.70 mfu: 48.91% global_avg_ntp_loss: 0.8027 global_avg_top_loss: 1.9986 +[titan] 2025-09-09 16:49:32,665 - root - INFO - lr: 7.1671e-06 gnorm: 0.46 [1 day, 23:14:04<1 day, 2:09:13] +[titan] 2025-09-09 16:49:58,132 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 16:50:04,487 - root - INFO - step: 25750 loss: 2.9044 memory: 122.04GiB(87.57%) tps: 10,298 tflops: 490.77 mfu: 49.62% global_avg_ntp_loss: 0.8640 global_avg_top_loss: 2.0404 +[titan] 2025-09-09 16:50:04,487 - root - INFO - lr: 7.1639e-06 gnorm: 0.36 [1 day, 23:14:36<1 day, 2:08:39] +[titan] 2025-09-09 16:50:36,447 - root - INFO - step: 25755 loss: 2.7662 memory: 122.04GiB(87.57%) tps: 10,253 tflops: 488.65 mfu: 49.41% global_avg_ntp_loss: 0.7859 global_avg_top_loss: 1.9803 +[titan] 2025-09-09 16:50:36,447 - root - INFO - lr: 7.1606e-06 gnorm: 0.35 [1 day, 23:15:08<1 day, 2:08:06] +[titan] 2025-09-09 16:51:08,410 - root - INFO - step: 25760 loss: 2.6984 memory: 122.04GiB(87.57%) tps: 10,252 tflops: 488.61 mfu: 49.40% global_avg_ntp_loss: 0.7640 global_avg_top_loss: 1.9344 +[titan] 2025-09-09 16:51:08,410 - root - INFO - lr: 7.1574e-06 gnorm: 0.38 [1 day, 23:15:40<1 day, 2:07:32] +[titan] 2025-09-09 16:51:40,427 - root - INFO - step: 25765 loss: 2.7768 memory: 122.04GiB(87.57%) tps: 10,235 tflops: 487.78 mfu: 49.32% global_avg_ntp_loss: 0.7948 global_avg_top_loss: 1.9820 +[titan] 2025-09-09 16:51:40,428 - root - INFO - lr: 7.1542e-06 gnorm: 0.37 [1 day, 23:16:12<1 day, 2:06:58] +[titan] 2025-09-09 16:52:12,494 - root - INFO - step: 25770 loss: 2.6964 memory: 122.04GiB(87.57%) tps: 10,219 tflops: 487.03 mfu: 49.24% global_avg_ntp_loss: 0.7578 global_avg_top_loss: 1.9386 +[titan] 2025-09-09 16:52:12,494 - root - INFO - lr: 7.1510e-06 gnorm: 0.36 [1 day, 23:16:44<1 day, 2:06:25] +[titan] 2025-09-09 16:52:44,570 - root - INFO - step: 25775 loss: 2.8399 memory: 122.04GiB(87.57%) tps: 10,216 tflops: 486.89 mfu: 49.23% global_avg_ntp_loss: 0.8329 global_avg_top_loss: 2.0070 +[titan] 2025-09-09 16:52:44,570 - root - INFO - lr: 7.1477e-06 gnorm: 0.42 [1 day, 23:17:16<1 day, 2:05:51] +[titan] 2025-09-09 16:53:16,626 - root - INFO - step: 25780 loss: 2.7932 memory: 122.04GiB(87.57%) tps: 10,222 tflops: 487.19 mfu: 49.26% global_avg_ntp_loss: 0.8052 global_avg_top_loss: 1.9880 +[titan] 2025-09-09 16:53:16,626 - root - INFO - lr: 7.1445e-06 gnorm: 0.42 [1 day, 23:17:48<1 day, 2:05:18] +[titan] 2025-09-09 16:53:48,539 - root - INFO - step: 25785 loss: 2.7354 memory: 122.04GiB(87.57%) tps: 10,268 tflops: 489.38 mfu: 49.48% global_avg_ntp_loss: 0.7747 global_avg_top_loss: 1.9607 +[titan] 2025-09-09 16:53:48,539 - root - INFO - lr: 7.1413e-06 gnorm: 0.36 [1 day, 23:18:20<1 day, 2:04:44] +[titan] 2025-09-09 16:54:20,638 - root - INFO - step: 25790 loss: 2.7536 memory: 122.04GiB(87.57%) tps: 10,209 tflops: 486.54 mfu: 49.19% global_avg_ntp_loss: 0.7840 global_avg_top_loss: 1.9697 +[titan] 2025-09-09 16:54:20,638 - root - INFO - lr: 7.1381e-06 gnorm: 0.35 [1 day, 23:18:52<1 day, 2:04:11] +[titan] 2025-09-09 16:54:52,828 - root - INFO - step: 25795 loss: 2.6883 memory: 122.04GiB(87.57%) tps: 10,180 tflops: 485.15 mfu: 49.05% global_avg_ntp_loss: 0.7527 global_avg_top_loss: 1.9357 +[titan] 2025-09-09 16:54:52,829 - root - INFO - lr: 7.1348e-06 gnorm: 0.40 [1 day, 23:19:24<1 day, 2:03:37] +[titan] 2025-09-09 16:55:18,493 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 16:55:25,011 - root - INFO - step: 25800 loss: 2.7297 memory: 122.04GiB(87.57%) tps: 10,182 tflops: 485.28 mfu: 49.07% global_avg_ntp_loss: 0.7700 global_avg_top_loss: 1.9598 +[titan] 2025-09-09 16:55:25,011 - root - INFO - lr: 7.1316e-06 gnorm: 0.37 [1 day, 23:19:56<1 day, 2:03:04] +[titan] 2025-09-09 16:55:57,111 - root - INFO - step: 25805 loss: 2.6895 memory: 122.04GiB(87.57%) tps: 10,208 tflops: 486.51 mfu: 49.19% global_avg_ntp_loss: 0.7543 global_avg_top_loss: 1.9352 +[titan] 2025-09-09 16:55:57,112 - root - INFO - lr: 7.1284e-06 gnorm: 0.39 [1 day, 23:20:28<1 day, 2:02:30] +[titan] 2025-09-09 16:56:29,125 - root - INFO - step: 25810 loss: 2.7379 memory: 122.04GiB(87.57%) tps: 10,236 tflops: 487.84 mfu: 49.33% global_avg_ntp_loss: 0.7756 global_avg_top_loss: 1.9623 +[titan] 2025-09-09 16:56:29,125 - root - INFO - lr: 7.1252e-06 gnorm: 0.36 [1 day, 23:21:00<1 day, 2:01:57] +[titan] 2025-09-09 16:57:01,095 - root - INFO - step: 25815 loss: 2.7877 memory: 122.04GiB(87.57%) tps: 10,250 tflops: 488.49 mfu: 49.39% global_avg_ntp_loss: 0.7999 global_avg_top_loss: 1.9878 +[titan] 2025-09-09 16:57:01,096 - root - INFO - lr: 7.1219e-06 gnorm: 0.39 [1 day, 23:21:32<1 day, 2:01:23] +[titan] 2025-09-09 16:57:33,368 - root - INFO - step: 25820 loss: 2.6703 memory: 122.04GiB(87.57%) tps: 10,154 tflops: 483.93 mfu: 48.93% global_avg_ntp_loss: 0.7450 global_avg_top_loss: 1.9253 +[titan] 2025-09-09 16:57:33,368 - root - INFO - lr: 7.1187e-06 gnorm: 0.38 [1 day, 23:22:04<1 day, 2:00:50] +[titan] 2025-09-09 16:58:05,416 - root - INFO - step: 25825 loss: 2.7347 memory: 122.04GiB(87.57%) tps: 10,225 tflops: 487.31 mfu: 49.27% global_avg_ntp_loss: 0.7751 global_avg_top_loss: 1.9596 +[titan] 2025-09-09 16:58:05,416 - root - INFO - lr: 7.1155e-06 gnorm: 0.36 [1 day, 23:22:37<1 day, 2:00:16] +[titan] 2025-09-09 16:58:37,439 - root - INFO - step: 25830 loss: 2.5870 memory: 122.04GiB(87.57%) tps: 10,233 tflops: 487.69 mfu: 49.31% global_avg_ntp_loss: 0.7084 global_avg_top_loss: 1.8786 +[titan] 2025-09-09 16:58:37,439 - root - INFO - lr: 7.1123e-06 gnorm: 0.37 [1 day, 23:23:09<1 day, 1:59:42] +[titan] 2025-09-09 16:59:09,624 - root - INFO - step: 25835 loss: 2.7255 memory: 122.04GiB(87.57%) tps: 10,181 tflops: 485.24 mfu: 49.06% global_avg_ntp_loss: 0.7672 global_avg_top_loss: 1.9583 +[titan] 2025-09-09 16:59:09,624 - root - INFO - lr: 7.1091e-06 gnorm: 0.39 [1 day, 23:23:41<1 day, 1:59:09] +[titan] 2025-09-09 16:59:41,635 - root - INFO - step: 25840 loss: 2.8682 memory: 122.04GiB(87.57%) tps: 10,237 tflops: 487.87 mfu: 49.33% global_avg_ntp_loss: 0.8473 global_avg_top_loss: 2.0209 +[titan] 2025-09-09 16:59:41,636 - root - INFO - lr: 7.1058e-06 gnorm: 0.40 [1 day, 23:24:13<1 day, 1:58:35] +[titan] 2025-09-09 17:00:13,784 - root - INFO - step: 25845 loss: 2.7899 memory: 122.04GiB(87.57%) tps: 10,193 tflops: 485.79 mfu: 49.12% global_avg_ntp_loss: 0.7967 global_avg_top_loss: 1.9932 +[titan] 2025-09-09 17:00:13,785 - root - INFO - lr: 7.1026e-06 gnorm: 0.38 [1 day, 23:24:45<1 day, 1:58:02] +[titan] 2025-09-09 17:00:39,706 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:00:46,109 - root - INFO - step: 25850 loss: 2.6656 memory: 122.04GiB(87.57%) tps: 10,137 tflops: 483.14 mfu: 48.85% global_avg_ntp_loss: 0.7446 global_avg_top_loss: 1.9209 +[titan] 2025-09-09 17:00:46,110 - root - INFO - lr: 7.0994e-06 gnorm: 0.36 [1 day, 23:25:17<1 day, 1:57:28] +[titan] 2025-09-09 17:01:17,993 - root - INFO - step: 25855 loss: 2.7729 memory: 122.04GiB(87.57%) tps: 10,278 tflops: 489.83 mfu: 49.53% global_avg_ntp_loss: 0.7911 global_avg_top_loss: 1.9819 +[titan] 2025-09-09 17:01:17,993 - root - INFO - lr: 7.0962e-06 gnorm: 0.38 [1 day, 23:25:49<1 day, 1:56:55] +[titan] 2025-09-09 17:01:49,974 - root - INFO - step: 25860 loss: 2.7198 memory: 122.04GiB(87.57%) tps: 10,246 tflops: 488.34 mfu: 49.38% global_avg_ntp_loss: 0.7680 global_avg_top_loss: 1.9517 +[titan] 2025-09-09 17:01:49,974 - root - INFO - lr: 7.0930e-06 gnorm: 0.37 [1 day, 23:26:21<1 day, 1:56:21] +[titan] 2025-09-09 17:02:21,804 - root - INFO - step: 25865 loss: 2.6994 memory: 122.04GiB(87.57%) tps: 10,295 tflops: 490.65 mfu: 49.61% global_avg_ntp_loss: 0.7595 global_avg_top_loss: 1.9399 +[titan] 2025-09-09 17:02:21,804 - root - INFO - lr: 7.0898e-06 gnorm: 0.36 [1 day, 23:26:53<1 day, 1:55:48] +[titan] 2025-09-09 17:02:53,718 - root - INFO - step: 25870 loss: 2.7622 memory: 122.04GiB(87.57%) tps: 10,268 tflops: 489.36 mfu: 49.48% global_avg_ntp_loss: 0.7843 global_avg_top_loss: 1.9779 +[titan] 2025-09-09 17:02:53,718 - root - INFO - lr: 7.0865e-06 gnorm: 0.39 [1 day, 23:27:25<1 day, 1:55:14] +[titan] 2025-09-09 17:03:25,968 - root - INFO - step: 25875 loss: 2.6897 memory: 122.04GiB(87.57%) tps: 10,161 tflops: 484.27 mfu: 48.97% global_avg_ntp_loss: 0.7555 global_avg_top_loss: 1.9343 +[titan] 2025-09-09 17:03:25,968 - root - INFO - lr: 7.0833e-06 gnorm: 0.35 [1 day, 23:27:57<1 day, 1:54:40] +[titan] 2025-09-09 17:03:58,066 - root - INFO - step: 25880 loss: 2.7137 memory: 122.04GiB(87.57%) tps: 10,209 tflops: 486.54 mfu: 49.20% global_avg_ntp_loss: 0.7647 global_avg_top_loss: 1.9489 +[titan] 2025-09-09 17:03:58,067 - root - INFO - lr: 7.0801e-06 gnorm: 0.36 [1 day, 23:28:29<1 day, 1:54:07] +[titan] 2025-09-09 17:04:30,040 - root - INFO - step: 25885 loss: 2.6279 memory: 122.04GiB(87.57%) tps: 10,249 tflops: 488.44 mfu: 49.39% global_avg_ntp_loss: 0.7299 global_avg_top_loss: 1.8980 +[titan] 2025-09-09 17:04:30,041 - root - INFO - lr: 7.0769e-06 gnorm: 0.37 [1 day, 23:29:01<1 day, 1:53:33] +[titan] 2025-09-09 17:05:02,081 - root - INFO - step: 25890 loss: 2.7596 memory: 122.04GiB(87.57%) tps: 10,227 tflops: 487.42 mfu: 49.28% global_avg_ntp_loss: 0.7935 global_avg_top_loss: 1.9662 +[titan] 2025-09-09 17:05:02,082 - root - INFO - lr: 7.0737e-06 gnorm: 0.37 [1 day, 23:29:33<1 day, 1:53:00] +[titan] 2025-09-09 17:05:33,879 - root - INFO - step: 25895 loss: 2.7432 memory: 122.04GiB(87.57%) tps: 10,305 tflops: 491.14 mfu: 49.66% global_avg_ntp_loss: 0.7774 global_avg_top_loss: 1.9658 +[titan] 2025-09-09 17:05:33,880 - root - INFO - lr: 7.0705e-06 gnorm: 0.37 [1 day, 23:30:05<1 day, 1:52:26] +[titan] 2025-09-09 17:05:59,651 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:06:06,077 - root - INFO - step: 25900 loss: 2.7325 memory: 122.04GiB(87.57%) tps: 10,178 tflops: 485.05 mfu: 49.04% global_avg_ntp_loss: 0.7737 global_avg_top_loss: 1.9588 +[titan] 2025-09-09 17:06:06,077 - root - INFO - lr: 7.0673e-06 gnorm: 0.56 [1 day, 23:30:37<1 day, 1:51:53] +[titan] 2025-09-09 17:06:38,040 - root - INFO - step: 25905 loss: 2.8397 memory: 122.04GiB(87.57%) tps: 10,252 tflops: 488.60 mfu: 49.40% global_avg_ntp_loss: 0.8264 global_avg_top_loss: 2.0133 +[titan] 2025-09-09 17:06:38,041 - root - INFO - lr: 7.0641e-06 gnorm: 0.37 [1 day, 23:31:09<1 day, 1:51:19] +[titan] 2025-09-09 17:07:10,117 - root - INFO - step: 25910 loss: 2.7564 memory: 122.04GiB(87.57%) tps: 10,216 tflops: 486.88 mfu: 49.23% global_avg_ntp_loss: 0.7814 global_avg_top_loss: 1.9749 +[titan] 2025-09-09 17:07:10,117 - root - INFO - lr: 7.0608e-06 gnorm: 0.36 [1 day, 23:31:41<1 day, 1:50:46] +[titan] 2025-09-09 17:07:42,188 - root - INFO - step: 25915 loss: 3.1628 memory: 122.04GiB(87.57%) tps: 10,218 tflops: 486.96 mfu: 49.24% global_avg_ntp_loss: 1.0202 global_avg_top_loss: 2.1426 +[titan] 2025-09-09 17:07:42,188 - root - INFO - lr: 7.0576e-06 gnorm: 0.36 [1 day, 23:32:13<1 day, 1:50:12] +[titan] 2025-09-09 17:08:14,263 - root - INFO - step: 25920 loss: 2.8122 memory: 122.04GiB(87.57%) tps: 10,216 tflops: 486.90 mfu: 49.23% global_avg_ntp_loss: 0.8137 global_avg_top_loss: 1.9985 +[titan] 2025-09-09 17:08:14,263 - root - INFO - lr: 7.0544e-06 gnorm: 0.36 [1 day, 23:32:45<1 day, 1:49:38] +[titan] 2025-09-09 17:08:46,254 - root - INFO - step: 25925 loss: 2.8404 memory: 122.04GiB(87.57%) tps: 10,243 tflops: 488.18 mfu: 49.36% global_avg_ntp_loss: 0.8278 global_avg_top_loss: 2.0126 +[titan] 2025-09-09 17:08:46,254 - root - INFO - lr: 7.0512e-06 gnorm: 0.66 [1 day, 23:33:17<1 day, 1:49:05] +[titan] 2025-09-09 17:09:18,395 - root - INFO - step: 25930 loss: 2.7268 memory: 122.04GiB(87.57%) tps: 10,195 tflops: 485.91 mfu: 49.13% global_avg_ntp_loss: 0.7700 global_avg_top_loss: 1.9569 +[titan] 2025-09-09 17:09:18,395 - root - INFO - lr: 7.0480e-06 gnorm: 0.37 [1 day, 23:33:49<1 day, 1:48:31] +[titan] 2025-09-09 17:09:50,352 - root - INFO - step: 25935 loss: 2.6415 memory: 122.04GiB(87.57%) tps: 10,254 tflops: 488.69 mfu: 49.41% global_avg_ntp_loss: 0.7317 global_avg_top_loss: 1.9098 +[titan] 2025-09-09 17:09:50,353 - root - INFO - lr: 7.0448e-06 gnorm: 0.36 [1 day, 23:34:21<1 day, 1:47:58] +[titan] 2025-09-09 17:10:22,460 - root - INFO - step: 25940 loss: 2.7190 memory: 122.04GiB(87.57%) tps: 10,206 tflops: 486.40 mfu: 49.18% global_avg_ntp_loss: 0.7676 global_avg_top_loss: 1.9514 +[titan] 2025-09-09 17:10:22,461 - root - INFO - lr: 7.0416e-06 gnorm: 0.37 [1 day, 23:34:54<1 day, 1:47:24] +[titan] 2025-09-09 17:10:54,645 - root - INFO - step: 25945 loss: 2.7431 memory: 122.04GiB(87.57%) tps: 10,182 tflops: 485.25 mfu: 49.06% global_avg_ntp_loss: 0.7822 global_avg_top_loss: 1.9610 +[titan] 2025-09-09 17:10:54,646 - root - INFO - lr: 7.0384e-06 gnorm: 0.37 [1 day, 23:35:26<1 day, 1:46:51] +[titan] 2025-09-09 17:11:20,180 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:11:26,613 - root - INFO - step: 25950 loss: 2.6404 memory: 122.04GiB(87.57%) tps: 10,251 tflops: 488.54 mfu: 49.40% global_avg_ntp_loss: 0.7345 global_avg_top_loss: 1.9060 +[titan] 2025-09-09 17:11:26,613 - root - INFO - lr: 7.0352e-06 gnorm: 0.36 [1 day, 23:35:58<1 day, 1:46:17] +[titan] 2025-09-09 17:11:58,761 - root - INFO - step: 25955 loss: 2.7990 memory: 122.04GiB(87.57%) tps: 10,193 tflops: 485.80 mfu: 49.12% global_avg_ntp_loss: 0.8003 global_avg_top_loss: 1.9987 +[titan] 2025-09-09 17:11:58,761 - root - INFO - lr: 7.0320e-06 gnorm: 0.37 [1 day, 23:36:30<1 day, 1:45:44] +[titan] 2025-09-09 17:12:30,683 - root - INFO - step: 25960 loss: 2.7329 memory: 122.04GiB(87.57%) tps: 10,265 tflops: 489.23 mfu: 49.47% global_avg_ntp_loss: 0.7694 global_avg_top_loss: 1.9635 +[titan] 2025-09-09 17:12:30,684 - root - INFO - lr: 7.0288e-06 gnorm: 0.37 [1 day, 23:37:02<1 day, 1:45:10] +[titan] 2025-09-09 17:13:02,662 - root - INFO - step: 25965 loss: 2.6478 memory: 122.04GiB(87.57%) tps: 10,247 tflops: 488.38 mfu: 49.38% global_avg_ntp_loss: 0.7350 global_avg_top_loss: 1.9128 +[titan] 2025-09-09 17:13:02,662 - root - INFO - lr: 7.0256e-06 gnorm: 0.38 [1 day, 23:37:34<1 day, 1:44:37] +[titan] 2025-09-09 17:13:34,659 - root - INFO - step: 25970 loss: 2.6278 memory: 122.04GiB(87.57%) tps: 10,241 tflops: 488.09 mfu: 49.35% global_avg_ntp_loss: 0.7235 global_avg_top_loss: 1.9042 +[titan] 2025-09-09 17:13:34,659 - root - INFO - lr: 7.0224e-06 gnorm: 0.36 [1 day, 23:38:06<1 day, 1:44:03] +[titan] 2025-09-09 17:14:06,778 - root - INFO - step: 25975 loss: 3.0415 memory: 122.04GiB(87.57%) tps: 10,202 tflops: 486.24 mfu: 49.16% global_avg_ntp_loss: 0.9439 global_avg_top_loss: 2.0976 +[titan] 2025-09-09 17:14:06,778 - root - INFO - lr: 7.0192e-06 gnorm: 0.37 [1 day, 23:38:38<1 day, 1:43:30] +[titan] 2025-09-09 17:14:38,819 - root - INFO - step: 25980 loss: 2.6871 memory: 122.04GiB(87.57%) tps: 10,227 tflops: 487.42 mfu: 49.28% global_avg_ntp_loss: 0.7530 global_avg_top_loss: 1.9341 +[titan] 2025-09-09 17:14:38,819 - root - INFO - lr: 7.0160e-06 gnorm: 0.36 [1 day, 23:39:10<1 day, 1:42:56] +[titan] 2025-09-09 17:15:10,777 - root - INFO - step: 25985 loss: 2.6717 memory: 122.04GiB(87.57%) tps: 10,254 tflops: 488.69 mfu: 49.41% global_avg_ntp_loss: 0.7479 global_avg_top_loss: 1.9238 +[titan] 2025-09-09 17:15:10,777 - root - INFO - lr: 7.0128e-06 gnorm: 0.36 [1 day, 23:39:42<1 day, 1:42:22] +[titan] 2025-09-09 17:15:42,889 - root - INFO - step: 25990 loss: 2.7102 memory: 122.04GiB(87.57%) tps: 10,205 tflops: 486.34 mfu: 49.18% global_avg_ntp_loss: 0.7640 global_avg_top_loss: 1.9462 +[titan] 2025-09-09 17:15:42,889 - root - INFO - lr: 7.0096e-06 gnorm: 0.37 [1 day, 23:40:14<1 day, 1:41:49] +[titan] 2025-09-09 17:16:15,003 - root - INFO - step: 25995 loss: 3.1320 memory: 122.04GiB(87.57%) tps: 10,204 tflops: 486.32 mfu: 49.17% global_avg_ntp_loss: 1.0082 global_avg_top_loss: 2.1237 +[titan] 2025-09-09 17:16:15,003 - root - INFO - lr: 7.0064e-06 gnorm: 0.37 [1 day, 23:40:46<1 day, 1:41:15] +[titan] 2025-09-09 17:16:40,779 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:16:47,196 - root - INFO - step: 26000 loss: 2.6892 memory: 122.04GiB(87.57%) tps: 10,179 tflops: 485.11 mfu: 49.05% global_avg_ntp_loss: 0.7537 global_avg_top_loss: 1.9355 +[titan] 2025-09-09 17:16:47,196 - root - INFO - lr: 7.0032e-06 gnorm: 0.35 [1 day, 23:41:18<1 day, 1:40:42] +[titan] 2025-09-09 17:17:19,262 - root - INFO - step: 26005 loss: 2.6976 memory: 122.04GiB(87.57%) tps: 10,219 tflops: 487.04 mfu: 49.25% global_avg_ntp_loss: 0.7588 global_avg_top_loss: 1.9388 +[titan] 2025-09-09 17:17:19,262 - root - INFO - lr: 7.0000e-06 gnorm: 0.38 [1 day, 23:41:50<1 day, 1:40:08] +[titan] 2025-09-09 17:17:51,312 - root - INFO - step: 26010 loss: 2.7888 memory: 122.04GiB(87.57%) tps: 10,224 tflops: 487.28 mfu: 49.27% global_avg_ntp_loss: 0.7977 global_avg_top_loss: 1.9911 +[titan] 2025-09-09 17:17:51,312 - root - INFO - lr: 6.9968e-06 gnorm: 0.37 [1 day, 23:42:22<1 day, 1:39:35] +[titan] 2025-09-09 17:18:23,360 - root - INFO - step: 26015 loss: 2.7930 memory: 122.04GiB(87.57%) tps: 10,225 tflops: 487.32 mfu: 49.27% global_avg_ntp_loss: 0.8097 global_avg_top_loss: 1.9833 +[titan] 2025-09-09 17:18:23,360 - root - INFO - lr: 6.9936e-06 gnorm: 0.48 [1 day, 23:42:54<1 day, 1:39:01] +[titan] 2025-09-09 17:18:55,360 - root - INFO - step: 26020 loss: 2.7541 memory: 122.04GiB(87.57%) tps: 10,240 tflops: 488.04 mfu: 49.35% global_avg_ntp_loss: 0.7820 global_avg_top_loss: 1.9721 +[titan] 2025-09-09 17:18:55,360 - root - INFO - lr: 6.9904e-06 gnorm: 0.36 [1 day, 23:43:26<1 day, 1:38:28] +[titan] 2025-09-09 17:19:27,445 - root - INFO - step: 26025 loss: 2.7449 memory: 122.04GiB(87.57%) tps: 10,213 tflops: 486.75 mfu: 49.22% global_avg_ntp_loss: 0.7743 global_avg_top_loss: 1.9706 +[titan] 2025-09-09 17:19:27,446 - root - INFO - lr: 6.9872e-06 gnorm: 0.45 [1 day, 23:43:59<1 day, 1:37:54] +[titan] 2025-09-09 17:19:59,479 - root - INFO - step: 26030 loss: 2.7256 memory: 122.04GiB(87.57%) tps: 10,230 tflops: 487.53 mfu: 49.30% global_avg_ntp_loss: 0.7765 global_avg_top_loss: 1.9492 +[titan] 2025-09-09 17:19:59,479 - root - INFO - lr: 6.9840e-06 gnorm: 0.38 [1 day, 23:44:31<1 day, 1:37:21] +[titan] 2025-09-09 17:20:31,585 - root - INFO - step: 26035 loss: 2.7881 memory: 122.04GiB(87.57%) tps: 10,206 tflops: 486.43 mfu: 49.18% global_avg_ntp_loss: 0.7991 global_avg_top_loss: 1.9889 +[titan] 2025-09-09 17:20:31,586 - root - INFO - lr: 6.9808e-06 gnorm: 0.41 [1 day, 23:45:03<1 day, 1:36:47] +[titan] 2025-09-09 17:21:03,561 - root - INFO - step: 26040 loss: 2.7484 memory: 122.04GiB(87.57%) tps: 10,248 tflops: 488.41 mfu: 49.38% global_avg_ntp_loss: 0.7811 global_avg_top_loss: 1.9673 +[titan] 2025-09-09 17:21:03,562 - root - INFO - lr: 6.9776e-06 gnorm: 0.37 [1 day, 23:45:35<1 day, 1:36:14] +[titan] 2025-09-09 17:21:35,665 - root - INFO - step: 26045 loss: 3.0163 memory: 122.04GiB(87.57%) tps: 10,207 tflops: 486.46 mfu: 49.19% global_avg_ntp_loss: 0.9318 global_avg_top_loss: 2.0845 +[titan] 2025-09-09 17:21:35,666 - root - INFO - lr: 6.9744e-06 gnorm: 0.36 [1 day, 23:46:07<1 day, 1:35:40] +[titan] 2025-09-09 17:22:01,268 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:22:07,694 - root - INFO - step: 26050 loss: 2.7443 memory: 122.04GiB(87.57%) tps: 10,231 tflops: 487.62 mfu: 49.30% global_avg_ntp_loss: 0.7822 global_avg_top_loss: 1.9621 +[titan] 2025-09-09 17:22:07,694 - root - INFO - lr: 6.9712e-06 gnorm: 0.36 [1 day, 23:46:39<1 day, 1:35:07] +[titan] 2025-09-09 17:22:39,719 - root - INFO - step: 26055 loss: 2.7208 memory: 122.04GiB(87.57%) tps: 10,232 tflops: 487.66 mfu: 49.31% global_avg_ntp_loss: 0.7660 global_avg_top_loss: 1.9549 +[titan] 2025-09-09 17:22:39,719 - root - INFO - lr: 6.9680e-06 gnorm: 0.38 [1 day, 23:47:11<1 day, 1:34:33] +[titan] 2025-09-09 17:23:11,858 - root - INFO - step: 26060 loss: 2.6359 memory: 122.04GiB(87.57%) tps: 10,196 tflops: 485.93 mfu: 49.13% global_avg_ntp_loss: 0.7269 global_avg_top_loss: 1.9090 +[titan] 2025-09-09 17:23:11,859 - root - INFO - lr: 6.9648e-06 gnorm: 0.50 [1 day, 23:47:43<1 day, 1:34:00] +[titan] 2025-09-09 17:23:43,902 - root - INFO - step: 26065 loss: 2.7613 memory: 122.04GiB(87.57%) tps: 10,226 tflops: 487.38 mfu: 49.28% global_avg_ntp_loss: 0.7847 global_avg_top_loss: 1.9766 +[titan] 2025-09-09 17:23:43,902 - root - INFO - lr: 6.9616e-06 gnorm: 0.40 [1 day, 23:48:15<1 day, 1:33:26] +[titan] 2025-09-09 17:24:15,916 - root - INFO - step: 26070 loss: 2.7550 memory: 122.04GiB(87.57%) tps: 10,236 tflops: 487.83 mfu: 49.33% global_avg_ntp_loss: 0.7811 global_avg_top_loss: 1.9740 +[titan] 2025-09-09 17:24:15,916 - root - INFO - lr: 6.9585e-06 gnorm: 0.38 [1 day, 23:48:47<1 day, 1:32:52] +[titan] 2025-09-09 17:24:47,927 - root - INFO - step: 26075 loss: 3.1116 memory: 122.04GiB(87.57%) tps: 10,237 tflops: 487.88 mfu: 49.33% global_avg_ntp_loss: 0.9942 global_avg_top_loss: 2.1173 +[titan] 2025-09-09 17:24:47,927 - root - INFO - lr: 6.9553e-06 gnorm: 0.38 [1 day, 23:49:19<1 day, 1:32:19] +[titan] 2025-09-09 17:25:20,262 - root - INFO - step: 26080 loss: 2.7241 memory: 122.04GiB(87.57%) tps: 10,134 tflops: 482.98 mfu: 48.84% global_avg_ntp_loss: 0.7668 global_avg_top_loss: 1.9573 +[titan] 2025-09-09 17:25:20,263 - root - INFO - lr: 6.9521e-06 gnorm: 0.38 [1 day, 23:49:51<1 day, 1:31:46] +[titan] 2025-09-09 17:25:52,316 - root - INFO - step: 26085 loss: 2.6729 memory: 122.04GiB(87.57%) tps: 10,223 tflops: 487.23 mfu: 49.26% global_avg_ntp_loss: 0.7504 global_avg_top_loss: 1.9225 +[titan] 2025-09-09 17:25:52,316 - root - INFO - lr: 6.9489e-06 gnorm: 0.37 [1 day, 23:50:23<1 day, 1:31:12] +[titan] 2025-09-09 17:26:24,324 - root - INFO - step: 26090 loss: 2.7296 memory: 122.04GiB(87.57%) tps: 10,238 tflops: 487.91 mfu: 49.33% global_avg_ntp_loss: 0.7711 global_avg_top_loss: 1.9585 +[titan] 2025-09-09 17:26:24,325 - root - INFO - lr: 6.9457e-06 gnorm: 0.37 [1 day, 23:50:55<1 day, 1:30:38] +[titan] 2025-09-09 17:26:56,361 - root - INFO - step: 26095 loss: 2.7303 memory: 122.04GiB(87.57%) tps: 10,229 tflops: 487.49 mfu: 49.29% global_avg_ntp_loss: 0.7752 global_avg_top_loss: 1.9551 +[titan] 2025-09-09 17:26:56,361 - root - INFO - lr: 6.9425e-06 gnorm: 0.37 [1 day, 23:51:27<1 day, 1:30:05] +[titan] 2025-09-09 17:27:22,101 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:27:28,468 - root - INFO - step: 26100 loss: 2.7222 memory: 122.04GiB(87.57%) tps: 10,206 tflops: 486.43 mfu: 49.18% global_avg_ntp_loss: 0.7676 global_avg_top_loss: 1.9546 +[titan] 2025-09-09 17:27:28,468 - root - INFO - lr: 6.9393e-06 gnorm: 0.37 [1 day, 23:52:00<1 day, 1:29:31] +[titan] 2025-09-09 17:28:00,388 - root - INFO - step: 26105 loss: 2.6318 memory: 122.04GiB(87.57%) tps: 10,266 tflops: 489.27 mfu: 49.47% global_avg_ntp_loss: 0.7282 global_avg_top_loss: 1.9036 +[titan] 2025-09-09 17:28:00,388 - root - INFO - lr: 6.9361e-06 gnorm: 0.35 [1 day, 23:52:31<1 day, 1:28:58] +[titan] 2025-09-09 17:28:32,441 - root - INFO - step: 26110 loss: 2.9489 memory: 122.04GiB(87.57%) tps: 10,223 tflops: 487.24 mfu: 49.27% global_avg_ntp_loss: 0.8720 global_avg_top_loss: 2.0769 +[titan] 2025-09-09 17:28:32,441 - root - INFO - lr: 6.9330e-06 gnorm: 0.38 [1 day, 23:53:03<1 day, 1:28:24] +[titan] 2025-09-09 17:28:45,506 - root - INFO - Dumping profiler traces at step 26112 +[titan] 2025-09-09 17:28:45,565 - root - INFO - Finished dumping profiler traces in 0.06 seconds +[titan] 2025-09-09 17:29:04,741 - root - INFO - step: 26115 loss: 2.6586 memory: 122.04GiB(87.57%) tps: 10,145 tflops: 483.51 mfu: 48.89% global_avg_ntp_loss: 0.7405 global_avg_top_loss: 1.9181 +[titan] 2025-09-09 17:29:04,742 - root - INFO - lr: 6.9298e-06 gnorm: 0.41 [1 day, 23:53:36<1 day, 1:27:51] +[titan] 2025-09-09 17:29:36,777 - root - INFO - step: 26120 loss: 2.7465 memory: 122.04GiB(87.57%) tps: 10,229 tflops: 487.50 mfu: 49.29% global_avg_ntp_loss: 0.7785 global_avg_top_loss: 1.9680 +[titan] 2025-09-09 17:29:36,777 - root - INFO - lr: 6.9266e-06 gnorm: 0.40 [1 day, 23:54:08<1 day, 1:27:17] +[titan] 2025-09-09 17:30:08,933 - root - INFO - step: 26125 loss: 3.2059 memory: 122.04GiB(87.57%) tps: 10,191 tflops: 485.67 mfu: 49.11% global_avg_ntp_loss: 1.0409 global_avg_top_loss: 2.1650 +[titan] 2025-09-09 17:30:08,933 - root - INFO - lr: 6.9234e-06 gnorm: 0.39 [1 day, 23:54:40<1 day, 1:26:44] +[titan] 2025-09-09 17:30:41,084 - root - INFO - step: 26130 loss: 2.7242 memory: 122.04GiB(87.57%) tps: 10,192 tflops: 485.75 mfu: 49.12% global_avg_ntp_loss: 0.7707 global_avg_top_loss: 1.9535 +[titan] 2025-09-09 17:30:41,085 - root - INFO - lr: 6.9202e-06 gnorm: 0.39 [1 day, 23:55:12<1 day, 1:26:10] +[titan] 2025-09-09 17:31:12,963 - root - INFO - step: 26135 loss: 2.6939 memory: 122.04GiB(87.57%) tps: 10,279 tflops: 489.90 mfu: 49.53% global_avg_ntp_loss: 0.7583 global_avg_top_loss: 1.9356 +[titan] 2025-09-09 17:31:12,963 - root - INFO - lr: 6.9170e-06 gnorm: 0.37 [1 day, 23:55:44<1 day, 1:25:37] +[titan] 2025-09-09 17:31:44,751 - root - INFO - step: 26140 loss: 2.7265 memory: 122.04GiB(87.57%) tps: 10,310 tflops: 491.36 mfu: 49.68% global_avg_ntp_loss: 0.7773 global_avg_top_loss: 1.9492 +[titan] 2025-09-09 17:31:44,751 - root - INFO - lr: 6.9139e-06 gnorm: 0.38 [1 day, 23:56:16<1 day, 1:25:03] +[titan] 2025-09-09 17:32:16,788 - root - INFO - step: 26145 loss: 2.7751 memory: 122.04GiB(87.57%) tps: 10,228 tflops: 487.48 mfu: 49.29% global_avg_ntp_loss: 0.7924 global_avg_top_loss: 1.9827 +[titan] 2025-09-09 17:32:16,789 - root - INFO - lr: 6.9107e-06 gnorm: 0.36 [1 day, 23:56:48<1 day, 1:24:30] +[titan] 2025-09-09 17:32:42,415 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:32:48,788 - root - INFO - step: 26150 loss: 2.7013 memory: 122.04GiB(87.57%) tps: 10,240 tflops: 488.05 mfu: 49.35% global_avg_ntp_loss: 0.7583 global_avg_top_loss: 1.9429 +[titan] 2025-09-09 17:32:48,789 - root - INFO - lr: 6.9075e-06 gnorm: 0.35 [1 day, 23:57:20<1 day, 1:23:56] +[titan] 2025-09-09 17:33:20,777 - root - INFO - step: 26155 loss: 3.1825 memory: 122.04GiB(87.57%) tps: 10,244 tflops: 488.21 mfu: 49.36% global_avg_ntp_loss: 1.0251 global_avg_top_loss: 2.1575 +[titan] 2025-09-09 17:33:20,777 - root - INFO - lr: 6.9043e-06 gnorm: 0.51 [1 day, 23:57:52<1 day, 1:23:23] +[titan] 2025-09-09 17:33:52,528 - root - INFO - step: 26160 loss: 2.7225 memory: 122.04GiB(87.57%) tps: 10,321 tflops: 491.87 mfu: 49.73% global_avg_ntp_loss: 0.7712 global_avg_top_loss: 1.9513 +[titan] 2025-09-09 17:33:52,529 - root - INFO - lr: 6.9011e-06 gnorm: 0.37 [1 day, 23:58:24<1 day, 1:22:49] +[titan] 2025-09-09 17:34:24,606 - root - INFO - step: 26165 loss: 2.6600 memory: 122.04GiB(87.57%) tps: 10,216 tflops: 486.86 mfu: 49.23% global_avg_ntp_loss: 0.7450 global_avg_top_loss: 1.9150 +[titan] 2025-09-09 17:34:24,607 - root - INFO - lr: 6.8980e-06 gnorm: 0.37 [1 day, 23:58:56<1 day, 1:22:15] +[titan] 2025-09-09 17:34:56,581 - root - INFO - step: 26170 loss: 2.6813 memory: 122.04GiB(87.57%) tps: 10,248 tflops: 488.43 mfu: 49.39% global_avg_ntp_loss: 0.7488 global_avg_top_loss: 1.9324 +[titan] 2025-09-09 17:34:56,581 - root - INFO - lr: 6.8948e-06 gnorm: 0.38 [1 day, 23:59:28<1 day, 1:21:42] +[titan] 2025-09-09 17:35:28,771 - root - INFO - step: 26175 loss: 2.7274 memory: 122.04GiB(87.57%) tps: 10,180 tflops: 485.17 mfu: 49.06% global_avg_ntp_loss: 0.7659 global_avg_top_loss: 1.9615 +[titan] 2025-09-09 17:35:28,771 - root - INFO - lr: 6.8916e-06 gnorm: 0.37 [2 days, 0:00:00<1 day, 1:21:08] +[titan] 2025-09-09 17:36:00,718 - root - INFO - step: 26180 loss: 2.6162 memory: 122.04GiB(87.57%) tps: 10,257 tflops: 488.84 mfu: 49.43% global_avg_ntp_loss: 0.7187 global_avg_top_loss: 1.8974 +[titan] 2025-09-09 17:36:00,719 - root - INFO - lr: 6.8884e-06 gnorm: 0.37 [2 days, 0:00:32<1 day, 1:20:35] +[titan] 2025-09-09 17:36:32,747 - root - INFO - step: 26185 loss: 2.7548 memory: 122.04GiB(87.57%) tps: 10,231 tflops: 487.60 mfu: 49.30% global_avg_ntp_loss: 0.7824 global_avg_top_loss: 1.9723 +[titan] 2025-09-09 17:36:32,748 - root - INFO - lr: 6.8853e-06 gnorm: 0.42 [2 days, 0:01:04<1 day, 1:20:01] +[titan] 2025-09-09 17:37:04,588 - root - INFO - step: 26190 loss: 2.9840 memory: 122.04GiB(87.57%) tps: 10,292 tflops: 490.49 mfu: 49.59% global_avg_ntp_loss: 0.8906 global_avg_top_loss: 2.0935 +[titan] 2025-09-09 17:37:04,588 - root - INFO - lr: 6.8821e-06 gnorm: 0.46 [2 days, 0:01:36<1 day, 1:19:28] +[titan] 2025-09-09 17:37:36,421 - root - INFO - step: 26195 loss: 2.7330 memory: 122.04GiB(87.57%) tps: 10,294 tflops: 490.60 mfu: 49.61% global_avg_ntp_loss: 0.7713 global_avg_top_loss: 1.9616 +[titan] 2025-09-09 17:37:36,422 - root - INFO - lr: 6.8789e-06 gnorm: 0.37 [2 days, 0:02:07<1 day, 1:18:54] +[titan] 2025-09-09 17:38:01,898 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:38:08,364 - root - INFO - step: 26200 loss: 2.8158 memory: 122.04GiB(87.57%) tps: 10,259 tflops: 488.92 mfu: 49.44% global_avg_ntp_loss: 0.8180 global_avg_top_loss: 1.9978 +[titan] 2025-09-09 17:38:08,365 - root - INFO - lr: 6.8757e-06 gnorm: 0.46 [2 days, 0:02:39<1 day, 1:18:21] +[titan] 2025-09-09 17:38:40,478 - root - INFO - step: 26205 loss: 3.2506 memory: 122.04GiB(87.57%) tps: 10,204 tflops: 486.32 mfu: 49.17% global_avg_ntp_loss: 1.0605 global_avg_top_loss: 2.1902 +[titan] 2025-09-09 17:38:40,478 - root - INFO - lr: 6.8726e-06 gnorm: 0.37 [2 days, 0:03:12<1 day, 1:17:47] +[titan] 2025-09-09 17:39:12,409 - root - INFO - step: 26210 loss: 2.7501 memory: 122.04GiB(87.57%) tps: 10,262 tflops: 489.09 mfu: 49.45% global_avg_ntp_loss: 0.7823 global_avg_top_loss: 1.9677 +[titan] 2025-09-09 17:39:12,409 - root - INFO - lr: 6.8694e-06 gnorm: 0.40 [2 days, 0:03:43<1 day, 1:17:13] +[titan] 2025-09-09 17:39:44,557 - root - INFO - step: 26215 loss: 2.7479 memory: 122.04GiB(87.57%) tps: 10,193 tflops: 485.80 mfu: 49.12% global_avg_ntp_loss: 0.7814 global_avg_top_loss: 1.9665 +[titan] 2025-09-09 17:39:44,557 - root - INFO - lr: 6.8662e-06 gnorm: 0.36 [2 days, 0:04:16<1 day, 1:16:40] +[titan] 2025-09-09 17:40:16,476 - root - INFO - step: 26220 loss: 2.7353 memory: 122.04GiB(87.57%) tps: 10,266 tflops: 489.28 mfu: 49.47% global_avg_ntp_loss: 0.7754 global_avg_top_loss: 1.9600 +[titan] 2025-09-09 17:40:16,476 - root - INFO - lr: 6.8631e-06 gnorm: 0.41 [2 days, 0:04:47<1 day, 1:16:06] +[titan] 2025-09-09 17:40:48,348 - root - INFO - step: 26225 loss: 2.6104 memory: 122.04GiB(87.57%) tps: 10,281 tflops: 490.01 mfu: 49.55% global_avg_ntp_loss: 0.7270 global_avg_top_loss: 1.8834 +[titan] 2025-09-09 17:40:48,348 - root - INFO - lr: 6.8599e-06 gnorm: 0.40 [2 days, 0:05:19<1 day, 1:15:33] +[titan] 2025-09-09 17:41:20,515 - root - INFO - step: 26230 loss: 2.7211 memory: 122.04GiB(87.57%) tps: 10,187 tflops: 485.52 mfu: 49.09% global_avg_ntp_loss: 0.7662 global_avg_top_loss: 1.9548 +[titan] 2025-09-09 17:41:20,515 - root - INFO - lr: 6.8567e-06 gnorm: 0.37 [2 days, 0:05:52<1 day, 1:14:59] +[titan] 2025-09-09 17:41:52,524 - root - INFO - step: 26235 loss: 3.1629 memory: 122.04GiB(87.57%) tps: 10,237 tflops: 487.90 mfu: 49.33% global_avg_ntp_loss: 1.0223 global_avg_top_loss: 2.1405 +[titan] 2025-09-09 17:41:52,525 - root - INFO - lr: 6.8535e-06 gnorm: 0.39 [2 days, 0:06:24<1 day, 1:14:26] +[titan] 2025-09-09 17:42:24,723 - root - INFO - step: 26240 loss: 2.6700 memory: 122.04GiB(87.57%) tps: 10,177 tflops: 485.03 mfu: 49.04% global_avg_ntp_loss: 0.7476 global_avg_top_loss: 1.9224 +[titan] 2025-09-09 17:42:24,724 - root - INFO - lr: 6.8504e-06 gnorm: 0.36 [2 days, 0:06:56<1 day, 1:13:52] +[titan] 2025-09-09 17:42:56,549 - root - INFO - step: 26245 loss: 2.8124 memory: 122.04GiB(87.57%) tps: 10,296 tflops: 490.72 mfu: 49.62% global_avg_ntp_loss: 0.8050 global_avg_top_loss: 2.0074 +[titan] 2025-09-09 17:42:56,549 - root - INFO - lr: 6.8472e-06 gnorm: 0.45 [2 days, 0:07:28<1 day, 1:13:19] +[titan] 2025-09-09 17:43:22,086 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:43:28,510 - root - INFO - step: 26250 loss: 2.7200 memory: 122.04GiB(87.57%) tps: 10,253 tflops: 488.64 mfu: 49.41% global_avg_ntp_loss: 0.7670 global_avg_top_loss: 1.9529 +[titan] 2025-09-09 17:43:28,510 - root - INFO - lr: 6.8440e-06 gnorm: 0.39 [2 days, 0:08:00<1 day, 1:12:45] +[titan] 2025-09-09 17:44:00,493 - root - INFO - step: 26255 loss: 2.7502 memory: 122.04GiB(87.57%) tps: 10,246 tflops: 488.30 mfu: 49.37% global_avg_ntp_loss: 0.7798 global_avg_top_loss: 1.9704 +[titan] 2025-09-09 17:44:00,494 - root - INFO - lr: 6.8409e-06 gnorm: 0.40 [2 days, 0:08:32<1 day, 1:12:12] +[titan] 2025-09-09 17:44:32,454 - root - INFO - step: 26260 loss: 2.7866 memory: 122.04GiB(87.57%) tps: 10,253 tflops: 488.64 mfu: 49.41% global_avg_ntp_loss: 0.8035 global_avg_top_loss: 1.9832 +[titan] 2025-09-09 17:44:32,455 - root - INFO - lr: 6.8377e-06 gnorm: 0.38 [2 days, 0:09:03<1 day, 1:11:38] +[titan] 2025-09-09 17:45:04,432 - root - INFO - step: 26265 loss: 2.7524 memory: 122.04GiB(87.57%) tps: 10,248 tflops: 488.39 mfu: 49.38% global_avg_ntp_loss: 0.7777 global_avg_top_loss: 1.9748 +[titan] 2025-09-09 17:45:04,432 - root - INFO - lr: 6.8345e-06 gnorm: 0.44 [2 days, 0:09:35<1 day, 1:11:05] +[titan] 2025-09-09 17:45:36,550 - root - INFO - step: 26270 loss: 3.0611 memory: 122.04GiB(87.57%) tps: 10,203 tflops: 486.26 mfu: 49.17% global_avg_ntp_loss: 0.9269 global_avg_top_loss: 2.1343 +[titan] 2025-09-09 17:45:36,550 - root - INFO - lr: 6.8314e-06 gnorm: 0.47 [2 days, 0:10:08<1 day, 1:10:31] +[titan] 2025-09-09 17:46:08,719 - root - INFO - step: 26275 loss: 2.7708 memory: 122.04GiB(87.57%) tps: 10,186 tflops: 485.47 mfu: 49.09% global_avg_ntp_loss: 0.7906 global_avg_top_loss: 1.9801 +[titan] 2025-09-09 17:46:08,719 - root - INFO - lr: 6.8282e-06 gnorm: 0.44 [2 days, 0:10:40<1 day, 1:09:58] +[titan] 2025-09-09 17:46:40,775 - root - INFO - step: 26280 loss: 2.7453 memory: 122.04GiB(87.57%) tps: 10,222 tflops: 487.20 mfu: 49.26% global_avg_ntp_loss: 0.7787 global_avg_top_loss: 1.9666 +[titan] 2025-09-09 17:46:40,775 - root - INFO - lr: 6.8251e-06 gnorm: 0.38 [2 days, 0:11:12<1 day, 1:09:24] +[titan] 2025-09-09 17:47:12,851 - root - INFO - step: 26285 loss: 3.2434 memory: 122.04GiB(87.57%) tps: 10,216 tflops: 486.88 mfu: 49.23% global_avg_ntp_loss: 1.0574 global_avg_top_loss: 2.1860 +[titan] 2025-09-09 17:47:12,851 - root - INFO - lr: 6.8219e-06 gnorm: 0.36 [2 days, 0:11:44<1 day, 1:08:51] +[titan] 2025-09-09 17:47:44,915 - root - INFO - step: 26290 loss: 2.6759 memory: 122.04GiB(87.57%) tps: 10,220 tflops: 487.06 mfu: 49.25% global_avg_ntp_loss: 0.7502 global_avg_top_loss: 1.9257 +[titan] 2025-09-09 17:47:44,916 - root - INFO - lr: 6.8187e-06 gnorm: 0.37 [2 days, 0:12:16<1 day, 1:08:17] +[titan] 2025-09-09 17:48:16,760 - root - INFO - step: 26295 loss: 2.7558 memory: 122.04GiB(87.57%) tps: 10,290 tflops: 490.43 mfu: 49.59% global_avg_ntp_loss: 0.7835 global_avg_top_loss: 1.9723 +[titan] 2025-09-09 17:48:16,760 - root - INFO - lr: 6.8156e-06 gnorm: 0.37 [2 days, 0:12:48<1 day, 1:07:44] +[titan] 2025-09-09 17:48:42,599 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:48:48,994 - root - INFO - step: 26300 loss: 2.6701 memory: 122.04GiB(87.57%) tps: 10,166 tflops: 484.50 mfu: 48.99% global_avg_ntp_loss: 0.7485 global_avg_top_loss: 1.9216 +[titan] 2025-09-09 17:48:48,994 - root - INFO - lr: 6.8124e-06 gnorm: 0.39 [2 days, 0:13:20<1 day, 1:07:10] +[titan] 2025-09-09 17:49:21,003 - root - INFO - step: 26305 loss: 2.6700 memory: 122.04GiB(87.57%) tps: 10,237 tflops: 487.91 mfu: 49.33% global_avg_ntp_loss: 0.7552 global_avg_top_loss: 1.9149 +[titan] 2025-09-09 17:49:21,003 - root - INFO - lr: 6.8093e-06 gnorm: 0.36 [2 days, 0:13:52<1 day, 1:06:37] +[titan] 2025-09-09 17:49:53,125 - root - INFO - step: 26310 loss: 2.7345 memory: 122.04GiB(87.57%) tps: 10,201 tflops: 486.19 mfu: 49.16% global_avg_ntp_loss: 0.7737 global_avg_top_loss: 1.9608 +[titan] 2025-09-09 17:49:53,126 - root - INFO - lr: 6.8061e-06 gnorm: 0.38 [2 days, 0:14:24<1 day, 1:06:03] +[titan] 2025-09-09 17:50:25,099 - root - INFO - step: 26315 loss: 2.5116 memory: 122.04GiB(87.57%) tps: 10,249 tflops: 488.45 mfu: 49.39% global_avg_ntp_loss: 0.6761 global_avg_top_loss: 1.8355 +[titan] 2025-09-09 17:50:25,099 - root - INFO - lr: 6.8029e-06 gnorm: 0.36 [2 days, 0:14:56<1 day, 1:05:30] +[titan] 2025-09-09 17:50:57,141 - root - INFO - step: 26320 loss: 2.6805 memory: 122.04GiB(87.57%) tps: 10,227 tflops: 487.40 mfu: 49.28% global_avg_ntp_loss: 0.7471 global_avg_top_loss: 1.9334 +[titan] 2025-09-09 17:50:57,141 - root - INFO - lr: 6.7998e-06 gnorm: 0.36 [2 days, 0:15:28<1 day, 1:04:56] +[titan] 2025-09-09 17:51:28,971 - root - INFO - step: 26325 loss: 2.7257 memory: 122.04GiB(87.57%) tps: 10,295 tflops: 490.65 mfu: 49.61% global_avg_ntp_loss: 0.7710 global_avg_top_loss: 1.9547 +[titan] 2025-09-09 17:51:28,972 - root - INFO - lr: 6.7966e-06 gnorm: 0.37 [2 days, 0:16:00<1 day, 1:04:23] +[titan] 2025-09-09 17:52:01,087 - root - INFO - step: 26330 loss: 2.8199 memory: 122.04GiB(87.57%) tps: 10,203 tflops: 486.28 mfu: 49.17% global_avg_ntp_loss: 0.8124 global_avg_top_loss: 2.0075 +[titan] 2025-09-09 17:52:01,088 - root - INFO - lr: 6.7935e-06 gnorm: 0.38 [2 days, 0:16:32<1 day, 1:03:49] +[titan] 2025-09-09 17:52:33,236 - root - INFO - step: 26335 loss: 2.7084 memory: 122.04GiB(87.57%) tps: 10,193 tflops: 485.79 mfu: 49.12% global_avg_ntp_loss: 0.7606 global_avg_top_loss: 1.9478 +[titan] 2025-09-09 17:52:33,236 - root - INFO - lr: 6.7903e-06 gnorm: 0.36 [2 days, 0:17:04<1 day, 1:03:16] +[titan] 2025-09-09 17:53:05,068 - root - INFO - step: 26340 loss: 2.7015 memory: 122.04GiB(87.57%) tps: 10,294 tflops: 490.62 mfu: 49.61% global_avg_ntp_loss: 0.7610 global_avg_top_loss: 1.9405 +[titan] 2025-09-09 17:53:05,068 - root - INFO - lr: 6.7872e-06 gnorm: 0.36 [2 days, 0:17:36<1 day, 1:02:42] +[titan] 2025-09-09 17:53:37,161 - root - INFO - step: 26345 loss: 2.6775 memory: 122.04GiB(87.57%) tps: 10,211 tflops: 486.63 mfu: 49.20% global_avg_ntp_loss: 0.7480 global_avg_top_loss: 1.9296 +[titan] 2025-09-09 17:53:37,161 - root - INFO - lr: 6.7840e-06 gnorm: 0.37 [2 days, 0:18:08<1 day, 1:02:09] +[titan] 2025-09-09 17:54:02,816 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:54:09,188 - root - INFO - step: 26350 loss: 2.7072 memory: 122.04GiB(87.57%) tps: 10,232 tflops: 487.63 mfu: 49.31% global_avg_ntp_loss: 0.7623 global_avg_top_loss: 1.9449 +[titan] 2025-09-09 17:54:09,188 - root - INFO - lr: 6.7808e-06 gnorm: 0.37 [2 days, 0:18:40<1 day, 1:01:35] +[titan] 2025-09-09 17:54:41,133 - root - INFO - step: 26355 loss: 2.7382 memory: 122.04GiB(87.57%) tps: 10,258 tflops: 488.88 mfu: 49.43% global_avg_ntp_loss: 0.7775 global_avg_top_loss: 1.9607 +[titan] 2025-09-09 17:54:41,134 - root - INFO - lr: 6.7777e-06 gnorm: 0.37 [2 days, 0:19:12<1 day, 1:01:01] +[titan] 2025-09-09 17:55:13,224 - root - INFO - step: 26360 loss: 2.7167 memory: 122.04GiB(87.57%) tps: 10,211 tflops: 486.67 mfu: 49.21% global_avg_ntp_loss: 0.7659 global_avg_top_loss: 1.9509 +[titan] 2025-09-09 17:55:13,224 - root - INFO - lr: 6.7745e-06 gnorm: 0.39 [2 days, 0:19:44<1 day, 1:00:28] +[titan] 2025-09-09 17:55:45,334 - root - INFO - step: 26365 loss: 3.2179 memory: 122.04GiB(87.57%) tps: 10,205 tflops: 486.37 mfu: 49.18% global_avg_ntp_loss: 1.0477 global_avg_top_loss: 2.1702 +[titan] 2025-09-09 17:55:45,334 - root - INFO - lr: 6.7714e-06 gnorm: 0.36 [2 days, 0:20:16<1 day, 0:59:55] +[titan] 2025-09-09 17:56:17,293 - root - INFO - step: 26370 loss: 2.7177 memory: 122.04GiB(87.57%) tps: 10,254 tflops: 488.68 mfu: 49.41% global_avg_ntp_loss: 0.7681 global_avg_top_loss: 1.9496 +[titan] 2025-09-09 17:56:17,293 - root - INFO - lr: 6.7682e-06 gnorm: 0.37 [2 days, 0:20:48<1 day, 0:59:21] +[titan] 2025-09-09 17:56:49,398 - root - INFO - step: 26375 loss: 2.6157 memory: 122.04GiB(87.57%) tps: 10,207 tflops: 486.44 mfu: 49.19% global_avg_ntp_loss: 0.7332 global_avg_top_loss: 1.8825 +[titan] 2025-09-09 17:56:49,398 - root - INFO - lr: 6.7651e-06 gnorm: 0.38 [2 days, 0:21:20<1 day, 0:58:48] +[titan] 2025-09-09 17:57:21,626 - root - INFO - step: 26380 loss: 2.6597 memory: 122.04GiB(87.57%) tps: 10,168 tflops: 484.60 mfu: 49.00% global_avg_ntp_loss: 0.7400 global_avg_top_loss: 1.9197 +[titan] 2025-09-09 17:57:21,626 - root - INFO - lr: 6.7619e-06 gnorm: 0.36 [2 days, 0:21:53<1 day, 0:58:14] +[titan] 2025-09-09 17:57:53,624 - root - INFO - step: 26385 loss: 2.6869 memory: 122.04GiB(87.57%) tps: 10,241 tflops: 488.07 mfu: 49.35% global_avg_ntp_loss: 0.7589 global_avg_top_loss: 1.9280 +[titan] 2025-09-09 17:57:53,625 - root - INFO - lr: 6.7588e-06 gnorm: 0.37 [2 days, 0:22:25<1 day, 0:57:41] +[titan] 2025-09-09 17:58:25,650 - root - INFO - step: 26390 loss: 2.8947 memory: 122.04GiB(87.57%) tps: 10,232 tflops: 487.65 mfu: 49.31% global_avg_ntp_loss: 0.8609 global_avg_top_loss: 2.0338 +[titan] 2025-09-09 17:58:25,651 - root - INFO - lr: 6.7556e-06 gnorm: 0.36 [2 days, 0:22:57<1 day, 0:57:07] +[titan] 2025-09-09 17:58:57,689 - root - INFO - step: 26395 loss: 2.5015 memory: 122.04GiB(87.57%) tps: 10,228 tflops: 487.45 mfu: 49.29% global_avg_ntp_loss: 0.6745 global_avg_top_loss: 1.8270 +[titan] 2025-09-09 17:58:57,690 - root - INFO - lr: 6.7525e-06 gnorm: 0.36 [2 days, 0:23:29<1 day, 0:56:34] +[titan] 2025-09-09 17:59:23,318 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 17:59:29,796 - root - INFO - step: 26400 loss: 2.7635 memory: 122.04GiB(87.57%) tps: 10,206 tflops: 486.42 mfu: 49.18% global_avg_ntp_loss: 0.7863 global_avg_top_loss: 1.9771 +[titan] 2025-09-09 17:59:29,797 - root - INFO - lr: 6.7493e-06 gnorm: 0.38 [2 days, 0:24:01<1 day, 0:56:00] +[titan] 2025-09-09 18:00:01,816 - root - INFO - step: 26405 loss: 2.6588 memory: 122.04GiB(87.57%) tps: 10,234 tflops: 487.74 mfu: 49.32% global_avg_ntp_loss: 0.7360 global_avg_top_loss: 1.9228 +[titan] 2025-09-09 18:00:01,817 - root - INFO - lr: 6.7462e-06 gnorm: 0.37 [2 days, 0:24:33<1 day, 0:55:27] +[titan] 2025-09-09 18:00:34,083 - root - INFO - step: 26410 loss: 2.7292 memory: 122.04GiB(87.57%) tps: 10,156 tflops: 484.02 mfu: 48.94% global_avg_ntp_loss: 0.7729 global_avg_top_loss: 1.9563 +[titan] 2025-09-09 18:00:34,083 - root - INFO - lr: 6.7431e-06 gnorm: 0.43 [2 days, 0:25:05<1 day, 0:54:53] +[titan] 2025-09-09 18:01:06,220 - root - INFO - step: 26415 loss: 2.7316 memory: 122.04GiB(87.57%) tps: 10,197 tflops: 485.96 mfu: 49.14% global_avg_ntp_loss: 0.7731 global_avg_top_loss: 1.9586 +[titan] 2025-09-09 18:01:06,220 - root - INFO - lr: 6.7399e-06 gnorm: 0.36 [2 days, 0:25:37<1 day, 0:54:20] +[titan] 2025-09-09 18:01:38,095 - root - INFO - step: 26420 loss: 2.7657 memory: 122.04GiB(87.57%) tps: 10,280 tflops: 489.96 mfu: 49.54% global_avg_ntp_loss: 0.7806 global_avg_top_loss: 1.9851 +[titan] 2025-09-09 18:01:38,095 - root - INFO - lr: 6.7368e-06 gnorm: 0.39 [2 days, 0:26:09<1 day, 0:53:46] +[titan] 2025-09-09 18:02:10,202 - root - INFO - step: 26425 loss: 2.7568 memory: 122.04GiB(87.57%) tps: 10,206 tflops: 486.42 mfu: 49.18% global_avg_ntp_loss: 0.7904 global_avg_top_loss: 1.9664 +[titan] 2025-09-09 18:02:10,202 - root - INFO - lr: 6.7336e-06 gnorm: 0.37 [2 days, 0:26:41<1 day, 0:53:13] +[titan] 2025-09-09 18:02:42,082 - root - INFO - step: 26430 loss: 2.8021 memory: 122.04GiB(87.57%) tps: 10,279 tflops: 489.88 mfu: 49.53% global_avg_ntp_loss: 0.7979 global_avg_top_loss: 2.0042 +[titan] 2025-09-09 18:02:42,082 - root - INFO - lr: 6.7305e-06 gnorm: 1.15 [2 days, 0:27:13<1 day, 0:52:39] +[titan] 2025-09-09 18:03:13,873 - root - INFO - step: 26435 loss: 2.8537 memory: 122.04GiB(87.57%) tps: 10,308 tflops: 491.25 mfu: 49.67% global_avg_ntp_loss: 0.8392 global_avg_top_loss: 2.0145 +[titan] 2025-09-09 18:03:13,873 - root - INFO - lr: 6.7273e-06 gnorm: 0.38 [2 days, 0:27:45<1 day, 0:52:06] +[titan] 2025-09-09 18:03:45,896 - root - INFO - step: 26440 loss: 2.6853 memory: 122.04GiB(87.57%) tps: 10,233 tflops: 487.70 mfu: 49.31% global_avg_ntp_loss: 0.7534 global_avg_top_loss: 1.9319 +[titan] 2025-09-09 18:03:45,896 - root - INFO - lr: 6.7242e-06 gnorm: 0.38 [2 days, 0:28:17<1 day, 0:51:32] +[titan] 2025-09-09 18:04:17,896 - root - INFO - step: 26445 loss: 2.7965 memory: 122.04GiB(87.57%) tps: 10,240 tflops: 488.03 mfu: 49.35% global_avg_ntp_loss: 0.8035 global_avg_top_loss: 1.9930 +[titan] 2025-09-09 18:04:17,897 - root - INFO - lr: 6.7211e-06 gnorm: 0.37 [2 days, 0:28:49<1 day, 0:50:59] +[titan] 2025-09-09 18:04:43,414 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 18:04:49,848 - root - INFO - step: 26450 loss: 2.7367 memory: 122.04GiB(87.57%) tps: 10,256 tflops: 488.79 mfu: 49.42% global_avg_ntp_loss: 0.7727 global_avg_top_loss: 1.9640 +[titan] 2025-09-09 18:04:49,848 - root - INFO - lr: 6.7179e-06 gnorm: 0.36 [2 days, 0:29:21<1 day, 0:50:25] +[titan] 2025-09-09 18:05:21,850 - root - INFO - step: 26455 loss: 2.6576 memory: 122.04GiB(87.57%) tps: 10,240 tflops: 488.02 mfu: 49.35% global_avg_ntp_loss: 0.7378 global_avg_top_loss: 1.9199 +[titan] 2025-09-09 18:05:21,850 - root - INFO - lr: 6.7148e-06 gnorm: 0.37 [2 days, 0:29:53<1 day, 0:49:52] +[titan] 2025-09-09 18:05:53,703 - root - INFO - step: 26460 loss: 2.7638 memory: 122.04GiB(87.57%) tps: 10,287 tflops: 490.29 mfu: 49.57% global_avg_ntp_loss: 0.7850 global_avg_top_loss: 1.9788 +[titan] 2025-09-09 18:05:53,704 - root - INFO - lr: 6.7116e-06 gnorm: 0.79 [2 days, 0:30:25<1 day, 0:49:18] +[titan] 2025-09-09 18:06:25,662 - root - INFO - step: 26465 loss: 2.7646 memory: 122.04GiB(87.57%) tps: 10,253 tflops: 488.67 mfu: 49.41% global_avg_ntp_loss: 0.7900 global_avg_top_loss: 1.9745 +[titan] 2025-09-09 18:06:25,662 - root - INFO - lr: 6.7085e-06 gnorm: 0.37 [2 days, 0:30:57<1 day, 0:48:44] +[titan] 2025-09-09 18:06:57,520 - root - INFO - step: 26470 loss: 2.6511 memory: 122.04GiB(87.57%) tps: 10,286 tflops: 490.22 mfu: 49.57% global_avg_ntp_loss: 0.7346 global_avg_top_loss: 1.9166 +[titan] 2025-09-09 18:06:57,520 - root - INFO - lr: 6.7054e-06 gnorm: 0.36 [2 days, 0:31:28<1 day, 0:48:11] +[titan] 2025-09-09 18:07:29,449 - root - INFO - step: 26475 loss: 2.7796 memory: 122.04GiB(87.57%) tps: 10,263 tflops: 489.13 mfu: 49.46% global_avg_ntp_loss: 0.7954 global_avg_top_loss: 1.9842 +[titan] 2025-09-09 18:07:29,450 - root - INFO - lr: 6.7022e-06 gnorm: 0.39 [2 days, 0:32:00<1 day, 0:47:37] +[titan] 2025-09-09 18:08:01,415 - root - INFO - step: 26480 loss: 2.6602 memory: 122.04GiB(87.57%) tps: 10,251 tflops: 488.56 mfu: 49.40% global_avg_ntp_loss: 0.7415 global_avg_top_loss: 1.9188 +[titan] 2025-09-09 18:08:01,416 - root - INFO - lr: 6.6991e-06 gnorm: 0.40 [2 days, 0:32:32<1 day, 0:47:04] +[titan] 2025-09-09 18:08:33,352 - root - INFO - step: 26485 loss: 2.7112 memory: 122.04GiB(87.57%) tps: 10,261 tflops: 489.01 mfu: 49.45% global_avg_ntp_loss: 0.7618 global_avg_top_loss: 1.9494 +[titan] 2025-09-09 18:08:33,353 - root - INFO - lr: 6.6959e-06 gnorm: 0.39 [2 days, 0:33:04<1 day, 0:46:30] +[titan] 2025-09-09 18:09:05,385 - root - INFO - step: 26490 loss: 2.7449 memory: 122.04GiB(87.57%) tps: 10,230 tflops: 487.55 mfu: 49.30% global_avg_ntp_loss: 0.7773 global_avg_top_loss: 1.9675 +[titan] 2025-09-09 18:09:05,385 - root - INFO - lr: 6.6928e-06 gnorm: 0.36 [2 days, 0:33:36<1 day, 0:45:57] +[titan] 2025-09-09 18:09:37,198 - root - INFO - step: 26495 loss: 2.7375 memory: 122.04GiB(87.57%) tps: 10,300 tflops: 490.91 mfu: 49.64% global_avg_ntp_loss: 0.7756 global_avg_top_loss: 1.9619 +[titan] 2025-09-09 18:09:37,199 - root - INFO - lr: 6.6897e-06 gnorm: 0.36 [2 days, 0:34:08<1 day, 0:45:23] +[titan] 2025-09-09 18:10:02,774 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 18:10:09,221 - root - INFO - step: 26500 loss: 2.7651 memory: 122.04GiB(87.57%) tps: 10,233 tflops: 487.70 mfu: 49.31% global_avg_ntp_loss: 0.7881 global_avg_top_loss: 1.9770 +[titan] 2025-09-09 18:10:09,221 - root - INFO - lr: 6.6865e-06 gnorm: 0.37 [2 days, 0:34:40<1 day, 0:44:50] +[titan] 2025-09-09 18:10:41,146 - root - INFO - step: 26505 loss: 2.7141 memory: 122.04GiB(87.57%) tps: 10,264 tflops: 489.19 mfu: 49.46% global_avg_ntp_loss: 0.7685 global_avg_top_loss: 1.9456 +[titan] 2025-09-09 18:10:41,147 - root - INFO - lr: 6.6834e-06 gnorm: 0.39 [2 days, 0:35:12<1 day, 0:44:16] +[titan] 2025-09-09 18:11:13,150 - root - INFO - step: 26510 loss: 2.7420 memory: 122.04GiB(87.57%) tps: 10,239 tflops: 487.99 mfu: 49.34% global_avg_ntp_loss: 0.7779 global_avg_top_loss: 1.9641 +[titan] 2025-09-09 18:11:13,151 - root - INFO - lr: 6.6803e-06 gnorm: 0.37 [2 days, 0:35:44<1 day, 0:43:43] +[titan] 2025-09-09 18:11:45,192 - root - INFO - step: 26515 loss: 3.1672 memory: 122.04GiB(87.57%) tps: 10,227 tflops: 487.41 mfu: 49.28% global_avg_ntp_loss: 1.0187 global_avg_top_loss: 2.1484 +[titan] 2025-09-09 18:11:45,192 - root - INFO - lr: 6.6771e-06 gnorm: 0.41 [2 days, 0:36:16<1 day, 0:43:09] +[titan] 2025-09-09 18:12:17,093 - root - INFO - step: 26520 loss: 2.6628 memory: 122.04GiB(87.57%) tps: 10,272 tflops: 489.56 mfu: 49.50% global_avg_ntp_loss: 0.7401 global_avg_top_loss: 1.9227 +[titan] 2025-09-09 18:12:17,093 - root - INFO - lr: 6.6740e-06 gnorm: 0.37 [2 days, 0:36:48<1 day, 0:42:36] +[titan] 2025-09-09 18:12:49,050 - root - INFO - step: 26525 loss: 2.6877 memory: 122.04GiB(87.57%) tps: 10,254 tflops: 488.70 mfu: 49.41% global_avg_ntp_loss: 0.7521 global_avg_top_loss: 1.9356 +[titan] 2025-09-09 18:12:49,050 - root - INFO - lr: 6.6709e-06 gnorm: 0.39 [2 days, 0:37:20<1 day, 0:42:02] +[titan] 2025-09-09 18:13:20,965 - root - INFO - step: 26530 loss: 2.7931 memory: 122.04GiB(87.57%) tps: 10,267 tflops: 489.34 mfu: 49.48% global_avg_ntp_loss: 0.8094 global_avg_top_loss: 1.9837 +[titan] 2025-09-09 18:13:20,965 - root - INFO - lr: 6.6678e-06 gnorm: 0.38 [2 days, 0:37:52<1 day, 0:41:29] +[titan] 2025-09-09 18:13:52,774 - root - INFO - step: 26535 loss: 2.5594 memory: 122.04GiB(87.57%) tps: 10,302 tflops: 490.97 mfu: 49.64% global_avg_ntp_loss: 0.6996 global_avg_top_loss: 1.8598 +[titan] 2025-09-09 18:13:52,775 - root - INFO - lr: 6.6646e-06 gnorm: 0.39 [2 days, 0:38:24<1 day, 0:40:55] +[titan] 2025-09-09 18:14:24,706 - root - INFO - step: 26540 loss: 2.7469 memory: 122.04GiB(87.57%) tps: 10,262 tflops: 489.09 mfu: 49.45% global_avg_ntp_loss: 0.7782 global_avg_top_loss: 1.9688 +[titan] 2025-09-09 18:14:24,707 - root - INFO - lr: 6.6615e-06 gnorm: 0.36 [2 days, 0:38:56<1 day, 0:40:21] +[titan] 2025-09-09 18:14:56,756 - root - INFO - step: 26545 loss: 2.7816 memory: 122.04GiB(87.57%) tps: 10,224 tflops: 487.28 mfu: 49.27% global_avg_ntp_loss: 0.7984 global_avg_top_loss: 1.9832 +[titan] 2025-09-09 18:14:56,757 - root - INFO - lr: 6.6584e-06 gnorm: 0.36 [2 days, 0:39:28<1 day, 0:39:48] +[titan] 2025-09-09 18:15:22,337 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 18:15:28,768 - root - INFO - step: 26550 loss: 2.7103 memory: 122.04GiB(87.57%) tps: 10,237 tflops: 487.87 mfu: 49.33% global_avg_ntp_loss: 0.7605 global_avg_top_loss: 1.9497 +[titan] 2025-09-09 18:15:28,768 - root - INFO - lr: 6.6552e-06 gnorm: 0.39 [2 days, 0:40:00<1 day, 0:39:14] +[titan] 2025-09-09 18:16:00,543 - root - INFO - step: 26555 loss: 2.7149 memory: 122.04GiB(87.57%) tps: 10,313 tflops: 491.51 mfu: 49.70% global_avg_ntp_loss: 0.7613 global_avg_top_loss: 1.9536 +[titan] 2025-09-09 18:16:00,543 - root - INFO - lr: 6.6521e-06 gnorm: 0.37 [2 days, 0:40:31<1 day, 0:38:41] +[titan] 2025-09-09 18:16:32,527 - root - INFO - step: 26560 loss: 2.6688 memory: 122.04GiB(87.57%) tps: 10,245 tflops: 488.28 mfu: 49.37% global_avg_ntp_loss: 0.7442 global_avg_top_loss: 1.9246 +[titan] 2025-09-09 18:16:32,528 - root - INFO - lr: 6.6490e-06 gnorm: 0.35 [2 days, 0:41:03<1 day, 0:38:07] +[titan] 2025-09-09 18:17:04,428 - root - INFO - step: 26565 loss: 3.0258 memory: 122.04GiB(87.57%) tps: 10,272 tflops: 489.56 mfu: 49.50% global_avg_ntp_loss: 0.9468 global_avg_top_loss: 2.0790 +[titan] 2025-09-09 18:17:04,429 - root - INFO - lr: 6.6459e-06 gnorm: 0.36 [2 days, 0:41:35<1 day, 0:37:34] +[titan] 2025-09-09 18:17:36,436 - root - INFO - step: 26570 loss: 2.7843 memory: 122.04GiB(87.57%) tps: 10,238 tflops: 487.93 mfu: 49.34% global_avg_ntp_loss: 0.8151 global_avg_top_loss: 1.9691 +[titan] 2025-09-09 18:17:36,436 - root - INFO - lr: 6.6427e-06 gnorm: 0.52 [2 days, 0:42:07<1 day, 0:37:00] +[titan] 2025-09-09 18:18:08,371 - root - INFO - step: 26575 loss: 2.7543 memory: 122.04GiB(87.57%) tps: 10,261 tflops: 489.03 mfu: 49.45% global_avg_ntp_loss: 0.7811 global_avg_top_loss: 1.9732 +[titan] 2025-09-09 18:18:08,371 - root - INFO - lr: 6.6396e-06 gnorm: 0.36 [2 days, 0:42:39<1 day, 0:36:27] +[titan] 2025-09-09 18:18:40,351 - root - INFO - step: 26580 loss: 2.5754 memory: 122.04GiB(87.57%) tps: 10,247 tflops: 488.35 mfu: 49.38% global_avg_ntp_loss: 0.7003 global_avg_top_loss: 1.8751 +[titan] 2025-09-09 18:18:40,351 - root - INFO - lr: 6.6365e-06 gnorm: 0.38 [2 days, 0:43:11<1 day, 0:35:53] +[titan] 2025-09-09 18:19:12,027 - root - INFO - step: 26585 loss: 2.7384 memory: 122.04GiB(87.57%) tps: 10,345 tflops: 493.04 mfu: 49.85% global_avg_ntp_loss: 0.7740 global_avg_top_loss: 1.9644 +[titan] 2025-09-09 18:19:12,027 - root - INFO - lr: 6.6334e-06 gnorm: 0.37 [2 days, 0:43:43<1 day, 0:35:20] +[titan] 2025-09-09 18:19:43,955 - root - INFO - step: 26590 loss: 2.6997 memory: 122.04GiB(87.57%) tps: 10,263 tflops: 489.14 mfu: 49.46% global_avg_ntp_loss: 0.7598 global_avg_top_loss: 1.9399 +[titan] 2025-09-09 18:19:43,955 - root - INFO - lr: 6.6303e-06 gnorm: 0.37 [2 days, 0:44:15<1 day, 0:34:46] +[titan] 2025-09-09 18:20:15,942 - root - INFO - step: 26595 loss: 2.7426 memory: 122.04GiB(87.57%) tps: 10,245 tflops: 488.25 mfu: 49.37% global_avg_ntp_loss: 0.7755 global_avg_top_loss: 1.9671 +[titan] 2025-09-09 18:20:15,942 - root - INFO - lr: 6.6271e-06 gnorm: 0.38 [2 days, 0:44:47<1 day, 0:34:13] +[titan] 2025-09-09 18:20:41,522 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 18:20:47,897 - root - INFO - step: 26600 loss: 2.7555 memory: 122.04GiB(87.57%) tps: 10,255 tflops: 488.73 mfu: 49.42% global_avg_ntp_loss: 0.7882 global_avg_top_loss: 1.9673 +[titan] 2025-09-09 18:20:47,897 - root - INFO - lr: 6.6240e-06 gnorm: 0.37 [2 days, 0:45:19<1 day, 0:33:39] +[titan] 2025-09-09 18:21:19,953 - root - INFO - step: 26605 loss: 2.7358 memory: 122.04GiB(87.57%) tps: 10,222 tflops: 487.19 mfu: 49.26% global_avg_ntp_loss: 0.7759 global_avg_top_loss: 1.9599 +[titan] 2025-09-09 18:21:19,954 - root - INFO - lr: 6.6209e-06 gnorm: 0.36 [2 days, 0:45:51<1 day, 0:33:06] +[titan] 2025-09-09 18:21:51,968 - root - INFO - step: 26610 loss: 2.7057 memory: 122.04GiB(87.57%) tps: 10,235 tflops: 487.82 mfu: 49.32% global_avg_ntp_loss: 0.7625 global_avg_top_loss: 1.9432 +[titan] 2025-09-09 18:21:51,969 - root - INFO - lr: 6.6178e-06 gnorm: 0.36 [2 days, 0:46:23<1 day, 0:32:32] +[titan] 2025-09-09 18:22:23,977 - root - INFO - step: 26615 loss: 2.6111 memory: 122.04GiB(87.57%) tps: 10,237 tflops: 487.91 mfu: 49.33% global_avg_ntp_loss: 0.7177 global_avg_top_loss: 1.8934 +[titan] 2025-09-09 18:22:23,977 - root - INFO - lr: 6.6147e-06 gnorm: 0.36 [2 days, 0:46:55<1 day, 0:31:59] +[titan] 2025-09-09 18:22:55,882 - root - INFO - step: 26620 loss: 2.7573 memory: 122.04GiB(87.57%) tps: 10,271 tflops: 489.50 mfu: 49.49% global_avg_ntp_loss: 0.7826 global_avg_top_loss: 1.9747 +[titan] 2025-09-09 18:22:55,883 - root - INFO - lr: 6.6115e-06 gnorm: 0.37 [2 days, 0:47:27<1 day, 0:31:25] +[titan] 2025-09-09 18:23:21,666 - root - INFO - Dumping profiler traces at step 26624 +[titan] 2025-09-09 18:23:21,726 - root - INFO - Finished dumping profiler traces in 0.06 seconds +[titan] 2025-09-09 18:23:28,076 - root - INFO - step: 26625 loss: 2.6706 memory: 122.04GiB(87.57%) tps: 10,179 tflops: 485.10 mfu: 49.05% global_avg_ntp_loss: 0.7474 global_avg_top_loss: 1.9232 +[titan] 2025-09-09 18:23:28,077 - root - INFO - lr: 6.6084e-06 gnorm: 0.36 [2 days, 0:47:59<1 day, 0:30:52] +[titan] 2025-09-09 18:23:59,881 - root - INFO - step: 26630 loss: 2.6770 memory: 122.04GiB(87.57%) tps: 10,303 tflops: 491.05 mfu: 49.65% global_avg_ntp_loss: 0.7498 global_avg_top_loss: 1.9273 +[titan] 2025-09-09 18:23:59,881 - root - INFO - lr: 6.6053e-06 gnorm: 0.36 [2 days, 0:48:31<1 day, 0:30:18] +[titan] 2025-09-09 18:24:31,904 - root - INFO - step: 26635 loss: 2.7035 memory: 122.04GiB(87.57%) tps: 10,233 tflops: 487.69 mfu: 49.31% global_avg_ntp_loss: 0.7591 global_avg_top_loss: 1.9444 +[titan] 2025-09-09 18:24:31,905 - root - INFO - lr: 6.6022e-06 gnorm: 0.35 [2 days, 0:49:03<1 day, 0:29:45] +[titan] 2025-09-09 18:25:03,947 - root - INFO - step: 26640 loss: 2.7697 memory: 122.04GiB(87.57%) tps: 10,227 tflops: 487.40 mfu: 49.28% global_avg_ntp_loss: 0.7919 global_avg_top_loss: 1.9778 +[titan] 2025-09-09 18:25:03,947 - root - INFO - lr: 6.5991e-06 gnorm: 0.36 [2 days, 0:49:35<1 day, 0:29:11] +[titan] 2025-09-09 18:25:35,714 - root - INFO - step: 26645 loss: 2.6643 memory: 122.04GiB(87.57%) tps: 10,315 tflops: 491.63 mfu: 49.71% global_avg_ntp_loss: 0.7427 global_avg_top_loss: 1.9217 +[titan] 2025-09-09 18:25:35,714 - root - INFO - lr: 6.5960e-06 gnorm: 0.36 [2 days, 0:50:07<1 day, 0:28:38] +[titan] 2025-09-09 18:26:01,148 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 18:26:07,551 - root - INFO - step: 26650 loss: 2.7395 memory: 122.04GiB(87.57%) tps: 10,293 tflops: 490.54 mfu: 49.60% global_avg_ntp_loss: 0.7801 global_avg_top_loss: 1.9594 +[titan] 2025-09-09 18:26:07,552 - root - INFO - lr: 6.5929e-06 gnorm: 0.37 [2 days, 0:50:38<1 day, 0:28:04] +[titan] 2025-09-09 18:26:39,577 - root - INFO - step: 26655 loss: 2.7005 memory: 122.04GiB(87.57%) tps: 10,232 tflops: 487.66 mfu: 49.31% global_avg_ntp_loss: 0.7555 global_avg_top_loss: 1.9450 +[titan] 2025-09-09 18:26:39,577 - root - INFO - lr: 6.5897e-06 gnorm: 0.36 [2 days, 0:51:11<1 day, 0:27:30] +[titan] 2025-09-09 18:27:11,661 - root - INFO - step: 26660 loss: 2.5784 memory: 122.04GiB(87.57%) tps: 10,213 tflops: 486.76 mfu: 49.22% global_avg_ntp_loss: 0.7040 global_avg_top_loss: 1.8744 +[titan] 2025-09-09 18:27:11,662 - root - INFO - lr: 6.5866e-06 gnorm: 0.57 [2 days, 0:51:43<1 day, 0:26:57] +[titan] 2025-09-09 18:27:43,857 - root - INFO - step: 26665 loss: 2.7274 memory: 122.04GiB(87.57%) tps: 10,178 tflops: 485.08 mfu: 49.05% global_avg_ntp_loss: 0.7693 global_avg_top_loss: 1.9581 +[titan] 2025-09-09 18:27:43,857 - root - INFO - lr: 6.5835e-06 gnorm: 0.50 [2 days, 0:52:15<1 day, 0:26:24] +[titan] 2025-09-09 18:28:15,625 - root - INFO - step: 26670 loss: 2.7489 memory: 122.04GiB(87.57%) tps: 10,315 tflops: 491.60 mfu: 49.71% global_avg_ntp_loss: 0.7797 global_avg_top_loss: 1.9692 +[titan] 2025-09-09 18:28:15,626 - root - INFO - lr: 6.5804e-06 gnorm: 0.37 [2 days, 0:52:47<1 day, 0:25:50] +[titan] 2025-09-09 18:28:47,736 - root - INFO - step: 26675 loss: 2.6154 memory: 122.04GiB(87.57%) tps: 10,205 tflops: 486.36 mfu: 49.18% global_avg_ntp_loss: 0.7216 global_avg_top_loss: 1.8938 +[titan] 2025-09-09 18:28:47,736 - root - INFO - lr: 6.5773e-06 gnorm: 0.37 [2 days, 0:53:19<1 day, 0:25:17] +[titan] 2025-09-09 18:29:19,704 - root - INFO - step: 26680 loss: 2.6659 memory: 122.04GiB(87.57%) tps: 10,250 tflops: 488.53 mfu: 49.40% global_avg_ntp_loss: 0.7406 global_avg_top_loss: 1.9253 +[titan] 2025-09-09 18:29:19,705 - root - INFO - lr: 6.5742e-06 gnorm: 0.37 [2 days, 0:53:51<1 day, 0:24:43] +[titan] 2025-09-09 18:29:51,641 - root - INFO - step: 26685 loss: 2.7422 memory: 122.04GiB(87.57%) tps: 10,261 tflops: 489.02 mfu: 49.45% global_avg_ntp_loss: 0.7777 global_avg_top_loss: 1.9645 +[titan] 2025-09-09 18:29:51,641 - root - INFO - lr: 6.5711e-06 gnorm: 0.37 [2 days, 0:54:23<1 day, 0:24:10] +[titan] 2025-09-09 18:30:23,887 - root - INFO - step: 26690 loss: 2.8764 memory: 122.04GiB(87.57%) tps: 10,162 tflops: 484.32 mfu: 48.97% global_avg_ntp_loss: 0.8292 global_avg_top_loss: 2.0471 +[titan] 2025-09-09 18:30:23,887 - root - INFO - lr: 6.5680e-06 gnorm: 0.42 [2 days, 0:54:55<1 day, 0:23:36] +[titan] 2025-09-09 18:30:55,929 - root - INFO - step: 26695 loss: 2.6844 memory: 122.04GiB(87.57%) tps: 10,227 tflops: 487.40 mfu: 49.28% global_avg_ntp_loss: 0.7501 global_avg_top_loss: 1.9344 +[titan] 2025-09-09 18:30:55,930 - root - INFO - lr: 6.5649e-06 gnorm: 0.40 [2 days, 0:55:27<1 day, 0:23:03] +[titan] 2025-09-09 18:31:21,543 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 18:31:27,901 - root - INFO - step: 26700 loss: 2.7301 memory: 122.04GiB(87.57%) tps: 10,249 tflops: 488.48 mfu: 49.39% global_avg_ntp_loss: 0.7713 global_avg_top_loss: 1.9588 +[titan] 2025-09-09 18:31:27,902 - root - INFO - lr: 6.5618e-06 gnorm: 0.39 [2 days, 0:55:59<1 day, 0:22:29] +[titan] 2025-09-09 18:31:59,878 - root - INFO - step: 26705 loss: 2.7289 memory: 122.04GiB(87.57%) tps: 10,248 tflops: 488.40 mfu: 49.38% global_avg_ntp_loss: 0.7786 global_avg_top_loss: 1.9503 +[titan] 2025-09-09 18:31:59,879 - root - INFO - lr: 6.5587e-06 gnorm: 0.39 [2 days, 0:56:31<1 day, 0:21:56] +[titan] 2025-09-09 18:32:31,845 - root - INFO - step: 26710 loss: 2.7364 memory: 122.04GiB(87.57%) tps: 10,251 tflops: 488.54 mfu: 49.40% global_avg_ntp_loss: 0.7807 global_avg_top_loss: 1.9557 +[titan] 2025-09-09 18:32:31,846 - root - INFO - lr: 6.5556e-06 gnorm: 0.38 [2 days, 0:57:03<1 day, 0:21:22] +[titan] 2025-09-09 18:33:03,927 - root - INFO - step: 26715 loss: 2.6721 memory: 122.04GiB(87.57%) tps: 10,214 tflops: 486.81 mfu: 49.22% global_avg_ntp_loss: 0.7460 global_avg_top_loss: 1.9261 +[titan] 2025-09-09 18:33:03,927 - root - INFO - lr: 6.5525e-06 gnorm: 0.43 [2 days, 0:57:35<1 day, 0:20:49] +[titan] 2025-09-09 18:33:35,894 - root - INFO - step: 26720 loss: 2.6865 memory: 122.04GiB(87.57%) tps: 10,251 tflops: 488.54 mfu: 49.40% global_avg_ntp_loss: 0.7515 global_avg_top_loss: 1.9351 +[titan] 2025-09-09 18:33:35,895 - root - INFO - lr: 6.5493e-06 gnorm: 0.39 [2 days, 0:58:07<1 day, 0:20:15] +[titan] 2025-09-09 18:34:07,755 - root - INFO - step: 26725 loss: 2.6890 memory: 122.04GiB(87.57%) tps: 10,285 tflops: 490.18 mfu: 49.56% global_avg_ntp_loss: 0.7545 global_avg_top_loss: 1.9345 +[titan] 2025-09-09 18:34:07,755 - root - INFO - lr: 6.5462e-06 gnorm: 0.39 [2 days, 0:58:39<1 day, 0:19:42] +[titan] 2025-09-09 18:34:39,712 - root - INFO - step: 26730 loss: 2.7492 memory: 122.04GiB(87.57%) tps: 10,254 tflops: 488.69 mfu: 49.41% global_avg_ntp_loss: 0.7823 global_avg_top_loss: 1.9668 +[titan] 2025-09-09 18:34:39,713 - root - INFO - lr: 6.5431e-06 gnorm: 0.42 [2 days, 0:59:11<1 day, 0:19:08] +[titan] 2025-09-09 18:35:11,503 - root - INFO - step: 26735 loss: 2.7350 memory: 122.04GiB(87.57%) tps: 10,308 tflops: 491.26 mfu: 49.67% global_avg_ntp_loss: 0.7739 global_avg_top_loss: 1.9610 +[titan] 2025-09-09 18:35:11,504 - root - INFO - lr: 6.5400e-06 gnorm: 0.35 [2 days, 0:59:42<1 day, 0:18:35] +[titan] 2025-09-09 18:35:43,530 - root - INFO - step: 26740 loss: 2.6251 memory: 122.04GiB(87.57%) tps: 10,232 tflops: 487.65 mfu: 49.31% global_avg_ntp_loss: 0.7294 global_avg_top_loss: 1.8956 +[titan] 2025-09-09 18:35:43,530 - root - INFO - lr: 6.5369e-06 gnorm: 0.51 [2 days, 1:00:14<1 day, 0:18:01] +[titan] 2025-09-09 18:36:15,427 - root - INFO - step: 26745 loss: 2.7441 memory: 122.04GiB(87.57%) tps: 10,273 tflops: 489.61 mfu: 49.51% global_avg_ntp_loss: 0.7772 global_avg_top_loss: 1.9669 +[titan] 2025-09-09 18:36:15,428 - root - INFO - lr: 6.5338e-06 gnorm: 0.44 [2 days, 1:00:46<1 day, 0:17:28] +[titan] 2025-09-09 18:36:40,904 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 18:36:47,249 - root - INFO - step: 26750 loss: 2.7808 memory: 122.04GiB(87.57%) tps: 10,298 tflops: 490.78 mfu: 49.62% global_avg_ntp_loss: 0.7984 global_avg_top_loss: 1.9824 +[titan] 2025-09-09 18:36:47,249 - root - INFO - lr: 6.5307e-06 gnorm: 0.39 [2 days, 1:01:18<1 day, 0:16:54] +[titan] 2025-09-09 18:37:19,394 - root - INFO - step: 26755 loss: 2.7120 memory: 122.04GiB(87.57%) tps: 10,194 tflops: 485.84 mfu: 49.12% global_avg_ntp_loss: 0.7616 global_avg_top_loss: 1.9503 +[titan] 2025-09-09 18:37:19,395 - root - INFO - lr: 6.5276e-06 gnorm: 0.37 [2 days, 1:01:50<1 day, 0:16:21] +[titan] 2025-09-09 18:37:51,277 - root - INFO - step: 26760 loss: 2.7289 memory: 122.04GiB(87.57%) tps: 10,278 tflops: 489.84 mfu: 49.53% global_avg_ntp_loss: 0.7720 global_avg_top_loss: 1.9569 +[titan] 2025-09-09 18:37:51,278 - root - INFO - lr: 6.5245e-06 gnorm: 0.36 [2 days, 1:02:22<1 day, 0:15:47] +[titan] 2025-09-09 18:38:23,337 - root - INFO - step: 26765 loss: 2.9793 memory: 122.04GiB(87.57%) tps: 10,221 tflops: 487.14 mfu: 49.26% global_avg_ntp_loss: 0.9055 global_avg_top_loss: 2.0738 +[titan] 2025-09-09 18:38:23,337 - root - INFO - lr: 6.5215e-06 gnorm: 0.38 [2 days, 1:02:54<1 day, 0:15:14] +[titan] 2025-09-09 18:38:55,034 - root - INFO - step: 26770 loss: 2.6750 memory: 122.04GiB(87.57%) tps: 10,338 tflops: 492.71 mfu: 49.82% global_avg_ntp_loss: 0.7462 global_avg_top_loss: 1.9288 +[titan] 2025-09-09 18:38:55,034 - root - INFO - lr: 6.5184e-06 gnorm: 0.48 [2 days, 1:03:26<1 day, 0:14:40] +[titan] 2025-09-09 18:39:27,104 - root - INFO - step: 26775 loss: 2.8126 memory: 122.04GiB(87.57%) tps: 10,218 tflops: 486.97 mfu: 49.24% global_avg_ntp_loss: 0.8093 global_avg_top_loss: 2.0033 +[titan] 2025-09-09 18:39:27,105 - root - INFO - lr: 6.5153e-06 gnorm: 0.38 [2 days, 1:03:58<1 day, 0:14:07] +[titan] 2025-09-09 18:39:59,118 - root - INFO - step: 26780 loss: 2.7232 memory: 122.04GiB(87.57%) tps: 10,236 tflops: 487.84 mfu: 49.33% global_avg_ntp_loss: 0.7691 global_avg_top_loss: 1.9541 +[titan] 2025-09-09 18:39:59,118 - root - INFO - lr: 6.5122e-06 gnorm: 0.38 [2 days, 1:04:30<1 day, 0:13:33] +[titan] 2025-09-09 18:40:30,999 - root - INFO - step: 26785 loss: 2.6975 memory: 122.04GiB(87.57%) tps: 10,279 tflops: 489.87 mfu: 49.53% global_avg_ntp_loss: 0.7588 global_avg_top_loss: 1.9387 +[titan] 2025-09-09 18:40:30,999 - root - INFO - lr: 6.5091e-06 gnorm: 0.37 [2 days, 1:05:02<1 day, 0:13:00] +[titan] 2025-09-09 18:41:02,990 - root - INFO - step: 26790 loss: 2.8991 memory: 122.04GiB(87.57%) tps: 10,243 tflops: 488.18 mfu: 49.36% global_avg_ntp_loss: 0.8509 global_avg_top_loss: 2.0482 +[titan] 2025-09-09 18:41:02,990 - root - INFO - lr: 6.5060e-06 gnorm: 0.41 [2 days, 1:05:34<1 day, 0:12:26] +[titan] 2025-09-09 18:41:34,969 - root - INFO - step: 26795 loss: 2.7226 memory: 122.04GiB(87.57%) tps: 10,247 tflops: 488.36 mfu: 49.38% global_avg_ntp_loss: 0.7696 global_avg_top_loss: 1.9530 +[titan] 2025-09-09 18:41:34,969 - root - INFO - lr: 6.5029e-06 gnorm: 0.37 [2 days, 1:06:06<1 day, 0:11:53] +[titan] 2025-09-09 18:42:00,507 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 18:42:06,982 - root - INFO - step: 26800 loss: 2.7583 memory: 122.04GiB(87.57%) tps: 10,236 tflops: 487.84 mfu: 49.33% global_avg_ntp_loss: 0.7973 global_avg_top_loss: 1.9610 +[titan] 2025-09-09 18:42:06,982 - root - INFO - lr: 6.4998e-06 gnorm: 0.45 [2 days, 1:06:38<1 day, 0:11:19] +[titan] 2025-09-09 18:42:38,835 - root - INFO - step: 26805 loss: 2.6549 memory: 122.04GiB(87.57%) tps: 10,288 tflops: 490.30 mfu: 49.58% global_avg_ntp_loss: 0.7381 global_avg_top_loss: 1.9168 +[titan] 2025-09-09 18:42:38,835 - root - INFO - lr: 6.4967e-06 gnorm: 0.35 [2 days, 1:07:10<1 day, 0:10:46] +[titan] 2025-09-09 18:43:10,862 - root - INFO - step: 26810 loss: 2.7149 memory: 122.04GiB(87.57%) tps: 10,232 tflops: 487.64 mfu: 49.31% global_avg_ntp_loss: 0.7648 global_avg_top_loss: 1.9501 +[titan] 2025-09-09 18:43:10,862 - root - INFO - lr: 6.4936e-06 gnorm: 0.35 [2 days, 1:07:42<1 day, 0:10:12] +[titan] 2025-09-09 18:43:43,062 - root - INFO - step: 26815 loss: 2.7017 memory: 122.04GiB(87.57%) tps: 10,177 tflops: 485.02 mfu: 49.04% global_avg_ntp_loss: 0.7592 global_avg_top_loss: 1.9426 +[titan] 2025-09-09 18:43:43,062 - root - INFO - lr: 6.4905e-06 gnorm: 0.36 [2 days, 1:08:14<1 day, 0:09:39] +[titan] 2025-09-09 18:44:14,872 - root - INFO - step: 26820 loss: 2.6268 memory: 122.04GiB(87.57%) tps: 10,301 tflops: 490.96 mfu: 49.64% global_avg_ntp_loss: 0.7222 global_avg_top_loss: 1.9046 +[titan] 2025-09-09 18:44:14,872 - root - INFO - lr: 6.4874e-06 gnorm: 0.55 [2 days, 1:08:46<1 day, 0:09:05] +[titan] 2025-09-09 18:44:46,895 - root - INFO - step: 26825 loss: 2.6928 memory: 122.04GiB(87.57%) tps: 10,233 tflops: 487.69 mfu: 49.31% global_avg_ntp_loss: 0.7572 global_avg_top_loss: 1.9356 +[titan] 2025-09-09 18:44:46,895 - root - INFO - lr: 6.4843e-06 gnorm: 0.38 [2 days, 1:09:18<1 day, 0:08:32] +[titan] 2025-09-09 18:45:18,819 - root - INFO - step: 26830 loss: 2.6896 memory: 122.04GiB(87.57%) tps: 10,265 tflops: 489.21 mfu: 49.46% global_avg_ntp_loss: 0.7526 global_avg_top_loss: 1.9371 +[titan] 2025-09-09 18:45:18,820 - root - INFO - lr: 6.4813e-06 gnorm: 0.37 [2 days, 1:09:50<1 day, 0:07:58] +[titan] 2025-09-09 18:45:50,797 - root - INFO - step: 26835 loss: 2.6914 memory: 122.04GiB(87.57%) tps: 10,247 tflops: 488.38 mfu: 49.38% global_avg_ntp_loss: 0.7555 global_avg_top_loss: 1.9359 +[titan] 2025-09-09 18:45:50,797 - root - INFO - lr: 6.4782e-06 gnorm: 0.36 [2 days, 1:10:22<1 day, 0:07:25] +[titan] 2025-09-09 18:46:22,809 - root - INFO - step: 26840 loss: 2.7312 memory: 122.04GiB(87.57%) tps: 10,237 tflops: 487.86 mfu: 49.33% global_avg_ntp_loss: 0.7715 global_avg_top_loss: 1.9596 +[titan] 2025-09-09 18:46:22,809 - root - INFO - lr: 6.4751e-06 gnorm: 0.37 [2 days, 1:10:54<1 day, 0:06:51] +[titan] 2025-09-09 18:46:54,845 - root - INFO - step: 26845 loss: 2.7830 memory: 122.04GiB(87.57%) tps: 10,229 tflops: 487.50 mfu: 49.29% global_avg_ntp_loss: 0.7965 global_avg_top_loss: 1.9865 +[titan] 2025-09-09 18:46:54,845 - root - INFO - lr: 6.4720e-06 gnorm: 0.37 [2 days, 1:11:26<1 day, 0:06:18] +[titan] 2025-09-09 18:47:20,438 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 18:47:26,953 - root - INFO - step: 26850 loss: 2.8378 memory: 122.04GiB(87.57%) tps: 10,206 tflops: 486.41 mfu: 49.18% global_avg_ntp_loss: 0.8144 global_avg_top_loss: 2.0234 +[titan] 2025-09-09 18:47:26,953 - root - INFO - lr: 6.4689e-06 gnorm: 1.00 [2 days, 1:11:58<1 day, 0:05:45] +[titan] 2025-09-09 18:47:58,978 - root - INFO - step: 26855 loss: 3.2085 memory: 122.04GiB(87.57%) tps: 10,232 tflops: 487.66 mfu: 49.31% global_avg_ntp_loss: 1.0421 global_avg_top_loss: 2.1664 +[titan] 2025-09-09 18:47:58,978 - root - INFO - lr: 6.4658e-06 gnorm: 0.42 [2 days, 1:12:30<1 day, 0:05:11] +[titan] 2025-09-09 18:48:30,675 - root - INFO - step: 26860 loss: 2.6858 memory: 122.04GiB(87.57%) tps: 10,338 tflops: 492.71 mfu: 49.82% global_avg_ntp_loss: 0.7521 global_avg_top_loss: 1.9337 +[titan] 2025-09-09 18:48:30,676 - root - INFO - lr: 6.4627e-06 gnorm: 0.36 [2 days, 1:13:02<1 day, 0:04:38] +[titan] 2025-09-09 18:49:02,689 - root - INFO - step: 26865 loss: 2.5476 memory: 122.04GiB(87.57%) tps: 10,236 tflops: 487.83 mfu: 49.33% global_avg_ntp_loss: 0.6916 global_avg_top_loss: 1.8560 +[titan] 2025-09-09 18:49:02,690 - root - INFO - lr: 6.4597e-06 gnorm: 0.35 [2 days, 1:13:34<1 day, 0:04:04] +[titan] 2025-09-09 18:49:34,520 - root - INFO - step: 26870 loss: 2.6289 memory: 122.04GiB(87.57%) tps: 10,295 tflops: 490.65 mfu: 49.61% global_avg_ntp_loss: 0.7313 global_avg_top_loss: 1.8976 +[titan] 2025-09-09 18:49:34,520 - root - INFO - lr: 6.4566e-06 gnorm: 0.36 [2 days, 1:14:05<1 day, 0:03:31] +[titan] 2025-09-09 18:50:06,556 - root - INFO - step: 26875 loss: 2.7057 memory: 122.04GiB(87.57%) tps: 10,229 tflops: 487.50 mfu: 49.29% global_avg_ntp_loss: 0.7613 global_avg_top_loss: 1.9444 +[titan] 2025-09-09 18:50:06,556 - root - INFO - lr: 6.4535e-06 gnorm: 0.38 [2 days, 1:14:37<1 day, 0:02:57] +[titan] 2025-09-09 18:50:38,478 - root - INFO - step: 26880 loss: 2.8484 memory: 122.04GiB(87.57%) tps: 10,265 tflops: 489.23 mfu: 49.47% global_avg_ntp_loss: 0.8308 global_avg_top_loss: 2.0176 +[titan] 2025-09-09 18:50:38,479 - root - INFO - lr: 6.4504e-06 gnorm: 0.37 [2 days, 1:15:09<1 day, 0:02:24] +[titan] 2025-09-09 18:51:10,479 - root - INFO - step: 26885 loss: 2.6820 memory: 122.04GiB(87.57%) tps: 10,240 tflops: 488.03 mfu: 49.35% global_avg_ntp_loss: 0.7489 global_avg_top_loss: 1.9331 +[titan] 2025-09-09 18:51:10,480 - root - INFO - lr: 6.4473e-06 gnorm: 0.38 [2 days, 1:15:41<1 day, 0:01:50] +[titan] 2025-09-09 18:51:42,515 - root - INFO - step: 26890 loss: 2.7534 memory: 122.04GiB(87.57%) tps: 10,229 tflops: 487.51 mfu: 49.29% global_avg_ntp_loss: 0.7844 global_avg_top_loss: 1.9691 +[titan] 2025-09-09 18:51:42,515 - root - INFO - lr: 6.4443e-06 gnorm: 0.37 [2 days, 1:16:13<1 day, 0:01:17] +[titan] 2025-09-09 18:52:14,305 - root - INFO - step: 26895 loss: 2.6009 memory: 122.04GiB(87.57%) tps: 10,308 tflops: 491.26 mfu: 49.67% global_avg_ntp_loss: 0.7154 global_avg_top_loss: 1.8856 +[titan] 2025-09-09 18:52:14,306 - root - INFO - lr: 6.4412e-06 gnorm: 0.35 [2 days, 1:16:45<1 day, 0:00:43] +[titan] 2025-09-09 18:52:40,051 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 18:52:46,410 - root - INFO - step: 26900 loss: 2.8136 memory: 122.04GiB(87.57%) tps: 10,207 tflops: 486.45 mfu: 49.19% global_avg_ntp_loss: 0.8116 global_avg_top_loss: 2.0020 +[titan] 2025-09-09 18:52:46,411 - root - INFO - lr: 6.4381e-06 gnorm: 0.38 [2 days, 1:17:17<1 day, 0:00:10] +[titan] 2025-09-09 18:53:18,400 - root - INFO - step: 26905 loss: 2.7392 memory: 122.04GiB(87.57%) tps: 10,244 tflops: 488.20 mfu: 49.36% global_avg_ntp_loss: 0.7787 global_avg_top_loss: 1.9605 +[titan] 2025-09-09 18:53:18,400 - root - INFO - lr: 6.4350e-06 gnorm: 0.38 [2 days, 1:17:49<23:59:36] +[titan] 2025-09-09 18:53:50,345 - root - INFO - step: 26910 loss: 2.7703 memory: 122.04GiB(87.57%) tps: 10,258 tflops: 488.89 mfu: 49.43% global_avg_ntp_loss: 0.7903 global_avg_top_loss: 1.9800 +[titan] 2025-09-09 18:53:50,345 - root - INFO - lr: 6.4320e-06 gnorm: 0.37 [2 days, 1:18:21<23:59:03] +[titan] 2025-09-09 18:54:22,055 - root - INFO - step: 26915 loss: 2.6668 memory: 122.04GiB(87.57%) tps: 10,334 tflops: 492.50 mfu: 49.80% global_avg_ntp_loss: 0.7436 global_avg_top_loss: 1.9232 +[titan] 2025-09-09 18:54:22,056 - root - INFO - lr: 6.4289e-06 gnorm: 0.39 [2 days, 1:18:53<23:58:29] +[titan] 2025-09-09 18:54:53,974 - root - INFO - step: 26920 loss: 2.7720 memory: 122.04GiB(87.57%) tps: 10,266 tflops: 489.28 mfu: 49.47% global_avg_ntp_loss: 0.7904 global_avg_top_loss: 1.9816 +[titan] 2025-09-09 18:54:53,975 - root - INFO - lr: 6.4258e-06 gnorm: 0.37 [2 days, 1:19:25<23:57:56] +[titan] 2025-09-09 18:55:26,066 - root - INFO - step: 26925 loss: 2.7029 memory: 122.04GiB(87.57%) tps: 10,211 tflops: 486.66 mfu: 49.21% global_avg_ntp_loss: 0.7597 global_avg_top_loss: 1.9431 +[titan] 2025-09-09 18:55:26,066 - root - INFO - lr: 6.4227e-06 gnorm: 0.40 [2 days, 1:19:57<23:57:22] +[titan] 2025-09-09 18:55:58,132 - root - INFO - step: 26930 loss: 2.7287 memory: 122.04GiB(87.57%) tps: 10,219 tflops: 487.04 mfu: 49.25% global_avg_ntp_loss: 0.7702 global_avg_top_loss: 1.9585 +[titan] 2025-09-09 18:55:58,132 - root - INFO - lr: 6.4197e-06 gnorm: 0.39 [2 days, 1:20:29<23:56:49] +[titan] 2025-09-09 18:56:30,182 - root - INFO - step: 26935 loss: 2.8694 memory: 122.04GiB(87.57%) tps: 10,224 tflops: 487.27 mfu: 49.27% global_avg_ntp_loss: 0.8479 global_avg_top_loss: 2.0215 +[titan] 2025-09-09 18:56:30,183 - root - INFO - lr: 6.4166e-06 gnorm: 0.38 [2 days, 1:21:01<23:56:15] +[titan] 2025-09-09 18:57:02,151 - root - INFO - step: 26940 loss: 2.7370 memory: 122.04GiB(87.57%) tps: 10,250 tflops: 488.52 mfu: 49.40% global_avg_ntp_loss: 0.7747 global_avg_top_loss: 1.9623 +[titan] 2025-09-09 18:57:02,152 - root - INFO - lr: 6.4135e-06 gnorm: 0.37 [2 days, 1:21:33<23:55:42] +[titan] 2025-09-09 18:57:34,348 - root - INFO - step: 26945 loss: 2.7522 memory: 122.04GiB(87.57%) tps: 10,178 tflops: 485.05 mfu: 49.04% global_avg_ntp_loss: 0.7980 global_avg_top_loss: 1.9542 +[titan] 2025-09-09 18:57:34,349 - root - INFO - lr: 6.4104e-06 gnorm: 0.38 [2 days, 1:22:05<23:55:09] +[titan] 2025-09-09 18:57:59,826 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 18:58:06,251 - root - INFO - step: 26950 loss: 3.0763 memory: 122.04GiB(87.57%) tps: 10,272 tflops: 489.54 mfu: 49.50% global_avg_ntp_loss: 0.9866 global_avg_top_loss: 2.0897 +[titan] 2025-09-09 18:58:06,252 - root - INFO - lr: 6.4074e-06 gnorm: 0.43 [2 days, 1:22:37<23:54:35] +[titan] 2025-09-09 18:58:38,209 - root - INFO - step: 26955 loss: 2.6407 memory: 122.04GiB(87.57%) tps: 10,254 tflops: 488.69 mfu: 49.41% global_avg_ntp_loss: 0.7312 global_avg_top_loss: 1.9095 +[titan] 2025-09-09 18:58:38,210 - root - INFO - lr: 6.4043e-06 gnorm: 0.36 [2 days, 1:23:09<23:54:02] +[titan] 2025-09-09 18:59:10,199 - root - INFO - step: 26960 loss: 2.6080 memory: 122.04GiB(87.57%) tps: 10,243 tflops: 488.20 mfu: 49.36% global_avg_ntp_loss: 0.7167 global_avg_top_loss: 1.8912 +[titan] 2025-09-09 18:59:10,200 - root - INFO - lr: 6.4012e-06 gnorm: 0.37 [2 days, 1:23:41<23:53:28] +[titan] 2025-09-09 18:59:42,158 - root - INFO - step: 26965 loss: 2.6799 memory: 122.04GiB(87.57%) tps: 10,254 tflops: 488.68 mfu: 49.41% global_avg_ntp_loss: 0.7459 global_avg_top_loss: 1.9340 +[titan] 2025-09-09 18:59:42,158 - root - INFO - lr: 6.3982e-06 gnorm: 0.36 [2 days, 1:24:13<23:52:55] +[titan] 2025-09-09 19:00:14,177 - root - INFO - step: 26970 loss: 2.6982 memory: 122.04GiB(87.57%) tps: 10,234 tflops: 487.76 mfu: 49.32% global_avg_ntp_loss: 0.7540 global_avg_top_loss: 1.9442 +[titan] 2025-09-09 19:00:14,177 - root - INFO - lr: 6.3951e-06 gnorm: 0.36 [2 days, 1:24:45<23:52:21] +[titan] 2025-09-09 19:00:46,238 - root - INFO - step: 26975 loss: 2.6967 memory: 122.04GiB(87.57%) tps: 10,221 tflops: 487.12 mfu: 49.25% global_avg_ntp_loss: 0.7569 global_avg_top_loss: 1.9398 +[titan] 2025-09-09 19:00:46,238 - root - INFO - lr: 6.3920e-06 gnorm: 0.37 [2 days, 1:25:17<23:51:48] +[titan] 2025-09-09 19:01:18,133 - root - INFO - step: 26980 loss: 2.8244 memory: 122.04GiB(87.57%) tps: 10,274 tflops: 489.65 mfu: 49.51% global_avg_ntp_loss: 0.8155 global_avg_top_loss: 2.0090 +[titan] 2025-09-09 19:01:18,133 - root - INFO - lr: 6.3890e-06 gnorm: 0.37 [2 days, 1:25:49<23:51:14] +[titan] 2025-09-09 19:01:49,981 - root - INFO - step: 26985 loss: 2.7032 memory: 122.04GiB(87.57%) tps: 10,289 tflops: 490.38 mfu: 49.58% global_avg_ntp_loss: 0.7608 global_avg_top_loss: 1.9424 +[titan] 2025-09-09 19:01:49,981 - root - INFO - lr: 6.3859e-06 gnorm: 0.37 [2 days, 1:26:21<23:50:41] +[titan] 2025-09-09 19:02:22,023 - root - INFO - step: 26990 loss: 2.7053 memory: 122.04GiB(87.57%) tps: 10,227 tflops: 487.40 mfu: 49.28% global_avg_ntp_loss: 0.7472 global_avg_top_loss: 1.9582 +[titan] 2025-09-09 19:02:22,024 - root - INFO - lr: 6.3828e-06 gnorm: 1.18 [2 days, 1:26:53<23:50:07] +[titan] 2025-09-09 19:02:54,069 - root - INFO - step: 26995 loss: 2.6801 memory: 122.04GiB(87.57%) tps: 10,225 tflops: 487.34 mfu: 49.28% global_avg_ntp_loss: 0.7467 global_avg_top_loss: 1.9334 +[titan] 2025-09-09 19:02:54,070 - root - INFO - lr: 6.3798e-06 gnorm: 0.35 [2 days, 1:27:25<23:49:34] +[titan] 2025-09-09 19:03:19,665 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 19:03:26,079 - root - INFO - step: 27000 loss: 2.6747 memory: 122.04GiB(87.57%) tps: 10,237 tflops: 487.89 mfu: 49.33% global_avg_ntp_loss: 0.7454 global_avg_top_loss: 1.9293 +[titan] 2025-09-09 19:03:26,080 - root - INFO - lr: 6.3767e-06 gnorm: 0.36 [2 days, 1:27:57<23:49:00] +[titan] 2025-09-09 19:03:57,995 - root - INFO - step: 27005 loss: 3.0726 memory: 122.04GiB(87.57%) tps: 10,267 tflops: 489.33 mfu: 49.48% global_avg_ntp_loss: 0.9287 global_avg_top_loss: 2.1439 +[titan] 2025-09-09 19:03:57,996 - root - INFO - lr: 6.3736e-06 gnorm: 0.39 [2 days, 1:28:29<23:48:27] +[titan] 2025-09-09 19:04:30,067 - root - INFO - step: 27010 loss: 2.7375 memory: 122.04GiB(87.57%) tps: 10,217 tflops: 486.95 mfu: 49.24% global_avg_ntp_loss: 0.7730 global_avg_top_loss: 1.9645 +[titan] 2025-09-09 19:04:30,068 - root - INFO - lr: 6.3706e-06 gnorm: 0.37 [2 days, 1:29:01<23:47:54] +[titan] 2025-09-09 19:05:02,102 - root - INFO - step: 27015 loss: 2.8582 memory: 122.04GiB(87.57%) tps: 10,229 tflops: 487.51 mfu: 49.29% global_avg_ntp_loss: 0.8349 global_avg_top_loss: 2.0232 +[titan] 2025-09-09 19:05:02,103 - root - INFO - lr: 6.3675e-06 gnorm: 0.38 [2 days, 1:29:33<23:47:20] +[titan] 2025-09-09 19:05:34,130 - root - INFO - step: 27020 loss: 2.6838 memory: 122.04GiB(87.57%) tps: 10,231 tflops: 487.62 mfu: 49.30% global_avg_ntp_loss: 0.7519 global_avg_top_loss: 1.9319 +[titan] 2025-09-09 19:05:34,131 - root - INFO - lr: 6.3645e-06 gnorm: 0.36 [2 days, 1:30:05<23:46:47] +[titan] 2025-09-09 19:06:06,082 - root - INFO - step: 27025 loss: 2.6730 memory: 122.04GiB(87.57%) tps: 10,256 tflops: 488.78 mfu: 49.42% global_avg_ntp_loss: 0.7428 global_avg_top_loss: 1.9302 +[titan] 2025-09-09 19:06:06,083 - root - INFO - lr: 6.3614e-06 gnorm: 0.38 [2 days, 1:30:37<23:46:13] +[titan] 2025-09-09 19:06:37,883 - root - INFO - step: 27030 loss: 3.1885 memory: 122.04GiB(87.57%) tps: 10,304 tflops: 491.10 mfu: 49.66% global_avg_ntp_loss: 1.0375 global_avg_top_loss: 2.1509 +[titan] 2025-09-09 19:06:37,884 - root - INFO - lr: 6.3583e-06 gnorm: 0.36 [2 days, 1:31:09<23:45:40] +[titan] 2025-09-09 19:07:09,881 - root - INFO - step: 27035 loss: 2.7117 memory: 122.04GiB(87.57%) tps: 10,241 tflops: 488.08 mfu: 49.35% global_avg_ntp_loss: 0.7632 global_avg_top_loss: 1.9485 +[titan] 2025-09-09 19:07:09,882 - root - INFO - lr: 6.3553e-06 gnorm: 0.38 [2 days, 1:31:41<23:45:06] +[titan] 2025-09-09 19:07:41,905 - root - INFO - step: 27040 loss: 2.6774 memory: 122.04GiB(87.57%) tps: 10,233 tflops: 487.68 mfu: 49.31% global_avg_ntp_loss: 0.7484 global_avg_top_loss: 1.9290 +[titan] 2025-09-09 19:07:41,906 - root - INFO - lr: 6.3522e-06 gnorm: 0.37 [2 days, 1:32:13<23:44:33] +[titan] 2025-09-09 19:08:13,946 - root - INFO - step: 27045 loss: 2.7450 memory: 122.04GiB(87.57%) tps: 10,227 tflops: 487.43 mfu: 49.29% global_avg_ntp_loss: 0.7756 global_avg_top_loss: 1.9694 +[titan] 2025-09-09 19:08:13,946 - root - INFO - lr: 6.3492e-06 gnorm: 0.40 [2 days, 1:32:45<23:43:59] +[titan] 2025-09-09 19:08:39,584 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 19:08:46,031 - root - INFO - step: 27050 loss: 2.6478 memory: 122.04GiB(87.57%) tps: 10,213 tflops: 486.75 mfu: 49.22% global_avg_ntp_loss: 0.7343 global_avg_top_loss: 1.9134 +[titan] 2025-09-09 19:08:46,031 - root - INFO - lr: 6.3461e-06 gnorm: 0.42 [2 days, 1:33:17<23:43:26] +[titan] 2025-09-09 19:09:17,964 - root - INFO - step: 27055 loss: 2.7008 memory: 122.04GiB(87.57%) tps: 10,262 tflops: 489.06 mfu: 49.45% global_avg_ntp_loss: 0.7598 global_avg_top_loss: 1.9409 +[titan] 2025-09-09 19:09:17,965 - root - INFO - lr: 6.3431e-06 gnorm: 0.39 [2 days, 1:33:49<23:42:53] +[titan] 2025-09-09 19:09:50,017 - root - INFO - step: 27060 loss: 2.7380 memory: 122.04GiB(87.57%) tps: 10,223 tflops: 487.23 mfu: 49.27% global_avg_ntp_loss: 0.7725 global_avg_top_loss: 1.9655 +[titan] 2025-09-09 19:09:50,018 - root - INFO - lr: 6.3400e-06 gnorm: 0.38 [2 days, 1:34:21<23:42:19] +[titan] 2025-09-09 19:10:22,223 - root - INFO - step: 27065 loss: 2.6734 memory: 122.04GiB(87.57%) tps: 10,175 tflops: 484.93 mfu: 49.03% global_avg_ntp_loss: 0.7467 global_avg_top_loss: 1.9267 +[titan] 2025-09-09 19:10:22,223 - root - INFO - lr: 6.3369e-06 gnorm: 0.39 [2 days, 1:34:53<23:41:46] +[titan] 2025-09-09 19:10:54,268 - root - INFO - step: 27070 loss: 2.6952 memory: 122.04GiB(87.57%) tps: 10,226 tflops: 487.36 mfu: 49.28% global_avg_ntp_loss: 0.7551 global_avg_top_loss: 1.9402 +[titan] 2025-09-09 19:10:54,268 - root - INFO - lr: 6.3339e-06 gnorm: 0.36 [2 days, 1:35:25<23:41:12] +[titan] 2025-09-09 19:11:26,435 - root - INFO - step: 27075 loss: 2.7463 memory: 122.04GiB(87.57%) tps: 10,187 tflops: 485.50 mfu: 49.09% global_avg_ntp_loss: 0.7781 global_avg_top_loss: 1.9681 +[titan] 2025-09-09 19:11:26,436 - root - INFO - lr: 6.3308e-06 gnorm: 0.42 [2 days, 1:35:57<23:40:39] +[titan] 2025-09-09 19:11:58,375 - root - INFO - step: 27080 loss: 2.7435 memory: 122.04GiB(87.57%) tps: 10,260 tflops: 488.97 mfu: 49.44% global_avg_ntp_loss: 0.7763 global_avg_top_loss: 1.9671 +[titan] 2025-09-09 19:11:58,375 - root - INFO - lr: 6.3278e-06 gnorm: 0.39 [2 days, 1:36:29<23:40:06] +[titan] 2025-09-09 19:12:30,534 - root - INFO - step: 27085 loss: 2.6053 memory: 122.04GiB(87.57%) tps: 10,190 tflops: 485.64 mfu: 49.10% global_avg_ntp_loss: 0.7191 global_avg_top_loss: 1.8862 +[titan] 2025-09-09 19:12:30,534 - root - INFO - lr: 6.3247e-06 gnorm: 0.37 [2 days, 1:37:01<23:39:32] +[titan] 2025-09-09 19:13:02,558 - root - INFO - step: 27090 loss: 2.7293 memory: 122.04GiB(87.57%) tps: 10,232 tflops: 487.67 mfu: 49.31% global_avg_ntp_loss: 0.7670 global_avg_top_loss: 1.9623 +[titan] 2025-09-09 19:13:02,559 - root - INFO - lr: 6.3217e-06 gnorm: 0.37 [2 days, 1:37:33<23:38:59] +[titan] 2025-09-09 19:13:34,565 - root - INFO - step: 27095 loss: 3.6952 memory: 122.04GiB(87.57%) tps: 10,238 tflops: 487.95 mfu: 49.34% global_avg_ntp_loss: 1.3170 global_avg_top_loss: 2.3782 +[titan] 2025-09-09 19:13:34,565 - root - INFO - lr: 6.3186e-06 gnorm: 0.40 [2 days, 1:38:05<23:38:25] +[titan] 2025-09-09 19:14:00,107 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 19:14:06,437 - root - INFO - step: 27100 loss: 2.7371 memory: 122.04GiB(87.57%) tps: 10,281 tflops: 490.00 mfu: 49.55% global_avg_ntp_loss: 0.7732 global_avg_top_loss: 1.9639 +[titan] 2025-09-09 19:14:06,437 - root - INFO - lr: 6.3156e-06 gnorm: 0.39 [2 days, 1:38:37<23:37:52] +[titan] 2025-09-09 19:14:38,535 - root - INFO - step: 27105 loss: 2.7704 memory: 122.04GiB(87.57%) tps: 10,209 tflops: 486.56 mfu: 49.20% global_avg_ntp_loss: 0.7909 global_avg_top_loss: 1.9795 +[titan] 2025-09-09 19:14:38,535 - root - INFO - lr: 6.3125e-06 gnorm: 0.40 [2 days, 1:39:09<23:37:18] +[titan] 2025-09-09 19:15:10,418 - root - INFO - step: 27110 loss: 3.1899 memory: 122.04GiB(87.57%) tps: 10,278 tflops: 489.82 mfu: 49.53% global_avg_ntp_loss: 1.0342 global_avg_top_loss: 2.1557 +[titan] 2025-09-09 19:15:10,419 - root - INFO - lr: 6.3095e-06 gnorm: 0.37 [2 days, 1:39:41<23:36:45] +[titan] 2025-09-09 19:15:42,484 - root - INFO - step: 27115 loss: 2.9980 memory: 122.04GiB(87.57%) tps: 10,219 tflops: 487.05 mfu: 49.25% global_avg_ntp_loss: 0.9298 global_avg_top_loss: 2.0682 +[titan] 2025-09-09 19:15:42,484 - root - INFO - lr: 6.3064e-06 gnorm: 0.36 [2 days, 1:40:13<23:36:12] +[titan] 2025-09-09 19:16:14,529 - root - INFO - step: 27120 loss: 2.7013 memory: 122.04GiB(87.57%) tps: 10,226 tflops: 487.36 mfu: 49.28% global_avg_ntp_loss: 0.7600 global_avg_top_loss: 1.9413 +[titan] 2025-09-09 19:16:14,529 - root - INFO - lr: 6.3034e-06 gnorm: 0.36 [2 days, 1:40:45<23:35:38] +[titan] 2025-09-09 19:16:46,604 - root - INFO - step: 27125 loss: 2.7012 memory: 122.04GiB(87.57%) tps: 10,216 tflops: 486.89 mfu: 49.23% global_avg_ntp_loss: 0.7581 global_avg_top_loss: 1.9430 +[titan] 2025-09-09 19:16:46,605 - root - INFO - lr: 6.3004e-06 gnorm: 0.36 [2 days, 1:41:17<23:35:05] +[titan] 2025-09-09 19:17:18,649 - root - INFO - step: 27130 loss: 2.7045 memory: 122.04GiB(87.57%) tps: 10,226 tflops: 487.37 mfu: 49.28% global_avg_ntp_loss: 0.7655 global_avg_top_loss: 1.9391 +[titan] 2025-09-09 19:17:18,649 - root - INFO - lr: 6.2973e-06 gnorm: 0.38 [2 days, 1:41:49<23:34:31] +[titan] 2025-09-09 19:17:50,655 - root - INFO - step: 27135 loss: 2.9253 memory: 122.04GiB(87.57%) tps: 10,238 tflops: 487.95 mfu: 49.34% global_avg_ntp_loss: 0.8866 global_avg_top_loss: 2.0387 +[titan] 2025-09-09 19:17:50,655 - root - INFO - lr: 6.2943e-06 gnorm: 0.37 [2 days, 1:42:21<23:33:58] +[titan] 2025-09-09 19:17:57,331 - root - INFO - Dumping profiler traces at step 27136 +[titan] 2025-09-09 19:17:57,401 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-09 19:18:22,920 - root - INFO - step: 27140 loss: 2.6909 memory: 122.04GiB(87.57%) tps: 10,156 tflops: 484.04 mfu: 48.94% global_avg_ntp_loss: 0.7540 global_avg_top_loss: 1.9369 +[titan] 2025-09-09 19:18:22,920 - root - INFO - lr: 6.2912e-06 gnorm: 0.43 [2 days, 1:42:54<23:33:25] +[titan] 2025-09-09 19:18:55,015 - root - INFO - step: 27145 loss: 2.6698 memory: 122.04GiB(87.57%) tps: 10,210 tflops: 486.59 mfu: 49.20% global_avg_ntp_loss: 0.7440 global_avg_top_loss: 1.9258 +[titan] 2025-09-09 19:18:55,015 - root - INFO - lr: 6.2882e-06 gnorm: 0.40 [2 days, 1:43:26<23:32:51] +[titan] 2025-09-09 19:19:20,804 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 19:19:27,192 - root - INFO - step: 27150 loss: 2.7734 memory: 122.04GiB(87.57%) tps: 10,184 tflops: 485.37 mfu: 49.08% global_avg_ntp_loss: 0.7875 global_avg_top_loss: 1.9859 +[titan] 2025-09-09 19:19:27,192 - root - INFO - lr: 6.2851e-06 gnorm: 0.38 [2 days, 1:43:58<23:32:18] +[titan] 2025-09-09 19:19:59,034 - root - INFO - step: 27155 loss: 2.6800 memory: 122.04GiB(87.57%) tps: 10,291 tflops: 490.47 mfu: 49.59% global_avg_ntp_loss: 0.7478 global_avg_top_loss: 1.9323 +[titan] 2025-09-09 19:19:59,034 - root - INFO - lr: 6.2821e-06 gnorm: 0.40 [2 days, 1:44:30<23:31:44] +[titan] 2025-09-09 19:20:31,023 - root - INFO - step: 27160 loss: 2.7391 memory: 122.04GiB(87.57%) tps: 10,244 tflops: 488.20 mfu: 49.36% global_avg_ntp_loss: 0.7746 global_avg_top_loss: 1.9645 +[titan] 2025-09-09 19:20:31,024 - root - INFO - lr: 6.2791e-06 gnorm: 0.36 [2 days, 1:45:02<23:31:11] +[titan] 2025-09-09 19:21:02,982 - root - INFO - step: 27165 loss: 3.0957 memory: 122.04GiB(87.57%) tps: 10,254 tflops: 488.68 mfu: 49.41% global_avg_ntp_loss: 0.9706 global_avg_top_loss: 2.1251 +[titan] 2025-09-09 19:21:02,982 - root - INFO - lr: 6.2760e-06 gnorm: 0.37 [2 days, 1:45:34<23:30:37] +[titan] 2025-09-09 19:21:34,913 - root - INFO - step: 27170 loss: 2.6701 memory: 122.04GiB(87.57%) tps: 10,262 tflops: 489.10 mfu: 49.45% global_avg_ntp_loss: 0.7452 global_avg_top_loss: 1.9249 +[titan] 2025-09-09 19:21:34,913 - root - INFO - lr: 6.2730e-06 gnorm: 0.37 [2 days, 1:46:06<23:30:04] +[titan] 2025-09-09 19:22:06,814 - root - INFO - step: 27175 loss: 3.6108 memory: 122.04GiB(87.57%) tps: 10,272 tflops: 489.56 mfu: 49.50% global_avg_ntp_loss: 1.2809 global_avg_top_loss: 2.3299 +[titan] 2025-09-09 19:22:06,814 - root - INFO - lr: 6.2700e-06 gnorm: 0.44 [2 days, 1:46:38<23:29:30] +[titan] 2025-09-09 19:22:38,918 - root - INFO - step: 27180 loss: 2.7138 memory: 122.04GiB(87.57%) tps: 10,207 tflops: 486.46 mfu: 49.19% global_avg_ntp_loss: 0.7626 global_avg_top_loss: 1.9512 +[titan] 2025-09-09 19:22:38,918 - root - INFO - lr: 6.2669e-06 gnorm: 0.40 [2 days, 1:47:10<23:28:57] +[titan] 2025-09-09 19:23:11,081 - root - INFO - step: 27185 loss: 2.7029 memory: 122.04GiB(87.57%) tps: 10,188 tflops: 485.57 mfu: 49.10% global_avg_ntp_loss: 0.7599 global_avg_top_loss: 1.9430 +[titan] 2025-09-09 19:23:11,082 - root - INFO - lr: 6.2639e-06 gnorm: 0.39 [2 days, 1:47:42<23:28:24] +[titan] 2025-09-09 19:23:43,028 - root - INFO - step: 27190 loss: 2.7070 memory: 122.04GiB(87.57%) tps: 10,257 tflops: 488.85 mfu: 49.43% global_avg_ntp_loss: 0.7610 global_avg_top_loss: 1.9460 +[titan] 2025-09-09 19:23:43,029 - root - INFO - lr: 6.2608e-06 gnorm: 0.37 [2 days, 1:48:14<23:27:50] +[titan] 2025-09-09 19:24:15,006 - root - INFO - step: 27195 loss: 2.6947 memory: 122.04GiB(87.57%) tps: 10,247 tflops: 488.39 mfu: 49.38% global_avg_ntp_loss: 0.7549 global_avg_top_loss: 1.9398 +[titan] 2025-09-09 19:24:15,006 - root - INFO - lr: 6.2578e-06 gnorm: 0.37 [2 days, 1:48:46<23:27:17] +[titan] 2025-09-09 19:24:40,596 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 19:24:47,045 - root - INFO - step: 27200 loss: 2.7372 memory: 122.04GiB(87.57%) tps: 10,228 tflops: 487.45 mfu: 49.29% global_avg_ntp_loss: 0.7747 global_avg_top_loss: 1.9625 +[titan] 2025-09-09 19:24:47,046 - root - INFO - lr: 6.2548e-06 gnorm: 0.41 [2 days, 1:49:18<23:26:43] +[titan] 2025-09-09 19:25:19,281 - root - INFO - step: 27205 loss: 2.6635 memory: 122.04GiB(87.57%) tps: 10,166 tflops: 484.48 mfu: 48.99% global_avg_ntp_loss: 0.7423 global_avg_top_loss: 1.9211 +[titan] 2025-09-09 19:25:19,281 - root - INFO - lr: 6.2517e-06 gnorm: 0.37 [2 days, 1:49:50<23:26:10] +[titan] 2025-09-09 19:25:51,302 - root - INFO - step: 27210 loss: 2.7562 memory: 122.04GiB(87.57%) tps: 10,233 tflops: 487.72 mfu: 49.31% global_avg_ntp_loss: 0.7837 global_avg_top_loss: 1.9725 +[titan] 2025-09-09 19:25:51,302 - root - INFO - lr: 6.2487e-06 gnorm: 0.45 [2 days, 1:50:22<23:25:37] +[titan] 2025-09-09 19:26:23,452 - root - INFO - step: 27215 loss: 2.6512 memory: 122.04GiB(87.57%) tps: 10,193 tflops: 485.77 mfu: 49.12% global_avg_ntp_loss: 0.7339 global_avg_top_loss: 1.9173 +[titan] 2025-09-09 19:26:23,452 - root - INFO - lr: 6.2457e-06 gnorm: 0.36 [2 days, 1:50:54<23:25:03] +[titan] 2025-09-09 19:26:55,409 - root - INFO - step: 27220 loss: 2.6973 memory: 122.04GiB(87.57%) tps: 10,254 tflops: 488.70 mfu: 49.41% global_avg_ntp_loss: 0.7558 global_avg_top_loss: 1.9415 +[titan] 2025-09-09 19:26:55,409 - root - INFO - lr: 6.2426e-06 gnorm: 0.36 [2 days, 1:51:26<23:24:30] +[titan] 2025-09-09 19:27:27,275 - root - INFO - step: 27225 loss: 2.6883 memory: 122.04GiB(87.57%) tps: 10,283 tflops: 490.09 mfu: 49.55% global_avg_ntp_loss: 0.7520 global_avg_top_loss: 1.9364 +[titan] 2025-09-09 19:27:27,276 - root - INFO - lr: 6.2396e-06 gnorm: 0.41 [2 days, 1:51:58<23:23:56] +[titan] 2025-09-09 19:27:59,387 - root - INFO - step: 27230 loss: 2.7478 memory: 122.04GiB(87.57%) tps: 10,205 tflops: 486.35 mfu: 49.18% global_avg_ntp_loss: 0.7774 global_avg_top_loss: 1.9704 +[titan] 2025-09-09 19:27:59,387 - root - INFO - lr: 6.2366e-06 gnorm: 0.38 [2 days, 1:52:30<23:23:23] +[titan] 2025-09-09 19:28:31,434 - root - INFO - step: 27235 loss: 2.6302 memory: 122.04GiB(87.57%) tps: 10,225 tflops: 487.33 mfu: 49.27% global_avg_ntp_loss: 0.7288 global_avg_top_loss: 1.9014 +[titan] 2025-09-09 19:28:31,434 - root - INFO - lr: 6.2336e-06 gnorm: 0.40 [2 days, 1:53:02<23:22:50] +[titan] 2025-09-09 19:29:03,280 - root - INFO - step: 27240 loss: 2.6055 memory: 122.04GiB(87.57%) tps: 10,290 tflops: 490.41 mfu: 49.59% global_avg_ntp_loss: 0.7136 global_avg_top_loss: 1.8919 +[titan] 2025-09-09 19:29:03,280 - root - INFO - lr: 6.2305e-06 gnorm: 0.38 [2 days, 1:53:34<23:22:16] +[titan] 2025-09-09 19:29:35,315 - root - INFO - step: 27245 loss: 2.7772 memory: 122.04GiB(87.57%) tps: 10,229 tflops: 487.52 mfu: 49.29% global_avg_ntp_loss: 0.7943 global_avg_top_loss: 1.9829 +[titan] 2025-09-09 19:29:35,315 - root - INFO - lr: 6.2275e-06 gnorm: 0.38 [2 days, 1:54:06<23:21:43] +[titan] 2025-09-09 19:30:00,910 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 19:30:07,285 - root - INFO - step: 27250 loss: 2.7578 memory: 122.04GiB(87.57%) tps: 10,250 tflops: 488.50 mfu: 49.39% global_avg_ntp_loss: 0.7853 global_avg_top_loss: 1.9726 +[titan] 2025-09-09 19:30:07,285 - root - INFO - lr: 6.2245e-06 gnorm: 0.40 [2 days, 1:54:38<23:21:09] +[titan] 2025-09-09 19:30:39,306 - root - INFO - step: 27255 loss: 3.7046 memory: 122.04GiB(87.57%) tps: 10,233 tflops: 487.72 mfu: 49.31% global_avg_ntp_loss: 1.3229 global_avg_top_loss: 2.3816 +[titan] 2025-09-09 19:30:39,307 - root - INFO - lr: 6.2215e-06 gnorm: 0.37 [2 days, 1:55:10<23:20:36] +[titan] 2025-09-09 19:31:11,557 - root - INFO - step: 27260 loss: 2.6905 memory: 122.04GiB(87.57%) tps: 10,161 tflops: 484.26 mfu: 48.96% global_avg_ntp_loss: 0.7532 global_avg_top_loss: 1.9373 +[titan] 2025-09-09 19:31:11,557 - root - INFO - lr: 6.2184e-06 gnorm: 0.40 [2 days, 1:55:42<23:20:03] +[titan] 2025-09-09 19:31:43,626 - root - INFO - step: 27265 loss: 2.7020 memory: 122.04GiB(87.57%) tps: 10,218 tflops: 487.00 mfu: 49.24% global_avg_ntp_loss: 0.7563 global_avg_top_loss: 1.9457 +[titan] 2025-09-09 19:31:43,626 - root - INFO - lr: 6.2154e-06 gnorm: 0.38 [2 days, 1:56:14<23:19:29] +[titan] 2025-09-09 19:32:15,717 - root - INFO - step: 27270 loss: 3.1301 memory: 122.04GiB(87.57%) tps: 10,211 tflops: 486.66 mfu: 49.21% global_avg_ntp_loss: 1.0078 global_avg_top_loss: 2.1223 +[titan] 2025-09-09 19:32:15,717 - root - INFO - lr: 6.2124e-06 gnorm: 0.39 [2 days, 1:56:47<23:18:56] +[titan] 2025-09-09 19:32:47,847 - root - INFO - step: 27275 loss: 2.7305 memory: 122.04GiB(87.57%) tps: 10,199 tflops: 486.06 mfu: 49.15% global_avg_ntp_loss: 0.7718 global_avg_top_loss: 1.9586 +[titan] 2025-09-09 19:32:47,848 - root - INFO - lr: 6.2094e-06 gnorm: 0.37 [2 days, 1:57:19<23:18:22] +[titan] 2025-09-09 19:33:19,745 - root - INFO - step: 27280 loss: 2.7766 memory: 122.04GiB(87.57%) tps: 10,273 tflops: 489.60 mfu: 49.50% global_avg_ntp_loss: 0.7970 global_avg_top_loss: 1.9796 +[titan] 2025-09-09 19:33:19,746 - root - INFO - lr: 6.2063e-06 gnorm: 0.43 [2 days, 1:57:51<23:17:49] +[titan] 2025-09-09 19:33:51,888 - root - INFO - step: 27285 loss: 2.7533 memory: 122.04GiB(87.57%) tps: 10,195 tflops: 485.88 mfu: 49.13% global_avg_ntp_loss: 0.7798 global_avg_top_loss: 1.9734 +[titan] 2025-09-09 19:33:51,888 - root - INFO - lr: 6.2033e-06 gnorm: 0.37 [2 days, 1:58:23<23:17:16] +[titan] 2025-09-09 19:34:23,928 - root - INFO - step: 27290 loss: 2.9562 memory: 122.04GiB(87.57%) tps: 10,227 tflops: 487.43 mfu: 49.29% global_avg_ntp_loss: 0.9042 global_avg_top_loss: 2.0520 +[titan] 2025-09-09 19:34:23,928 - root - INFO - lr: 6.2003e-06 gnorm: 0.37 [2 days, 1:58:55<23:16:42] +[titan] 2025-09-09 19:34:55,792 - root - INFO - step: 27295 loss: 2.6356 memory: 122.04GiB(87.57%) tps: 10,284 tflops: 490.13 mfu: 49.56% global_avg_ntp_loss: 0.7307 global_avg_top_loss: 1.9048 +[titan] 2025-09-09 19:34:55,792 - root - INFO - lr: 6.1973e-06 gnorm: 0.41 [2 days, 1:59:27<23:16:09] +[titan] 2025-09-09 19:35:21,453 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 19:35:27,806 - root - INFO - step: 27300 loss: 2.6964 memory: 122.04GiB(87.57%) tps: 10,236 tflops: 487.83 mfu: 49.33% global_avg_ntp_loss: 0.7595 global_avg_top_loss: 1.9370 +[titan] 2025-09-09 19:35:27,806 - root - INFO - lr: 6.1943e-06 gnorm: 0.37 [2 days, 1:59:59<23:15:35] +[titan] 2025-09-09 19:35:59,660 - root - INFO - step: 27305 loss: 2.7542 memory: 122.04GiB(87.57%) tps: 10,287 tflops: 490.27 mfu: 49.57% global_avg_ntp_loss: 0.7859 global_avg_top_loss: 1.9683 +[titan] 2025-09-09 19:35:59,661 - root - INFO - lr: 6.1912e-06 gnorm: 0.37 [2 days, 2:00:30<23:15:02] +[titan] 2025-09-09 19:36:31,539 - root - INFO - step: 27310 loss: 2.7520 memory: 122.04GiB(87.57%) tps: 10,279 tflops: 489.91 mfu: 49.54% global_avg_ntp_loss: 0.7791 global_avg_top_loss: 1.9729 +[titan] 2025-09-09 19:36:31,539 - root - INFO - lr: 6.1882e-06 gnorm: 0.38 [2 days, 2:01:02<23:14:28] +[titan] 2025-09-09 19:37:03,781 - root - INFO - step: 27315 loss: 2.6413 memory: 122.04GiB(87.57%) tps: 10,163 tflops: 484.37 mfu: 48.98% global_avg_ntp_loss: 0.7354 global_avg_top_loss: 1.9059 +[titan] 2025-09-09 19:37:03,782 - root - INFO - lr: 6.1852e-06 gnorm: 0.38 [2 days, 2:01:35<23:13:55] +[titan] 2025-09-09 19:37:35,889 - root - INFO - step: 27320 loss: 2.7525 memory: 122.04GiB(87.57%) tps: 10,206 tflops: 486.41 mfu: 49.18% global_avg_ntp_loss: 0.7833 global_avg_top_loss: 1.9692 +[titan] 2025-09-09 19:37:35,889 - root - INFO - lr: 6.1822e-06 gnorm: 0.38 [2 days, 2:02:07<23:13:22] +[titan] 2025-09-09 19:38:08,291 - root - INFO - step: 27325 loss: 2.7362 memory: 122.04GiB(87.57%) tps: 10,113 tflops: 481.98 mfu: 48.73% global_avg_ntp_loss: 0.7729 global_avg_top_loss: 1.9632 +[titan] 2025-09-09 19:38:08,292 - root - INFO - lr: 6.1792e-06 gnorm: 0.37 [2 days, 2:02:39<23:12:48] +[titan] 2025-09-09 19:38:40,424 - root - INFO - step: 27330 loss: 2.7705 memory: 122.04GiB(87.57%) tps: 10,198 tflops: 486.03 mfu: 49.14% global_avg_ntp_loss: 0.7971 global_avg_top_loss: 1.9734 +[titan] 2025-09-09 19:38:40,424 - root - INFO - lr: 6.1762e-06 gnorm: 0.37 [2 days, 2:03:11<23:12:15] +[titan] 2025-09-09 19:39:12,592 - root - INFO - step: 27335 loss: 3.6722 memory: 122.04GiB(87.57%) tps: 10,187 tflops: 485.49 mfu: 49.09% global_avg_ntp_loss: 1.3130 global_avg_top_loss: 2.3592 +[titan] 2025-09-09 19:39:12,593 - root - INFO - lr: 6.1731e-06 gnorm: 0.42 [2 days, 2:03:43<23:11:42] +[titan] 2025-09-09 19:39:44,432 - root - INFO - step: 27340 loss: 2.6653 memory: 122.04GiB(87.57%) tps: 10,292 tflops: 490.49 mfu: 49.59% global_avg_ntp_loss: 0.7394 global_avg_top_loss: 1.9259 +[titan] 2025-09-09 19:39:44,433 - root - INFO - lr: 6.1701e-06 gnorm: 0.38 [2 days, 2:04:15<23:11:08] +[titan] 2025-09-09 19:40:16,442 - root - INFO - step: 27345 loss: 2.7383 memory: 122.04GiB(87.57%) tps: 10,237 tflops: 487.90 mfu: 49.33% global_avg_ntp_loss: 0.7746 global_avg_top_loss: 1.9637 +[titan] 2025-09-09 19:40:16,443 - root - INFO - lr: 6.1671e-06 gnorm: 0.37 [2 days, 2:04:47<23:10:35] +[titan] 2025-09-09 19:40:42,013 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 19:40:48,530 - root - INFO - step: 27350 loss: 3.2440 memory: 122.04GiB(87.57%) tps: 10,212 tflops: 486.71 mfu: 49.21% global_avg_ntp_loss: 1.0583 global_avg_top_loss: 2.1858 +[titan] 2025-09-09 19:40:48,531 - root - INFO - lr: 6.1641e-06 gnorm: 0.38 [2 days, 2:05:19<23:10:02] +[titan] 2025-09-09 19:41:20,522 - root - INFO - step: 27355 loss: 3.2478 memory: 122.04GiB(87.57%) tps: 10,243 tflops: 488.16 mfu: 49.36% global_avg_ntp_loss: 1.0625 global_avg_top_loss: 2.1853 +[titan] 2025-09-09 19:41:20,523 - root - INFO - lr: 6.1611e-06 gnorm: 0.39 [2 days, 2:05:51<23:09:28] +[titan] 2025-09-09 19:41:52,521 - root - INFO - step: 27360 loss: 2.7529 memory: 122.04GiB(87.57%) tps: 10,241 tflops: 488.06 mfu: 49.35% global_avg_ntp_loss: 0.7827 global_avg_top_loss: 1.9702 +[titan] 2025-09-09 19:41:52,522 - root - INFO - lr: 6.1581e-06 gnorm: 0.39 [2 days, 2:06:23<23:08:55] +[titan] 2025-09-09 19:42:24,369 - root - INFO - step: 27365 loss: 2.6660 memory: 122.04GiB(87.57%) tps: 10,289 tflops: 490.38 mfu: 49.58% global_avg_ntp_loss: 0.7412 global_avg_top_loss: 1.9247 +[titan] 2025-09-09 19:42:24,370 - root - INFO - lr: 6.1551e-06 gnorm: 0.37 [2 days, 2:06:55<23:08:21] +[titan] 2025-09-09 19:42:56,638 - root - INFO - step: 27370 loss: 2.6101 memory: 122.04GiB(87.57%) tps: 10,155 tflops: 483.99 mfu: 48.94% global_avg_ntp_loss: 0.7181 global_avg_top_loss: 1.8921 +[titan] 2025-09-09 19:42:56,638 - root - INFO - lr: 6.1521e-06 gnorm: 0.36 [2 days, 2:07:27<23:07:48] +[titan] 2025-09-09 19:43:28,559 - root - INFO - step: 27375 loss: 2.7820 memory: 122.04GiB(87.57%) tps: 10,265 tflops: 489.25 mfu: 49.47% global_avg_ntp_loss: 0.7932 global_avg_top_loss: 1.9888 +[titan] 2025-09-09 19:43:28,560 - root - INFO - lr: 6.1491e-06 gnorm: 0.55 [2 days, 2:07:59<23:07:14] +[titan] 2025-09-09 19:44:00,626 - root - INFO - step: 27380 loss: 2.7627 memory: 122.04GiB(87.57%) tps: 10,219 tflops: 487.03 mfu: 49.24% global_avg_ntp_loss: 0.7883 global_avg_top_loss: 1.9744 +[titan] 2025-09-09 19:44:00,627 - root - INFO - lr: 6.1461e-06 gnorm: 0.39 [2 days, 2:08:31<23:06:41] +[titan] 2025-09-09 19:44:32,575 - root - INFO - step: 27385 loss: 2.6749 memory: 122.04GiB(87.57%) tps: 10,257 tflops: 488.82 mfu: 49.43% global_avg_ntp_loss: 0.7450 global_avg_top_loss: 1.9299 +[titan] 2025-09-09 19:44:32,576 - root - INFO - lr: 6.1431e-06 gnorm: 0.38 [2 days, 2:09:03<23:06:08] +[titan] 2025-09-09 19:45:04,823 - root - INFO - step: 27390 loss: 2.7189 memory: 122.04GiB(87.57%) tps: 10,162 tflops: 484.31 mfu: 48.97% global_avg_ntp_loss: 0.7633 global_avg_top_loss: 1.9556 +[titan] 2025-09-09 19:45:04,823 - root - INFO - lr: 6.1400e-06 gnorm: 0.40 [2 days, 2:09:36<23:05:34] +[titan] 2025-09-09 19:45:36,726 - root - INFO - step: 27395 loss: 2.6272 memory: 122.04GiB(87.57%) tps: 10,271 tflops: 489.52 mfu: 49.50% global_avg_ntp_loss: 0.7262 global_avg_top_loss: 1.9010 +[titan] 2025-09-09 19:45:36,727 - root - INFO - lr: 6.1370e-06 gnorm: 0.38 [2 days, 2:10:08<23:05:01] +[titan] 2025-09-09 19:46:02,254 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 19:46:08,670 - root - INFO - step: 27400 loss: 3.2022 memory: 122.04GiB(87.57%) tps: 10,258 tflops: 488.90 mfu: 49.43% global_avg_ntp_loss: 1.0387 global_avg_top_loss: 2.1635 +[titan] 2025-09-09 19:46:08,671 - root - INFO - lr: 6.1340e-06 gnorm: 0.39 [2 days, 2:10:39<23:04:28] +[titan] 2025-09-09 19:46:40,734 - root - INFO - step: 27405 loss: 2.7130 memory: 122.04GiB(87.57%) tps: 10,220 tflops: 487.08 mfu: 49.25% global_avg_ntp_loss: 0.7644 global_avg_top_loss: 1.9487 +[titan] 2025-09-09 19:46:40,734 - root - INFO - lr: 6.1310e-06 gnorm: 0.37 [2 days, 2:11:12<23:03:54] +[titan] 2025-09-09 19:47:12,686 - root - INFO - step: 27410 loss: 2.7198 memory: 122.04GiB(87.57%) tps: 10,256 tflops: 488.77 mfu: 49.42% global_avg_ntp_loss: 0.7647 global_avg_top_loss: 1.9552 +[titan] 2025-09-09 19:47:12,686 - root - INFO - lr: 6.1280e-06 gnorm: 0.37 [2 days, 2:11:43<23:03:21] +[titan] 2025-09-09 19:47:44,812 - root - INFO - step: 27415 loss: 3.0966 memory: 122.04GiB(87.57%) tps: 10,200 tflops: 486.13 mfu: 49.15% global_avg_ntp_loss: 0.9709 global_avg_top_loss: 2.1257 +[titan] 2025-09-09 19:47:44,812 - root - INFO - lr: 6.1250e-06 gnorm: 0.38 [2 days, 2:12:16<23:02:47] +[titan] 2025-09-09 19:48:16,814 - root - INFO - step: 27420 loss: 2.7303 memory: 122.04GiB(87.57%) tps: 10,240 tflops: 488.01 mfu: 49.34% global_avg_ntp_loss: 0.7730 global_avg_top_loss: 1.9573 +[titan] 2025-09-09 19:48:16,815 - root - INFO - lr: 6.1220e-06 gnorm: 0.37 [2 days, 2:12:48<23:02:14] +[titan] 2025-09-09 19:48:48,824 - root - INFO - step: 27425 loss: 2.6604 memory: 122.04GiB(87.57%) tps: 10,237 tflops: 487.89 mfu: 49.33% global_avg_ntp_loss: 0.7384 global_avg_top_loss: 1.9220 +[titan] 2025-09-09 19:48:48,825 - root - INFO - lr: 6.1190e-06 gnorm: 0.36 [2 days, 2:13:20<23:01:41] +[titan] 2025-09-09 19:49:20,980 - root - INFO - step: 27430 loss: 3.2045 memory: 122.04GiB(87.57%) tps: 10,191 tflops: 485.68 mfu: 49.11% global_avg_ntp_loss: 1.0378 global_avg_top_loss: 2.1667 +[titan] 2025-09-09 19:49:20,980 - root - INFO - lr: 6.1160e-06 gnorm: 0.39 [2 days, 2:13:52<23:01:07] +[titan] 2025-09-09 19:49:53,147 - root - INFO - step: 27435 loss: 2.6938 memory: 122.04GiB(87.57%) tps: 10,187 tflops: 485.51 mfu: 49.09% global_avg_ntp_loss: 0.7552 global_avg_top_loss: 1.9387 +[titan] 2025-09-09 19:49:53,148 - root - INFO - lr: 6.1130e-06 gnorm: 0.37 [2 days, 2:14:24<23:00:34] +[titan] 2025-09-09 19:50:25,231 - root - INFO - step: 27440 loss: 2.7039 memory: 122.04GiB(87.57%) tps: 10,214 tflops: 486.77 mfu: 49.22% global_avg_ntp_loss: 0.7604 global_avg_top_loss: 1.9435 +[titan] 2025-09-09 19:50:25,231 - root - INFO - lr: 6.1100e-06 gnorm: 0.38 [2 days, 2:14:56<23:00:01] +[titan] 2025-09-09 19:50:57,335 - root - INFO - step: 27445 loss: 2.6224 memory: 122.04GiB(87.57%) tps: 10,207 tflops: 486.46 mfu: 49.19% global_avg_ntp_loss: 0.7239 global_avg_top_loss: 1.8986 +[titan] 2025-09-09 19:50:57,335 - root - INFO - lr: 6.1070e-06 gnorm: 0.36 [2 days, 2:15:28<22:59:27] +[titan] 2025-09-09 19:51:23,014 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 19:51:29,568 - root - INFO - step: 27450 loss: 2.7362 memory: 122.04GiB(87.57%) tps: 10,166 tflops: 484.52 mfu: 48.99% global_avg_ntp_loss: 0.7747 global_avg_top_loss: 1.9615 +[titan] 2025-09-09 19:51:29,568 - root - INFO - lr: 6.1040e-06 gnorm: 0.38 [2 days, 2:16:00<22:58:54] +[titan] 2025-09-09 19:52:01,614 - root - INFO - step: 27455 loss: 2.7540 memory: 122.04GiB(87.57%) tps: 10,226 tflops: 487.35 mfu: 49.28% global_avg_ntp_loss: 0.7794 global_avg_top_loss: 1.9747 +[titan] 2025-09-09 19:52:01,614 - root - INFO - lr: 6.1011e-06 gnorm: 0.50 [2 days, 2:16:32<22:58:20] +[titan] 2025-09-09 19:52:33,610 - root - INFO - step: 27460 loss: 2.7341 memory: 122.04GiB(87.57%) tps: 10,241 tflops: 488.09 mfu: 49.35% global_avg_ntp_loss: 0.7713 global_avg_top_loss: 1.9628 +[titan] 2025-09-09 19:52:33,611 - root - INFO - lr: 6.0981e-06 gnorm: 0.39 [2 days, 2:17:04<22:57:47] +[titan] 2025-09-09 19:53:05,588 - root - INFO - step: 27465 loss: 2.6968 memory: 122.04GiB(87.57%) tps: 10,248 tflops: 488.39 mfu: 49.38% global_avg_ntp_loss: 0.7580 global_avg_top_loss: 1.9388 +[titan] 2025-09-09 19:53:05,588 - root - INFO - lr: 6.0951e-06 gnorm: 0.39 [2 days, 2:17:36<22:57:14] +[titan] 2025-09-09 19:53:37,733 - root - INFO - step: 27470 loss: 2.7709 memory: 122.04GiB(87.57%) tps: 10,194 tflops: 485.84 mfu: 49.12% global_avg_ntp_loss: 0.7893 global_avg_top_loss: 1.9816 +[titan] 2025-09-09 19:53:37,734 - root - INFO - lr: 6.0921e-06 gnorm: 0.40 [2 days, 2:18:08<22:56:40] +[titan] 2025-09-09 19:54:09,955 - root - INFO - step: 27475 loss: 2.6824 memory: 122.04GiB(87.57%) tps: 10,170 tflops: 484.68 mfu: 49.01% global_avg_ntp_loss: 0.7459 global_avg_top_loss: 1.9365 +[titan] 2025-09-09 19:54:09,956 - root - INFO - lr: 6.0891e-06 gnorm: 0.37 [2 days, 2:18:41<22:56:07] +[titan] 2025-09-09 19:54:42,002 - root - INFO - step: 27480 loss: 2.7159 memory: 122.04GiB(87.57%) tps: 10,225 tflops: 487.34 mfu: 49.28% global_avg_ntp_loss: 0.7651 global_avg_top_loss: 1.9508 +[titan] 2025-09-09 19:54:42,002 - root - INFO - lr: 6.0861e-06 gnorm: 0.37 [2 days, 2:19:13<22:55:34] +[titan] 2025-09-09 19:55:14,137 - root - INFO - step: 27485 loss: 2.6776 memory: 122.04GiB(87.57%) tps: 10,197 tflops: 486.00 mfu: 49.14% global_avg_ntp_loss: 0.7357 global_avg_top_loss: 1.9419 +[titan] 2025-09-09 19:55:14,137 - root - INFO - lr: 6.0831e-06 gnorm: 0.66 [2 days, 2:19:45<22:55:00] +[titan] 2025-09-09 19:55:46,147 - root - INFO - step: 27490 loss: 2.7540 memory: 122.04GiB(87.57%) tps: 10,237 tflops: 487.89 mfu: 49.33% global_avg_ntp_loss: 0.7806 global_avg_top_loss: 1.9734 +[titan] 2025-09-09 19:55:46,148 - root - INFO - lr: 6.0801e-06 gnorm: 0.37 [2 days, 2:20:17<22:54:27] +[titan] 2025-09-09 19:56:18,093 - root - INFO - step: 27495 loss: 2.6953 memory: 122.04GiB(87.57%) tps: 10,258 tflops: 488.88 mfu: 49.43% global_avg_ntp_loss: 0.7540 global_avg_top_loss: 1.9413 +[titan] 2025-09-09 19:56:18,093 - root - INFO - lr: 6.0771e-06 gnorm: 0.38 [2 days, 2:20:49<22:53:54] +[titan] 2025-09-09 19:56:43,689 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds. +[titan] 2025-09-09 19:56:50,109 - root - INFO - step: 27500 loss: 2.6832 memory: 122.04GiB(87.57%) tps: 10,235 tflops: 487.80 mfu: 49.32% global_avg_ntp_loss: 0.7485 global_avg_top_loss: 1.9347 +[titan] 2025-09-09 19:56:50,110 - root - INFO - lr: 6.0741e-06 gnorm: 0.37 [2 days, 2:21:21<22:53:20] +[titan] 2025-09-09 19:57:22,269 - root - INFO - step: 27505 loss: 2.6244 memory: 122.04GiB(87.57%) tps: 10,189 tflops: 485.61 mfu: 49.10% global_avg_ntp_loss: 0.7235 global_avg_top_loss: 1.9009 +[titan] 2025-09-09 19:57:22,270 - root - INFO - lr: 6.0711e-06 gnorm: 0.36 [2 days, 2:21:53<22:52:47] +[titan] 2025-09-09 19:57:54,296 - root - INFO - step: 27510 loss: 3.1628 memory: 122.04GiB(87.57%) tps: 10,232 tflops: 487.64 mfu: 49.31% global_avg_ntp_loss: 1.0215 global_avg_top_loss: 2.1413 +[titan] 2025-09-09 19:57:54,296 - root - INFO - lr: 6.0682e-06 gnorm: 0.37 [2 days, 2:22:25<22:52:13] +[titan] 2025-09-09 19:58:26,527 - root - INFO - step: 27515 loss: 2.7030 memory: 122.04GiB(87.57%) tps: 10,167 tflops: 484.55 mfu: 48.99% global_avg_ntp_loss: 0.7594 global_avg_top_loss: 1.9436 +[titan] 2025-09-09 19:58:26,527 - root - INFO - lr: 6.0652e-06 gnorm: 0.44 [2 days, 2:22:57<22:51:40] +[titan] 2025-09-09 19:58:58,587 - root - INFO - step: 27520 loss: 2.6953 memory: 122.04GiB(87.57%) tps: 10,221 tflops: 487.13 mfu: 49.25% global_avg_ntp_loss: 0.7537 global_avg_top_loss: 1.9416 +[titan] 2025-09-09 19:58:58,587 - root - INFO - lr: 6.0622e-06 gnorm: 0.41 [2 days, 2:23:29<22:51:07] +[titan] 2025-09-09 19:59:30,617 - root - INFO - step: 27525 loss: 2.6840 memory: 122.04GiB(87.57%) tps: 10,231 tflops: 487.59 mfu: 49.30% global_avg_ntp_loss: 0.7492 global_avg_top_loss: 1.9348 +[titan] 2025-09-09 19:59:30,617 - root - INFO - lr: 6.0592e-06 gnorm: 0.38 [2 days, 2:24:01<22:50:33] +[titan] 2025-09-09 20:00:02,763 - root - INFO - step: 27530 loss: 2.7763 memory: 122.04GiB(87.57%) tps: 10,194 tflops: 485.83 mfu: 49.12% global_avg_ntp_loss: 0.7919 global_avg_top_loss: 1.9844 +[titan] 2025-09-09 20:00:02,763 - root - INFO - lr: 6.0562e-06 gnorm: 0.38 [2 days, 2:24:34<22:50:00] +[titan] 2025-09-09 20:00:34,732 - root - INFO - step: 27535 loss: 2.7301 memory: 122.04GiB(87.57%) tps: 10,250 tflops: 488.51 mfu: 49.39% global_avg_ntp_loss: 0.7703 global_avg_top_loss: 1.9598 +[titan] 2025-09-09 20:00:34,733 - root - INFO - lr: 6.0532e-06 gnorm: 0.42 [2 days, 2:25:05<22:49:27] +[titan] 2025-09-09 20:01:06,792 - root - INFO - step: 27540 loss: 2.7005 memory: 122.04GiB(87.57%) tps: 10,221 tflops: 487.14 mfu: 49.26% global_avg_ntp_loss: 0.7577 global_avg_top_loss: 1.9428 +[titan] 2025-09-09 20:01:06,792 - root - INFO - lr: 6.0503e-06 gnorm: 0.38 [2 days, 2:25:38<22:48:53] +[titan] 2025-09-09 20:01:38,813 - root - INFO - step: 27545 loss: 3.2007 memory: 122.04GiB(87.57%) tps: 10,234 tflops: 487.72 mfu: 49.31% global_avg_ntp_loss: 1.0387 global_avg_top_loss: 2.1619 +[titan] 2025-09-09 20:01:38,813 - root - INFO - lr: 6.0473e-06 gnorm: 0.37 [2 days, 2:26:10<22:48:20] +[titan] 2025-09-09 20:02:04,423 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds. +[titan] 2025-09-09 20:02:10,844 - root - INFO - step: 27550 loss: 2.7095 memory: 122.04GiB(87.57%) tps: 10,230 tflops: 487.57 mfu: 49.30% global_avg_ntp_loss: 0.7634 global_avg_top_loss: 1.9461 +[titan] 2025-09-09 20:02:10,845 - root - INFO - lr: 6.0443e-06 gnorm: 0.36 [2 days, 2:26:42<22:47:47] +[titan] 2025-09-09 20:02:43,093 - root - INFO - step: 27555 loss: 2.6413 memory: 122.04GiB(87.57%) tps: 10,161 tflops: 484.28 mfu: 48.97% global_avg_ntp_loss: 0.7298 global_avg_top_loss: 1.9114 +[titan] 2025-09-09 20:02:43,094 - root - INFO - lr: 6.0413e-06 gnorm: 0.38 [2 days, 2:27:14<22:47:13] +[titan] 2025-09-09 20:03:15,158 - root - INFO - step: 27560 loss: 2.7785 memory: 122.04GiB(87.57%) tps: 10,219 tflops: 487.05 mfu: 49.25% global_avg_ntp_loss: 0.7945 global_avg_top_loss: 1.9840 +[titan] 2025-09-09 20:03:15,159 - root - INFO - lr: 6.0383e-06 gnorm: 0.38 [2 days, 2:27:46<22:46:40] +[titan] 2025-09-09 20:03:47,318 - root - INFO - step: 27565 loss: 2.6802 memory: 122.04GiB(87.57%) tps: 10,190 tflops: 485.63 mfu: 49.10% global_avg_ntp_loss: 0.7481 global_avg_top_loss: 1.9320 +[titan] 2025-09-09 20:03:47,318 - root - INFO - lr: 6.0354e-06 gnorm: 0.36 [2 days, 2:28:18<22:46:07] +[titan] 2025-09-09 20:04:19,472 - root - INFO - step: 27570 loss: 2.7005 memory: 122.04GiB(87.57%) tps: 10,191 tflops: 485.71 mfu: 49.11% global_avg_ntp_loss: 0.7580 global_avg_top_loss: 1.9425 +[titan] 2025-09-09 20:04:19,472 - root - INFO - lr: 6.0324e-06 gnorm: 0.62 [2 days, 2:28:50<22:45:33] +[titan] 2025-09-09 20:04:51,441 - root - INFO - step: 27575 loss: 2.7155 memory: 122.04GiB(87.57%) tps: 10,250 tflops: 488.52 mfu: 49.40% global_avg_ntp_loss: 0.7669 global_avg_top_loss: 1.9486 +[titan] 2025-09-09 20:04:51,441 - root - INFO - lr: 6.0294e-06 gnorm: 0.37 [2 days, 2:29:22<22:45:00] +[titan] 2025-09-09 20:05:23,276 - root - INFO - step: 27580 loss: 2.7277 memory: 122.04GiB(87.57%) tps: 10,293 tflops: 490.57 mfu: 49.60% global_avg_ntp_loss: 0.7699 global_avg_top_loss: 1.9577 +[titan] 2025-09-09 20:05:23,277 - root - INFO - lr: 6.0264e-06 gnorm: 0.36 [2 days, 2:29:54<22:44:26] +[titan] 2025-09-09 20:05:55,367 - root - INFO - step: 27585 loss: 2.7860 memory: 122.04GiB(87.57%) tps: 10,211 tflops: 486.66 mfu: 49.21% global_avg_ntp_loss: 0.8004 global_avg_top_loss: 1.9856 +[titan] 2025-09-09 20:05:55,368 - root - INFO - lr: 6.0235e-06 gnorm: 0.42 [2 days, 2:30:26<22:43:53] +[titan] 2025-09-09 20:06:27,596 - root - INFO - step: 27590 loss: 2.6917 memory: 122.04GiB(87.57%) tps: 10,168 tflops: 484.58 mfu: 49.00% global_avg_ntp_loss: 0.7537 global_avg_top_loss: 1.9379 +[titan] 2025-09-09 20:06:27,597 - root - INFO - lr: 6.0205e-06 gnorm: 0.38 [2 days, 2:30:58<22:43:20] +[titan] 2025-09-09 20:06:59,531 - root - INFO - step: 27595 loss: 2.7512 memory: 122.04GiB(87.57%) tps: 10,261 tflops: 489.04 mfu: 49.45% global_avg_ntp_loss: 0.7786 global_avg_top_loss: 1.9726 +[titan] 2025-09-09 20:06:59,532 - root - INFO - lr: 6.0175e-06 gnorm: 0.38 [2 days, 2:31:30<22:42:46] +[titan] 2025-09-09 20:07:25,377 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 20:07:31,769 - root - INFO - step: 27600 loss: 2.6837 memory: 122.04GiB(87.57%) tps: 10,165 tflops: 484.45 mfu: 48.98% global_avg_ntp_loss: 0.7489 global_avg_top_loss: 1.9348 +[titan] 2025-09-09 20:07:31,769 - root - INFO - lr: 6.0145e-06 gnorm: 0.38 [2 days, 2:32:03<22:42:13] +[titan] 2025-09-09 20:08:03,866 - root - INFO - step: 27605 loss: 2.7257 memory: 122.04GiB(87.57%) tps: 10,209 tflops: 486.57 mfu: 49.20% global_avg_ntp_loss: 0.7721 global_avg_top_loss: 1.9536 +[titan] 2025-09-09 20:08:03,867 - root - INFO - lr: 6.0116e-06 gnorm: 0.37 [2 days, 2:32:35<22:41:40] +[titan] 2025-09-09 20:08:35,733 - root - INFO - step: 27610 loss: 2.7324 memory: 122.04GiB(87.57%) tps: 10,283 tflops: 490.09 mfu: 49.55% global_avg_ntp_loss: 0.7754 global_avg_top_loss: 1.9570 +[titan] 2025-09-09 20:08:35,733 - root - INFO - lr: 6.0086e-06 gnorm: 0.39 [2 days, 2:33:06<22:41:06] +[titan] 2025-09-09 20:09:07,693 - root - INFO - step: 27615 loss: 2.7002 memory: 122.04GiB(87.57%) tps: 10,253 tflops: 488.65 mfu: 49.41% global_avg_ntp_loss: 0.7548 global_avg_top_loss: 1.9454 +[titan] 2025-09-09 20:09:07,694 - root - INFO - lr: 6.0056e-06 gnorm: 0.45 [2 days, 2:33:38<22:40:33] +[titan] 2025-09-09 20:09:39,612 - root - INFO - step: 27620 loss: 2.7725 memory: 122.04GiB(87.57%) tps: 10,266 tflops: 489.28 mfu: 49.47% global_avg_ntp_loss: 0.7925 global_avg_top_loss: 1.9799 +[titan] 2025-09-09 20:09:39,613 - root - INFO - lr: 6.0027e-06 gnorm: 0.38 [2 days, 2:34:10<22:39:59] +[titan] 2025-09-09 20:10:11,764 - root - INFO - step: 27625 loss: 3.2329 memory: 122.04GiB(87.57%) tps: 10,192 tflops: 485.75 mfu: 49.12% global_avg_ntp_loss: 1.0521 global_avg_top_loss: 2.1808 +[titan] 2025-09-09 20:10:11,765 - root - INFO - lr: 5.9997e-06 gnorm: 0.37 [2 days, 2:34:42<22:39:26] +[titan] 2025-09-09 20:10:43,627 - root - INFO - step: 27630 loss: 2.7781 memory: 122.04GiB(87.57%) tps: 10,284 tflops: 490.14 mfu: 49.56% global_avg_ntp_loss: 0.7998 global_avg_top_loss: 1.9783 +[titan] 2025-09-09 20:10:43,628 - root - INFO - lr: 5.9967e-06 gnorm: 0.38 [2 days, 2:35:14<22:38:53] +[titan] 2025-09-09 20:11:15,602 - root - INFO - step: 27635 loss: 2.6257 memory: 122.04GiB(87.57%) tps: 10,248 tflops: 488.43 mfu: 49.39% global_avg_ntp_loss: 0.7219 global_avg_top_loss: 1.9037 +[titan] 2025-09-09 20:11:15,602 - root - INFO - lr: 5.9937e-06 gnorm: 0.36 [2 days, 2:35:46<22:38:19] +[titan] 2025-09-09 20:11:47,610 - root - INFO - step: 27640 loss: 2.6846 memory: 122.04GiB(87.57%) tps: 10,238 tflops: 487.93 mfu: 49.34% global_avg_ntp_loss: 0.7516 global_avg_top_loss: 1.9330 +[titan] 2025-09-09 20:11:47,610 - root - INFO - lr: 5.9908e-06 gnorm: 0.37 [2 days, 2:36:18<22:37:46] +[titan] 2025-09-09 20:12:19,839 - root - INFO - step: 27645 loss: 2.6790 memory: 122.04GiB(87.57%) tps: 10,168 tflops: 484.58 mfu: 49.00% global_avg_ntp_loss: 0.7468 global_avg_top_loss: 1.9322 +[titan] 2025-09-09 20:12:19,839 - root - INFO - lr: 5.9878e-06 gnorm: 0.37 [2 days, 2:36:51<22:37:13] +[titan] 2025-09-09 20:12:39,278 - root - INFO - Dumping profiler traces at step 27648 +[titan] 2025-09-09 20:12:39,335 - root - INFO - Finished dumping profiler traces in 0.06 seconds +[titan] 2025-09-09 20:12:45,702 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 20:12:52,061 - root - INFO - step: 27650 loss: 2.6148 memory: 122.04GiB(87.57%) tps: 10,170 tflops: 484.68 mfu: 49.01% global_avg_ntp_loss: 0.7215 global_avg_top_loss: 1.8933 +[titan] 2025-09-09 20:12:52,061 - root - INFO - lr: 5.9849e-06 gnorm: 0.36 [2 days, 2:37:23<22:36:39] +[titan] 2025-09-09 20:13:24,071 - root - INFO - step: 27655 loss: 2.6972 memory: 122.04GiB(87.57%) tps: 10,237 tflops: 487.89 mfu: 49.33% global_avg_ntp_loss: 0.7571 global_avg_top_loss: 1.9401 +[titan] 2025-09-09 20:13:24,071 - root - INFO - lr: 5.9819e-06 gnorm: 0.37 [2 days, 2:37:55<22:36:06] +[titan] 2025-09-09 20:13:56,082 - root - INFO - step: 27660 loss: 2.6261 memory: 122.04GiB(87.57%) tps: 10,237 tflops: 487.88 mfu: 49.33% global_avg_ntp_loss: 0.7168 global_avg_top_loss: 1.9093 +[titan] 2025-09-09 20:13:56,082 - root - INFO - lr: 5.9789e-06 gnorm: 0.45 [2 days, 2:38:27<22:35:33] +[titan] 2025-09-09 20:14:28,102 - root - INFO - step: 27665 loss: 2.7333 memory: 122.04GiB(87.57%) tps: 10,234 tflops: 487.73 mfu: 49.32% global_avg_ntp_loss: 0.7723 global_avg_top_loss: 1.9610 +[titan] 2025-09-09 20:14:28,103 - root - INFO - lr: 5.9760e-06 gnorm: 0.38 [2 days, 2:38:59<22:34:59] +[titan] 2025-09-09 20:15:00,192 - root - INFO - step: 27670 loss: 2.6630 memory: 122.04GiB(87.57%) tps: 10,212 tflops: 486.68 mfu: 49.21% global_avg_ntp_loss: 0.7456 global_avg_top_loss: 1.9174 +[titan] 2025-09-09 20:15:00,192 - root - INFO - lr: 5.9730e-06 gnorm: 0.37 [2 days, 2:39:31<22:34:26] +[titan] 2025-09-09 20:15:31,995 - root - INFO - step: 27675 loss: 2.6408 memory: 122.04GiB(87.57%) tps: 10,304 tflops: 491.06 mfu: 49.65% global_avg_ntp_loss: 0.7313 global_avg_top_loss: 1.9095 +[titan] 2025-09-09 20:15:31,996 - root - INFO - lr: 5.9700e-06 gnorm: 0.37 [2 days, 2:40:03<22:33:52] +[titan] 2025-09-09 20:16:04,045 - root - INFO - step: 27680 loss: 2.6419 memory: 122.04GiB(87.57%) tps: 10,224 tflops: 487.28 mfu: 49.27% global_avg_ntp_loss: 0.7316 global_avg_top_loss: 1.9104 +[titan] 2025-09-09 20:16:04,046 - root - INFO - lr: 5.9671e-06 gnorm: 0.36 [2 days, 2:40:35<22:33:19] +[titan] 2025-09-09 20:16:36,103 - root - INFO - step: 27685 loss: 2.7632 memory: 122.04GiB(87.57%) tps: 10,222 tflops: 487.16 mfu: 49.26% global_avg_ntp_loss: 0.7870 global_avg_top_loss: 1.9761 +[titan] 2025-09-09 20:16:36,104 - root - INFO - lr: 5.9641e-06 gnorm: 0.38 [2 days, 2:41:07<22:32:46] +[titan] 2025-09-09 20:17:08,384 - root - INFO - step: 27690 loss: 2.7392 memory: 122.04GiB(87.57%) tps: 10,151 tflops: 483.80 mfu: 48.92% global_avg_ntp_loss: 0.7735 global_avg_top_loss: 1.9657 +[titan] 2025-09-09 20:17:08,384 - root - INFO - lr: 5.9612e-06 gnorm: 0.38 [2 days, 2:41:39<22:32:12] +[titan] 2025-09-09 20:17:40,218 - root - INFO - step: 27695 loss: 2.7230 memory: 122.04GiB(87.57%) tps: 10,294 tflops: 490.60 mfu: 49.61% global_avg_ntp_loss: 0.7709 global_avg_top_loss: 1.9521 +[titan] 2025-09-09 20:17:40,218 - root - INFO - lr: 5.9582e-06 gnorm: 0.37 [2 days, 2:42:11<22:31:39] +[titan] 2025-09-09 20:18:05,935 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 20:18:12,389 - root - INFO - step: 27700 loss: 2.6947 memory: 122.04GiB(87.57%) tps: 10,186 tflops: 485.45 mfu: 49.08% global_avg_ntp_loss: 0.7504 global_avg_top_loss: 1.9443 +[titan] 2025-09-09 20:18:12,389 - root - INFO - lr: 5.9552e-06 gnorm: 0.38 [2 days, 2:42:43<22:31:06] +[titan] 2025-09-09 20:18:44,121 - root - INFO - step: 27705 loss: 3.2882 memory: 122.04GiB(87.57%) tps: 10,327 tflops: 492.16 mfu: 49.76% global_avg_ntp_loss: 1.0857 global_avg_top_loss: 2.2025 +[titan] 2025-09-09 20:18:44,122 - root - INFO - lr: 5.9523e-06 gnorm: 0.36 [2 days, 2:43:15<22:30:32] +[titan] 2025-09-09 20:19:16,187 - root - INFO - step: 27710 loss: 2.6179 memory: 122.04GiB(87.57%) tps: 10,221 tflops: 487.12 mfu: 49.25% global_avg_ntp_loss: 0.7210 global_avg_top_loss: 1.8969 +[titan] 2025-09-09 20:19:16,187 - root - INFO - lr: 5.9493e-06 gnorm: 0.36 [2 days, 2:43:47<22:29:59] +[titan] 2025-09-09 20:19:48,197 - root - INFO - step: 27715 loss: 2.7285 memory: 122.04GiB(87.57%) tps: 10,237 tflops: 487.89 mfu: 49.33% global_avg_ntp_loss: 0.7739 global_avg_top_loss: 1.9546 +[titan] 2025-09-09 20:19:48,198 - root - INFO - lr: 5.9464e-06 gnorm: 0.38 [2 days, 2:44:19<22:29:25] +[titan] 2025-09-09 20:20:20,146 - root - INFO - step: 27720 loss: 2.6512 memory: 122.04GiB(87.57%) tps: 10,257 tflops: 488.82 mfu: 49.43% global_avg_ntp_loss: 0.7383 global_avg_top_loss: 1.9129 +[titan] 2025-09-09 20:20:20,147 - root - INFO - lr: 5.9434e-06 gnorm: 0.38 [2 days, 2:44:51<22:28:52] +[titan] 2025-09-09 20:20:52,030 - root - INFO - step: 27725 loss: 2.7467 memory: 122.04GiB(87.57%) tps: 10,278 tflops: 489.83 mfu: 49.53% global_avg_ntp_loss: 0.7770 global_avg_top_loss: 1.9697 +[titan] 2025-09-09 20:20:52,030 - root - INFO - lr: 5.9405e-06 gnorm: 0.38 [2 days, 2:45:23<22:28:19] +[titan] 2025-09-09 20:21:23,985 - root - INFO - step: 27730 loss: 2.6245 memory: 122.04GiB(87.57%) tps: 10,255 tflops: 488.74 mfu: 49.42% global_avg_ntp_loss: 0.7239 global_avg_top_loss: 1.9006 +[titan] 2025-09-09 20:21:23,985 - root - INFO - lr: 5.9375e-06 gnorm: 0.40 [2 days, 2:45:55<22:27:45] +[titan] 2025-09-09 20:21:56,412 - root - INFO - step: 27735 loss: 2.7080 memory: 122.04GiB(87.57%) tps: 10,105 tflops: 481.62 mfu: 48.70% global_avg_ntp_loss: 0.7666 global_avg_top_loss: 1.9414 +[titan] 2025-09-09 20:21:56,412 - root - INFO - lr: 5.9346e-06 gnorm: 0.38 [2 days, 2:46:27<22:27:12] +[titan] 2025-09-09 20:22:28,310 - root - INFO - step: 27740 loss: 2.6339 memory: 122.04GiB(87.57%) tps: 10,273 tflops: 489.60 mfu: 49.50% global_avg_ntp_loss: 0.7294 global_avg_top_loss: 1.9045 +[titan] 2025-09-09 20:22:28,310 - root - INFO - lr: 5.9316e-06 gnorm: 0.39 [2 days, 2:46:59<22:26:39] +[titan] 2025-09-09 20:23:00,336 - root - INFO - step: 27745 loss: 2.7238 memory: 122.04GiB(87.57%) tps: 10,232 tflops: 487.65 mfu: 49.31% global_avg_ntp_loss: 0.7675 global_avg_top_loss: 1.9563 +[titan] 2025-09-09 20:23:00,336 - root - INFO - lr: 5.9287e-06 gnorm: 0.39 [2 days, 2:47:31<22:26:05] +[titan] 2025-09-09 20:23:25,737 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 20:23:32,139 - root - INFO - step: 27750 loss: 3.1013 memory: 122.04GiB(87.57%) tps: 10,304 tflops: 491.07 mfu: 49.65% global_avg_ntp_loss: 0.9949 global_avg_top_loss: 2.1064 +[titan] 2025-09-09 20:23:32,139 - root - INFO - lr: 5.9257e-06 gnorm: 0.37 [2 days, 2:48:03<22:25:32] +[titan] 2025-09-09 20:24:04,277 - root - INFO - step: 27755 loss: 2.7528 memory: 122.04GiB(87.57%) tps: 10,196 tflops: 485.94 mfu: 49.13% global_avg_ntp_loss: 0.7814 global_avg_top_loss: 1.9714 +[titan] 2025-09-09 20:24:04,278 - root - INFO - lr: 5.9228e-06 gnorm: 0.38 [2 days, 2:48:35<22:24:58] +[titan] 2025-09-09 20:24:36,239 - root - INFO - step: 27760 loss: 2.6562 memory: 122.04GiB(87.57%) tps: 10,253 tflops: 488.64 mfu: 49.41% global_avg_ntp_loss: 0.7350 global_avg_top_loss: 1.9213 +[titan] 2025-09-09 20:24:36,239 - root - INFO - lr: 5.9198e-06 gnorm: 0.42 [2 days, 2:49:07<22:24:25] +[titan] 2025-09-09 20:25:08,110 - root - INFO - step: 27765 loss: 2.6901 memory: 122.04GiB(87.57%) tps: 10,282 tflops: 490.01 mfu: 49.55% global_avg_ntp_loss: 0.7548 global_avg_top_loss: 1.9353 +[titan] 2025-09-09 20:25:08,110 - root - INFO - lr: 5.9169e-06 gnorm: 0.38 [2 days, 2:49:39<22:23:52] +[titan] 2025-09-09 20:25:39,934 - root - INFO - step: 27770 loss: 2.7141 memory: 122.04GiB(87.57%) tps: 10,297 tflops: 490.74 mfu: 49.62% global_avg_ntp_loss: 0.7625 global_avg_top_loss: 1.9516 +[titan] 2025-09-09 20:25:39,935 - root - INFO - lr: 5.9139e-06 gnorm: 0.39 [2 days, 2:50:11<22:23:18] +[titan] 2025-09-09 20:26:11,990 - root - INFO - step: 27775 loss: 2.5712 memory: 122.04GiB(87.57%) tps: 10,223 tflops: 487.20 mfu: 49.26% global_avg_ntp_loss: 0.7013 global_avg_top_loss: 1.8698 +[titan] 2025-09-09 20:26:11,990 - root - INFO - lr: 5.9110e-06 gnorm: 0.37 [2 days, 2:50:43<22:22:45] +[titan] 2025-09-09 20:26:43,981 - root - INFO - step: 27780 loss: 2.6533 memory: 122.04GiB(87.57%) tps: 10,243 tflops: 488.17 mfu: 49.36% global_avg_ntp_loss: 0.7368 global_avg_top_loss: 1.9165 +[titan] 2025-09-09 20:26:43,982 - root - INFO - lr: 5.9080e-06 gnorm: 0.37 [2 days, 2:51:15<22:22:11] +[titan] 2025-09-09 20:27:15,992 - root - INFO - step: 27785 loss: 3.1062 memory: 122.04GiB(87.57%) tps: 10,237 tflops: 487.88 mfu: 49.33% global_avg_ntp_loss: 0.9976 global_avg_top_loss: 2.1086 +[titan] 2025-09-09 20:27:15,992 - root - INFO - lr: 5.9051e-06 gnorm: 0.35 [2 days, 2:51:47<22:21:38] +[titan] 2025-09-09 20:27:48,058 - root - INFO - step: 27790 loss: 2.6567 memory: 122.04GiB(87.57%) tps: 10,219 tflops: 487.04 mfu: 49.25% global_avg_ntp_loss: 0.7385 global_avg_top_loss: 1.9182 +[titan] 2025-09-09 20:27:48,058 - root - INFO - lr: 5.9022e-06 gnorm: 0.37 [2 days, 2:52:19<22:21:05] +[titan] 2025-09-09 20:28:20,124 - root - INFO - step: 27795 loss: 2.7361 memory: 122.04GiB(87.57%) tps: 10,219 tflops: 487.04 mfu: 49.25% global_avg_ntp_loss: 0.7737 global_avg_top_loss: 1.9624 +[titan] 2025-09-09 20:28:20,124 - root - INFO - lr: 5.8992e-06 gnorm: 0.40 [2 days, 2:52:51<22:20:31] +[titan] 2025-09-09 20:28:45,736 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 20:28:52,137 - root - INFO - step: 27800 loss: 2.7212 memory: 122.04GiB(87.57%) tps: 10,236 tflops: 487.84 mfu: 49.33% global_avg_ntp_loss: 0.7679 global_avg_top_loss: 1.9533 +[titan] 2025-09-09 20:28:52,138 - root - INFO - lr: 5.8963e-06 gnorm: 0.38 [2 days, 2:53:23<22:19:58] +[titan] 2025-09-09 20:29:24,069 - root - INFO - step: 27805 loss: 2.7507 memory: 122.04GiB(87.57%) tps: 10,262 tflops: 489.09 mfu: 49.45% global_avg_ntp_loss: 0.7806 global_avg_top_loss: 1.9700 +[titan] 2025-09-09 20:29:24,069 - root - INFO - lr: 5.8933e-06 gnorm: 0.41 [2 days, 2:53:55<22:19:25] +[titan] 2025-09-09 20:29:56,045 - root - INFO - step: 27810 loss: 2.6492 memory: 122.04GiB(87.57%) tps: 10,248 tflops: 488.42 mfu: 49.38% global_avg_ntp_loss: 0.7346 global_avg_top_loss: 1.9146 +[titan] 2025-09-09 20:29:56,045 - root - INFO - lr: 5.8904e-06 gnorm: 0.39 [2 days, 2:54:27<22:18:51] +[titan] 2025-09-09 20:30:27,920 - root - INFO - step: 27815 loss: 2.7739 memory: 122.04GiB(87.57%) tps: 10,280 tflops: 489.95 mfu: 49.54% global_avg_ntp_loss: 0.7886 global_avg_top_loss: 1.9853 +[titan] 2025-09-09 20:30:27,920 - root - INFO - lr: 5.8875e-06 gnorm: 0.43 [2 days, 2:54:59<22:18:18] +[titan] 2025-09-09 20:30:59,941 - root - INFO - step: 27820 loss: 2.7760 memory: 122.04GiB(87.57%) tps: 10,234 tflops: 487.73 mfu: 49.32% global_avg_ntp_loss: 0.7907 global_avg_top_loss: 1.9852 +[titan] 2025-09-09 20:30:59,941 - root - INFO - lr: 5.8845e-06 gnorm: 0.47 [2 days, 2:55:31<22:17:45] +[titan] 2025-09-09 20:31:31,956 - root - INFO - step: 27825 loss: 2.6948 memory: 122.04GiB(87.57%) tps: 10,235 tflops: 487.81 mfu: 49.32% global_avg_ntp_loss: 0.7546 global_avg_top_loss: 1.9402 +[titan] 2025-09-09 20:31:31,956 - root - INFO - lr: 5.8816e-06 gnorm: 0.40 [2 days, 2:56:03<22:17:11] +[titan] 2025-09-09 20:32:04,192 - root - INFO - step: 27830 loss: 2.7136 memory: 122.04GiB(87.57%) tps: 10,165 tflops: 484.48 mfu: 48.99% global_avg_ntp_loss: 0.7647 global_avg_top_loss: 1.9489 +[titan] 2025-09-09 20:32:04,192 - root - INFO - lr: 5.8787e-06 gnorm: 0.38 [2 days, 2:56:35<22:16:38] +[titan] 2025-09-09 20:32:36,176 - root - INFO - step: 27835 loss: 2.6755 memory: 122.04GiB(87.57%) tps: 10,245 tflops: 488.28 mfu: 49.37% global_avg_ntp_loss: 0.7463 global_avg_top_loss: 1.9292 +[titan] 2025-09-09 20:32:36,177 - root - INFO - lr: 5.8757e-06 gnorm: 0.37 [2 days, 2:57:07<22:16:05] +[titan] 2025-09-09 20:33:08,157 - root - INFO - step: 27840 loss: 2.7392 memory: 122.04GiB(87.57%) tps: 10,246 tflops: 488.34 mfu: 49.38% global_avg_ntp_loss: 0.7754 global_avg_top_loss: 1.9638 +[titan] 2025-09-09 20:33:08,158 - root - INFO - lr: 5.8728e-06 gnorm: 0.41 [2 days, 2:57:39<22:15:31] +[titan] 2025-09-09 20:33:39,972 - root - INFO - step: 27845 loss: 2.7575 memory: 122.04GiB(87.57%) tps: 10,300 tflops: 490.89 mfu: 49.63% global_avg_ntp_loss: 0.7808 global_avg_top_loss: 1.9766 +[titan] 2025-09-09 20:33:39,972 - root - INFO - lr: 5.8698e-06 gnorm: 0.41 [2 days, 2:58:11<22:14:58] +[titan] 2025-09-09 20:34:05,443 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 20:34:11,851 - root - INFO - step: 27850 loss: 2.6612 memory: 122.04GiB(87.57%) tps: 10,279 tflops: 489.89 mfu: 49.53% global_avg_ntp_loss: 0.7407 global_avg_top_loss: 1.9204 +[titan] 2025-09-09 20:34:11,852 - root - INFO - lr: 5.8669e-06 gnorm: 0.38 [2 days, 2:58:43<22:14:24] +[titan] 2025-09-09 20:34:43,833 - root - INFO - step: 27855 loss: 2.7084 memory: 122.04GiB(87.57%) tps: 10,246 tflops: 488.33 mfu: 49.38% global_avg_ntp_loss: 0.7628 global_avg_top_loss: 1.9456 +[titan] 2025-09-09 20:34:43,833 - root - INFO - lr: 5.8640e-06 gnorm: 0.37 [2 days, 2:59:15<22:13:51] +[titan] 2025-09-09 20:35:15,730 - root - INFO - step: 27860 loss: 2.6756 memory: 122.04GiB(87.57%) tps: 10,273 tflops: 489.62 mfu: 49.51% global_avg_ntp_loss: 0.7448 global_avg_top_loss: 1.9307 +[titan] 2025-09-09 20:35:15,730 - root - INFO - lr: 5.8611e-06 gnorm: 0.38 [2 days, 2:59:46<22:13:18] +[titan] 2025-09-09 20:35:47,711 - root - INFO - step: 27865 loss: 2.6445 memory: 122.04GiB(87.57%) tps: 10,246 tflops: 488.33 mfu: 49.38% global_avg_ntp_loss: 0.7335 global_avg_top_loss: 1.9110 +[titan] 2025-09-09 20:35:47,711 - root - INFO - lr: 5.8581e-06 gnorm: 0.36 [2 days, 3:00:18<22:12:44] +[titan] 2025-09-09 20:36:19,554 - root - INFO - step: 27870 loss: 2.6463 memory: 122.04GiB(87.57%) tps: 10,291 tflops: 490.46 mfu: 49.59% global_avg_ntp_loss: 0.7369 global_avg_top_loss: 1.9094 +[titan] 2025-09-09 20:36:19,554 - root - INFO - lr: 5.8552e-06 gnorm: 0.37 [2 days, 3:00:50<22:12:11] +[titan] 2025-09-09 20:36:51,375 - root - INFO - step: 27875 loss: 2.5530 memory: 122.04GiB(87.57%) tps: 10,298 tflops: 490.78 mfu: 49.62% global_avg_ntp_loss: 0.6897 global_avg_top_loss: 1.8633 +[titan] 2025-09-09 20:36:51,376 - root - INFO - lr: 5.8523e-06 gnorm: 0.38 [2 days, 3:01:22<22:11:37] +[titan] 2025-09-09 20:37:23,101 - root - INFO - step: 27880 loss: 2.7208 memory: 122.04GiB(87.57%) tps: 10,329 tflops: 492.27 mfu: 49.77% global_avg_ntp_loss: 0.7674 global_avg_top_loss: 1.9535 +[titan] 2025-09-09 20:37:23,101 - root - INFO - lr: 5.8493e-06 gnorm: 0.38 [2 days, 3:01:54<22:11:04] +[titan] 2025-09-09 20:37:55,021 - root - INFO - step: 27885 loss: 2.7644 memory: 122.04GiB(87.57%) tps: 10,266 tflops: 489.27 mfu: 49.47% global_avg_ntp_loss: 0.7860 global_avg_top_loss: 1.9784 +[titan] 2025-09-09 20:37:55,021 - root - INFO - lr: 5.8464e-06 gnorm: 0.37 [2 days, 3:02:26<22:10:30] +[titan] 2025-09-09 20:38:27,082 - root - INFO - step: 27890 loss: 2.7070 memory: 122.04GiB(87.57%) tps: 10,221 tflops: 487.12 mfu: 49.25% global_avg_ntp_loss: 0.7628 global_avg_top_loss: 1.9442 +[titan] 2025-09-09 20:38:27,082 - root - INFO - lr: 5.8435e-06 gnorm: 0.39 [2 days, 3:02:58<22:09:57] +[titan] 2025-09-09 20:38:58,877 - root - INFO - step: 27895 loss: 2.6664 memory: 122.04GiB(87.57%) tps: 10,306 tflops: 491.18 mfu: 49.66% global_avg_ntp_loss: 0.7416 global_avg_top_loss: 1.9248 +[titan] 2025-09-09 20:38:58,878 - root - INFO - lr: 5.8406e-06 gnorm: 0.40 [2 days, 3:03:30<22:09:24] +[titan] 2025-09-09 20:39:24,355 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 20:39:30,830 - root - INFO - step: 27900 loss: 2.7641 memory: 122.04GiB(87.57%) tps: 10,255 tflops: 488.77 mfu: 49.42% global_avg_ntp_loss: 0.7852 global_avg_top_loss: 1.9788 +[titan] 2025-09-09 20:39:30,831 - root - INFO - lr: 5.8376e-06 gnorm: 0.40 [2 days, 3:04:01<22:08:50] +[titan] 2025-09-09 20:40:02,573 - root - INFO - step: 27905 loss: 2.6812 memory: 122.04GiB(87.57%) tps: 10,323 tflops: 491.99 mfu: 49.75% global_avg_ntp_loss: 0.7510 global_avg_top_loss: 1.9302 +[titan] 2025-09-09 20:40:02,574 - root - INFO - lr: 5.8347e-06 gnorm: 0.39 [2 days, 3:04:33<22:08:17] +[titan] 2025-09-09 20:40:34,425 - root - INFO - step: 27910 loss: 2.7515 memory: 122.04GiB(87.57%) tps: 10,288 tflops: 490.32 mfu: 49.58% global_avg_ntp_loss: 0.7792 global_avg_top_loss: 1.9723 +[titan] 2025-09-09 20:40:34,426 - root - INFO - lr: 5.8318e-06 gnorm: 0.38 [2 days, 3:05:05<22:07:43] +[titan] 2025-09-09 20:41:06,221 - root - INFO - step: 27915 loss: 2.6838 memory: 122.04GiB(87.57%) tps: 10,306 tflops: 491.19 mfu: 49.66% global_avg_ntp_loss: 0.7545 global_avg_top_loss: 1.9292 +[titan] 2025-09-09 20:41:06,221 - root - INFO - lr: 5.8289e-06 gnorm: 0.37 [2 days, 3:05:37<22:07:10] +[titan] 2025-09-09 20:41:38,370 - root - INFO - step: 27920 loss: 2.8344 memory: 122.04GiB(87.57%) tps: 10,193 tflops: 485.78 mfu: 49.12% global_avg_ntp_loss: 0.8100 global_avg_top_loss: 2.0244 +[titan] 2025-09-09 20:41:38,370 - root - INFO - lr: 5.8259e-06 gnorm: 0.46 [2 days, 3:06:09<22:06:37] +[titan] 2025-09-09 20:42:10,335 - root - INFO - step: 27925 loss: 2.6726 memory: 122.04GiB(87.57%) tps: 10,251 tflops: 488.57 mfu: 49.40% global_avg_ntp_loss: 0.7456 global_avg_top_loss: 1.9271 +[titan] 2025-09-09 20:42:10,335 - root - INFO - lr: 5.8230e-06 gnorm: 0.38 [2 days, 3:06:41<22:06:03] +[titan] 2025-09-09 20:42:42,202 - root - INFO - step: 27930 loss: 2.7168 memory: 122.04GiB(87.57%) tps: 10,283 tflops: 490.08 mfu: 49.55% global_avg_ntp_loss: 0.7630 global_avg_top_loss: 1.9538 +[titan] 2025-09-09 20:42:42,203 - root - INFO - lr: 5.8201e-06 gnorm: 0.37 [2 days, 3:07:13<22:05:30] +[titan] 2025-09-09 20:43:14,355 - root - INFO - step: 27935 loss: 2.7060 memory: 122.04GiB(87.57%) tps: 10,192 tflops: 485.73 mfu: 49.11% global_avg_ntp_loss: 0.7617 global_avg_top_loss: 1.9442 +[titan] 2025-09-09 20:43:14,355 - root - INFO - lr: 5.8172e-06 gnorm: 0.44 [2 days, 3:07:45<22:04:57] +[titan] 2025-09-09 20:43:46,305 - root - INFO - step: 27940 loss: 2.7714 memory: 122.04GiB(87.57%) tps: 10,256 tflops: 488.81 mfu: 49.42% global_avg_ntp_loss: 0.7948 global_avg_top_loss: 1.9766 +[titan] 2025-09-09 20:43:46,305 - root - INFO - lr: 5.8143e-06 gnorm: 0.40 [2 days, 3:08:17<22:04:23] +[titan] 2025-09-09 20:44:18,549 - root - INFO - step: 27945 loss: 2.6450 memory: 122.04GiB(87.57%) tps: 10,163 tflops: 484.35 mfu: 48.97% global_avg_ntp_loss: 0.7317 global_avg_top_loss: 1.9133 +[titan] 2025-09-09 20:44:18,549 - root - INFO - lr: 5.8113e-06 gnorm: 0.40 [2 days, 3:08:49<22:03:50] +[titan] 2025-09-09 20:44:44,211 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 20:44:50,578 - root - INFO - step: 27950 loss: 2.5807 memory: 122.04GiB(87.57%) tps: 10,231 tflops: 487.60 mfu: 49.30% global_avg_ntp_loss: 0.7058 global_avg_top_loss: 1.8749 +[titan] 2025-09-09 20:44:50,578 - root - INFO - lr: 5.8084e-06 gnorm: 0.36 [2 days, 3:09:21<22:03:17] +[titan] 2025-09-09 20:45:22,546 - root - INFO - step: 27955 loss: 2.5778 memory: 122.04GiB(87.57%) tps: 10,250 tflops: 488.53 mfu: 49.40% global_avg_ntp_loss: 0.7009 global_avg_top_loss: 1.8769 +[titan] 2025-09-09 20:45:22,547 - root - INFO - lr: 5.8055e-06 gnorm: 0.38 [2 days, 3:09:53<22:02:43] +[titan] 2025-09-09 20:45:54,501 - root - INFO - step: 27960 loss: 2.7177 memory: 122.04GiB(87.57%) tps: 10,255 tflops: 488.74 mfu: 49.42% global_avg_ntp_loss: 0.7674 global_avg_top_loss: 1.9503 +[titan] 2025-09-09 20:45:54,502 - root - INFO - lr: 5.8026e-06 gnorm: 0.39 [2 days, 3:10:25<22:02:10] +[titan] 2025-09-09 20:46:26,433 - root - INFO - step: 27965 loss: 2.7119 memory: 122.04GiB(87.57%) tps: 10,262 tflops: 489.09 mfu: 49.45% global_avg_ntp_loss: 0.7644 global_avg_top_loss: 1.9475 +[titan] 2025-09-09 20:46:26,433 - root - INFO - lr: 5.7997e-06 gnorm: 0.38 [2 days, 3:10:57<22:01:36] +[titan] 2025-09-09 20:46:58,521 - root - INFO - step: 27970 loss: 2.6835 memory: 122.04GiB(87.57%) tps: 10,212 tflops: 486.71 mfu: 49.21% global_avg_ntp_loss: 0.7488 global_avg_top_loss: 1.9347 +[titan] 2025-09-09 20:46:58,521 - root - INFO - lr: 5.7968e-06 gnorm: 0.37 [2 days, 3:11:29<22:01:03] +[titan] 2025-09-09 20:47:30,572 - root - INFO - step: 27975 loss: 2.7375 memory: 122.04GiB(87.57%) tps: 10,224 tflops: 487.27 mfu: 49.27% global_avg_ntp_loss: 0.7756 global_avg_top_loss: 1.9619 +[titan] 2025-09-09 20:47:30,572 - root - INFO - lr: 5.7939e-06 gnorm: 0.37 [2 days, 3:12:01<22:00:30] +[titan] 2025-09-09 20:48:02,632 - root - INFO - step: 27980 loss: 2.6951 memory: 122.04GiB(87.57%) tps: 10,221 tflops: 487.14 mfu: 49.26% global_avg_ntp_loss: 0.7562 global_avg_top_loss: 1.9389 +[titan] 2025-09-09 20:48:02,632 - root - INFO - lr: 5.7909e-06 gnorm: 0.36 [2 days, 3:12:33<21:59:57] +[titan] 2025-09-09 20:48:34,549 - root - INFO - step: 27985 loss: 2.7530 memory: 122.04GiB(87.57%) tps: 10,267 tflops: 489.30 mfu: 49.47% global_avg_ntp_loss: 0.7831 global_avg_top_loss: 1.9699 +[titan] 2025-09-09 20:48:34,550 - root - INFO - lr: 5.7880e-06 gnorm: 0.39 [2 days, 3:13:05<21:59:23] +[titan] 2025-09-09 20:49:06,345 - root - INFO - step: 27990 loss: 2.7218 memory: 122.04GiB(87.57%) tps: 10,306 tflops: 491.18 mfu: 49.66% global_avg_ntp_loss: 0.7642 global_avg_top_loss: 1.9576 +[titan] 2025-09-09 20:49:06,346 - root - INFO - lr: 5.7851e-06 gnorm: 0.39 [2 days, 3:13:37<21:58:50] +[titan] 2025-09-09 20:49:38,332 - root - INFO - step: 27995 loss: 2.6885 memory: 122.04GiB(87.57%) tps: 10,244 tflops: 488.24 mfu: 49.37% global_avg_ntp_loss: 0.7592 global_avg_top_loss: 1.9293 +[titan] 2025-09-09 20:49:38,333 - root - INFO - lr: 5.7822e-06 gnorm: 0.39 [2 days, 3:14:09<21:58:16] +[titan] 2025-09-09 20:50:03,918 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 20:50:10,289 - root - INFO - step: 28000 loss: 3.0002 memory: 122.04GiB(87.57%) tps: 10,254 tflops: 488.71 mfu: 49.41% global_avg_ntp_loss: 0.9230 global_avg_top_loss: 2.0772 +[titan] 2025-09-09 20:50:10,289 - root - INFO - lr: 5.7793e-06 gnorm: 0.37 [2 days, 3:14:41<21:57:43] +[titan] 2025-09-09 20:50:42,195 - root - INFO - step: 28005 loss: 2.7525 memory: 122.04GiB(87.57%) tps: 10,271 tflops: 489.49 mfu: 49.49% global_avg_ntp_loss: 0.7789 global_avg_top_loss: 1.9736 +[titan] 2025-09-09 20:50:42,195 - root - INFO - lr: 5.7764e-06 gnorm: 0.38 [2 days, 3:15:13<21:57:10] +[titan] 2025-09-09 20:51:14,076 - root - INFO - step: 28010 loss: 2.6766 memory: 122.04GiB(87.57%) tps: 10,279 tflops: 489.87 mfu: 49.53% global_avg_ntp_loss: 0.7462 global_avg_top_loss: 1.9304 +[titan] 2025-09-09 20:51:14,076 - root - INFO - lr: 5.7735e-06 gnorm: 0.36 [2 days, 3:15:45<21:56:36] +[titan] 2025-09-09 20:51:46,090 - root - INFO - step: 28015 loss: 2.7187 memory: 122.04GiB(87.57%) tps: 10,236 tflops: 487.82 mfu: 49.32% global_avg_ntp_loss: 0.7629 global_avg_top_loss: 1.9557 +[titan] 2025-09-09 20:51:46,091 - root - INFO - lr: 5.7706e-06 gnorm: 0.37 [2 days, 3:16:17<21:56:03] +[titan] 2025-09-09 20:52:18,012 - root - INFO - step: 28020 loss: 2.7724 memory: 122.04GiB(87.57%) tps: 10,265 tflops: 489.24 mfu: 49.47% global_avg_ntp_loss: 0.8058 global_avg_top_loss: 1.9666 +[titan] 2025-09-09 20:52:18,012 - root - INFO - lr: 5.7677e-06 gnorm: 0.41 [2 days, 3:16:49<21:55:29] +[titan] 2025-09-09 20:52:49,997 - root - INFO - step: 28025 loss: 2.6849 memory: 122.04GiB(87.57%) tps: 10,245 tflops: 488.28 mfu: 49.37% global_avg_ntp_loss: 0.7529 global_avg_top_loss: 1.9319 +[titan] 2025-09-09 20:52:49,997 - root - INFO - lr: 5.7648e-06 gnorm: 0.44 [2 days, 3:17:21<21:54:56] +[titan] 2025-09-09 20:53:21,970 - root - INFO - step: 28030 loss: 2.6014 memory: 122.04GiB(87.57%) tps: 10,249 tflops: 488.45 mfu: 49.39% global_avg_ntp_loss: 0.7126 global_avg_top_loss: 1.8888 +[titan] 2025-09-09 20:53:21,970 - root - INFO - lr: 5.7619e-06 gnorm: 0.36 [2 days, 3:17:53<21:54:23] +[titan] 2025-09-09 20:53:54,070 - root - INFO - step: 28035 loss: 2.7625 memory: 122.04GiB(87.57%) tps: 10,208 tflops: 486.53 mfu: 49.19% global_avg_ntp_loss: 0.8031 global_avg_top_loss: 1.9594 +[titan] 2025-09-09 20:53:54,070 - root - INFO - lr: 5.7590e-06 gnorm: 0.38 [2 days, 3:18:25<21:53:49] +[titan] 2025-09-09 20:54:26,026 - root - INFO - step: 28040 loss: 2.6993 memory: 122.04GiB(87.57%) tps: 10,255 tflops: 488.72 mfu: 49.42% global_avg_ntp_loss: 0.7587 global_avg_top_loss: 1.9407 +[titan] 2025-09-09 20:54:26,026 - root - INFO - lr: 5.7561e-06 gnorm: 0.40 [2 days, 3:18:57<21:53:16] +[titan] 2025-09-09 20:54:58,026 - root - INFO - step: 28045 loss: 2.6724 memory: 122.04GiB(87.57%) tps: 10,240 tflops: 488.04 mfu: 49.35% global_avg_ntp_loss: 0.7497 global_avg_top_loss: 1.9227 +[titan] 2025-09-09 20:54:58,026 - root - INFO - lr: 5.7532e-06 gnorm: 0.38 [2 days, 3:19:29<21:52:43] +[titan] 2025-09-09 20:55:23,613 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 20:55:30,087 - root - INFO - step: 28050 loss: 2.7747 memory: 122.04GiB(87.57%) tps: 10,221 tflops: 487.11 mfu: 49.25% global_avg_ntp_loss: 0.7913 global_avg_top_loss: 1.9834 +[titan] 2025-09-09 20:55:30,088 - root - INFO - lr: 5.7503e-06 gnorm: 0.37 [2 days, 3:20:01<21:52:09] +[titan] 2025-09-09 20:56:01,944 - root - INFO - step: 28055 loss: 3.0967 memory: 122.04GiB(87.57%) tps: 10,286 tflops: 490.24 mfu: 49.57% global_avg_ntp_loss: 0.9778 global_avg_top_loss: 2.1190 +[titan] 2025-09-09 20:56:01,945 - root - INFO - lr: 5.7474e-06 gnorm: 0.44 [2 days, 3:20:33<21:51:36] +[titan] 2025-09-09 20:56:33,948 - root - INFO - step: 28060 loss: 2.6766 memory: 122.04GiB(87.57%) tps: 10,239 tflops: 487.99 mfu: 49.34% global_avg_ntp_loss: 0.7476 global_avg_top_loss: 1.9290 +[titan] 2025-09-09 20:56:33,948 - root - INFO - lr: 5.7445e-06 gnorm: 0.38 [2 days, 3:21:05<21:51:03] +[titan] 2025-09-09 20:57:06,122 - root - INFO - step: 28065 loss: 2.6681 memory: 122.04GiB(87.57%) tps: 10,185 tflops: 485.39 mfu: 49.08% global_avg_ntp_loss: 0.7432 global_avg_top_loss: 1.9250 +[titan] 2025-09-09 20:57:06,123 - root - INFO - lr: 5.7416e-06 gnorm: 0.37 [2 days, 3:21:37<21:50:29] +[titan] 2025-09-09 20:57:38,034 - root - INFO - step: 28070 loss: 2.6305 memory: 122.04GiB(87.57%) tps: 10,269 tflops: 489.40 mfu: 49.48% global_avg_ntp_loss: 0.7281 global_avg_top_loss: 1.9024 +[titan] 2025-09-09 20:57:38,034 - root - INFO - lr: 5.7387e-06 gnorm: 0.36 [2 days, 3:22:09<21:49:56] +[titan] 2025-09-09 20:58:10,127 - root - INFO - step: 28075 loss: 2.7710 memory: 122.04GiB(87.57%) tps: 10,211 tflops: 486.63 mfu: 49.20% global_avg_ntp_loss: 0.7932 global_avg_top_loss: 1.9779 +[titan] 2025-09-09 20:58:10,128 - root - INFO - lr: 5.7358e-06 gnorm: 0.38 [2 days, 3:22:41<21:49:23] +[titan] 2025-09-09 20:58:42,068 - root - INFO - step: 28080 loss: 2.7469 memory: 122.04GiB(87.57%) tps: 10,259 tflops: 488.95 mfu: 49.44% global_avg_ntp_loss: 0.7770 global_avg_top_loss: 1.9699 +[titan] 2025-09-09 20:58:42,069 - root - INFO - lr: 5.7329e-06 gnorm: 0.39 [2 days, 3:23:13<21:48:49] +[titan] 2025-09-09 20:59:13,973 - root - INFO - step: 28085 loss: 2.6655 memory: 122.04GiB(87.57%) tps: 10,271 tflops: 489.50 mfu: 49.49% global_avg_ntp_loss: 0.7405 global_avg_top_loss: 1.9250 +[titan] 2025-09-09 20:59:13,973 - root - INFO - lr: 5.7300e-06 gnorm: 0.38 [2 days, 3:23:45<21:48:16] +[titan] 2025-09-09 20:59:45,793 - root - INFO - step: 28090 loss: 2.8909 memory: 122.04GiB(87.57%) tps: 10,298 tflops: 490.81 mfu: 49.63% global_avg_ntp_loss: 0.8568 global_avg_top_loss: 2.0341 +[titan] 2025-09-09 20:59:45,793 - root - INFO - lr: 5.7271e-06 gnorm: 0.37 [2 days, 3:24:16<21:47:43] +[titan] 2025-09-09 21:00:17,735 - root - INFO - step: 28095 loss: 2.7080 memory: 122.04GiB(87.57%) tps: 10,259 tflops: 488.93 mfu: 49.44% global_avg_ntp_loss: 0.7599 global_avg_top_loss: 1.9481 +[titan] 2025-09-09 21:00:17,735 - root - INFO - lr: 5.7242e-06 gnorm: 0.36 [2 days, 3:24:48<21:47:09] +[titan] 2025-09-09 21:00:43,422 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:00:50,019 - root - INFO - step: 28100 loss: 3.1012 memory: 122.04GiB(87.57%) tps: 10,150 tflops: 483.75 mfu: 48.91% global_avg_ntp_loss: 0.9895 global_avg_top_loss: 2.1117 +[titan] 2025-09-09 21:00:50,020 - root - INFO - lr: 5.7213e-06 gnorm: 0.46 [2 days, 3:25:21<21:46:36] +[titan] 2025-09-09 21:01:21,923 - root - INFO - step: 28105 loss: 2.6933 memory: 122.04GiB(87.57%) tps: 10,271 tflops: 489.51 mfu: 49.50% global_avg_ntp_loss: 0.7546 global_avg_top_loss: 1.9386 +[titan] 2025-09-09 21:01:21,924 - root - INFO - lr: 5.7184e-06 gnorm: 0.38 [2 days, 3:25:53<21:46:03] +[titan] 2025-09-09 21:01:53,925 - root - INFO - step: 28110 loss: 2.6600 memory: 122.04GiB(87.57%) tps: 10,240 tflops: 488.03 mfu: 49.35% global_avg_ntp_loss: 0.7388 global_avg_top_loss: 1.9212 +[titan] 2025-09-09 21:01:53,925 - root - INFO - lr: 5.7155e-06 gnorm: 0.39 [2 days, 3:26:25<21:45:29] +[titan] 2025-09-09 21:02:25,994 - root - INFO - step: 28115 loss: 2.7463 memory: 122.04GiB(87.57%) tps: 10,218 tflops: 487.00 mfu: 49.24% global_avg_ntp_loss: 0.7790 global_avg_top_loss: 1.9673 +[titan] 2025-09-09 21:02:25,994 - root - INFO - lr: 5.7126e-06 gnorm: 0.38 [2 days, 3:26:57<21:44:56] +[titan] 2025-09-09 21:02:57,893 - root - INFO - step: 28120 loss: 2.7644 memory: 122.04GiB(87.57%) tps: 10,273 tflops: 489.59 mfu: 49.50% global_avg_ntp_loss: 0.7905 global_avg_top_loss: 1.9739 +[titan] 2025-09-09 21:02:57,893 - root - INFO - lr: 5.7098e-06 gnorm: 0.38 [2 days, 3:27:29<21:44:23] +[titan] 2025-09-09 21:03:29,958 - root - INFO - step: 28125 loss: 2.7378 memory: 122.04GiB(87.57%) tps: 10,220 tflops: 487.06 mfu: 49.25% global_avg_ntp_loss: 0.7735 global_avg_top_loss: 1.9643 +[titan] 2025-09-09 21:03:29,958 - root - INFO - lr: 5.7069e-06 gnorm: 0.38 [2 days, 3:28:01<21:43:49] +[titan] 2025-09-09 21:04:02,044 - root - INFO - step: 28130 loss: 2.7900 memory: 122.04GiB(87.57%) tps: 10,213 tflops: 486.72 mfu: 49.21% global_avg_ntp_loss: 0.7982 global_avg_top_loss: 1.9918 +[titan] 2025-09-09 21:04:02,045 - root - INFO - lr: 5.7040e-06 gnorm: 0.38 [2 days, 3:28:33<21:43:16] +[titan] 2025-09-09 21:04:33,945 - root - INFO - step: 28135 loss: 2.6533 memory: 122.04GiB(87.57%) tps: 10,272 tflops: 489.57 mfu: 49.50% global_avg_ntp_loss: 0.7391 global_avg_top_loss: 1.9142 +[titan] 2025-09-09 21:04:33,946 - root - INFO - lr: 5.7011e-06 gnorm: 0.38 [2 days, 3:29:05<21:42:43] +[titan] 2025-09-09 21:05:05,868 - root - INFO - step: 28140 loss: 2.7795 memory: 122.04GiB(87.57%) tps: 10,265 tflops: 489.22 mfu: 49.47% global_avg_ntp_loss: 0.7961 global_avg_top_loss: 1.9834 +[titan] 2025-09-09 21:05:05,869 - root - INFO - lr: 5.6982e-06 gnorm: 0.39 [2 days, 3:29:36<21:42:09] +[titan] 2025-09-09 21:05:37,771 - root - INFO - step: 28145 loss: 2.7587 memory: 122.04GiB(87.57%) tps: 10,272 tflops: 489.54 mfu: 49.50% global_avg_ntp_loss: 0.7843 global_avg_top_loss: 1.9745 +[titan] 2025-09-09 21:05:37,771 - root - INFO - lr: 5.6953e-06 gnorm: 0.40 [2 days, 3:30:08<21:41:36] +[titan] 2025-09-09 21:06:03,272 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:06:09,730 - root - INFO - step: 28150 loss: 2.7301 memory: 122.04GiB(87.57%) tps: 10,253 tflops: 488.66 mfu: 49.41% global_avg_ntp_loss: 0.7694 global_avg_top_loss: 1.9607 +[titan] 2025-09-09 21:06:09,731 - root - INFO - lr: 5.6924e-06 gnorm: 0.42 [2 days, 3:30:40<21:41:03] +[titan] 2025-09-09 21:06:41,503 - root - INFO - step: 28155 loss: 2.6656 memory: 122.04GiB(87.57%) tps: 10,313 tflops: 491.53 mfu: 49.70% global_avg_ntp_loss: 0.7436 global_avg_top_loss: 1.9220 +[titan] 2025-09-09 21:06:41,504 - root - INFO - lr: 5.6896e-06 gnorm: 0.38 [2 days, 3:31:12<21:40:29] +[titan] 2025-09-09 21:07:13,681 - root - INFO - step: 28160 loss: 2.5493 memory: 122.04GiB(87.57%) tps: 10,184 tflops: 485.35 mfu: 49.08% global_avg_ntp_loss: 0.6903 global_avg_top_loss: 1.8590 +[titan] 2025-09-09 21:07:13,681 - root - INFO - lr: 5.6867e-06 gnorm: 0.36 [2 days, 3:31:44<21:39:56] +[titan] 2025-09-09 21:07:13,988 - root - INFO - Dumping profiler traces at step 28160 +[titan] 2025-09-09 21:07:14,059 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-09 21:07:45,964 - root - INFO - step: 28165 loss: 2.7377 memory: 122.04GiB(87.57%) tps: 10,151 tflops: 483.77 mfu: 48.91% global_avg_ntp_loss: 0.7750 global_avg_top_loss: 1.9627 +[titan] 2025-09-09 21:07:45,964 - root - INFO - lr: 5.6838e-06 gnorm: 0.39 [2 days, 3:32:17<21:39:23] +[titan] 2025-09-09 21:08:17,852 - root - INFO - step: 28170 loss: 2.7077 memory: 122.04GiB(87.57%) tps: 10,276 tflops: 489.75 mfu: 49.52% global_avg_ntp_loss: 0.7605 global_avg_top_loss: 1.9472 +[titan] 2025-09-09 21:08:17,853 - root - INFO - lr: 5.6809e-06 gnorm: 0.37 [2 days, 3:32:48<21:38:49] +[titan] 2025-09-09 21:08:49,947 - root - INFO - step: 28175 loss: 2.6875 memory: 122.04GiB(87.57%) tps: 10,210 tflops: 486.61 mfu: 49.20% global_avg_ntp_loss: 0.7510 global_avg_top_loss: 1.9366 +[titan] 2025-09-09 21:08:49,947 - root - INFO - lr: 5.6780e-06 gnorm: 0.37 [2 days, 3:33:21<21:38:16] +[titan] 2025-09-09 21:09:21,907 - root - INFO - step: 28180 loss: 2.7553 memory: 122.04GiB(87.57%) tps: 10,253 tflops: 488.65 mfu: 49.41% global_avg_ntp_loss: 0.7859 global_avg_top_loss: 1.9693 +[titan] 2025-09-09 21:09:21,908 - root - INFO - lr: 5.6752e-06 gnorm: 0.37 [2 days, 3:33:53<21:37:43] +[titan] 2025-09-09 21:09:53,779 - root - INFO - step: 28185 loss: 2.7911 memory: 122.04GiB(87.57%) tps: 10,281 tflops: 490.00 mfu: 49.55% global_avg_ntp_loss: 0.8002 global_avg_top_loss: 1.9909 +[titan] 2025-09-09 21:09:53,780 - root - INFO - lr: 5.6723e-06 gnorm: 0.41 [2 days, 3:34:24<21:37:09] +[titan] 2025-09-09 21:10:25,646 - root - INFO - step: 28190 loss: 2.6504 memory: 122.04GiB(87.57%) tps: 10,283 tflops: 490.08 mfu: 49.55% global_avg_ntp_loss: 0.7356 global_avg_top_loss: 1.9148 +[titan] 2025-09-09 21:10:25,647 - root - INFO - lr: 5.6694e-06 gnorm: 0.37 [2 days, 3:34:56<21:36:36] +[titan] 2025-09-09 21:10:57,553 - root - INFO - step: 28195 loss: 2.6239 memory: 122.04GiB(87.57%) tps: 10,270 tflops: 489.48 mfu: 49.49% global_avg_ntp_loss: 0.7222 global_avg_top_loss: 1.9017 +[titan] 2025-09-09 21:10:57,553 - root - INFO - lr: 5.6665e-06 gnorm: 0.40 [2 days, 3:35:28<21:36:02] +[titan] 2025-09-09 21:11:23,102 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:11:29,443 - root - INFO - step: 28200 loss: 2.8591 memory: 122.04GiB(87.57%) tps: 10,276 tflops: 489.72 mfu: 49.52% global_avg_ntp_loss: 0.8402 global_avg_top_loss: 2.0190 +[titan] 2025-09-09 21:11:29,443 - root - INFO - lr: 5.6637e-06 gnorm: 0.38 [2 days, 3:36:00<21:35:29] +[titan] 2025-09-09 21:12:01,378 - root - INFO - step: 28205 loss: 2.6971 memory: 122.04GiB(87.57%) tps: 10,261 tflops: 489.04 mfu: 49.45% global_avg_ntp_loss: 0.7600 global_avg_top_loss: 1.9371 +[titan] 2025-09-09 21:12:01,379 - root - INFO - lr: 5.6608e-06 gnorm: 0.39 [2 days, 3:36:32<21:34:56] +[titan] 2025-09-09 21:12:33,383 - root - INFO - step: 28210 loss: 2.6892 memory: 122.04GiB(87.57%) tps: 10,239 tflops: 487.98 mfu: 49.34% global_avg_ntp_loss: 0.7563 global_avg_top_loss: 1.9330 +[titan] 2025-09-09 21:12:33,383 - root - INFO - lr: 5.6579e-06 gnorm: 0.38 [2 days, 3:37:04<21:34:22] +[titan] 2025-09-09 21:13:05,227 - root - INFO - step: 28215 loss: 2.6069 memory: 122.04GiB(87.57%) tps: 10,291 tflops: 490.44 mfu: 49.59% global_avg_ntp_loss: 0.7227 global_avg_top_loss: 1.8842 +[titan] 2025-09-09 21:13:05,227 - root - INFO - lr: 5.6550e-06 gnorm: 0.39 [2 days, 3:37:36<21:33:49] +[titan] 2025-09-09 21:13:37,316 - root - INFO - step: 28220 loss: 2.7630 memory: 122.04GiB(87.57%) tps: 10,212 tflops: 486.69 mfu: 49.21% global_avg_ntp_loss: 0.7858 global_avg_top_loss: 1.9773 +[titan] 2025-09-09 21:13:37,316 - root - INFO - lr: 5.6522e-06 gnorm: 0.38 [2 days, 3:38:08<21:33:16] +[titan] 2025-09-09 21:14:09,293 - root - INFO - step: 28225 loss: 2.7865 memory: 122.04GiB(87.57%) tps: 10,247 tflops: 488.38 mfu: 49.38% global_avg_ntp_loss: 0.8006 global_avg_top_loss: 1.9858 +[titan] 2025-09-09 21:14:09,294 - root - INFO - lr: 5.6493e-06 gnorm: 0.38 [2 days, 3:38:40<21:32:42] +[titan] 2025-09-09 21:14:41,201 - root - INFO - step: 28230 loss: 2.6685 memory: 122.04GiB(87.57%) tps: 10,270 tflops: 489.45 mfu: 49.49% global_avg_ntp_loss: 0.7415 global_avg_top_loss: 1.9270 +[titan] 2025-09-09 21:14:41,202 - root - INFO - lr: 5.6464e-06 gnorm: 0.37 [2 days, 3:39:12<21:32:09] +[titan] 2025-09-09 21:15:13,120 - root - INFO - step: 28235 loss: 2.7414 memory: 122.04GiB(87.57%) tps: 10,267 tflops: 489.30 mfu: 49.47% global_avg_ntp_loss: 0.7739 global_avg_top_loss: 1.9676 +[titan] 2025-09-09 21:15:13,120 - root - INFO - lr: 5.6436e-06 gnorm: 0.37 [2 days, 3:39:44<21:31:36] +[titan] 2025-09-09 21:15:45,140 - root - INFO - step: 28240 loss: 2.7126 memory: 122.04GiB(87.57%) tps: 10,234 tflops: 487.74 mfu: 49.32% global_avg_ntp_loss: 0.7607 global_avg_top_loss: 1.9519 +[titan] 2025-09-09 21:15:45,140 - root - INFO - lr: 5.6407e-06 gnorm: 0.37 [2 days, 3:40:16<21:31:02] +[titan] 2025-09-09 21:16:17,076 - root - INFO - step: 28245 loss: 2.7356 memory: 122.04GiB(87.57%) tps: 10,261 tflops: 489.02 mfu: 49.45% global_avg_ntp_loss: 0.7783 global_avg_top_loss: 1.9573 +[titan] 2025-09-09 21:16:17,076 - root - INFO - lr: 5.6378e-06 gnorm: 0.37 [2 days, 3:40:48<21:30:29] +[titan] 2025-09-09 21:16:42,585 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:16:49,094 - root - INFO - step: 28250 loss: 2.6597 memory: 122.04GiB(87.57%) tps: 10,235 tflops: 487.77 mfu: 49.32% global_avg_ntp_loss: 0.7406 global_avg_top_loss: 1.9191 +[titan] 2025-09-09 21:16:49,095 - root - INFO - lr: 5.6350e-06 gnorm: 0.38 [2 days, 3:41:20<21:29:56] +[titan] 2025-09-09 21:17:21,154 - root - INFO - step: 28255 loss: 2.8276 memory: 122.04GiB(87.57%) tps: 10,221 tflops: 487.14 mfu: 49.26% global_avg_ntp_loss: 0.8282 global_avg_top_loss: 1.9994 +[titan] 2025-09-09 21:17:21,154 - root - INFO - lr: 5.6321e-06 gnorm: 0.39 [2 days, 3:41:52<21:29:22] +[titan] 2025-09-09 21:17:52,902 - root - INFO - step: 28260 loss: 2.6914 memory: 122.04GiB(87.57%) tps: 10,322 tflops: 491.92 mfu: 49.74% global_avg_ntp_loss: 0.7553 global_avg_top_loss: 1.9361 +[titan] 2025-09-09 21:17:52,902 - root - INFO - lr: 5.6292e-06 gnorm: 0.38 [2 days, 3:42:23<21:28:49] +[titan] 2025-09-09 21:18:24,838 - root - INFO - step: 28265 loss: 2.8613 memory: 122.04GiB(87.57%) tps: 10,261 tflops: 489.01 mfu: 49.45% global_avg_ntp_loss: 0.8460 global_avg_top_loss: 2.0153 +[titan] 2025-09-09 21:18:24,839 - root - INFO - lr: 5.6264e-06 gnorm: 0.38 [2 days, 3:42:55<21:28:16] +[titan] 2025-09-09 21:18:56,742 - root - INFO - step: 28270 loss: 2.5910 memory: 122.04GiB(87.57%) tps: 10,271 tflops: 489.52 mfu: 49.50% global_avg_ntp_loss: 0.7136 global_avg_top_loss: 1.8774 +[titan] 2025-09-09 21:18:56,742 - root - INFO - lr: 5.6235e-06 gnorm: 0.37 [2 days, 3:43:27<21:27:42] +[titan] 2025-09-09 21:19:28,821 - root - INFO - step: 28275 loss: 2.6543 memory: 122.04GiB(87.57%) tps: 10,215 tflops: 486.83 mfu: 49.22% global_avg_ntp_loss: 0.7392 global_avg_top_loss: 1.9151 +[titan] 2025-09-09 21:19:28,822 - root - INFO - lr: 5.6206e-06 gnorm: 0.38 [2 days, 3:43:59<21:27:09] +[titan] 2025-09-09 21:20:00,561 - root - INFO - step: 28280 loss: 3.3227 memory: 122.04GiB(87.57%) tps: 10,324 tflops: 492.04 mfu: 49.75% global_avg_ntp_loss: 1.1079 global_avg_top_loss: 2.2148 +[titan] 2025-09-09 21:20:00,562 - root - INFO - lr: 5.6178e-06 gnorm: 0.40 [2 days, 3:44:31<21:26:36] +[titan] 2025-09-09 21:20:32,501 - root - INFO - step: 28285 loss: 2.7512 memory: 122.04GiB(87.57%) tps: 10,260 tflops: 488.97 mfu: 49.44% global_avg_ntp_loss: 0.7806 global_avg_top_loss: 1.9706 +[titan] 2025-09-09 21:20:32,501 - root - INFO - lr: 5.6149e-06 gnorm: 0.37 [2 days, 3:45:03<21:26:02] +[titan] 2025-09-09 21:21:04,572 - root - INFO - step: 28290 loss: 2.5216 memory: 122.04GiB(87.57%) tps: 10,218 tflops: 486.96 mfu: 49.24% global_avg_ntp_loss: 0.6776 global_avg_top_loss: 1.8440 +[titan] 2025-09-09 21:21:04,573 - root - INFO - lr: 5.6120e-06 gnorm: 0.36 [2 days, 3:45:35<21:25:29] +[titan] 2025-09-09 21:21:36,332 - root - INFO - step: 28295 loss: 2.6873 memory: 122.04GiB(87.57%) tps: 10,318 tflops: 491.74 mfu: 49.72% global_avg_ntp_loss: 0.7580 global_avg_top_loss: 1.9293 +[titan] 2025-09-09 21:21:36,332 - root - INFO - lr: 5.6092e-06 gnorm: 0.40 [2 days, 3:46:07<21:24:55] +[titan] 2025-09-09 21:22:02,062 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:22:08,496 - root - INFO - step: 28300 loss: 2.6765 memory: 122.04GiB(87.57%) tps: 10,188 tflops: 485.56 mfu: 49.10% global_avg_ntp_loss: 0.7471 global_avg_top_loss: 1.9293 +[titan] 2025-09-09 21:22:08,496 - root - INFO - lr: 5.6063e-06 gnorm: 0.37 [2 days, 3:46:39<21:24:22] +[titan] 2025-09-09 21:22:40,484 - root - INFO - step: 28305 loss: 2.7023 memory: 122.04GiB(87.57%) tps: 10,244 tflops: 488.22 mfu: 49.37% global_avg_ntp_loss: 0.7591 global_avg_top_loss: 1.9433 +[titan] 2025-09-09 21:22:40,485 - root - INFO - lr: 5.6035e-06 gnorm: 0.38 [2 days, 3:47:11<21:23:49] +[titan] 2025-09-09 21:23:12,475 - root - INFO - step: 28310 loss: 2.8013 memory: 122.04GiB(87.57%) tps: 10,243 tflops: 488.18 mfu: 49.36% global_avg_ntp_loss: 0.8024 global_avg_top_loss: 1.9988 +[titan] 2025-09-09 21:23:12,476 - root - INFO - lr: 5.6006e-06 gnorm: 0.39 [2 days, 3:47:43<21:23:16] +[titan] 2025-09-09 21:23:44,550 - root - INFO - step: 28315 loss: 2.7410 memory: 122.04GiB(87.57%) tps: 10,217 tflops: 486.91 mfu: 49.23% global_avg_ntp_loss: 0.7780 global_avg_top_loss: 1.9630 +[titan] 2025-09-09 21:23:44,550 - root - INFO - lr: 5.5978e-06 gnorm: 0.38 [2 days, 3:48:15<21:22:42] +[titan] 2025-09-09 21:24:16,615 - root - INFO - step: 28320 loss: 2.6325 memory: 122.04GiB(87.57%) tps: 10,220 tflops: 487.06 mfu: 49.25% global_avg_ntp_loss: 0.7306 global_avg_top_loss: 1.9019 +[titan] 2025-09-09 21:24:16,615 - root - INFO - lr: 5.5949e-06 gnorm: 0.36 [2 days, 3:48:47<21:22:09] +[titan] 2025-09-09 21:24:48,575 - root - INFO - step: 28325 loss: 2.7379 memory: 122.04GiB(87.57%) tps: 10,253 tflops: 488.65 mfu: 49.41% global_avg_ntp_loss: 0.7735 global_avg_top_loss: 1.9644 +[titan] 2025-09-09 21:24:48,575 - root - INFO - lr: 5.5921e-06 gnorm: 0.38 [2 days, 3:49:19<21:21:36] +[titan] 2025-09-09 21:25:20,513 - root - INFO - step: 28330 loss: 2.7198 memory: 122.04GiB(87.57%) tps: 10,260 tflops: 488.99 mfu: 49.44% global_avg_ntp_loss: 0.7656 global_avg_top_loss: 1.9542 +[titan] 2025-09-09 21:25:20,514 - root - INFO - lr: 5.5892e-06 gnorm: 0.37 [2 days, 3:49:51<21:21:02] +[titan] 2025-09-09 21:25:52,438 - root - INFO - step: 28335 loss: 2.7184 memory: 122.04GiB(87.57%) tps: 10,264 tflops: 489.20 mfu: 49.46% global_avg_ntp_loss: 0.7665 global_avg_top_loss: 1.9519 +[titan] 2025-09-09 21:25:52,438 - root - INFO - lr: 5.5863e-06 gnorm: 0.37 [2 days, 3:50:23<21:20:29] +[titan] 2025-09-09 21:26:24,421 - root - INFO - step: 28340 loss: 2.7392 memory: 122.04GiB(87.57%) tps: 10,246 tflops: 488.30 mfu: 49.37% global_avg_ntp_loss: 0.7835 global_avg_top_loss: 1.9557 +[titan] 2025-09-09 21:26:24,422 - root - INFO - lr: 5.5835e-06 gnorm: 0.39 [2 days, 3:50:55<21:19:56] +[titan] 2025-09-09 21:26:56,483 - root - INFO - step: 28345 loss: 2.7355 memory: 122.04GiB(87.57%) tps: 10,221 tflops: 487.10 mfu: 49.25% global_avg_ntp_loss: 0.7774 global_avg_top_loss: 1.9581 +[titan] 2025-09-09 21:26:56,483 - root - INFO - lr: 5.5806e-06 gnorm: 0.41 [2 days, 3:51:27<21:19:22] +[titan] 2025-09-09 21:27:21,809 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:27:28,258 - root - INFO - step: 28350 loss: 2.6791 memory: 122.04GiB(87.57%) tps: 10,313 tflops: 491.51 mfu: 49.70% global_avg_ntp_loss: 0.7472 global_avg_top_loss: 1.9319 +[titan] 2025-09-09 21:27:28,258 - root - INFO - lr: 5.5778e-06 gnorm: 0.38 [2 days, 3:51:59<21:18:49] +[titan] 2025-09-09 21:28:00,207 - root - INFO - step: 28355 loss: 2.7238 memory: 122.04GiB(87.57%) tps: 10,256 tflops: 488.82 mfu: 49.43% global_avg_ntp_loss: 0.7695 global_avg_top_loss: 1.9543 +[titan] 2025-09-09 21:28:00,208 - root - INFO - lr: 5.5749e-06 gnorm: 0.38 [2 days, 3:52:31<21:18:16] +[titan] 2025-09-09 21:28:32,171 - root - INFO - step: 28360 loss: 3.3545 memory: 122.04GiB(87.57%) tps: 10,252 tflops: 488.60 mfu: 49.40% global_avg_ntp_loss: 1.1257 global_avg_top_loss: 2.2288 +[titan] 2025-09-09 21:28:32,171 - root - INFO - lr: 5.5721e-06 gnorm: 0.37 [2 days, 3:53:03<21:17:42] +[titan] 2025-09-09 21:29:04,364 - root - INFO - step: 28365 loss: 2.6643 memory: 122.04GiB(87.57%) tps: 10,179 tflops: 485.11 mfu: 49.05% global_avg_ntp_loss: 0.7408 global_avg_top_loss: 1.9235 +[titan] 2025-09-09 21:29:04,365 - root - INFO - lr: 5.5693e-06 gnorm: 0.37 [2 days, 3:53:35<21:17:09] +[titan] 2025-09-09 21:29:36,264 - root - INFO - step: 28370 loss: 2.7348 memory: 122.04GiB(87.57%) tps: 10,273 tflops: 489.59 mfu: 49.50% global_avg_ntp_loss: 0.7726 global_avg_top_loss: 1.9622 +[titan] 2025-09-09 21:29:36,264 - root - INFO - lr: 5.5664e-06 gnorm: 0.44 [2 days, 3:54:07<21:16:36] +[titan] 2025-09-09 21:30:08,276 - root - INFO - step: 28375 loss: 2.5622 memory: 122.04GiB(87.57%) tps: 10,236 tflops: 487.86 mfu: 49.33% global_avg_ntp_loss: 0.7009 global_avg_top_loss: 1.8613 +[titan] 2025-09-09 21:30:08,277 - root - INFO - lr: 5.5636e-06 gnorm: 0.44 [2 days, 3:54:39<21:16:02] +[titan] 2025-09-09 21:30:40,408 - root - INFO - step: 28380 loss: 2.7838 memory: 122.04GiB(87.57%) tps: 10,198 tflops: 486.05 mfu: 49.15% global_avg_ntp_loss: 0.8067 global_avg_top_loss: 1.9770 +[titan] 2025-09-09 21:30:40,408 - root - INFO - lr: 5.5607e-06 gnorm: 0.40 [2 days, 3:55:11<21:15:29] +[titan] 2025-09-09 21:31:12,132 - root - INFO - step: 28385 loss: 2.6302 memory: 122.04GiB(87.57%) tps: 10,329 tflops: 492.28 mfu: 49.78% global_avg_ntp_loss: 0.7225 global_avg_top_loss: 1.9078 +[titan] 2025-09-09 21:31:12,133 - root - INFO - lr: 5.5579e-06 gnorm: 0.42 [2 days, 3:55:43<21:14:56] +[titan] 2025-09-09 21:31:44,056 - root - INFO - step: 28390 loss: 2.8550 memory: 122.04GiB(87.57%) tps: 10,265 tflops: 489.22 mfu: 49.47% global_avg_ntp_loss: 0.8501 global_avg_top_loss: 2.0049 +[titan] 2025-09-09 21:31:44,056 - root - INFO - lr: 5.5550e-06 gnorm: 0.39 [2 days, 3:56:15<21:14:22] +[titan] 2025-09-09 21:32:15,880 - root - INFO - step: 28395 loss: 2.6919 memory: 122.04GiB(87.57%) tps: 10,297 tflops: 490.75 mfu: 49.62% global_avg_ntp_loss: 0.7558 global_avg_top_loss: 1.9360 +[titan] 2025-09-09 21:32:15,880 - root - INFO - lr: 5.5522e-06 gnorm: 0.43 [2 days, 3:56:46<21:13:49] +[titan] 2025-09-09 21:32:41,421 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:32:47,854 - root - INFO - step: 28400 loss: 2.5807 memory: 122.04GiB(87.57%) tps: 10,248 tflops: 488.43 mfu: 49.39% global_avg_ntp_loss: 0.7034 global_avg_top_loss: 1.8773 +[titan] 2025-09-09 21:32:47,855 - root - INFO - lr: 5.5493e-06 gnorm: 0.42 [2 days, 3:57:18<21:13:16] +[titan] 2025-09-09 21:33:19,784 - root - INFO - step: 28405 loss: 2.7164 memory: 122.04GiB(87.57%) tps: 10,263 tflops: 489.12 mfu: 49.46% global_avg_ntp_loss: 0.7639 global_avg_top_loss: 1.9526 +[titan] 2025-09-09 21:33:19,785 - root - INFO - lr: 5.5465e-06 gnorm: 0.37 [2 days, 3:57:50<21:12:42] +[titan] 2025-09-09 21:33:51,752 - root - INFO - step: 28410 loss: 3.0325 memory: 122.04GiB(87.57%) tps: 10,251 tflops: 488.54 mfu: 49.40% global_avg_ntp_loss: 0.9399 global_avg_top_loss: 2.0926 +[titan] 2025-09-09 21:33:51,753 - root - INFO - lr: 5.5437e-06 gnorm: 0.40 [2 days, 3:58:22<21:12:09] +[titan] 2025-09-09 21:34:23,754 - root - INFO - step: 28415 loss: 2.7711 memory: 122.04GiB(87.57%) tps: 10,240 tflops: 488.02 mfu: 49.34% global_avg_ntp_loss: 0.7887 global_avg_top_loss: 1.9824 +[titan] 2025-09-09 21:34:23,755 - root - INFO - lr: 5.5408e-06 gnorm: 0.38 [2 days, 3:58:54<21:11:36] +[titan] 2025-09-09 21:34:55,955 - root - INFO - step: 28420 loss: 2.7472 memory: 122.04GiB(87.57%) tps: 10,176 tflops: 485.00 mfu: 49.04% global_avg_ntp_loss: 0.7821 global_avg_top_loss: 1.9651 +[titan] 2025-09-09 21:34:55,956 - root - INFO - lr: 5.5380e-06 gnorm: 0.37 [2 days, 3:59:27<21:11:02] +[titan] 2025-09-09 21:35:27,770 - root - INFO - step: 28425 loss: 2.8168 memory: 122.04GiB(87.57%) tps: 10,300 tflops: 490.90 mfu: 49.64% global_avg_ntp_loss: 0.8165 global_avg_top_loss: 2.0003 +[titan] 2025-09-09 21:35:27,770 - root - INFO - lr: 5.5352e-06 gnorm: 0.39 [2 days, 3:59:58<21:10:29] +[titan] 2025-09-09 21:35:59,676 - root - INFO - step: 28430 loss: 2.8166 memory: 122.04GiB(87.57%) tps: 10,270 tflops: 489.47 mfu: 49.49% global_avg_ntp_loss: 0.8164 global_avg_top_loss: 2.0003 +[titan] 2025-09-09 21:35:59,677 - root - INFO - lr: 5.5323e-06 gnorm: 0.38 [2 days, 4:00:30<21:09:56] +[titan] 2025-09-09 21:36:31,636 - root - INFO - step: 28435 loss: 2.6888 memory: 122.04GiB(87.57%) tps: 10,253 tflops: 488.67 mfu: 49.41% global_avg_ntp_loss: 0.7517 global_avg_top_loss: 1.9371 +[titan] 2025-09-09 21:36:31,636 - root - INFO - lr: 5.5295e-06 gnorm: 0.37 [2 days, 4:01:02<21:09:22] +[titan] 2025-09-09 21:37:03,607 - root - INFO - step: 28440 loss: 3.1246 memory: 122.04GiB(87.57%) tps: 10,250 tflops: 488.49 mfu: 49.39% global_avg_ntp_loss: 1.0049 global_avg_top_loss: 2.1197 +[titan] 2025-09-09 21:37:03,607 - root - INFO - lr: 5.5266e-06 gnorm: 0.42 [2 days, 4:01:34<21:08:49] +[titan] 2025-09-09 21:37:35,461 - root - INFO - step: 28445 loss: 2.7246 memory: 122.04GiB(87.57%) tps: 10,287 tflops: 490.28 mfu: 49.57% global_avg_ntp_loss: 0.7677 global_avg_top_loss: 1.9569 +[titan] 2025-09-09 21:37:35,461 - root - INFO - lr: 5.5238e-06 gnorm: 0.38 [2 days, 4:02:06<21:08:16] +[titan] 2025-09-09 21:38:01,194 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:38:07,556 - root - INFO - step: 28450 loss: 2.9222 memory: 122.04GiB(87.57%) tps: 10,210 tflops: 486.60 mfu: 49.20% global_avg_ntp_loss: 0.8822 global_avg_top_loss: 2.0400 +[titan] 2025-09-09 21:38:07,556 - root - INFO - lr: 5.5210e-06 gnorm: 0.39 [2 days, 4:02:38<21:07:42] +[titan] 2025-09-09 21:38:39,331 - root - INFO - step: 28455 loss: 2.6796 memory: 122.04GiB(87.57%) tps: 10,313 tflops: 491.51 mfu: 49.70% global_avg_ntp_loss: 0.7506 global_avg_top_loss: 1.9290 +[titan] 2025-09-09 21:38:39,331 - root - INFO - lr: 5.5182e-06 gnorm: 0.43 [2 days, 4:03:10<21:07:09] +[titan] 2025-09-09 21:39:11,176 - root - INFO - step: 28460 loss: 2.7121 memory: 122.04GiB(87.57%) tps: 10,290 tflops: 490.42 mfu: 49.59% global_avg_ntp_loss: 0.7631 global_avg_top_loss: 1.9491 +[titan] 2025-09-09 21:39:11,176 - root - INFO - lr: 5.5153e-06 gnorm: 0.40 [2 days, 4:03:42<21:06:36] +[titan] 2025-09-09 21:39:43,200 - root - INFO - step: 28465 loss: 2.6685 memory: 122.04GiB(87.57%) tps: 10,233 tflops: 487.68 mfu: 49.31% global_avg_ntp_loss: 0.7458 global_avg_top_loss: 1.9226 +[titan] 2025-09-09 21:39:43,200 - root - INFO - lr: 5.5125e-06 gnorm: 0.37 [2 days, 4:04:14<21:06:02] +[titan] 2025-09-09 21:40:15,359 - root - INFO - step: 28470 loss: 2.7464 memory: 122.04GiB(87.57%) tps: 10,190 tflops: 485.63 mfu: 49.10% global_avg_ntp_loss: 0.7767 global_avg_top_loss: 1.9698 +[titan] 2025-09-09 21:40:15,359 - root - INFO - lr: 5.5097e-06 gnorm: 0.38 [2 days, 4:04:46<21:05:29] +[titan] 2025-09-09 21:40:47,551 - root - INFO - step: 28475 loss: 2.6765 memory: 122.04GiB(87.57%) tps: 10,179 tflops: 485.13 mfu: 49.05% global_avg_ntp_loss: 0.7455 global_avg_top_loss: 1.9310 +[titan] 2025-09-09 21:40:47,552 - root - INFO - lr: 5.5068e-06 gnorm: 0.38 [2 days, 4:05:18<21:04:56] +[titan] 2025-09-09 21:41:19,545 - root - INFO - step: 28480 loss: 2.7187 memory: 122.04GiB(87.57%) tps: 10,242 tflops: 488.14 mfu: 49.36% global_avg_ntp_loss: 0.7676 global_avg_top_loss: 1.9511 +[titan] 2025-09-09 21:41:19,546 - root - INFO - lr: 5.5040e-06 gnorm: 0.38 [2 days, 4:05:50<21:04:23] +[titan] 2025-09-09 21:41:51,389 - root - INFO - step: 28485 loss: 2.6657 memory: 122.04GiB(87.57%) tps: 10,291 tflops: 490.44 mfu: 49.59% global_avg_ntp_loss: 0.7411 global_avg_top_loss: 1.9245 +[titan] 2025-09-09 21:41:51,389 - root - INFO - lr: 5.5012e-06 gnorm: 0.37 [2 days, 4:06:22<21:03:49] +[titan] 2025-09-09 21:42:23,178 - root - INFO - step: 28490 loss: 3.2187 memory: 122.04GiB(87.57%) tps: 10,308 tflops: 491.28 mfu: 49.67% global_avg_ntp_loss: 1.0470 global_avg_top_loss: 2.1716 +[titan] 2025-09-09 21:42:23,179 - root - INFO - lr: 5.4984e-06 gnorm: 0.38 [2 days, 4:06:54<21:03:16] +[titan] 2025-09-09 21:42:55,118 - root - INFO - step: 28495 loss: 2.6664 memory: 122.04GiB(87.57%) tps: 10,259 tflops: 488.96 mfu: 49.44% global_avg_ntp_loss: 0.7416 global_avg_top_loss: 1.9248 +[titan] 2025-09-09 21:42:55,119 - root - INFO - lr: 5.4955e-06 gnorm: 0.37 [2 days, 4:07:26<21:02:43] +[titan] 2025-09-09 21:43:20,653 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:43:27,078 - root - INFO - step: 28500 loss: 2.7225 memory: 122.04GiB(87.57%) tps: 10,253 tflops: 488.66 mfu: 49.41% global_avg_ntp_loss: 0.7655 global_avg_top_loss: 1.9571 +[titan] 2025-09-09 21:43:27,078 - root - INFO - lr: 5.4927e-06 gnorm: 0.38 [2 days, 4:07:58<21:02:09] +[titan] 2025-09-09 21:43:59,034 - root - INFO - step: 28505 loss: 2.6893 memory: 122.04GiB(87.57%) tps: 10,255 tflops: 488.72 mfu: 49.42% global_avg_ntp_loss: 0.7535 global_avg_top_loss: 1.9358 +[titan] 2025-09-09 21:43:59,034 - root - INFO - lr: 5.4899e-06 gnorm: 0.38 [2 days, 4:08:30<21:01:36] +[titan] 2025-09-09 21:44:30,839 - root - INFO - step: 28510 loss: 2.7577 memory: 122.04GiB(87.57%) tps: 10,303 tflops: 491.03 mfu: 49.65% global_avg_ntp_loss: 0.7871 global_avg_top_loss: 1.9706 +[titan] 2025-09-09 21:44:30,840 - root - INFO - lr: 5.4871e-06 gnorm: 0.38 [2 days, 4:09:01<21:01:03] +[titan] 2025-09-09 21:45:03,153 - root - INFO - step: 28515 loss: 2.6288 memory: 122.04GiB(87.57%) tps: 10,141 tflops: 483.30 mfu: 48.87% global_avg_ntp_loss: 0.7264 global_avg_top_loss: 1.9024 +[titan] 2025-09-09 21:45:03,154 - root - INFO - lr: 5.4842e-06 gnorm: 0.39 [2 days, 4:09:34<21:00:29] +[titan] 2025-09-09 21:45:35,081 - root - INFO - step: 28520 loss: 3.2211 memory: 122.04GiB(87.57%) tps: 10,263 tflops: 489.15 mfu: 49.46% global_avg_ntp_loss: 1.0502 global_avg_top_loss: 2.1709 +[titan] 2025-09-09 21:45:35,081 - root - INFO - lr: 5.4814e-06 gnorm: 0.39 [2 days, 4:10:06<20:59:56] +[titan] 2025-09-09 21:46:06,973 - root - INFO - step: 28525 loss: 2.8803 memory: 122.04GiB(87.57%) tps: 10,275 tflops: 489.69 mfu: 49.51% global_avg_ntp_loss: 0.8427 global_avg_top_loss: 2.0376 +[titan] 2025-09-09 21:46:06,974 - root - INFO - lr: 5.4786e-06 gnorm: 0.39 [2 days, 4:10:38<20:59:23] +[titan] 2025-09-09 21:46:38,919 - root - INFO - step: 28530 loss: 2.5020 memory: 122.04GiB(87.57%) tps: 10,258 tflops: 488.87 mfu: 49.43% global_avg_ntp_loss: 0.6680 global_avg_top_loss: 1.8340 +[titan] 2025-09-09 21:46:38,920 - root - INFO - lr: 5.4758e-06 gnorm: 0.36 [2 days, 4:11:09<20:58:49] +[titan] 2025-09-09 21:47:10,824 - root - INFO - step: 28535 loss: 3.1806 memory: 122.04GiB(87.57%) tps: 10,271 tflops: 489.51 mfu: 49.50% global_avg_ntp_loss: 1.0301 global_avg_top_loss: 2.1505 +[titan] 2025-09-09 21:47:10,824 - root - INFO - lr: 5.4730e-06 gnorm: 0.38 [2 days, 4:11:41<20:58:16] +[titan] 2025-09-09 21:47:42,767 - root - INFO - step: 28540 loss: 2.7551 memory: 122.04GiB(87.57%) tps: 10,259 tflops: 488.91 mfu: 49.44% global_avg_ntp_loss: 0.7848 global_avg_top_loss: 1.9703 +[titan] 2025-09-09 21:47:42,767 - root - INFO - lr: 5.4701e-06 gnorm: 0.38 [2 days, 4:12:13<20:57:43] +[titan] 2025-09-09 21:48:14,746 - root - INFO - step: 28545 loss: 2.6349 memory: 122.04GiB(87.57%) tps: 10,247 tflops: 488.37 mfu: 49.38% global_avg_ntp_loss: 0.7264 global_avg_top_loss: 1.9085 +[titan] 2025-09-09 21:48:14,746 - root - INFO - lr: 5.4673e-06 gnorm: 0.38 [2 days, 4:12:45<20:57:09] +[titan] 2025-09-09 21:48:40,188 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:48:46,628 - root - INFO - step: 28550 loss: 2.7167 memory: 122.04GiB(87.57%) tps: 10,278 tflops: 489.85 mfu: 49.53% global_avg_ntp_loss: 0.7678 global_avg_top_loss: 1.9489 +[titan] 2025-09-09 21:48:46,628 - root - INFO - lr: 5.4645e-06 gnorm: 0.39 [2 days, 4:13:17<20:56:36] +[titan] 2025-09-09 21:49:18,585 - root - INFO - step: 28555 loss: 2.6226 memory: 122.04GiB(87.57%) tps: 10,254 tflops: 488.69 mfu: 49.41% global_avg_ntp_loss: 0.7200 global_avg_top_loss: 1.9026 +[titan] 2025-09-09 21:49:18,586 - root - INFO - lr: 5.4617e-06 gnorm: 0.39 [2 days, 4:13:49<20:56:03] +[titan] 2025-09-09 21:49:50,577 - root - INFO - step: 28560 loss: 2.8661 memory: 122.04GiB(87.57%) tps: 10,243 tflops: 488.17 mfu: 49.36% global_avg_ntp_loss: 0.8341 global_avg_top_loss: 2.0321 +[titan] 2025-09-09 21:49:50,578 - root - INFO - lr: 5.4589e-06 gnorm: 0.41 [2 days, 4:14:21<20:55:29] +[titan] 2025-09-09 21:50:22,525 - root - INFO - step: 28565 loss: 2.5915 memory: 122.04GiB(87.57%) tps: 10,257 tflops: 488.84 mfu: 49.43% global_avg_ntp_loss: 0.7055 global_avg_top_loss: 1.8859 +[titan] 2025-09-09 21:50:22,526 - root - INFO - lr: 5.4561e-06 gnorm: 0.39 [2 days, 4:14:53<20:54:56] +[titan] 2025-09-09 21:50:54,501 - root - INFO - step: 28570 loss: 3.1971 memory: 122.04GiB(87.57%) tps: 10,248 tflops: 488.41 mfu: 49.38% global_avg_ntp_loss: 1.0341 global_avg_top_loss: 2.1630 +[titan] 2025-09-09 21:50:54,502 - root - INFO - lr: 5.4533e-06 gnorm: 0.51 [2 days, 4:15:25<20:54:23] +[titan] 2025-09-09 21:51:26,697 - root - INFO - step: 28575 loss: 2.6941 memory: 122.04GiB(87.57%) tps: 10,178 tflops: 485.08 mfu: 49.05% global_avg_ntp_loss: 0.7557 global_avg_top_loss: 1.9385 +[titan] 2025-09-09 21:51:26,697 - root - INFO - lr: 5.4504e-06 gnorm: 0.38 [2 days, 4:15:57<20:53:50] +[titan] 2025-09-09 21:51:58,672 - root - INFO - step: 28580 loss: 2.6588 memory: 122.04GiB(87.57%) tps: 10,248 tflops: 488.42 mfu: 49.39% global_avg_ntp_loss: 0.7415 global_avg_top_loss: 1.9174 +[titan] 2025-09-09 21:51:58,673 - root - INFO - lr: 5.4476e-06 gnorm: 0.40 [2 days, 4:16:29<20:53:16] +[titan] 2025-09-09 21:52:30,508 - root - INFO - step: 28585 loss: 3.1974 memory: 122.04GiB(87.57%) tps: 10,293 tflops: 490.56 mfu: 49.60% global_avg_ntp_loss: 1.0373 global_avg_top_loss: 2.1600 +[titan] 2025-09-09 21:52:30,509 - root - INFO - lr: 5.4448e-06 gnorm: 0.39 [2 days, 4:17:01<20:52:43] +[titan] 2025-09-09 21:53:02,463 - root - INFO - step: 28590 loss: 2.7603 memory: 122.04GiB(87.57%) tps: 10,255 tflops: 488.73 mfu: 49.42% global_avg_ntp_loss: 0.7827 global_avg_top_loss: 1.9776 +[titan] 2025-09-09 21:53:02,464 - root - INFO - lr: 5.4420e-06 gnorm: 0.40 [2 days, 4:17:33<20:52:10] +[titan] 2025-09-09 21:53:34,353 - root - INFO - step: 28595 loss: 2.7405 memory: 122.04GiB(87.57%) tps: 10,276 tflops: 489.73 mfu: 49.52% global_avg_ntp_loss: 0.7735 global_avg_top_loss: 1.9670 +[titan] 2025-09-09 21:53:34,354 - root - INFO - lr: 5.4392e-06 gnorm: 0.40 [2 days, 4:18:05<20:51:36] +[titan] 2025-09-09 21:54:00,249 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:54:06,571 - root - INFO - step: 28600 loss: 3.0858 memory: 122.04GiB(87.57%) tps: 10,171 tflops: 484.75 mfu: 49.01% global_avg_ntp_loss: 0.9849 global_avg_top_loss: 2.1009 +[titan] 2025-09-09 21:54:06,571 - root - INFO - lr: 5.4364e-06 gnorm: 0.38 [2 days, 4:18:37<20:51:03] +[titan] 2025-09-09 21:54:38,498 - root - INFO - step: 28605 loss: 2.6908 memory: 122.04GiB(87.57%) tps: 10,264 tflops: 489.16 mfu: 49.46% global_avg_ntp_loss: 0.7542 global_avg_top_loss: 1.9367 +[titan] 2025-09-09 21:54:38,498 - root - INFO - lr: 5.4336e-06 gnorm: 0.38 [2 days, 4:19:09<20:50:30] +[titan] 2025-09-09 21:55:10,493 - root - INFO - step: 28610 loss: 2.7055 memory: 122.04GiB(87.57%) tps: 10,242 tflops: 488.12 mfu: 49.36% global_avg_ntp_loss: 0.7618 global_avg_top_loss: 1.9437 +[titan] 2025-09-09 21:55:10,493 - root - INFO - lr: 5.4308e-06 gnorm: 0.38 [2 days, 4:19:41<20:49:57] +[titan] 2025-09-09 21:55:42,512 - root - INFO - step: 28615 loss: 3.1941 memory: 122.04GiB(87.57%) tps: 10,234 tflops: 487.75 mfu: 49.32% global_avg_ntp_loss: 1.0331 global_avg_top_loss: 2.1610 +[titan] 2025-09-09 21:55:42,512 - root - INFO - lr: 5.4280e-06 gnorm: 0.40 [2 days, 4:20:13<20:49:23] +[titan] 2025-09-09 21:56:14,457 - root - INFO - step: 28620 loss: 2.7257 memory: 122.04GiB(87.57%) tps: 10,258 tflops: 488.89 mfu: 49.43% global_avg_ntp_loss: 0.7699 global_avg_top_loss: 1.9557 +[titan] 2025-09-09 21:56:14,457 - root - INFO - lr: 5.4252e-06 gnorm: 0.37 [2 days, 4:20:45<20:48:50] +[titan] 2025-09-09 21:56:46,459 - root - INFO - step: 28625 loss: 2.6829 memory: 122.04GiB(87.57%) tps: 10,240 tflops: 488.01 mfu: 49.34% global_avg_ntp_loss: 0.7492 global_avg_top_loss: 1.9337 +[titan] 2025-09-09 21:56:46,459 - root - INFO - lr: 5.4224e-06 gnorm: 0.38 [2 days, 4:21:17<20:48:17] +[titan] 2025-09-09 21:57:18,580 - root - INFO - step: 28630 loss: 2.6641 memory: 122.04GiB(87.57%) tps: 10,202 tflops: 486.21 mfu: 49.16% global_avg_ntp_loss: 0.7534 global_avg_top_loss: 1.9106 +[titan] 2025-09-09 21:57:18,580 - root - INFO - lr: 5.4196e-06 gnorm: 0.41 [2 days, 4:21:49<20:47:43] +[titan] 2025-09-09 21:57:50,515 - root - INFO - step: 28635 loss: 2.6440 memory: 122.04GiB(87.57%) tps: 10,261 tflops: 489.03 mfu: 49.45% global_avg_ntp_loss: 0.7312 global_avg_top_loss: 1.9128 +[titan] 2025-09-09 21:57:50,516 - root - INFO - lr: 5.4168e-06 gnorm: 0.37 [2 days, 4:22:21<20:47:10] +[titan] 2025-09-09 21:58:22,561 - root - INFO - step: 28640 loss: 2.6476 memory: 122.04GiB(87.57%) tps: 10,226 tflops: 487.35 mfu: 49.28% global_avg_ntp_loss: 0.7352 global_avg_top_loss: 1.9124 +[titan] 2025-09-09 21:58:22,561 - root - INFO - lr: 5.4140e-06 gnorm: 0.38 [2 days, 4:22:53<20:46:37] +[titan] 2025-09-09 21:58:54,368 - root - INFO - step: 28645 loss: 2.7637 memory: 122.04GiB(87.57%) tps: 10,302 tflops: 491.00 mfu: 49.65% global_avg_ntp_loss: 0.7900 global_avg_top_loss: 1.9737 +[titan] 2025-09-09 21:58:54,369 - root - INFO - lr: 5.4112e-06 gnorm: 0.42 [2 days, 4:23:25<20:46:03] +[titan] 2025-09-09 21:59:20,073 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 21:59:26,572 - root - INFO - step: 28650 loss: 3.1641 memory: 122.04GiB(87.57%) tps: 10,175 tflops: 484.95 mfu: 49.03% global_avg_ntp_loss: 1.0333 global_avg_top_loss: 2.1307 +[titan] 2025-09-09 21:59:26,573 - root - INFO - lr: 5.4084e-06 gnorm: 0.39 [2 days, 4:23:57<20:45:30] +[titan] 2025-09-09 21:59:58,660 - root - INFO - step: 28655 loss: 2.7249 memory: 122.04GiB(87.57%) tps: 10,212 tflops: 486.72 mfu: 49.21% global_avg_ntp_loss: 0.7668 global_avg_top_loss: 1.9581 +[titan] 2025-09-09 21:59:58,660 - root - INFO - lr: 5.4056e-06 gnorm: 0.39 [2 days, 4:24:29<20:44:57] +[titan] 2025-09-09 22:00:30,606 - root - INFO - step: 28660 loss: 2.5616 memory: 122.04GiB(87.57%) tps: 10,258 tflops: 488.87 mfu: 49.43% global_avg_ntp_loss: 0.6933 global_avg_top_loss: 1.8683 +[titan] 2025-09-09 22:00:30,606 - root - INFO - lr: 5.4028e-06 gnorm: 0.39 [2 days, 4:25:01<20:44:24] +[titan] 2025-09-09 22:01:02,610 - root - INFO - step: 28665 loss: 3.1232 memory: 122.04GiB(87.57%) tps: 10,239 tflops: 487.99 mfu: 49.34% global_avg_ntp_loss: 1.0025 global_avg_top_loss: 2.1206 +[titan] 2025-09-09 22:01:02,610 - root - INFO - lr: 5.4000e-06 gnorm: 0.38 [2 days, 4:25:33<20:43:50] +[titan] 2025-09-09 22:01:34,629 - root - INFO - step: 28670 loss: 2.6952 memory: 122.04GiB(87.57%) tps: 10,234 tflops: 487.75 mfu: 49.32% global_avg_ntp_loss: 0.7604 global_avg_top_loss: 1.9348 +[titan] 2025-09-09 22:01:34,629 - root - INFO - lr: 5.3972e-06 gnorm: 0.39 [2 days, 4:26:05<20:43:17] +[titan] 2025-09-09 22:01:47,793 - root - INFO - Dumping profiler traces at step 28672 +[titan] 2025-09-09 22:01:47,864 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-09 22:02:06,898 - root - INFO - step: 28675 loss: 2.6586 memory: 122.04GiB(87.57%) tps: 10,155 tflops: 483.98 mfu: 48.94% global_avg_ntp_loss: 0.7408 global_avg_top_loss: 1.9179 +[titan] 2025-09-09 22:02:06,899 - root - INFO - lr: 5.3944e-06 gnorm: 0.38 [2 days, 4:26:37<20:42:44] +[titan] 2025-09-09 22:02:38,744 - root - INFO - step: 28680 loss: 2.6551 memory: 122.04GiB(87.57%) tps: 10,290 tflops: 490.41 mfu: 49.59% global_avg_ntp_loss: 0.7363 global_avg_top_loss: 1.9188 +[titan] 2025-09-09 22:02:38,744 - root - INFO - lr: 5.3916e-06 gnorm: 0.36 [2 days, 4:27:09<20:42:11] +[titan] 2025-09-09 22:03:10,799 - root - INFO - step: 28685 loss: 2.7268 memory: 122.04GiB(87.57%) tps: 10,223 tflops: 487.21 mfu: 49.26% global_avg_ntp_loss: 0.7700 global_avg_top_loss: 1.9569 +[titan] 2025-09-09 22:03:10,799 - root - INFO - lr: 5.3888e-06 gnorm: 0.40 [2 days, 4:27:41<20:41:37] +[titan] 2025-09-09 22:03:42,679 - root - INFO - step: 28690 loss: 2.6472 memory: 122.04GiB(87.57%) tps: 10,279 tflops: 489.88 mfu: 49.53% global_avg_ntp_loss: 0.7358 global_avg_top_loss: 1.9114 +[titan] 2025-09-09 22:03:42,679 - root - INFO - lr: 5.3860e-06 gnorm: 0.37 [2 days, 4:28:13<20:41:04] +[titan] 2025-09-09 22:04:14,853 - root - INFO - step: 28695 loss: 3.1900 memory: 122.04GiB(87.57%) tps: 10,185 tflops: 485.40 mfu: 49.08% global_avg_ntp_loss: 1.0345 global_avg_top_loss: 2.1555 +[titan] 2025-09-09 22:04:14,854 - root - INFO - lr: 5.3833e-06 gnorm: 0.37 [2 days, 4:28:45<20:40:31] +[titan] 2025-09-09 22:04:40,544 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 22:04:46,936 - root - INFO - step: 28700 loss: 2.7025 memory: 122.04GiB(87.57%) tps: 10,214 tflops: 486.78 mfu: 49.22% global_avg_ntp_loss: 0.7573 global_avg_top_loss: 1.9451 +[titan] 2025-09-09 22:04:46,937 - root - INFO - lr: 5.3805e-06 gnorm: 0.38 [2 days, 4:29:17<20:39:58] +[titan] 2025-09-09 22:05:19,119 - root - INFO - step: 28705 loss: 2.6819 memory: 122.04GiB(87.57%) tps: 10,182 tflops: 485.27 mfu: 49.07% global_avg_ntp_loss: 0.7505 global_avg_top_loss: 1.9314 +[titan] 2025-09-09 22:05:19,120 - root - INFO - lr: 5.3777e-06 gnorm: 0.38 [2 days, 4:29:50<20:39:24] +[titan] 2025-09-09 22:05:51,276 - root - INFO - step: 28710 loss: 2.6720 memory: 122.04GiB(87.57%) tps: 10,190 tflops: 485.67 mfu: 49.11% global_avg_ntp_loss: 0.7448 global_avg_top_loss: 1.9272 +[titan] 2025-09-09 22:05:51,276 - root - INFO - lr: 5.3749e-06 gnorm: 0.40 [2 days, 4:30:22<20:38:51] +[titan] 2025-09-09 22:06:23,209 - root - INFO - step: 28715 loss: 3.1158 memory: 122.04GiB(87.57%) tps: 10,262 tflops: 489.07 mfu: 49.45% global_avg_ntp_loss: 0.9988 global_avg_top_loss: 2.1171 +[titan] 2025-09-09 22:06:23,209 - root - INFO - lr: 5.3721e-06 gnorm: 0.39 [2 days, 4:30:54<20:38:18] +[titan] 2025-09-09 22:06:55,018 - root - INFO - step: 28720 loss: 2.6877 memory: 122.04GiB(87.57%) tps: 10,302 tflops: 490.97 mfu: 49.64% global_avg_ntp_loss: 0.7519 global_avg_top_loss: 1.9358 +[titan] 2025-09-09 22:06:55,018 - root - INFO - lr: 5.3693e-06 gnorm: 0.40 [2 days, 4:31:26<20:37:44] +[titan] 2025-09-09 22:07:27,132 - root - INFO - step: 28725 loss: 2.6639 memory: 122.04GiB(87.57%) tps: 10,204 tflops: 486.31 mfu: 49.17% global_avg_ntp_loss: 0.7421 global_avg_top_loss: 1.9218 +[titan] 2025-09-09 22:07:27,133 - root - INFO - lr: 5.3665e-06 gnorm: 0.40 [2 days, 4:31:58<20:37:11] +[titan] 2025-09-09 22:07:58,984 - root - INFO - step: 28730 loss: 3.1616 memory: 122.04GiB(87.57%) tps: 10,288 tflops: 490.32 mfu: 49.58% global_avg_ntp_loss: 1.0217 global_avg_top_loss: 2.1399 +[titan] 2025-09-09 22:07:58,985 - root - INFO - lr: 5.3637e-06 gnorm: 0.42 [2 days, 4:32:29<20:36:38] +[titan] 2025-09-09 22:08:31,198 - root - INFO - step: 28735 loss: 2.6441 memory: 122.04GiB(87.57%) tps: 10,172 tflops: 484.81 mfu: 49.02% global_avg_ntp_loss: 0.7313 global_avg_top_loss: 1.9128 +[titan] 2025-09-09 22:08:31,198 - root - INFO - lr: 5.3610e-06 gnorm: 0.38 [2 days, 4:33:02<20:36:05] +[titan] 2025-09-09 22:09:03,305 - root - INFO - step: 28740 loss: 2.7205 memory: 122.04GiB(87.57%) tps: 10,206 tflops: 486.42 mfu: 49.18% global_avg_ntp_loss: 0.7682 global_avg_top_loss: 1.9522 +[titan] 2025-09-09 22:09:03,305 - root - INFO - lr: 5.3582e-06 gnorm: 0.41 [2 days, 4:33:34<20:35:31] +[titan] 2025-09-09 22:09:35,325 - root - INFO - step: 28745 loss: 3.0771 memory: 122.04GiB(87.57%) tps: 10,234 tflops: 487.74 mfu: 49.32% global_avg_ntp_loss: 0.9833 global_avg_top_loss: 2.0938 +[titan] 2025-09-09 22:09:35,325 - root - INFO - lr: 5.3554e-06 gnorm: 0.39 [2 days, 4:34:06<20:34:58] +[titan] 2025-09-09 22:10:01,068 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 22:10:07,441 - root - INFO - step: 28750 loss: 2.6752 memory: 122.04GiB(87.57%) tps: 10,203 tflops: 486.28 mfu: 49.17% global_avg_ntp_loss: 0.7440 global_avg_top_loss: 1.9311 +[titan] 2025-09-09 22:10:07,441 - root - INFO - lr: 5.3526e-06 gnorm: 0.38 [2 days, 4:34:38<20:34:25] +[titan] 2025-09-09 22:10:39,495 - root - INFO - step: 28755 loss: 2.6750 memory: 122.04GiB(87.57%) tps: 10,223 tflops: 487.23 mfu: 49.26% global_avg_ntp_loss: 0.7475 global_avg_top_loss: 1.9275 +[titan] 2025-09-09 22:10:39,495 - root - INFO - lr: 5.3498e-06 gnorm: 0.37 [2 days, 4:35:10<20:33:52] +[titan] 2025-09-09 22:11:11,429 - root - INFO - step: 28760 loss: 2.7428 memory: 122.04GiB(87.57%) tps: 10,261 tflops: 489.05 mfu: 49.45% global_avg_ntp_loss: 0.7751 global_avg_top_loss: 1.9677 +[titan] 2025-09-09 22:11:11,430 - root - INFO - lr: 5.3471e-06 gnorm: 0.39 [2 days, 4:35:42<20:33:18] +[titan] 2025-09-09 22:11:43,494 - root - INFO - step: 28765 loss: 2.6364 memory: 122.04GiB(87.57%) tps: 10,220 tflops: 487.06 mfu: 49.25% global_avg_ntp_loss: 0.7254 global_avg_top_loss: 1.9110 +[titan] 2025-09-09 22:11:43,494 - root - INFO - lr: 5.3443e-06 gnorm: 0.38 [2 days, 4:36:14<20:32:45] +[titan] 2025-09-09 22:12:15,584 - root - INFO - step: 28770 loss: 2.6527 memory: 122.04GiB(87.57%) tps: 10,212 tflops: 486.68 mfu: 49.21% global_avg_ntp_loss: 0.7365 global_avg_top_loss: 1.9163 +[titan] 2025-09-09 22:12:15,584 - root - INFO - lr: 5.3415e-06 gnorm: 0.38 [2 days, 4:36:46<20:32:12] +[titan] 2025-09-09 22:12:47,720 - root - INFO - step: 28775 loss: 3.1524 memory: 122.04GiB(87.57%) tps: 10,197 tflops: 485.98 mfu: 49.14% global_avg_ntp_loss: 1.0147 global_avg_top_loss: 2.1377 +[titan] 2025-09-09 22:12:47,720 - root - INFO - lr: 5.3387e-06 gnorm: 0.39 [2 days, 4:37:18<20:31:39] +[titan] 2025-09-09 22:13:19,821 - root - INFO - step: 28780 loss: 2.6481 memory: 122.04GiB(87.57%) tps: 10,208 tflops: 486.50 mfu: 49.19% global_avg_ntp_loss: 0.7351 global_avg_top_loss: 1.9131 +[titan] 2025-09-09 22:13:19,822 - root - INFO - lr: 5.3360e-06 gnorm: 0.39 [2 days, 4:37:50<20:31:05] +[titan] 2025-09-09 22:13:52,050 - root - INFO - step: 28785 loss: 2.6342 memory: 122.04GiB(87.57%) tps: 10,168 tflops: 484.59 mfu: 49.00% global_avg_ntp_loss: 0.7244 global_avg_top_loss: 1.9098 +[titan] 2025-09-09 22:13:52,050 - root - INFO - lr: 5.3332e-06 gnorm: 0.40 [2 days, 4:38:23<20:30:32] +[titan] 2025-09-09 22:14:24,061 - root - INFO - step: 28790 loss: 2.6920 memory: 122.04GiB(87.57%) tps: 10,237 tflops: 487.87 mfu: 49.33% global_avg_ntp_loss: 0.7544 global_avg_top_loss: 1.9376 +[titan] 2025-09-09 22:14:24,062 - root - INFO - lr: 5.3304e-06 gnorm: 0.40 [2 days, 4:38:55<20:29:59] +[titan] 2025-09-09 22:14:56,308 - root - INFO - step: 28795 loss: 3.1088 memory: 122.04GiB(87.57%) tps: 10,162 tflops: 484.31 mfu: 48.97% global_avg_ntp_loss: 0.9950 global_avg_top_loss: 2.1138 +[titan] 2025-09-09 22:14:56,308 - root - INFO - lr: 5.3276e-06 gnorm: 0.39 [2 days, 4:39:27<20:29:26] +[titan] 2025-09-09 22:15:21,980 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 22:15:28,411 - root - INFO - step: 28800 loss: 2.6261 memory: 122.04GiB(87.57%) tps: 10,207 tflops: 486.47 mfu: 49.19% global_avg_ntp_loss: 0.7230 global_avg_top_loss: 1.9030 +[titan] 2025-09-09 22:15:28,412 - root - INFO - lr: 5.3249e-06 gnorm: 0.38 [2 days, 4:39:59<20:28:53] +[titan] 2025-09-09 22:16:00,462 - root - INFO - step: 28805 loss: 2.5911 memory: 122.04GiB(87.57%) tps: 10,224 tflops: 487.27 mfu: 49.27% global_avg_ntp_loss: 0.7104 global_avg_top_loss: 1.8807 +[titan] 2025-09-09 22:16:00,463 - root - INFO - lr: 5.3221e-06 gnorm: 0.37 [2 days, 4:40:31<20:28:19] +[titan] 2025-09-09 22:16:32,557 - root - INFO - step: 28810 loss: 3.1791 memory: 122.04GiB(87.57%) tps: 10,210 tflops: 486.60 mfu: 49.20% global_avg_ntp_loss: 1.0293 global_avg_top_loss: 2.1497 +[titan] 2025-09-09 22:16:32,558 - root - INFO - lr: 5.3193e-06 gnorm: 0.39 [2 days, 4:41:03<20:27:46] +[titan] 2025-09-09 22:17:04,649 - root - INFO - step: 28815 loss: 2.6710 memory: 122.04GiB(87.57%) tps: 10,211 tflops: 486.64 mfu: 49.21% global_avg_ntp_loss: 0.7457 global_avg_top_loss: 1.9253 +[titan] 2025-09-09 22:17:04,650 - root - INFO - lr: 5.3166e-06 gnorm: 0.44 [2 days, 4:41:35<20:27:13] +[titan] 2025-09-09 22:17:36,857 - root - INFO - step: 28820 loss: 2.7233 memory: 122.04GiB(87.57%) tps: 10,174 tflops: 484.89 mfu: 49.03% global_avg_ntp_loss: 0.7658 global_avg_top_loss: 1.9575 +[titan] 2025-09-09 22:17:36,858 - root - INFO - lr: 5.3138e-06 gnorm: 0.38 [2 days, 4:42:07<20:26:40] +[titan] 2025-09-09 22:18:08,733 - root - INFO - step: 28825 loss: 2.8850 memory: 122.04GiB(87.57%) tps: 10,280 tflops: 489.95 mfu: 49.54% global_avg_ntp_loss: 0.8689 global_avg_top_loss: 2.0160 +[titan] 2025-09-09 22:18:08,733 - root - INFO - lr: 5.3110e-06 gnorm: 0.38 [2 days, 4:42:39<20:26:06] +[titan] 2025-09-09 22:18:40,756 - root - INFO - step: 28830 loss: 2.6594 memory: 122.04GiB(87.57%) tps: 10,233 tflops: 487.69 mfu: 49.31% global_avg_ntp_loss: 0.7398 global_avg_top_loss: 1.9196 +[titan] 2025-09-09 22:18:40,756 - root - INFO - lr: 5.3083e-06 gnorm: 0.39 [2 days, 4:43:11<20:25:33] +[titan] 2025-09-09 22:19:12,753 - root - INFO - step: 28835 loss: 2.6925 memory: 122.04GiB(87.57%) tps: 10,241 tflops: 488.09 mfu: 49.35% global_avg_ntp_loss: 0.7521 global_avg_top_loss: 1.9404 +[titan] 2025-09-09 22:19:12,754 - root - INFO - lr: 5.3055e-06 gnorm: 0.43 [2 days, 4:43:43<20:25:00] +[titan] 2025-09-09 22:19:44,721 - root - INFO - step: 28840 loss: 2.6651 memory: 122.04GiB(87.57%) tps: 10,251 tflops: 488.55 mfu: 49.40% global_avg_ntp_loss: 0.7416 global_avg_top_loss: 1.9235 +[titan] 2025-09-09 22:19:44,721 - root - INFO - lr: 5.3027e-06 gnorm: 0.39 [2 days, 4:44:15<20:24:27] +[titan] 2025-09-09 22:20:16,646 - root - INFO - step: 28845 loss: 2.7803 memory: 122.04GiB(87.57%) tps: 10,264 tflops: 489.19 mfu: 49.46% global_avg_ntp_loss: 0.7930 global_avg_top_loss: 1.9873 +[titan] 2025-09-09 22:20:16,646 - root - INFO - lr: 5.3000e-06 gnorm: 0.40 [2 days, 4:44:47<20:23:53] +[titan] 2025-09-09 22:20:42,120 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 22:20:48,597 - root - INFO - step: 28850 loss: 2.8029 memory: 122.04GiB(87.57%) tps: 10,256 tflops: 488.79 mfu: 49.42% global_avg_ntp_loss: 0.8124 global_avg_top_loss: 1.9906 +[titan] 2025-09-09 22:20:48,598 - root - INFO - lr: 5.2972e-06 gnorm: 0.41 [2 days, 4:45:19<20:23:20] +[titan] 2025-09-09 22:21:20,777 - root - INFO - step: 28855 loss: 2.6903 memory: 122.04GiB(87.57%) tps: 10,183 tflops: 485.33 mfu: 49.07% global_avg_ntp_loss: 0.7583 global_avg_top_loss: 1.9320 +[titan] 2025-09-09 22:21:20,777 - root - INFO - lr: 5.2944e-06 gnorm: 0.41 [2 days, 4:45:51<20:22:47] +[titan] 2025-09-09 22:21:52,887 - root - INFO - step: 28860 loss: 2.7436 memory: 122.04GiB(87.57%) tps: 10,205 tflops: 486.36 mfu: 49.18% global_avg_ntp_loss: 0.7766 global_avg_top_loss: 1.9670 +[titan] 2025-09-09 22:21:52,888 - root - INFO - lr: 5.2917e-06 gnorm: 0.38 [2 days, 4:46:23<20:22:14] +[titan] 2025-09-09 22:22:24,858 - root - INFO - step: 28865 loss: 2.7135 memory: 122.04GiB(87.57%) tps: 10,250 tflops: 488.49 mfu: 49.39% global_avg_ntp_loss: 0.7639 global_avg_top_loss: 1.9496 +[titan] 2025-09-09 22:22:24,858 - root - INFO - lr: 5.2889e-06 gnorm: 0.41 [2 days, 4:46:55<20:21:40] +[titan] 2025-09-09 22:22:56,995 - root - INFO - step: 28870 loss: 2.7356 memory: 122.04GiB(87.57%) tps: 10,197 tflops: 485.97 mfu: 49.14% global_avg_ntp_loss: 0.7734 global_avg_top_loss: 1.9622 +[titan] 2025-09-09 22:22:56,995 - root - INFO - lr: 5.2862e-06 gnorm: 0.38 [2 days, 4:47:27<20:21:07] +[titan] 2025-09-09 22:23:28,997 - root - INFO - step: 28875 loss: 3.6074 memory: 122.04GiB(87.57%) tps: 10,240 tflops: 488.01 mfu: 49.34% global_avg_ntp_loss: 1.2744 global_avg_top_loss: 2.3330 +[titan] 2025-09-09 22:23:28,997 - root - INFO - lr: 5.2834e-06 gnorm: 0.39 [2 days, 4:47:59<20:20:34] +[titan] 2025-09-09 22:24:01,395 - root - INFO - step: 28880 loss: 2.6378 memory: 122.04GiB(87.57%) tps: 10,115 tflops: 482.05 mfu: 48.74% global_avg_ntp_loss: 0.7282 global_avg_top_loss: 1.9096 +[titan] 2025-09-09 22:24:01,395 - root - INFO - lr: 5.2807e-06 gnorm: 0.38 [2 days, 4:48:32<20:20:01] +[titan] 2025-09-09 22:24:33,525 - root - INFO - step: 28885 loss: 2.6623 memory: 122.04GiB(87.57%) tps: 10,199 tflops: 486.06 mfu: 49.15% global_avg_ntp_loss: 0.7431 global_avg_top_loss: 1.9191 +[titan] 2025-09-09 22:24:33,525 - root - INFO - lr: 5.2779e-06 gnorm: 0.38 [2 days, 4:49:04<20:19:27] +[titan] 2025-09-09 22:25:05,463 - root - INFO - step: 28890 loss: 3.1742 memory: 122.04GiB(87.57%) tps: 10,260 tflops: 489.00 mfu: 49.44% global_avg_ntp_loss: 1.0287 global_avg_top_loss: 2.1455 +[titan] 2025-09-09 22:25:05,463 - root - INFO - lr: 5.2751e-06 gnorm: 0.38 [2 days, 4:49:36<20:18:54] +[titan] 2025-09-09 22:25:37,496 - root - INFO - step: 28895 loss: 2.6747 memory: 122.04GiB(87.57%) tps: 10,230 tflops: 487.53 mfu: 49.30% global_avg_ntp_loss: 0.7448 global_avg_top_loss: 1.9299 +[titan] 2025-09-09 22:25:37,497 - root - INFO - lr: 5.2724e-06 gnorm: 0.40 [2 days, 4:50:08<20:18:21] +[titan] 2025-09-09 22:26:02,908 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 22:26:09,316 - root - INFO - step: 28900 loss: 2.6546 memory: 122.04GiB(87.57%) tps: 10,298 tflops: 490.82 mfu: 49.63% global_avg_ntp_loss: 0.7368 global_avg_top_loss: 1.9178 +[titan] 2025-09-09 22:26:09,316 - root - INFO - lr: 5.2696e-06 gnorm: 0.42 [2 days, 4:50:40<20:17:48] +[titan] 2025-09-09 22:26:41,177 - root - INFO - step: 28905 loss: 2.6410 memory: 122.04GiB(87.57%) tps: 10,285 tflops: 490.17 mfu: 49.56% global_avg_ntp_loss: 0.7279 global_avg_top_loss: 1.9131 +[titan] 2025-09-09 22:26:41,177 - root - INFO - lr: 5.2669e-06 gnorm: 0.38 [2 days, 4:51:12<20:17:14] +[titan] 2025-09-09 22:27:13,290 - root - INFO - step: 28910 loss: 2.6782 memory: 122.04GiB(87.57%) tps: 10,204 tflops: 486.33 mfu: 49.17% global_avg_ntp_loss: 0.7516 global_avg_top_loss: 1.9265 +[titan] 2025-09-09 22:27:13,290 - root - INFO - lr: 5.2641e-06 gnorm: 0.38 [2 days, 4:51:44<20:16:41] +[titan] 2025-09-09 22:27:45,302 - root - INFO - step: 28915 loss: 2.6866 memory: 122.04GiB(87.57%) tps: 10,236 tflops: 487.85 mfu: 49.33% global_avg_ntp_loss: 0.7632 global_avg_top_loss: 1.9234 +[titan] 2025-09-09 22:27:45,303 - root - INFO - lr: 5.2614e-06 gnorm: 0.37 [2 days, 4:52:16<20:16:08] +[titan] 2025-09-09 22:28:17,219 - root - INFO - step: 28920 loss: 2.7901 memory: 122.04GiB(87.57%) tps: 10,267 tflops: 489.33 mfu: 49.48% global_avg_ntp_loss: 0.7978 global_avg_top_loss: 1.9922 +[titan] 2025-09-09 22:28:17,219 - root - INFO - lr: 5.2586e-06 gnorm: 0.39 [2 days, 4:52:48<20:15:34] +[titan] 2025-09-09 22:28:49,229 - root - INFO - step: 28925 loss: 2.6673 memory: 122.04GiB(87.57%) tps: 10,237 tflops: 487.89 mfu: 49.33% global_avg_ntp_loss: 0.7464 global_avg_top_loss: 1.9209 +[titan] 2025-09-09 22:28:49,229 - root - INFO - lr: 5.2559e-06 gnorm: 0.38 [2 days, 4:53:20<20:15:01] +[titan] 2025-09-09 22:29:21,338 - root - INFO - step: 28930 loss: 2.7162 memory: 122.04GiB(87.57%) tps: 10,205 tflops: 486.38 mfu: 49.18% global_avg_ntp_loss: 0.7631 global_avg_top_loss: 1.9531 +[titan] 2025-09-09 22:29:21,339 - root - INFO - lr: 5.2531e-06 gnorm: 0.41 [2 days, 4:53:52<20:14:28] +[titan] 2025-09-09 22:29:53,381 - root - INFO - step: 28935 loss: 2.6847 memory: 122.04GiB(87.57%) tps: 10,227 tflops: 487.39 mfu: 49.28% global_avg_ntp_loss: 0.7489 global_avg_top_loss: 1.9358 +[titan] 2025-09-09 22:29:53,381 - root - INFO - lr: 5.2504e-06 gnorm: 0.40 [2 days, 4:54:24<20:13:55] +[titan] 2025-09-09 22:30:25,603 - root - INFO - step: 28940 loss: 2.7387 memory: 122.04GiB(87.57%) tps: 10,170 tflops: 484.68 mfu: 49.01% global_avg_ntp_loss: 0.7740 global_avg_top_loss: 1.9647 +[titan] 2025-09-09 22:30:25,603 - root - INFO - lr: 5.2476e-06 gnorm: 0.39 [2 days, 4:54:56<20:13:22] +[titan] 2025-09-09 22:30:57,518 - root - INFO - step: 28945 loss: 2.7068 memory: 122.04GiB(87.57%) tps: 10,268 tflops: 489.35 mfu: 49.48% global_avg_ntp_loss: 0.7588 global_avg_top_loss: 1.9480 +[titan] 2025-09-09 22:30:57,518 - root - INFO - lr: 5.2449e-06 gnorm: 0.41 [2 days, 4:55:28<20:12:48] +[titan] 2025-09-09 22:31:23,140 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 22:31:29,555 - root - INFO - step: 28950 loss: 2.7610 memory: 122.04GiB(87.57%) tps: 10,228 tflops: 487.47 mfu: 49.29% global_avg_ntp_loss: 0.7847 global_avg_top_loss: 1.9763 +[titan] 2025-09-09 22:31:29,556 - root - INFO - lr: 5.2422e-06 gnorm: 0.39 [2 days, 4:56:00<20:12:15] +[titan] 2025-09-09 22:32:01,535 - root - INFO - step: 28955 loss: 3.6030 memory: 122.04GiB(87.57%) tps: 10,247 tflops: 488.35 mfu: 49.38% global_avg_ntp_loss: 1.2743 global_avg_top_loss: 2.3287 +[titan] 2025-09-09 22:32:01,536 - root - INFO - lr: 5.2394e-06 gnorm: 0.42 [2 days, 4:56:32<20:11:42] +[titan] 2025-09-09 22:32:33,518 - root - INFO - step: 28960 loss: 2.6624 memory: 122.04GiB(87.57%) tps: 10,246 tflops: 488.31 mfu: 49.37% global_avg_ntp_loss: 0.7422 global_avg_top_loss: 1.9202 +[titan] 2025-09-09 22:32:33,518 - root - INFO - lr: 5.2367e-06 gnorm: 0.38 [2 days, 4:57:04<20:11:08] +[titan] 2025-09-09 22:33:05,436 - root - INFO - step: 28965 loss: 2.6714 memory: 122.04GiB(87.57%) tps: 10,267 tflops: 489.30 mfu: 49.47% global_avg_ntp_loss: 0.7453 global_avg_top_loss: 1.9261 +[titan] 2025-09-09 22:33:05,436 - root - INFO - lr: 5.2339e-06 gnorm: 0.40 [2 days, 4:57:36<20:10:35] +[titan] 2025-09-09 22:33:37,573 - root - INFO - step: 28970 loss: 2.6994 memory: 122.04GiB(87.57%) tps: 10,197 tflops: 485.96 mfu: 49.14% global_avg_ntp_loss: 0.7558 global_avg_top_loss: 1.9435 +[titan] 2025-09-09 22:33:37,573 - root - INFO - lr: 5.2312e-06 gnorm: 0.39 [2 days, 4:58:08<20:10:02] +[titan] 2025-09-09 22:34:09,684 - root - INFO - step: 28975 loss: 2.5462 memory: 122.04GiB(87.57%) tps: 10,205 tflops: 486.36 mfu: 49.18% global_avg_ntp_loss: 0.6863 global_avg_top_loss: 1.8599 +[titan] 2025-09-09 22:34:09,685 - root - INFO - lr: 5.2284e-06 gnorm: 0.39 [2 days, 4:58:40<20:09:29] +[titan] 2025-09-09 22:34:41,910 - root - INFO - step: 28980 loss: 2.6703 memory: 122.04GiB(87.57%) tps: 10,169 tflops: 484.63 mfu: 49.00% global_avg_ntp_loss: 0.7429 global_avg_top_loss: 1.9274 +[titan] 2025-09-09 22:34:41,910 - root - INFO - lr: 5.2257e-06 gnorm: 0.39 [2 days, 4:59:12<20:08:56] +[titan] 2025-09-09 22:35:13,710 - root - INFO - step: 28985 loss: 2.6346 memory: 122.04GiB(87.57%) tps: 10,305 tflops: 491.11 mfu: 49.66% global_avg_ntp_loss: 0.7294 global_avg_top_loss: 1.9052 +[titan] 2025-09-09 22:35:13,710 - root - INFO - lr: 5.2230e-06 gnorm: 0.37 [2 days, 4:59:44<20:08:22] +[titan] 2025-09-09 22:35:45,684 - root - INFO - step: 28990 loss: 2.7377 memory: 122.04GiB(87.57%) tps: 10,249 tflops: 488.44 mfu: 49.39% global_avg_ntp_loss: 0.7751 global_avg_top_loss: 1.9626 +[titan] 2025-09-09 22:35:45,685 - root - INFO - lr: 5.2202e-06 gnorm: 0.38 [2 days, 5:00:16<20:07:49] +[titan] 2025-09-09 22:36:17,704 - root - INFO - step: 28995 loss: 2.6655 memory: 122.04GiB(87.57%) tps: 10,234 tflops: 487.75 mfu: 49.32% global_avg_ntp_loss: 0.7393 global_avg_top_loss: 1.9262 +[titan] 2025-09-09 22:36:17,704 - root - INFO - lr: 5.2175e-06 gnorm: 0.39 [2 days, 5:00:48<20:07:16] +[titan] 2025-09-09 22:36:43,429 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 22:36:49,790 - root - INFO - step: 29000 loss: 2.6706 memory: 122.04GiB(87.57%) tps: 10,213 tflops: 486.74 mfu: 49.22% global_avg_ntp_loss: 0.7425 global_avg_top_loss: 1.9281 +[titan] 2025-09-09 22:36:49,790 - root - INFO - lr: 5.2148e-06 gnorm: 0.38 [2 days, 5:01:20<20:06:43] +[titan] 2025-09-09 22:37:21,733 - root - INFO - step: 29005 loss: 2.6569 memory: 122.04GiB(87.57%) tps: 10,259 tflops: 488.91 mfu: 49.44% global_avg_ntp_loss: 0.7413 global_avg_top_loss: 1.9156 +[titan] 2025-09-09 22:37:21,734 - root - INFO - lr: 5.2120e-06 gnorm: 0.38 [2 days, 5:01:52<20:06:09] +[titan] 2025-09-09 22:37:53,908 - root - INFO - step: 29010 loss: 2.7091 memory: 122.04GiB(87.57%) tps: 10,185 tflops: 485.39 mfu: 49.08% global_avg_ntp_loss: 0.7599 global_avg_top_loss: 1.9493 +[titan] 2025-09-09 22:37:53,909 - root - INFO - lr: 5.2093e-06 gnorm: 0.39 [2 days, 5:02:24<20:05:36] +[titan] 2025-09-09 22:38:25,933 - root - INFO - step: 29015 loss: 2.6873 memory: 122.04GiB(87.57%) tps: 10,232 tflops: 487.67 mfu: 49.31% global_avg_ntp_loss: 0.7524 global_avg_top_loss: 1.9349 +[titan] 2025-09-09 22:38:25,934 - root - INFO - lr: 5.2066e-06 gnorm: 0.39 [2 days, 5:02:56<20:05:03] +[titan] 2025-09-09 22:38:58,108 - root - INFO - step: 29020 loss: 2.6560 memory: 122.04GiB(87.57%) tps: 10,185 tflops: 485.39 mfu: 49.08% global_avg_ntp_loss: 0.7429 global_avg_top_loss: 1.9131 +[titan] 2025-09-09 22:38:58,109 - root - INFO - lr: 5.2038e-06 gnorm: 0.40 [2 days, 5:03:29<20:04:30] +[titan] 2025-09-09 22:39:30,483 - root - INFO - step: 29025 loss: 2.6399 memory: 122.04GiB(87.57%) tps: 10,122 tflops: 482.40 mfu: 48.78% global_avg_ntp_loss: 0.7348 global_avg_top_loss: 1.9052 +[titan] 2025-09-09 22:39:30,483 - root - INFO - lr: 5.2011e-06 gnorm: 0.47 [2 days, 5:04:01<20:03:57] +[titan] 2025-09-09 22:40:02,312 - root - INFO - step: 29030 loss: 2.7287 memory: 122.04GiB(87.57%) tps: 10,295 tflops: 490.67 mfu: 49.61% global_avg_ntp_loss: 0.7708 global_avg_top_loss: 1.9578 +[titan] 2025-09-09 22:40:02,312 - root - INFO - lr: 5.1984e-06 gnorm: 0.39 [2 days, 5:04:33<20:03:23] +[titan] 2025-09-09 22:40:34,212 - root - INFO - step: 29035 loss: 3.6279 memory: 122.04GiB(87.57%) tps: 10,272 tflops: 489.57 mfu: 49.50% global_avg_ntp_loss: 1.2845 global_avg_top_loss: 2.3435 +[titan] 2025-09-09 22:40:34,212 - root - INFO - lr: 5.1956e-06 gnorm: 0.38 [2 days, 5:05:05<20:02:50] +[titan] 2025-09-09 22:41:06,176 - root - INFO - step: 29040 loss: 2.6558 memory: 122.04GiB(87.57%) tps: 10,252 tflops: 488.59 mfu: 49.40% global_avg_ntp_loss: 0.7384 global_avg_top_loss: 1.9174 +[titan] 2025-09-09 22:41:06,177 - root - INFO - lr: 5.1929e-06 gnorm: 0.40 [2 days, 5:05:37<20:02:17] +[titan] 2025-09-09 22:41:38,478 - root - INFO - step: 29045 loss: 2.6775 memory: 122.04GiB(87.57%) tps: 10,145 tflops: 483.49 mfu: 48.89% global_avg_ntp_loss: 0.7511 global_avg_top_loss: 1.9264 +[titan] 2025-09-09 22:41:38,478 - root - INFO - lr: 5.1902e-06 gnorm: 0.38 [2 days, 5:06:09<20:01:44] +[titan] 2025-09-09 22:42:03,931 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 22:42:10,293 - root - INFO - step: 29050 loss: 2.6216 memory: 122.04GiB(87.57%) tps: 10,300 tflops: 490.88 mfu: 49.63% global_avg_ntp_loss: 0.7192 global_avg_top_loss: 1.9024 +[titan] 2025-09-09 22:42:10,293 - root - INFO - lr: 5.1875e-06 gnorm: 0.39 [2 days, 5:06:41<20:01:10] +[titan] 2025-09-09 22:42:42,442 - root - INFO - step: 29055 loss: 2.6030 memory: 122.04GiB(87.57%) tps: 10,193 tflops: 485.78 mfu: 49.12% global_avg_ntp_loss: 0.7124 global_avg_top_loss: 1.8906 +[titan] 2025-09-09 22:42:42,443 - root - INFO - lr: 5.1847e-06 gnorm: 0.40 [2 days, 5:07:13<20:00:37] +[titan] 2025-09-09 22:43:14,432 - root - INFO - step: 29060 loss: 2.6507 memory: 122.04GiB(87.57%) tps: 10,244 tflops: 488.20 mfu: 49.36% global_avg_ntp_loss: 0.7355 global_avg_top_loss: 1.9152 +[titan] 2025-09-09 22:43:14,432 - root - INFO - lr: 5.1820e-06 gnorm: 0.39 [2 days, 5:07:45<20:00:04] +[titan] 2025-09-09 22:43:46,262 - root - INFO - step: 29065 loss: 2.6855 memory: 122.04GiB(87.57%) tps: 10,295 tflops: 490.65 mfu: 49.61% global_avg_ntp_loss: 0.7500 global_avg_top_loss: 1.9355 +[titan] 2025-09-09 22:43:46,262 - root - INFO - lr: 5.1793e-06 gnorm: 0.39 [2 days, 5:08:17<19:59:30] +[titan] 2025-09-09 22:44:18,373 - root - INFO - step: 29070 loss: 2.9020 memory: 122.04GiB(87.57%) tps: 10,205 tflops: 486.36 mfu: 49.18% global_avg_ntp_loss: 0.8524 global_avg_top_loss: 2.0496 +[titan] 2025-09-09 22:44:18,374 - root - INFO - lr: 5.1766e-06 gnorm: 0.42 [2 days, 5:08:49<19:58:57] +[titan] 2025-09-09 22:44:50,386 - root - INFO - step: 29075 loss: 2.7733 memory: 122.04GiB(87.57%) tps: 10,236 tflops: 487.86 mfu: 49.33% global_avg_ntp_loss: 0.7906 global_avg_top_loss: 1.9827 +[titan] 2025-09-09 22:44:50,386 - root - INFO - lr: 5.1738e-06 gnorm: 0.40 [2 days, 5:09:21<19:58:24] +[titan] 2025-09-09 22:45:22,384 - root - INFO - step: 29080 loss: 2.7112 memory: 122.04GiB(87.57%) tps: 10,241 tflops: 488.08 mfu: 49.35% global_avg_ntp_loss: 0.7622 global_avg_top_loss: 1.9490 +[titan] 2025-09-09 22:45:22,384 - root - INFO - lr: 5.1711e-06 gnorm: 0.40 [2 days, 5:09:53<19:57:51] +[titan] 2025-09-09 22:45:54,391 - root - INFO - step: 29085 loss: 2.6443 memory: 122.04GiB(87.57%) tps: 10,238 tflops: 487.93 mfu: 49.34% global_avg_ntp_loss: 0.7329 global_avg_top_loss: 1.9114 +[titan] 2025-09-09 22:45:54,392 - root - INFO - lr: 5.1684e-06 gnorm: 0.37 [2 days, 5:10:25<19:57:17] +[titan] 2025-09-09 22:46:26,547 - root - INFO - step: 29090 loss: 2.6211 memory: 122.04GiB(87.57%) tps: 10,191 tflops: 485.68 mfu: 49.11% global_avg_ntp_loss: 0.7190 global_avg_top_loss: 1.9021 +[titan] 2025-09-09 22:46:26,547 - root - INFO - lr: 5.1657e-06 gnorm: 0.41 [2 days, 5:10:57<19:56:44] +[titan] 2025-09-09 22:46:58,590 - root - INFO - step: 29095 loss: 3.2135 memory: 122.04GiB(87.57%) tps: 10,227 tflops: 487.39 mfu: 49.28% global_avg_ntp_loss: 1.0503 global_avg_top_loss: 2.1632 +[titan] 2025-09-09 22:46:58,590 - root - INFO - lr: 5.1630e-06 gnorm: 0.38 [2 days, 5:11:29<19:56:11] +[titan] 2025-09-09 22:47:24,379 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 22:47:30,777 - root - INFO - step: 29100 loss: 2.5813 memory: 122.04GiB(87.57%) tps: 10,181 tflops: 485.20 mfu: 49.06% global_avg_ntp_loss: 0.7079 global_avg_top_loss: 1.8733 +[titan] 2025-09-09 22:47:30,778 - root - INFO - lr: 5.1602e-06 gnorm: 0.36 [2 days, 5:12:01<19:55:38] +[titan] 2025-09-09 22:48:02,654 - root - INFO - step: 29105 loss: 2.6240 memory: 122.04GiB(87.57%) tps: 10,280 tflops: 489.94 mfu: 49.54% global_avg_ntp_loss: 0.7261 global_avg_top_loss: 1.8979 +[titan] 2025-09-09 22:48:02,654 - root - INFO - lr: 5.1575e-06 gnorm: 0.43 [2 days, 5:12:33<19:55:05] +[titan] 2025-09-09 22:48:34,748 - root - INFO - step: 29110 loss: 2.6238 memory: 122.04GiB(87.57%) tps: 10,210 tflops: 486.61 mfu: 49.20% global_avg_ntp_loss: 0.7265 global_avg_top_loss: 1.8973 +[titan] 2025-09-09 22:48:34,749 - root - INFO - lr: 5.1548e-06 gnorm: 0.39 [2 days, 5:13:05<19:54:31] +[titan] 2025-09-09 22:49:06,774 - root - INFO - step: 29115 loss: 3.0516 memory: 122.04GiB(87.57%) tps: 10,232 tflops: 487.65 mfu: 49.31% global_avg_ntp_loss: 0.9731 global_avg_top_loss: 2.0786 +[titan] 2025-09-09 22:49:06,775 - root - INFO - lr: 5.1521e-06 gnorm: 0.37 [2 days, 5:13:37<19:53:58] +[titan] 2025-09-09 22:49:38,841 - root - INFO - step: 29120 loss: 2.7758 memory: 122.04GiB(87.57%) tps: 10,219 tflops: 487.03 mfu: 49.24% global_avg_ntp_loss: 0.7954 global_avg_top_loss: 1.9804 +[titan] 2025-09-09 22:49:38,841 - root - INFO - lr: 5.1494e-06 gnorm: 0.41 [2 days, 5:14:09<19:53:25] +[titan] 2025-09-09 22:50:10,762 - root - INFO - step: 29125 loss: 2.6811 memory: 122.04GiB(87.57%) tps: 10,266 tflops: 489.25 mfu: 49.47% global_avg_ntp_loss: 0.7535 global_avg_top_loss: 1.9276 +[titan] 2025-09-09 22:50:10,763 - root - INFO - lr: 5.1467e-06 gnorm: 0.41 [2 days, 5:14:41<19:52:52] +[titan] 2025-09-09 22:50:42,971 - root - INFO - step: 29130 loss: 2.6952 memory: 122.04GiB(87.57%) tps: 10,174 tflops: 484.88 mfu: 49.03% global_avg_ntp_loss: 0.7592 global_avg_top_loss: 1.9360 +[titan] 2025-09-09 22:50:42,972 - root - INFO - lr: 5.1440e-06 gnorm: 0.38 [2 days, 5:15:13<19:52:18] +[titan] 2025-09-09 22:51:15,063 - root - INFO - step: 29135 loss: 2.7569 memory: 122.04GiB(87.57%) tps: 10,211 tflops: 486.66 mfu: 49.21% global_avg_ntp_loss: 0.7881 global_avg_top_loss: 1.9688 +[titan] 2025-09-09 22:51:15,063 - root - INFO - lr: 5.1413e-06 gnorm: 0.39 [2 days, 5:15:45<19:51:45] +[titan] 2025-09-09 22:51:47,021 - root - INFO - step: 29140 loss: 2.7331 memory: 122.04GiB(87.57%) tps: 10,254 tflops: 488.68 mfu: 49.41% global_avg_ntp_loss: 0.7839 global_avg_top_loss: 1.9492 +[titan] 2025-09-09 22:51:47,022 - root - INFO - lr: 5.1385e-06 gnorm: 0.39 [2 days, 5:16:17<19:51:12] +[titan] 2025-09-09 22:52:19,111 - root - INFO - step: 29145 loss: 2.7940 memory: 122.04GiB(87.57%) tps: 10,212 tflops: 486.68 mfu: 49.21% global_avg_ntp_loss: 0.8079 global_avg_top_loss: 1.9861 +[titan] 2025-09-09 22:52:19,112 - root - INFO - lr: 5.1358e-06 gnorm: 0.41 [2 days, 5:16:50<19:50:39] +[titan] 2025-09-09 22:52:44,647 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 22:52:51,045 - root - INFO - step: 29150 loss: 2.9851 memory: 122.04GiB(87.57%) tps: 10,262 tflops: 489.06 mfu: 49.45% global_avg_ntp_loss: 0.8898 global_avg_top_loss: 2.0953 +[titan] 2025-09-09 22:52:51,045 - root - INFO - lr: 5.1331e-06 gnorm: 0.40 [2 days, 5:17:21<19:50:06] +[titan] 2025-09-09 22:53:22,988 - root - INFO - step: 29155 loss: 2.6518 memory: 122.04GiB(87.57%) tps: 10,258 tflops: 488.91 mfu: 49.43% global_avg_ntp_loss: 0.7328 global_avg_top_loss: 1.9191 +[titan] 2025-09-09 22:53:22,989 - root - INFO - lr: 5.1304e-06 gnorm: 0.40 [2 days, 5:17:53<19:49:32] +[titan] 2025-09-09 22:53:54,930 - root - INFO - step: 29160 loss: 2.8450 memory: 122.04GiB(87.57%) tps: 10,259 tflops: 488.93 mfu: 49.44% global_avg_ntp_loss: 0.8424 global_avg_top_loss: 2.0026 +[titan] 2025-09-09 22:53:54,931 - root - INFO - lr: 5.1277e-06 gnorm: 0.40 [2 days, 5:18:25<19:48:59] +[titan] 2025-09-09 22:54:26,964 - root - INFO - step: 29165 loss: 2.7427 memory: 122.04GiB(87.57%) tps: 10,229 tflops: 487.53 mfu: 49.29% global_avg_ntp_loss: 0.7772 global_avg_top_loss: 1.9655 +[titan] 2025-09-09 22:54:26,965 - root - INFO - lr: 5.1250e-06 gnorm: 0.42 [2 days, 5:18:57<19:48:26] +[titan] 2025-09-09 22:54:59,096 - root - INFO - step: 29170 loss: 2.7462 memory: 122.04GiB(87.57%) tps: 10,199 tflops: 486.06 mfu: 49.15% global_avg_ntp_loss: 0.7783 global_avg_top_loss: 1.9678 +[titan] 2025-09-09 22:54:59,096 - root - INFO - lr: 5.1223e-06 gnorm: 0.38 [2 days, 5:19:30<19:47:53] +[titan] 2025-09-09 22:55:30,853 - root - INFO - step: 29175 loss: 2.7078 memory: 122.04GiB(87.57%) tps: 10,319 tflops: 491.78 mfu: 49.72% global_avg_ntp_loss: 0.7600 global_avg_top_loss: 1.9478 +[titan] 2025-09-09 22:55:30,853 - root - INFO - lr: 5.1196e-06 gnorm: 0.40 [2 days, 5:20:01<19:47:19] +[titan] 2025-09-09 22:56:02,956 - root - INFO - step: 29180 loss: 2.7188 memory: 122.04GiB(87.57%) tps: 10,208 tflops: 486.48 mfu: 49.19% global_avg_ntp_loss: 0.7658 global_avg_top_loss: 1.9529 +[titan] 2025-09-09 22:56:02,956 - root - INFO - lr: 5.1169e-06 gnorm: 0.38 [2 days, 5:20:33<19:46:46] +[titan] 2025-09-09 22:56:28,921 - root - INFO - Dumping profiler traces at step 29184 +[titan] 2025-09-09 22:56:28,992 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-09 22:56:35,311 - root - INFO - step: 29185 loss: 2.6094 memory: 122.04GiB(87.57%) tps: 10,128 tflops: 482.68 mfu: 48.81% global_avg_ntp_loss: 0.7306 global_avg_top_loss: 1.8788 +[titan] 2025-09-09 22:56:35,311 - root - INFO - lr: 5.1142e-06 gnorm: 0.48 [2 days, 5:21:06<19:46:13] +[titan] 2025-09-09 22:57:07,091 - root - INFO - step: 29190 loss: 2.6938 memory: 122.04GiB(87.57%) tps: 10,311 tflops: 491.43 mfu: 49.69% global_avg_ntp_loss: 0.7530 global_avg_top_loss: 1.9408 +[titan] 2025-09-09 22:57:07,091 - root - INFO - lr: 5.1115e-06 gnorm: 0.38 [2 days, 5:21:37<19:45:40] +[titan] 2025-09-09 22:57:38,985 - root - INFO - step: 29195 loss: 3.1577 memory: 122.04GiB(87.57%) tps: 10,274 tflops: 489.66 mfu: 49.51% global_avg_ntp_loss: 1.0210 global_avg_top_loss: 2.1367 +[titan] 2025-09-09 22:57:38,986 - root - INFO - lr: 5.1088e-06 gnorm: 0.38 [2 days, 5:22:09<19:45:06] +[titan] 2025-09-09 22:58:04,660 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 22:58:11,054 - root - INFO - step: 29200 loss: 2.6686 memory: 122.04GiB(87.57%) tps: 10,218 tflops: 486.99 mfu: 49.24% global_avg_ntp_loss: 0.7422 global_avg_top_loss: 1.9263 +[titan] 2025-09-09 22:58:11,055 - root - INFO - lr: 5.1061e-06 gnorm: 0.39 [2 days, 5:22:41<19:44:33] +[titan] 2025-09-09 22:58:42,842 - root - INFO - step: 29205 loss: 2.6944 memory: 122.04GiB(87.57%) tps: 10,309 tflops: 491.30 mfu: 49.68% global_avg_ntp_loss: 0.7533 global_avg_top_loss: 1.9411 +[titan] 2025-09-09 22:58:42,842 - root - INFO - lr: 5.1034e-06 gnorm: 0.39 [2 days, 5:23:13<19:44:00] +[titan] 2025-09-09 22:59:14,821 - root - INFO - step: 29210 loss: 2.6511 memory: 122.04GiB(87.57%) tps: 10,247 tflops: 488.36 mfu: 49.38% global_avg_ntp_loss: 0.7349 global_avg_top_loss: 1.9162 +[titan] 2025-09-09 22:59:14,821 - root - INFO - lr: 5.1007e-06 gnorm: 0.39 [2 days, 5:23:45<19:43:27] +[titan] 2025-09-09 22:59:46,629 - root - INFO - step: 29215 loss: 2.7013 memory: 122.04GiB(87.57%) tps: 10,302 tflops: 490.99 mfu: 49.64% global_avg_ntp_loss: 0.7585 global_avg_top_loss: 1.9429 +[titan] 2025-09-09 22:59:46,630 - root - INFO - lr: 5.0980e-06 gnorm: 0.39 [2 days, 5:24:17<19:42:53] +[titan] 2025-09-09 23:00:18,596 - root - INFO - step: 29220 loss: 2.6984 memory: 122.04GiB(87.57%) tps: 10,251 tflops: 488.55 mfu: 49.40% global_avg_ntp_loss: 0.7608 global_avg_top_loss: 1.9376 +[titan] 2025-09-09 23:00:18,597 - root - INFO - lr: 5.0953e-06 gnorm: 0.39 [2 days, 5:24:49<19:42:20] +[titan] 2025-09-09 23:00:50,664 - root - INFO - step: 29225 loss: 2.6469 memory: 122.04GiB(87.57%) tps: 10,219 tflops: 487.02 mfu: 49.24% global_avg_ntp_loss: 0.7335 global_avg_top_loss: 1.9134 +[titan] 2025-09-09 23:00:50,664 - root - INFO - lr: 5.0926e-06 gnorm: 0.38 [2 days, 5:25:21<19:41:47] +[titan] 2025-09-09 23:01:22,703 - root - INFO - step: 29230 loss: 2.6773 memory: 122.04GiB(87.57%) tps: 10,228 tflops: 487.45 mfu: 49.29% global_avg_ntp_loss: 0.7482 global_avg_top_loss: 1.9290 +[titan] 2025-09-09 23:01:22,703 - root - INFO - lr: 5.0899e-06 gnorm: 0.40 [2 days, 5:25:53<19:41:14] +[titan] 2025-09-09 23:01:54,652 - root - INFO - step: 29235 loss: 2.7078 memory: 122.04GiB(87.57%) tps: 10,256 tflops: 488.82 mfu: 49.43% global_avg_ntp_loss: 0.7596 global_avg_top_loss: 1.9482 +[titan] 2025-09-09 23:01:54,653 - root - INFO - lr: 5.0872e-06 gnorm: 0.41 [2 days, 5:26:25<19:40:40] +[titan] 2025-09-09 23:02:26,593 - root - INFO - step: 29240 loss: 2.6131 memory: 122.04GiB(87.57%) tps: 10,259 tflops: 488.95 mfu: 49.44% global_avg_ntp_loss: 0.7177 global_avg_top_loss: 1.8954 +[titan] 2025-09-09 23:02:26,594 - root - INFO - lr: 5.0846e-06 gnorm: 0.39 [2 days, 5:26:57<19:40:07] +[titan] 2025-09-09 23:02:58,597 - root - INFO - step: 29245 loss: 2.7600 memory: 122.04GiB(87.57%) tps: 10,239 tflops: 487.99 mfu: 49.34% global_avg_ntp_loss: 0.7883 global_avg_top_loss: 1.9716 +[titan] 2025-09-09 23:02:58,598 - root - INFO - lr: 5.0819e-06 gnorm: 0.38 [2 days, 5:27:29<19:39:34] +[titan] 2025-09-09 23:03:24,208 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 23:03:30,592 - root - INFO - step: 29250 loss: 2.6628 memory: 122.04GiB(87.57%) tps: 10,242 tflops: 488.12 mfu: 49.36% global_avg_ntp_loss: 0.7411 global_avg_top_loss: 1.9217 +[titan] 2025-09-09 23:03:30,592 - root - INFO - lr: 5.0792e-06 gnorm: 0.38 [2 days, 5:28:01<19:39:01] +[titan] 2025-09-09 23:04:02,496 - root - INFO - step: 29255 loss: 2.7456 memory: 122.04GiB(87.57%) tps: 10,271 tflops: 489.52 mfu: 49.50% global_avg_ntp_loss: 0.7792 global_avg_top_loss: 1.9664 +[titan] 2025-09-09 23:04:02,496 - root - INFO - lr: 5.0765e-06 gnorm: 0.39 [2 days, 5:28:33<19:38:27] +[titan] 2025-09-09 23:04:34,402 - root - INFO - step: 29260 loss: 2.7025 memory: 122.04GiB(87.57%) tps: 10,270 tflops: 489.48 mfu: 49.49% global_avg_ntp_loss: 0.7557 global_avg_top_loss: 1.9468 +[titan] 2025-09-09 23:04:34,403 - root - INFO - lr: 5.0738e-06 gnorm: 0.39 [2 days, 5:29:05<19:37:54] +[titan] 2025-09-09 23:05:06,379 - root - INFO - step: 29265 loss: 2.6506 memory: 122.04GiB(87.57%) tps: 10,248 tflops: 488.41 mfu: 49.38% global_avg_ntp_loss: 0.7370 global_avg_top_loss: 1.9136 +[titan] 2025-09-09 23:05:06,379 - root - INFO - lr: 5.0711e-06 gnorm: 0.38 [2 days, 5:29:37<19:37:21] +[titan] 2025-09-09 23:05:38,292 - root - INFO - step: 29270 loss: 2.7339 memory: 122.04GiB(87.57%) tps: 10,268 tflops: 489.37 mfu: 49.48% global_avg_ntp_loss: 0.7735 global_avg_top_loss: 1.9604 +[titan] 2025-09-09 23:05:38,293 - root - INFO - lr: 5.0684e-06 gnorm: 0.39 [2 days, 5:30:09<19:36:48] +[titan] 2025-09-09 23:06:10,247 - root - INFO - step: 29275 loss: 2.6792 memory: 122.04GiB(87.57%) tps: 10,255 tflops: 488.73 mfu: 49.42% global_avg_ntp_loss: 0.7468 global_avg_top_loss: 1.9324 +[titan] 2025-09-09 23:06:10,248 - root - INFO - lr: 5.0657e-06 gnorm: 0.41 [2 days, 5:30:41<19:36:14] +[titan] 2025-09-09 23:06:42,444 - root - INFO - step: 29280 loss: 2.7697 memory: 122.04GiB(87.57%) tps: 10,178 tflops: 485.07 mfu: 49.05% global_avg_ntp_loss: 0.7903 global_avg_top_loss: 1.9795 +[titan] 2025-09-09 23:06:42,444 - root - INFO - lr: 5.0631e-06 gnorm: 0.38 [2 days, 5:31:13<19:35:41] +[titan] 2025-09-09 23:07:14,579 - root - INFO - step: 29285 loss: 2.7344 memory: 122.04GiB(87.57%) tps: 10,197 tflops: 486.00 mfu: 49.14% global_avg_ntp_loss: 0.7738 global_avg_top_loss: 1.9606 +[titan] 2025-09-09 23:07:14,579 - root - INFO - lr: 5.0604e-06 gnorm: 0.41 [2 days, 5:31:45<19:35:08] +[titan] 2025-09-09 23:07:46,617 - root - INFO - step: 29290 loss: 2.6415 memory: 122.04GiB(87.57%) tps: 10,228 tflops: 487.45 mfu: 49.29% global_avg_ntp_loss: 0.7295 global_avg_top_loss: 1.9121 +[titan] 2025-09-09 23:07:46,618 - root - INFO - lr: 5.0577e-06 gnorm: 0.38 [2 days, 5:32:17<19:34:35] +[titan] 2025-09-09 23:08:18,621 - root - INFO - step: 29295 loss: 2.6646 memory: 122.04GiB(87.57%) tps: 10,239 tflops: 488.00 mfu: 49.34% global_avg_ntp_loss: 0.7405 global_avg_top_loss: 1.9241 +[titan] 2025-09-09 23:08:18,621 - root - INFO - lr: 5.0550e-06 gnorm: 0.46 [2 days, 5:32:49<19:34:01] +[titan] 2025-09-09 23:08:44,164 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 23:08:50,538 - root - INFO - step: 29300 loss: 2.7990 memory: 122.04GiB(87.57%) tps: 10,267 tflops: 489.31 mfu: 49.47% global_avg_ntp_loss: 0.7976 global_avg_top_loss: 2.0014 +[titan] 2025-09-09 23:08:50,539 - root - INFO - lr: 5.0523e-06 gnorm: 0.39 [2 days, 5:33:21<19:33:28] +[titan] 2025-09-09 23:09:22,323 - root - INFO - step: 29305 loss: 2.6830 memory: 122.04GiB(87.57%) tps: 10,310 tflops: 491.35 mfu: 49.68% global_avg_ntp_loss: 0.7503 global_avg_top_loss: 1.9327 +[titan] 2025-09-09 23:09:22,323 - root - INFO - lr: 5.0497e-06 gnorm: 0.39 [2 days, 5:33:53<19:32:55] +[titan] 2025-09-09 23:09:54,600 - root - INFO - step: 29310 loss: 2.7757 memory: 122.04GiB(87.57%) tps: 10,152 tflops: 483.85 mfu: 48.92% global_avg_ntp_loss: 0.7954 global_avg_top_loss: 1.9802 +[titan] 2025-09-09 23:09:54,600 - root - INFO - lr: 5.0470e-06 gnorm: 0.40 [2 days, 5:34:25<19:32:22] +[titan] 2025-09-09 23:10:26,550 - root - INFO - step: 29315 loss: 2.7023 memory: 122.04GiB(87.57%) tps: 10,256 tflops: 488.80 mfu: 49.42% global_avg_ntp_loss: 0.7604 global_avg_top_loss: 1.9418 +[titan] 2025-09-09 23:10:26,551 - root - INFO - lr: 5.0443e-06 gnorm: 0.38 [2 days, 5:34:57<19:31:49] +[titan] 2025-09-09 23:10:58,510 - root - INFO - step: 29320 loss: 2.5561 memory: 122.04GiB(87.57%) tps: 10,253 tflops: 488.66 mfu: 49.41% global_avg_ntp_loss: 0.6911 global_avg_top_loss: 1.8650 +[titan] 2025-09-09 23:10:58,510 - root - INFO - lr: 5.0416e-06 gnorm: 0.38 [2 days, 5:35:29<19:31:15] +[titan] 2025-09-09 23:11:30,391 - root - INFO - step: 29325 loss: 2.6740 memory: 122.04GiB(87.57%) tps: 10,279 tflops: 489.87 mfu: 49.53% global_avg_ntp_loss: 0.7473 global_avg_top_loss: 1.9267 +[titan] 2025-09-09 23:11:30,391 - root - INFO - lr: 5.0389e-06 gnorm: 0.39 [2 days, 5:36:01<19:30:42] +[titan] 2025-09-09 23:12:02,294 - root - INFO - step: 29330 loss: 2.7124 memory: 122.04GiB(87.57%) tps: 10,271 tflops: 489.53 mfu: 49.50% global_avg_ntp_loss: 0.7615 global_avg_top_loss: 1.9509 +[titan] 2025-09-09 23:12:02,294 - root - INFO - lr: 5.0363e-06 gnorm: 0.38 [2 days, 5:36:33<19:30:09] +[titan] 2025-09-09 23:12:34,251 - root - INFO - step: 29335 loss: 2.6808 memory: 122.04GiB(87.57%) tps: 10,254 tflops: 488.69 mfu: 49.41% global_avg_ntp_loss: 0.7482 global_avg_top_loss: 1.9326 +[titan] 2025-09-09 23:12:34,252 - root - INFO - lr: 5.0336e-06 gnorm: 0.40 [2 days, 5:37:05<19:29:35] +[titan] 2025-09-09 23:13:06,286 - root - INFO - step: 29340 loss: 2.6380 memory: 122.04GiB(87.57%) tps: 10,229 tflops: 487.52 mfu: 49.29% global_avg_ntp_loss: 0.7297 global_avg_top_loss: 1.9083 +[titan] 2025-09-09 23:13:06,286 - root - INFO - lr: 5.0309e-06 gnorm: 0.38 [2 days, 5:37:37<19:29:02] +[titan] 2025-09-09 23:13:38,327 - root - INFO - step: 29345 loss: 2.7049 memory: 122.04GiB(87.57%) tps: 10,227 tflops: 487.42 mfu: 49.28% global_avg_ntp_loss: 0.7606 global_avg_top_loss: 1.9443 +[titan] 2025-09-09 23:13:38,327 - root - INFO - lr: 5.0283e-06 gnorm: 0.38 [2 days, 5:38:09<19:28:29] +[titan] 2025-09-09 23:14:03,863 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 23:14:10,283 - root - INFO - step: 29350 loss: 2.6685 memory: 122.04GiB(87.57%) tps: 10,254 tflops: 488.72 mfu: 49.42% global_avg_ntp_loss: 0.7434 global_avg_top_loss: 1.9251 +[titan] 2025-09-09 23:14:10,284 - root - INFO - lr: 5.0256e-06 gnorm: 0.39 [2 days, 5:38:41<19:27:56] +[titan] 2025-09-09 23:14:42,167 - root - INFO - step: 29355 loss: 2.7210 memory: 122.04GiB(87.57%) tps: 10,278 tflops: 489.82 mfu: 49.53% global_avg_ntp_loss: 0.7705 global_avg_top_loss: 1.9505 +[titan] 2025-09-09 23:14:42,167 - root - INFO - lr: 5.0229e-06 gnorm: 0.39 [2 days, 5:39:13<19:27:23] +[titan] 2025-09-09 23:15:14,043 - root - INFO - step: 29360 loss: 2.6904 memory: 122.04GiB(87.57%) tps: 10,280 tflops: 489.95 mfu: 49.54% global_avg_ntp_loss: 0.7531 global_avg_top_loss: 1.9373 +[titan] 2025-09-09 23:15:14,043 - root - INFO - lr: 5.0203e-06 gnorm: 0.38 [2 days, 5:39:44<19:26:49] +[titan] 2025-09-09 23:15:46,334 - root - INFO - step: 29365 loss: 2.6537 memory: 122.04GiB(87.57%) tps: 10,148 tflops: 483.64 mfu: 48.90% global_avg_ntp_loss: 0.7375 global_avg_top_loss: 1.9161 +[titan] 2025-09-09 23:15:46,334 - root - INFO - lr: 5.0176e-06 gnorm: 0.41 [2 days, 5:40:17<19:26:16] +[titan] 2025-09-09 23:16:18,020 - root - INFO - step: 29370 loss: 2.7244 memory: 122.04GiB(87.57%) tps: 10,342 tflops: 492.89 mfu: 49.84% global_avg_ntp_loss: 0.7680 global_avg_top_loss: 1.9564 +[titan] 2025-09-09 23:16:18,020 - root - INFO - lr: 5.0149e-06 gnorm: 0.40 [2 days, 5:40:48<19:25:43] +[titan] 2025-09-09 23:16:50,166 - root - INFO - step: 29375 loss: 2.7198 memory: 122.04GiB(87.57%) tps: 10,194 tflops: 485.82 mfu: 49.12% global_avg_ntp_loss: 0.7676 global_avg_top_loss: 1.9521 +[titan] 2025-09-09 23:16:50,167 - root - INFO - lr: 5.0123e-06 gnorm: 0.38 [2 days, 5:41:21<19:25:10] +[titan] 2025-09-09 23:17:22,259 - root - INFO - step: 29380 loss: 2.5785 memory: 122.04GiB(87.57%) tps: 10,211 tflops: 486.64 mfu: 49.21% global_avg_ntp_loss: 0.7027 global_avg_top_loss: 1.8758 +[titan] 2025-09-09 23:17:22,259 - root - INFO - lr: 5.0096e-06 gnorm: 0.39 [2 days, 5:41:53<19:24:36] +[titan] 2025-09-09 23:17:54,068 - root - INFO - step: 29385 loss: 2.7280 memory: 122.04GiB(87.57%) tps: 10,302 tflops: 490.96 mfu: 49.64% global_avg_ntp_loss: 0.7727 global_avg_top_loss: 1.9553 +[titan] 2025-09-09 23:17:54,069 - root - INFO - lr: 5.0069e-06 gnorm: 0.39 [2 days, 5:42:24<19:24:03] +[titan] 2025-09-09 23:18:25,942 - root - INFO - step: 29390 loss: 2.6867 memory: 122.04GiB(87.57%) tps: 10,281 tflops: 489.98 mfu: 49.54% global_avg_ntp_loss: 0.7522 global_avg_top_loss: 1.9344 +[titan] 2025-09-09 23:18:25,943 - root - INFO - lr: 5.0043e-06 gnorm: 0.39 [2 days, 5:42:56<19:23:30] +[titan] 2025-09-09 23:18:58,087 - root - INFO - step: 29395 loss: 2.5905 memory: 122.04GiB(87.57%) tps: 10,194 tflops: 485.85 mfu: 49.13% global_avg_ntp_loss: 0.7117 global_avg_top_loss: 1.8787 +[titan] 2025-09-09 23:18:58,087 - root - INFO - lr: 5.0016e-06 gnorm: 0.38 [2 days, 5:43:28<19:22:57] +[titan] 2025-09-09 23:19:23,629 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 23:19:30,067 - root - INFO - step: 29400 loss: 2.6546 memory: 122.04GiB(87.57%) tps: 10,247 tflops: 488.35 mfu: 49.38% global_avg_ntp_loss: 0.7355 global_avg_top_loss: 1.9192 +[titan] 2025-09-09 23:19:30,067 - root - INFO - lr: 4.9989e-06 gnorm: 0.39 [2 days, 5:44:00<19:22:24] +[titan] 2025-09-09 23:20:02,218 - root - INFO - step: 29405 loss: 2.7080 memory: 122.04GiB(87.57%) tps: 10,192 tflops: 485.75 mfu: 49.12% global_avg_ntp_loss: 0.7621 global_avg_top_loss: 1.9458 +[titan] 2025-09-09 23:20:02,218 - root - INFO - lr: 4.9963e-06 gnorm: 0.41 [2 days, 5:44:33<19:21:50] +[titan] 2025-09-09 23:20:34,210 - root - INFO - step: 29410 loss: 2.7444 memory: 122.04GiB(87.57%) tps: 10,243 tflops: 488.16 mfu: 49.36% global_avg_ntp_loss: 0.7791 global_avg_top_loss: 1.9653 +[titan] 2025-09-09 23:20:34,210 - root - INFO - lr: 4.9936e-06 gnorm: 0.41 [2 days, 5:45:05<19:21:17] +[titan] 2025-09-09 23:21:06,547 - root - INFO - step: 29415 loss: 2.7118 memory: 122.04GiB(87.57%) tps: 10,134 tflops: 482.96 mfu: 48.83% global_avg_ntp_loss: 0.7672 global_avg_top_loss: 1.9445 +[titan] 2025-09-09 23:21:06,547 - root - INFO - lr: 4.9910e-06 gnorm: 0.40 [2 days, 5:45:37<19:20:44] +[titan] 2025-09-09 23:21:38,636 - root - INFO - step: 29420 loss: 2.7307 memory: 122.04GiB(87.57%) tps: 10,212 tflops: 486.69 mfu: 49.21% global_avg_ntp_loss: 0.7686 global_avg_top_loss: 1.9621 +[titan] 2025-09-09 23:21:38,636 - root - INFO - lr: 4.9883e-06 gnorm: 0.40 [2 days, 5:46:09<19:20:11] +[titan] 2025-09-09 23:22:10,683 - root - INFO - step: 29425 loss: 2.7485 memory: 122.04GiB(87.57%) tps: 10,225 tflops: 487.32 mfu: 49.27% global_avg_ntp_loss: 0.7819 global_avg_top_loss: 1.9666 +[titan] 2025-09-09 23:22:10,684 - root - INFO - lr: 4.9856e-06 gnorm: 0.39 [2 days, 5:46:41<19:19:38] +[titan] 2025-09-09 23:22:42,613 - root - INFO - step: 29430 loss: 2.7111 memory: 122.04GiB(87.57%) tps: 10,263 tflops: 489.12 mfu: 49.46% global_avg_ntp_loss: 0.7616 global_avg_top_loss: 1.9495 +[titan] 2025-09-09 23:22:42,614 - root - INFO - lr: 4.9830e-06 gnorm: 0.39 [2 days, 5:47:13<19:19:04] +[titan] 2025-09-09 23:23:14,555 - root - INFO - step: 29435 loss: 2.7984 memory: 122.04GiB(87.57%) tps: 10,259 tflops: 488.93 mfu: 49.44% global_avg_ntp_loss: 0.8023 global_avg_top_loss: 1.9961 +[titan] 2025-09-09 23:23:14,556 - root - INFO - lr: 4.9803e-06 gnorm: 0.44 [2 days, 5:47:45<19:18:31] +[titan] 2025-09-09 23:23:46,631 - root - INFO - step: 29440 loss: 2.7336 memory: 122.04GiB(87.57%) tps: 10,216 tflops: 486.89 mfu: 49.23% global_avg_ntp_loss: 0.7707 global_avg_top_loss: 1.9629 +[titan] 2025-09-09 23:23:46,632 - root - INFO - lr: 4.9777e-06 gnorm: 0.39 [2 days, 5:48:17<19:17:58] +[titan] 2025-09-09 23:24:18,631 - root - INFO - step: 29445 loss: 2.6786 memory: 122.04GiB(87.57%) tps: 10,241 tflops: 488.06 mfu: 49.35% global_avg_ntp_loss: 0.7484 global_avg_top_loss: 1.9302 +[titan] 2025-09-09 23:24:18,631 - root - INFO - lr: 4.9750e-06 gnorm: 0.39 [2 days, 5:48:49<19:17:25] +[titan] 2025-09-09 23:24:44,351 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 23:24:50,723 - root - INFO - step: 29450 loss: 2.6512 memory: 122.04GiB(87.57%) tps: 10,211 tflops: 486.64 mfu: 49.21% global_avg_ntp_loss: 0.7335 global_avg_top_loss: 1.9177 +[titan] 2025-09-09 23:24:50,723 - root - INFO - lr: 4.9724e-06 gnorm: 0.53 [2 days, 5:49:21<19:16:52] +[titan] 2025-09-09 23:25:22,863 - root - INFO - step: 29455 loss: 2.6324 memory: 122.04GiB(87.57%) tps: 10,196 tflops: 485.91 mfu: 49.13% global_avg_ntp_loss: 0.7294 global_avg_top_loss: 1.9030 +[titan] 2025-09-09 23:25:22,864 - root - INFO - lr: 4.9697e-06 gnorm: 0.38 [2 days, 5:49:53<19:16:18] +[titan] 2025-09-09 23:25:54,872 - root - INFO - step: 29460 loss: 2.6107 memory: 122.04GiB(87.57%) tps: 10,237 tflops: 487.91 mfu: 49.33% global_avg_ntp_loss: 0.7171 global_avg_top_loss: 1.8937 +[titan] 2025-09-09 23:25:54,873 - root - INFO - lr: 4.9671e-06 gnorm: 0.50 [2 days, 5:50:25<19:15:45] +[titan] 2025-09-09 23:26:26,822 - root - INFO - step: 29465 loss: 2.7100 memory: 122.04GiB(87.57%) tps: 10,257 tflops: 488.82 mfu: 49.43% global_avg_ntp_loss: 0.7655 global_avg_top_loss: 1.9444 +[titan] 2025-09-09 23:26:26,822 - root - INFO - lr: 4.9644e-06 gnorm: 0.39 [2 days, 5:50:57<19:15:12] +[titan] 2025-09-09 23:26:59,115 - root - INFO - step: 29470 loss: 2.8733 memory: 122.04GiB(87.57%) tps: 10,147 tflops: 483.61 mfu: 48.90% global_avg_ntp_loss: 0.8526 global_avg_top_loss: 2.0207 +[titan] 2025-09-09 23:26:59,116 - root - INFO - lr: 4.9618e-06 gnorm: 0.40 [2 days, 5:51:29<19:14:39] +[titan] 2025-09-09 23:27:31,112 - root - INFO - step: 29475 loss: 2.6146 memory: 122.04GiB(87.57%) tps: 10,241 tflops: 488.09 mfu: 49.35% global_avg_ntp_loss: 0.7173 global_avg_top_loss: 1.8973 +[titan] 2025-09-09 23:27:31,113 - root - INFO - lr: 4.9591e-06 gnorm: 0.39 [2 days, 5:52:01<19:14:06] +[titan] 2025-09-09 23:28:03,055 - root - INFO - step: 29480 loss: 2.7077 memory: 122.04GiB(87.57%) tps: 10,259 tflops: 488.93 mfu: 49.44% global_avg_ntp_loss: 0.7603 global_avg_top_loss: 1.9475 +[titan] 2025-09-09 23:28:03,055 - root - INFO - lr: 4.9565e-06 gnorm: 0.41 [2 days, 5:52:33<19:13:32] +[titan] 2025-09-09 23:28:34,949 - root - INFO - step: 29485 loss: 2.6600 memory: 122.04GiB(87.57%) tps: 10,274 tflops: 489.66 mfu: 49.51% global_avg_ntp_loss: 0.7423 global_avg_top_loss: 1.9177 +[titan] 2025-09-09 23:28:34,949 - root - INFO - lr: 4.9538e-06 gnorm: 0.39 [2 days, 5:53:05<19:12:59] +[titan] 2025-09-09 23:29:06,990 - root - INFO - step: 29490 loss: 2.7052 memory: 122.04GiB(87.57%) tps: 10,227 tflops: 487.43 mfu: 49.28% global_avg_ntp_loss: 0.7591 global_avg_top_loss: 1.9461 +[titan] 2025-09-09 23:29:06,990 - root - INFO - lr: 4.9512e-06 gnorm: 0.43 [2 days, 5:53:37<19:12:26] +[titan] 2025-09-09 23:29:39,008 - root - INFO - step: 29495 loss: 2.6494 memory: 122.04GiB(87.57%) tps: 10,234 tflops: 487.76 mfu: 49.32% global_avg_ntp_loss: 0.7352 global_avg_top_loss: 1.9142 +[titan] 2025-09-09 23:29:39,009 - root - INFO - lr: 4.9486e-06 gnorm: 0.38 [2 days, 5:54:09<19:11:53] +[titan] 2025-09-09 23:30:04,501 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 23:30:10,900 - root - INFO - step: 29500 loss: 2.6307 memory: 122.04GiB(87.57%) tps: 10,275 tflops: 489.70 mfu: 49.51% global_avg_ntp_loss: 0.7293 global_avg_top_loss: 1.9014 +[titan] 2025-09-09 23:30:10,901 - root - INFO - lr: 4.9459e-06 gnorm: 0.37 [2 days, 5:54:41<19:11:19] +[titan] 2025-09-09 23:30:42,669 - root - INFO - step: 29505 loss: 2.6899 memory: 122.04GiB(87.57%) tps: 10,315 tflops: 491.60 mfu: 49.71% global_avg_ntp_loss: 0.7533 global_avg_top_loss: 1.9365 +[titan] 2025-09-09 23:30:42,670 - root - INFO - lr: 4.9433e-06 gnorm: 0.40 [2 days, 5:55:13<19:10:46] +[titan] 2025-09-09 23:31:14,714 - root - INFO - step: 29510 loss: 2.7504 memory: 122.04GiB(87.57%) tps: 10,226 tflops: 487.36 mfu: 49.28% global_avg_ntp_loss: 0.7848 global_avg_top_loss: 1.9656 +[titan] 2025-09-09 23:31:14,714 - root - INFO - lr: 4.9406e-06 gnorm: 0.38 [2 days, 5:55:45<19:10:13] +[titan] 2025-09-09 23:31:46,701 - root - INFO - step: 29515 loss: 2.6921 memory: 122.04GiB(87.57%) tps: 10,244 tflops: 488.24 mfu: 49.37% global_avg_ntp_loss: 0.7494 global_avg_top_loss: 1.9427 +[titan] 2025-09-09 23:31:46,702 - root - INFO - lr: 4.9380e-06 gnorm: 0.39 [2 days, 5:56:17<19:09:40] +[titan] 2025-09-09 23:32:18,822 - root - INFO - step: 29520 loss: 2.6763 memory: 122.04GiB(87.57%) tps: 10,202 tflops: 486.20 mfu: 49.16% global_avg_ntp_loss: 0.7621 global_avg_top_loss: 1.9143 +[titan] 2025-09-09 23:32:18,823 - root - INFO - lr: 4.9354e-06 gnorm: 0.39 [2 days, 5:56:49<19:09:07] +[titan] 2025-09-09 23:32:50,838 - root - INFO - step: 29525 loss: 3.0576 memory: 122.04GiB(87.57%) tps: 10,235 tflops: 487.81 mfu: 49.32% global_avg_ntp_loss: 0.9634 global_avg_top_loss: 2.0942 +[titan] 2025-09-09 23:32:50,838 - root - INFO - lr: 4.9327e-06 gnorm: 0.39 [2 days, 5:57:21<19:08:33] +[titan] 2025-09-09 23:33:22,842 - root - INFO - step: 29530 loss: 2.6854 memory: 122.04GiB(87.57%) tps: 10,239 tflops: 487.99 mfu: 49.34% global_avg_ntp_loss: 0.7522 global_avg_top_loss: 1.9332 +[titan] 2025-09-09 23:33:22,842 - root - INFO - lr: 4.9301e-06 gnorm: 0.38 [2 days, 5:57:53<19:08:00] +[titan] 2025-09-09 23:33:54,772 - root - INFO - step: 29535 loss: 2.6469 memory: 122.04GiB(87.57%) tps: 10,263 tflops: 489.11 mfu: 49.46% global_avg_ntp_loss: 0.7293 global_avg_top_loss: 1.9177 +[titan] 2025-09-09 23:33:54,772 - root - INFO - lr: 4.9274e-06 gnorm: 0.39 [2 days, 5:58:25<19:07:27] +[titan] 2025-09-09 23:34:26,818 - root - INFO - step: 29540 loss: 2.6060 memory: 122.04GiB(87.57%) tps: 10,226 tflops: 487.34 mfu: 49.28% global_avg_ntp_loss: 0.7195 global_avg_top_loss: 1.8864 +[titan] 2025-09-09 23:34:26,818 - root - INFO - lr: 4.9248e-06 gnorm: 0.47 [2 days, 5:58:57<19:06:54] +[titan] 2025-09-09 23:34:58,816 - root - INFO - step: 29545 loss: 2.6410 memory: 122.04GiB(87.57%) tps: 10,241 tflops: 488.07 mfu: 49.35% global_avg_ntp_loss: 0.7293 global_avg_top_loss: 1.9117 +[titan] 2025-09-09 23:34:58,817 - root - INFO - lr: 4.9222e-06 gnorm: 0.38 [2 days, 5:59:29<19:06:20] +[titan] 2025-09-09 23:35:24,538 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 23:35:30,941 - root - INFO - step: 29550 loss: 2.7671 memory: 122.04GiB(87.57%) tps: 10,201 tflops: 486.16 mfu: 49.16% global_avg_ntp_loss: 0.7879 global_avg_top_loss: 1.9792 +[titan] 2025-09-09 23:35:30,941 - root - INFO - lr: 4.9195e-06 gnorm: 0.45 [2 days, 6:00:01<19:05:47] +[titan] 2025-09-09 23:36:03,162 - root - INFO - step: 29555 loss: 2.6948 memory: 122.04GiB(87.57%) tps: 10,170 tflops: 484.68 mfu: 49.01% global_avg_ntp_loss: 0.7547 global_avg_top_loss: 1.9401 +[titan] 2025-09-09 23:36:03,163 - root - INFO - lr: 4.9169e-06 gnorm: 0.40 [2 days, 6:00:33<19:05:14] +[titan] 2025-09-09 23:36:35,000 - root - INFO - step: 29560 loss: 2.7432 memory: 122.04GiB(87.57%) tps: 10,293 tflops: 490.54 mfu: 49.60% global_avg_ntp_loss: 0.7774 global_avg_top_loss: 1.9657 +[titan] 2025-09-09 23:36:35,000 - root - INFO - lr: 4.9143e-06 gnorm: 0.39 [2 days, 6:01:05<19:04:41] +[titan] 2025-09-09 23:37:06,856 - root - INFO - step: 29565 loss: 2.7192 memory: 122.04GiB(87.57%) tps: 10,287 tflops: 490.25 mfu: 49.57% global_avg_ntp_loss: 0.7649 global_avg_top_loss: 1.9543 +[titan] 2025-09-09 23:37:06,856 - root - INFO - lr: 4.9117e-06 gnorm: 0.39 [2 days, 6:01:37<19:04:08] +[titan] 2025-09-09 23:37:38,968 - root - INFO - step: 29570 loss: 2.6684 memory: 122.04GiB(87.57%) tps: 10,205 tflops: 486.34 mfu: 49.18% global_avg_ntp_loss: 0.7420 global_avg_top_loss: 1.9264 +[titan] 2025-09-09 23:37:38,968 - root - INFO - lr: 4.9090e-06 gnorm: 0.38 [2 days, 6:02:09<19:03:35] +[titan] 2025-09-09 23:38:10,969 - root - INFO - step: 29575 loss: 2.6341 memory: 122.04GiB(87.57%) tps: 10,240 tflops: 488.02 mfu: 49.35% global_avg_ntp_loss: 0.7259 global_avg_top_loss: 1.9081 +[titan] 2025-09-09 23:38:10,970 - root - INFO - lr: 4.9064e-06 gnorm: 0.38 [2 days, 6:02:41<19:03:01] +[titan] 2025-09-09 23:38:43,179 - root - INFO - step: 29580 loss: 2.5590 memory: 122.04GiB(87.57%) tps: 10,174 tflops: 484.86 mfu: 49.03% global_avg_ntp_loss: 0.6934 global_avg_top_loss: 1.8656 +[titan] 2025-09-09 23:38:43,180 - root - INFO - lr: 4.9038e-06 gnorm: 0.40 [2 days, 6:03:13<19:02:28] +[titan] 2025-09-09 23:39:15,183 - root - INFO - step: 29585 loss: 2.7235 memory: 122.04GiB(87.57%) tps: 10,239 tflops: 487.99 mfu: 49.34% global_avg_ntp_loss: 0.7690 global_avg_top_loss: 1.9545 +[titan] 2025-09-09 23:39:15,184 - root - INFO - lr: 4.9011e-06 gnorm: 0.39 [2 days, 6:03:46<19:01:55] +[titan] 2025-09-09 23:39:47,246 - root - INFO - step: 29590 loss: 2.8727 memory: 122.04GiB(87.57%) tps: 10,220 tflops: 487.09 mfu: 49.25% global_avg_ntp_loss: 0.8667 global_avg_top_loss: 2.0060 +[titan] 2025-09-09 23:39:47,246 - root - INFO - lr: 4.8985e-06 gnorm: 0.39 [2 days, 6:04:18<19:01:22] +[titan] 2025-09-09 23:40:19,391 - root - INFO - step: 29595 loss: 2.6904 memory: 122.04GiB(87.57%) tps: 10,194 tflops: 485.84 mfu: 49.12% global_avg_ntp_loss: 0.7493 global_avg_top_loss: 1.9411 +[titan] 2025-09-09 23:40:19,391 - root - INFO - lr: 4.8959e-06 gnorm: 0.39 [2 days, 6:04:50<19:00:49] +[titan] 2025-09-09 23:40:44,958 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 23:40:51,412 - root - INFO - step: 29600 loss: 2.6792 memory: 122.04GiB(87.57%) tps: 10,234 tflops: 487.73 mfu: 49.32% global_avg_ntp_loss: 0.7478 global_avg_top_loss: 1.9314 +[titan] 2025-09-09 23:40:51,412 - root - INFO - lr: 4.8933e-06 gnorm: 0.39 [2 days, 6:05:22<19:00:15] +[titan] 2025-09-09 23:41:23,486 - root - INFO - step: 29605 loss: 2.6825 memory: 122.04GiB(87.57%) tps: 10,217 tflops: 486.92 mfu: 49.23% global_avg_ntp_loss: 0.7484 global_avg_top_loss: 1.9341 +[titan] 2025-09-09 23:41:23,486 - root - INFO - lr: 4.8907e-06 gnorm: 0.40 [2 days, 6:05:54<18:59:42] +[titan] 2025-09-09 23:41:55,559 - root - INFO - step: 29610 loss: 2.6834 memory: 122.04GiB(87.57%) tps: 10,217 tflops: 486.94 mfu: 49.24% global_avg_ntp_loss: 0.7484 global_avg_top_loss: 1.9349 +[titan] 2025-09-09 23:41:55,559 - root - INFO - lr: 4.8880e-06 gnorm: 0.41 [2 days, 6:06:26<18:59:09] +[titan] 2025-09-09 23:42:27,592 - root - INFO - step: 29615 loss: 2.7094 memory: 122.04GiB(87.57%) tps: 10,230 tflops: 487.53 mfu: 49.30% global_avg_ntp_loss: 0.7609 global_avg_top_loss: 1.9485 +[titan] 2025-09-09 23:42:27,593 - root - INFO - lr: 4.8854e-06 gnorm: 0.39 [2 days, 6:06:58<18:58:36] +[titan] 2025-09-09 23:42:59,640 - root - INFO - step: 29620 loss: 3.0617 memory: 122.04GiB(87.57%) tps: 10,225 tflops: 487.32 mfu: 49.27% global_avg_ntp_loss: 0.9728 global_avg_top_loss: 2.0889 +[titan] 2025-09-09 23:42:59,641 - root - INFO - lr: 4.8828e-06 gnorm: 0.39 [2 days, 6:07:30<18:58:03] +[titan] 2025-09-09 23:43:31,486 - root - INFO - step: 29625 loss: 2.8192 memory: 122.04GiB(87.57%) tps: 10,290 tflops: 490.41 mfu: 49.59% global_avg_ntp_loss: 0.8222 global_avg_top_loss: 1.9970 +[titan] 2025-09-09 23:43:31,487 - root - INFO - lr: 4.8802e-06 gnorm: 0.39 [2 days, 6:08:02<18:57:29] +[titan] 2025-09-09 23:44:03,370 - root - INFO - step: 29630 loss: 2.6848 memory: 122.04GiB(87.57%) tps: 10,277 tflops: 489.82 mfu: 49.53% global_avg_ntp_loss: 0.7519 global_avg_top_loss: 1.9330 +[titan] 2025-09-09 23:44:03,371 - root - INFO - lr: 4.8776e-06 gnorm: 0.38 [2 days, 6:08:34<18:56:56] +[titan] 2025-09-09 23:44:35,405 - root - INFO - step: 29635 loss: 2.6599 memory: 122.04GiB(87.57%) tps: 10,229 tflops: 487.52 mfu: 49.29% global_avg_ntp_loss: 0.7397 global_avg_top_loss: 1.9202 +[titan] 2025-09-09 23:44:35,406 - root - INFO - lr: 4.8749e-06 gnorm: 0.38 [2 days, 6:09:06<18:56:23] +[titan] 2025-09-09 23:45:07,457 - root - INFO - step: 29640 loss: 2.7675 memory: 122.04GiB(87.57%) tps: 10,224 tflops: 487.26 mfu: 49.27% global_avg_ntp_loss: 0.7881 global_avg_top_loss: 1.9794 +[titan] 2025-09-09 23:45:07,457 - root - INFO - lr: 4.8723e-06 gnorm: 0.39 [2 days, 6:09:38<18:55:50] +[titan] 2025-09-09 23:45:39,587 - root - INFO - step: 29645 loss: 2.6829 memory: 122.04GiB(87.57%) tps: 10,199 tflops: 486.07 mfu: 49.15% global_avg_ntp_loss: 0.7513 global_avg_top_loss: 1.9316 +[titan] 2025-09-09 23:45:39,588 - root - INFO - lr: 4.8697e-06 gnorm: 0.38 [2 days, 6:10:10<18:55:17] +[titan] 2025-09-09 23:46:05,078 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 23:46:11,475 - root - INFO - step: 29650 loss: 2.6912 memory: 122.04GiB(87.57%) tps: 10,276 tflops: 489.76 mfu: 49.52% global_avg_ntp_loss: 0.7516 global_avg_top_loss: 1.9395 +[titan] 2025-09-09 23:46:11,476 - root - INFO - lr: 4.8671e-06 gnorm: 0.64 [2 days, 6:10:42<18:54:43] +[titan] 2025-09-09 23:46:43,410 - root - INFO - step: 29655 loss: 2.6293 memory: 122.04GiB(87.57%) tps: 10,261 tflops: 489.04 mfu: 49.45% global_avg_ntp_loss: 0.7226 global_avg_top_loss: 1.9067 +[titan] 2025-09-09 23:46:43,411 - root - INFO - lr: 4.8645e-06 gnorm: 0.38 [2 days, 6:11:14<18:54:10] +[titan] 2025-09-09 23:47:15,405 - root - INFO - step: 29660 loss: 2.7703 memory: 122.04GiB(87.57%) tps: 10,242 tflops: 488.12 mfu: 49.35% global_avg_ntp_loss: 0.7866 global_avg_top_loss: 1.9838 +[titan] 2025-09-09 23:47:15,406 - root - INFO - lr: 4.8619e-06 gnorm: 0.39 [2 days, 6:11:46<18:53:37] +[titan] 2025-09-09 23:47:47,232 - root - INFO - step: 29665 loss: 2.7646 memory: 122.04GiB(87.57%) tps: 10,296 tflops: 490.70 mfu: 49.62% global_avg_ntp_loss: 0.7845 global_avg_top_loss: 1.9800 +[titan] 2025-09-09 23:47:47,233 - root - INFO - lr: 4.8593e-06 gnorm: 0.38 [2 days, 6:12:18<18:53:04] +[titan] 2025-09-09 23:48:19,241 - root - INFO - step: 29670 loss: 2.6916 memory: 122.04GiB(87.57%) tps: 10,237 tflops: 487.91 mfu: 49.33% global_avg_ntp_loss: 0.7538 global_avg_top_loss: 1.9378 +[titan] 2025-09-09 23:48:19,242 - root - INFO - lr: 4.8567e-06 gnorm: 0.38 [2 days, 6:12:50<18:52:31] +[titan] 2025-09-09 23:48:51,378 - root - INFO - step: 29675 loss: 2.7000 memory: 122.04GiB(87.57%) tps: 10,197 tflops: 485.97 mfu: 49.14% global_avg_ntp_loss: 0.7544 global_avg_top_loss: 1.9456 +[titan] 2025-09-09 23:48:51,378 - root - INFO - lr: 4.8541e-06 gnorm: 0.38 [2 days, 6:13:22<18:51:57] +[titan] 2025-09-09 23:49:23,199 - root - INFO - step: 29680 loss: 2.7927 memory: 122.04GiB(87.57%) tps: 10,298 tflops: 490.79 mfu: 49.62% global_avg_ntp_loss: 0.7960 global_avg_top_loss: 1.9967 +[titan] 2025-09-09 23:49:23,199 - root - INFO - lr: 4.8514e-06 gnorm: 0.40 [2 days, 6:13:53<18:51:24] +[titan] 2025-09-09 23:49:55,237 - root - INFO - step: 29685 loss: 2.7399 memory: 122.04GiB(87.57%) tps: 10,228 tflops: 487.47 mfu: 49.29% global_avg_ntp_loss: 0.7752 global_avg_top_loss: 1.9647 +[titan] 2025-09-09 23:49:55,237 - root - INFO - lr: 4.8488e-06 gnorm: 0.39 [2 days, 6:14:26<18:50:51] +[titan] 2025-09-09 23:50:27,228 - root - INFO - step: 29690 loss: 2.7406 memory: 122.04GiB(87.57%) tps: 10,243 tflops: 488.18 mfu: 49.36% global_avg_ntp_loss: 0.7764 global_avg_top_loss: 1.9642 +[titan] 2025-09-09 23:50:27,228 - root - INFO - lr: 4.8462e-06 gnorm: 0.39 [2 days, 6:14:58<18:50:18] +[titan] 2025-09-09 23:50:59,184 - root - INFO - step: 29695 loss: 2.6932 memory: 122.04GiB(87.57%) tps: 10,254 tflops: 488.72 mfu: 49.42% global_avg_ntp_loss: 0.7544 global_avg_top_loss: 1.9388 +[titan] 2025-09-09 23:50:59,185 - root - INFO - lr: 4.8436e-06 gnorm: 0.40 [2 days, 6:15:29<18:49:44] +[titan] 2025-09-09 23:51:05,815 - root - INFO - Dumping profiler traces at step 29696 +[titan] 2025-09-09 23:51:05,887 - root - INFO - Finished dumping profiler traces in 0.07 seconds +[titan] 2025-09-09 23:51:24,941 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 23:51:31,380 - root - INFO - step: 29700 loss: 2.7486 memory: 122.04GiB(87.57%) tps: 10,178 tflops: 485.08 mfu: 49.05% global_avg_ntp_loss: 0.7782 global_avg_top_loss: 1.9704 +[titan] 2025-09-09 23:51:31,381 - root - INFO - lr: 4.8410e-06 gnorm: 0.39 [2 days, 6:16:02<18:49:11] +[titan] 2025-09-09 23:52:03,289 - root - INFO - step: 29705 loss: 2.6199 memory: 122.04GiB(87.57%) tps: 10,269 tflops: 489.44 mfu: 49.49% global_avg_ntp_loss: 0.7254 global_avg_top_loss: 1.8945 +[titan] 2025-09-09 23:52:03,290 - root - INFO - lr: 4.8384e-06 gnorm: 0.39 [2 days, 6:16:34<18:48:38] +[titan] 2025-09-09 23:52:35,048 - root - INFO - step: 29710 loss: 2.6929 memory: 122.04GiB(87.57%) tps: 10,318 tflops: 491.75 mfu: 49.72% global_avg_ntp_loss: 0.7557 global_avg_top_loss: 1.9372 +[titan] 2025-09-09 23:52:35,048 - root - INFO - lr: 4.8358e-06 gnorm: 0.40 [2 days, 6:17:05<18:48:05] +[titan] 2025-09-09 23:53:06,855 - root - INFO - step: 29715 loss: 2.6783 memory: 122.04GiB(87.57%) tps: 10,302 tflops: 491.00 mfu: 49.65% global_avg_ntp_loss: 0.7491 global_avg_top_loss: 1.9293 +[titan] 2025-09-09 23:53:06,856 - root - INFO - lr: 4.8332e-06 gnorm: 0.38 [2 days, 6:17:37<18:47:32] +[titan] 2025-09-09 23:53:38,701 - root - INFO - step: 29720 loss: 2.7316 memory: 122.04GiB(87.57%) tps: 10,290 tflops: 490.40 mfu: 49.59% global_avg_ntp_loss: 0.7712 global_avg_top_loss: 1.9604 +[titan] 2025-09-09 23:53:38,702 - root - INFO - lr: 4.8306e-06 gnorm: 0.39 [2 days, 6:18:09<18:46:58] +[titan] 2025-09-09 23:54:10,529 - root - INFO - step: 29725 loss: 2.6654 memory: 122.04GiB(87.57%) tps: 10,296 tflops: 490.69 mfu: 49.61% global_avg_ntp_loss: 0.7409 global_avg_top_loss: 1.9245 +[titan] 2025-09-09 23:54:10,529 - root - INFO - lr: 4.8280e-06 gnorm: 0.39 [2 days, 6:18:41<18:46:25] +[titan] 2025-09-09 23:54:42,620 - root - INFO - step: 29730 loss: 2.6882 memory: 122.04GiB(87.57%) tps: 10,211 tflops: 486.66 mfu: 49.21% global_avg_ntp_loss: 0.7527 global_avg_top_loss: 1.9356 +[titan] 2025-09-09 23:54:42,620 - root - INFO - lr: 4.8254e-06 gnorm: 0.38 [2 days, 6:19:13<18:45:52] +[titan] 2025-09-09 23:55:14,622 - root - INFO - step: 29735 loss: 3.1121 memory: 122.04GiB(87.57%) tps: 10,240 tflops: 488.01 mfu: 49.34% global_avg_ntp_loss: 1.0012 global_avg_top_loss: 2.1108 +[titan] 2025-09-09 23:55:14,623 - root - INFO - lr: 4.8228e-06 gnorm: 0.41 [2 days, 6:19:45<18:45:19] +[titan] 2025-09-09 23:55:46,563 - root - INFO - step: 29740 loss: 2.7354 memory: 122.04GiB(87.57%) tps: 10,259 tflops: 488.95 mfu: 49.44% global_avg_ntp_loss: 0.7733 global_avg_top_loss: 1.9620 +[titan] 2025-09-09 23:55:46,563 - root - INFO - lr: 4.8202e-06 gnorm: 0.39 [2 days, 6:20:17<18:44:46] +[titan] 2025-09-09 23:56:18,419 - root - INFO - step: 29745 loss: 2.8026 memory: 122.04GiB(87.57%) tps: 10,287 tflops: 490.25 mfu: 49.57% global_avg_ntp_loss: 0.8278 global_avg_top_loss: 1.9747 +[titan] 2025-09-09 23:56:18,419 - root - INFO - lr: 4.8176e-06 gnorm: 0.38 [2 days, 6:20:49<18:44:12] +[titan] 2025-09-09 23:56:44,039 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-09 23:56:50,341 - root - INFO - step: 29750 loss: 2.5971 memory: 122.04GiB(87.57%) tps: 10,265 tflops: 489.24 mfu: 49.47% global_avg_ntp_loss: 0.7216 global_avg_top_loss: 1.8755 +[titan] 2025-09-09 23:56:50,341 - root - INFO - lr: 4.8150e-06 gnorm: 1.13 [2 days, 6:21:21<18:43:39] +[titan] 2025-09-09 23:57:22,521 - root - INFO - step: 29755 loss: 3.0763 memory: 122.04GiB(87.57%) tps: 10,183 tflops: 485.31 mfu: 49.07% global_avg_ntp_loss: 0.9800 global_avg_top_loss: 2.0963 +[titan] 2025-09-09 23:57:22,521 - root - INFO - lr: 4.8124e-06 gnorm: 0.37 [2 days, 6:21:53<18:43:06] +[titan] 2025-09-09 23:57:54,378 - root - INFO - step: 29760 loss: 2.6953 memory: 122.04GiB(87.57%) tps: 10,286 tflops: 490.23 mfu: 49.57% global_avg_ntp_loss: 0.7549 global_avg_top_loss: 1.9404 +[titan] 2025-09-09 23:57:54,378 - root - INFO - lr: 4.8099e-06 gnorm: 0.43 [2 days, 6:22:25<18:42:33] +[titan] 2025-09-09 23:58:26,318 - root - INFO - step: 29765 loss: 2.6468 memory: 122.04GiB(87.57%) tps: 10,259 tflops: 488.96 mfu: 49.44% global_avg_ntp_loss: 0.7329 global_avg_top_loss: 1.9139 +[titan] 2025-09-09 23:58:26,318 - root - INFO - lr: 4.8073e-06 gnorm: 0.43 [2 days, 6:22:57<18:41:59] +[titan] 2025-09-09 23:58:58,421 - root - INFO - step: 29770 loss: 2.6920 memory: 122.04GiB(87.57%) tps: 10,207 tflops: 486.48 mfu: 49.19% global_avg_ntp_loss: 0.7535 global_avg_top_loss: 1.9385 +[titan] 2025-09-09 23:58:58,422 - root - INFO - lr: 4.8047e-06 gnorm: 0.40 [2 days, 6:23:29<18:41:26] +[titan] 2025-09-09 23:59:30,710 - root - INFO - step: 29775 loss: 2.6059 memory: 122.04GiB(87.57%) tps: 10,149 tflops: 483.69 mfu: 48.91% global_avg_ntp_loss: 0.7144 global_avg_top_loss: 1.8915 +[titan] 2025-09-09 23:59:30,710 - root - INFO - lr: 4.8021e-06 gnorm: 0.37 [2 days, 6:24:01<18:40:53] +[titan] 2025-09-10 00:00:02,498 - root - INFO - step: 29780 loss: 2.6640 memory: 122.04GiB(87.57%) tps: 10,309 tflops: 491.30 mfu: 49.68% global_avg_ntp_loss: 0.7417 global_avg_top_loss: 1.9223 +[titan] 2025-09-10 00:00:02,498 - root - INFO - lr: 4.7995e-06 gnorm: 0.40 [2 days, 6:24:33<18:40:20] +[titan] 2025-09-10 00:00:34,400 - root - INFO - step: 29785 loss: 2.5482 memory: 122.04GiB(87.57%) tps: 10,272 tflops: 489.54 mfu: 49.50% global_avg_ntp_loss: 0.6895 global_avg_top_loss: 1.8587 +[titan] 2025-09-10 00:00:34,401 - root - INFO - lr: 4.7969e-06 gnorm: 0.37 [2 days, 6:25:05<18:39:47] +[titan] 2025-09-10 00:01:06,418 - root - INFO - step: 29790 loss: 2.7028 memory: 122.04GiB(87.57%) tps: 10,235 tflops: 487.78 mfu: 49.32% global_avg_ntp_loss: 0.7597 global_avg_top_loss: 1.9431 +[titan] 2025-09-10 00:01:06,418 - root - INFO - lr: 4.7943e-06 gnorm: 0.39 [2 days, 6:25:37<18:39:14] +[titan] 2025-09-10 00:01:38,374 - root - INFO - step: 29795 loss: 2.6177 memory: 122.04GiB(87.57%) tps: 10,254 tflops: 488.72 mfu: 49.42% global_avg_ntp_loss: 0.7188 global_avg_top_loss: 1.8989 +[titan] 2025-09-10 00:01:38,374 - root - INFO - lr: 4.7917e-06 gnorm: 0.39 [2 days, 6:26:09<18:38:40] +[titan] 2025-09-10 00:02:03,862 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 00:02:10,277 - root - INFO - step: 29800 loss: 2.6450 memory: 122.04GiB(87.57%) tps: 10,271 tflops: 489.53 mfu: 49.50% global_avg_ntp_loss: 0.7344 global_avg_top_loss: 1.9106 +[titan] 2025-09-10 00:02:10,277 - root - INFO - lr: 4.7892e-06 gnorm: 0.39 [2 days, 6:26:41<18:38:07] +[titan] 2025-09-10 00:02:42,328 - root - INFO - step: 29805 loss: 2.8560 memory: 122.04GiB(87.57%) tps: 10,225 tflops: 487.33 mfu: 49.28% global_avg_ntp_loss: 0.8431 global_avg_top_loss: 2.0129 +[titan] 2025-09-10 00:02:42,329 - root - INFO - lr: 4.7866e-06 gnorm: 0.38 [2 days, 6:27:13<18:37:34] +[titan] 2025-09-10 00:03:14,336 - root - INFO - step: 29810 loss: 2.6454 memory: 122.04GiB(87.57%) tps: 10,238 tflops: 487.93 mfu: 49.34% global_avg_ntp_loss: 0.7318 global_avg_top_loss: 1.9136 +[titan] 2025-09-10 00:03:14,336 - root - INFO - lr: 4.7840e-06 gnorm: 0.38 [2 days, 6:27:45<18:37:01] +[titan] 2025-09-10 00:03:46,354 - root - INFO - step: 29815 loss: 2.5502 memory: 122.04GiB(87.57%) tps: 10,234 tflops: 487.76 mfu: 49.32% global_avg_ntp_loss: 0.6868 global_avg_top_loss: 1.8634 +[titan] 2025-09-10 00:03:46,355 - root - INFO - lr: 4.7814e-06 gnorm: 0.38 [2 days, 6:28:17<18:36:28] +[titan] 2025-09-10 00:04:18,238 - root - INFO - step: 29820 loss: 2.6542 memory: 122.04GiB(87.57%) tps: 10,278 tflops: 489.83 mfu: 49.53% global_avg_ntp_loss: 0.7341 global_avg_top_loss: 1.9202 +[titan] 2025-09-10 00:04:18,238 - root - INFO - lr: 4.7788e-06 gnorm: 0.41 [2 days, 6:28:49<18:35:54] +[titan] 2025-09-10 00:04:50,377 - root - INFO - step: 29825 loss: 2.5905 memory: 122.04GiB(87.57%) tps: 10,196 tflops: 485.92 mfu: 49.13% global_avg_ntp_loss: 0.7089 global_avg_top_loss: 1.8816 +[titan] 2025-09-10 00:04:50,378 - root - INFO - lr: 4.7762e-06 gnorm: 0.37 [2 days, 6:29:21<18:35:21] +[titan] 2025-09-10 00:05:22,383 - root - INFO - step: 29830 loss: 2.5534 memory: 122.04GiB(87.57%) tps: 10,239 tflops: 487.96 mfu: 49.34% global_avg_ntp_loss: 0.6934 global_avg_top_loss: 1.8600 +[titan] 2025-09-10 00:05:22,383 - root - INFO - lr: 4.7737e-06 gnorm: 0.39 [2 days, 6:29:53<18:34:48] +[titan] 2025-09-10 00:05:54,578 - root - INFO - step: 29835 loss: 2.6753 memory: 122.04GiB(87.57%) tps: 10,178 tflops: 485.08 mfu: 49.05% global_avg_ntp_loss: 0.7460 global_avg_top_loss: 1.9292 +[titan] 2025-09-10 00:05:54,579 - root - INFO - lr: 4.7711e-06 gnorm: 0.38 [2 days, 6:30:25<18:34:15] +[titan] 2025-09-10 00:06:26,566 - root - INFO - step: 29840 loss: 2.7302 memory: 122.04GiB(87.57%) tps: 10,244 tflops: 488.24 mfu: 49.37% global_avg_ntp_loss: 0.7746 global_avg_top_loss: 1.9556 +[titan] 2025-09-10 00:06:26,566 - root - INFO - lr: 4.7685e-06 gnorm: 0.38 [2 days, 6:30:57<18:33:42] +[titan] 2025-09-10 00:06:58,803 - root - INFO - step: 29845 loss: 2.6537 memory: 122.04GiB(87.57%) tps: 10,165 tflops: 484.45 mfu: 48.98% global_avg_ntp_loss: 0.7356 global_avg_top_loss: 1.9181 +[titan] 2025-09-10 00:06:58,803 - root - INFO - lr: 4.7659e-06 gnorm: 0.38 [2 days, 6:31:29<18:33:09] +[titan] 2025-09-10 00:07:24,447 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 00:07:30,806 - root - INFO - step: 29850 loss: 2.7362 memory: 122.04GiB(87.57%) tps: 10,239 tflops: 488.00 mfu: 49.34% global_avg_ntp_loss: 0.7745 global_avg_top_loss: 1.9617 +[titan] 2025-09-10 00:07:30,806 - root - INFO - lr: 4.7634e-06 gnorm: 0.38 [2 days, 6:32:01<18:32:35] +[titan] 2025-09-10 00:08:02,841 - root - INFO - step: 29855 loss: 2.7087 memory: 122.04GiB(87.57%) tps: 10,229 tflops: 487.51 mfu: 49.29% global_avg_ntp_loss: 0.7605 global_avg_top_loss: 1.9483 +[titan] 2025-09-10 00:08:02,841 - root - INFO - lr: 4.7608e-06 gnorm: 0.39 [2 days, 6:32:33<18:32:02] +[titan] 2025-09-10 00:08:34,684 - root - INFO - step: 29860 loss: 2.6531 memory: 122.04GiB(87.57%) tps: 10,291 tflops: 490.46 mfu: 49.59% global_avg_ntp_loss: 0.7355 global_avg_top_loss: 1.9176 +[titan] 2025-09-10 00:08:34,684 - root - INFO - lr: 4.7582e-06 gnorm: 0.39 [2 days, 6:33:05<18:31:29] +[titan] 2025-09-10 00:09:06,601 - root - INFO - step: 29865 loss: 2.7168 memory: 122.04GiB(87.57%) tps: 10,267 tflops: 489.31 mfu: 49.48% global_avg_ntp_loss: 0.7646 global_avg_top_loss: 1.9522 +[titan] 2025-09-10 00:09:06,602 - root - INFO - lr: 4.7557e-06 gnorm: 0.39 [2 days, 6:33:37<18:30:56] +[titan] 2025-09-10 00:09:38,451 - root - INFO - step: 29870 loss: 2.7116 memory: 122.04GiB(87.57%) tps: 10,289 tflops: 490.35 mfu: 49.58% global_avg_ntp_loss: 0.7654 global_avg_top_loss: 1.9462 +[titan] 2025-09-10 00:09:38,452 - root - INFO - lr: 4.7531e-06 gnorm: 0.39 [2 days, 6:34:09<18:30:23] +[titan] 2025-09-10 00:10:10,458 - root - INFO - step: 29875 loss: 2.7238 memory: 122.04GiB(87.57%) tps: 10,238 tflops: 487.93 mfu: 49.34% global_avg_ntp_loss: 0.7679 global_avg_top_loss: 1.9559 +[titan] 2025-09-10 00:10:10,459 - root - INFO - lr: 4.7505e-06 gnorm: 0.40 [2 days, 6:34:41<18:29:49] +[titan] 2025-09-10 00:10:42,572 - root - INFO - step: 29880 loss: 2.6896 memory: 122.04GiB(87.57%) tps: 10,204 tflops: 486.33 mfu: 49.17% global_avg_ntp_loss: 0.7511 global_avg_top_loss: 1.9384 +[titan] 2025-09-10 00:10:42,572 - root - INFO - lr: 4.7479e-06 gnorm: 0.40 [2 days, 6:35:13<18:29:16] +[titan] 2025-09-10 00:11:14,690 - root - INFO - step: 29885 loss: 3.1702 memory: 122.04GiB(87.57%) tps: 10,202 tflops: 486.24 mfu: 49.16% global_avg_ntp_loss: 1.0232 global_avg_top_loss: 2.1470 +[titan] 2025-09-10 00:11:14,691 - root - INFO - lr: 4.7454e-06 gnorm: 0.39 [2 days, 6:35:45<18:28:43] +[titan] 2025-09-10 00:11:46,939 - root - INFO - step: 29890 loss: 2.6405 memory: 122.04GiB(87.57%) tps: 10,161 tflops: 484.28 mfu: 48.97% global_avg_ntp_loss: 0.7311 global_avg_top_loss: 1.9094 +[titan] 2025-09-10 00:11:46,940 - root - INFO - lr: 4.7428e-06 gnorm: 0.38 [2 days, 6:36:17<18:28:10] +[titan] 2025-09-10 00:12:19,176 - root - INFO - step: 29895 loss: 2.7750 memory: 122.04GiB(87.57%) tps: 10,165 tflops: 484.46 mfu: 48.99% global_avg_ntp_loss: 0.7865 global_avg_top_loss: 1.9884 +[titan] 2025-09-10 00:12:19,177 - root - INFO - lr: 4.7402e-06 gnorm: 0.60 [2 days, 6:36:49<18:27:37] +[titan] 2025-09-10 00:12:44,693 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 00:12:51,251 - root - INFO - step: 29900 loss: 2.6360 memory: 122.04GiB(87.57%) tps: 10,216 tflops: 486.90 mfu: 49.23% global_avg_ntp_loss: 0.7259 global_avg_top_loss: 1.9101 +[titan] 2025-09-10 00:12:51,252 - root - INFO - lr: 4.7377e-06 gnorm: 0.40 [2 days, 6:37:22<18:27:04] +[titan] 2025-09-10 00:13:23,319 - root - INFO - step: 29905 loss: 2.7614 memory: 122.04GiB(87.57%) tps: 10,219 tflops: 487.01 mfu: 49.24% global_avg_ntp_loss: 0.7881 global_avg_top_loss: 1.9733 +[titan] 2025-09-10 00:13:23,320 - root - INFO - lr: 4.7351e-06 gnorm: 0.40 [2 days, 6:37:54<18:26:31] +[titan] 2025-09-10 00:13:55,301 - root - INFO - step: 29910 loss: 2.5872 memory: 122.04GiB(87.57%) tps: 10,246 tflops: 488.33 mfu: 49.38% global_avg_ntp_loss: 0.7106 global_avg_top_loss: 1.8765 +[titan] 2025-09-10 00:13:55,302 - root - INFO - lr: 4.7326e-06 gnorm: 0.38 [2 days, 6:38:26<18:25:57] +[titan] 2025-09-10 00:14:27,418 - root - INFO - step: 29915 loss: 2.9859 memory: 122.04GiB(87.57%) tps: 10,203 tflops: 486.26 mfu: 49.17% global_avg_ntp_loss: 0.9417 global_avg_top_loss: 2.0442 +[titan] 2025-09-10 00:14:27,419 - root - INFO - lr: 4.7300e-06 gnorm: 0.39 [2 days, 6:38:58<18:25:24] +[titan] 2025-09-10 00:14:59,292 - root - INFO - step: 29920 loss: 2.7048 memory: 122.04GiB(87.57%) tps: 10,281 tflops: 489.98 mfu: 49.54% global_avg_ntp_loss: 0.7620 global_avg_top_loss: 1.9428 +[titan] 2025-09-10 00:14:59,292 - root - INFO - lr: 4.7274e-06 gnorm: 0.38 [2 days, 6:39:30<18:24:51] +[titan] 2025-09-10 00:15:31,364 - root - INFO - step: 29925 loss: 2.6873 memory: 122.04GiB(87.57%) tps: 10,217 tflops: 486.95 mfu: 49.24% global_avg_ntp_loss: 0.7512 global_avg_top_loss: 1.9362 +[titan] 2025-09-10 00:15:31,364 - root - INFO - lr: 4.7249e-06 gnorm: 0.42 [2 days, 6:40:02<18:24:18] +[titan] 2025-09-10 00:16:03,584 - root - INFO - step: 29930 loss: 2.6387 memory: 122.04GiB(87.57%) tps: 10,170 tflops: 484.71 mfu: 49.01% global_avg_ntp_loss: 0.7322 global_avg_top_loss: 1.9065 +[titan] 2025-09-10 00:16:03,584 - root - INFO - lr: 4.7223e-06 gnorm: 0.39 [2 days, 6:40:34<18:23:45] +[titan] 2025-09-10 00:16:35,629 - root - INFO - step: 29935 loss: 2.6520 memory: 122.04GiB(87.57%) tps: 10,226 tflops: 487.36 mfu: 49.28% global_avg_ntp_loss: 0.7334 global_avg_top_loss: 1.9186 +[titan] 2025-09-10 00:16:35,629 - root - INFO - lr: 4.7198e-06 gnorm: 0.40 [2 days, 6:41:06<18:23:12] +[titan] 2025-09-10 00:17:07,617 - root - INFO - step: 29940 loss: 2.7165 memory: 122.04GiB(87.57%) tps: 10,244 tflops: 488.23 mfu: 49.37% global_avg_ntp_loss: 0.7679 global_avg_top_loss: 1.9486 +[titan] 2025-09-10 00:17:07,617 - root - INFO - lr: 4.7172e-06 gnorm: 0.39 [2 days, 6:41:38<18:22:38] +[titan] 2025-09-10 00:17:39,816 - root - INFO - step: 29945 loss: 2.7217 memory: 122.04GiB(87.57%) tps: 10,177 tflops: 485.03 mfu: 49.04% global_avg_ntp_loss: 0.7665 global_avg_top_loss: 1.9552 +[titan] 2025-09-10 00:17:39,816 - root - INFO - lr: 4.7146e-06 gnorm: 0.39 [2 days, 6:42:10<18:22:05] +[titan] 2025-09-10 00:18:05,464 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-09-10 00:18:11,908 - root - INFO - step: 29950 loss: 2.6150 memory: 122.04GiB(87.57%) tps: 10,211 tflops: 486.64 mfu: 49.21% global_avg_ntp_loss: 0.7222 global_avg_top_loss: 1.8928 +[titan] 2025-09-10 00:18:11,909 - root - INFO - lr: 4.7121e-06 gnorm: 0.38 [2 days, 6:42:42<18:21:32] +[titan] 2025-09-10 00:18:44,033 - root - INFO - step: 29955 loss: 2.7258 memory: 122.04GiB(87.57%) tps: 10,201 tflops: 486.16 mfu: 49.16% global_avg_ntp_loss: 0.7661 global_avg_top_loss: 1.9597 +[titan] 2025-09-10 00:18:44,033 - root - INFO - lr: 4.7095e-06 gnorm: 0.40 [2 days, 6:43:14<18:20:59] +[titan] 2025-09-10 00:19:16,106 - root - INFO - step: 29960 loss: 2.5972 memory: 122.04GiB(87.57%) tps: 10,217 tflops: 486.93 mfu: 49.24% global_avg_ntp_loss: 0.7099 global_avg_top_loss: 1.8873 +[titan] 2025-09-10 00:19:16,106 - root - INFO - lr: 4.7070e-06 gnorm: 0.39 [2 days, 6:43:46<18:20:26] +[titan] 2025-09-10 00:19:48,029 - root - INFO - step: 29965 loss: 3.8520 memory: 122.04GiB(87.57%) tps: 10,265 tflops: 489.22 mfu: 49.47% global_avg_ntp_loss: 1.3764 global_avg_top_loss: 2.4756 +[titan] 2025-09-10 00:19:48,029 - root - INFO - lr: 4.7044e-06 gnorm: 0.41 [2 days, 6:44:18<18:19:53] +[titan] 2025-09-10 00:20:20,187 - root - INFO - step: 29970 loss: 2.6966 memory: 122.04GiB(87.57%) tps: 10,190 tflops: 485.64 mfu: 49.10% global_avg_ntp_loss: 0.7559 global_avg_top_loss: 1.9408 +[titan] 2025-09-10 00:20:20,187 - root - INFO - lr: 4.7019e-06 gnorm: 0.41 [2 days, 6:44:50<18:19:20] +[titan] 2025-09-10 00:20:52,285 - root - INFO - step: 29975 loss: 3.1491 memory: 122.04GiB(87.57%) tps: 10,209 tflops: 486.56 mfu: 49.20% global_avg_ntp_loss: 1.0164 global_avg_top_loss: 2.1327 +[titan] 2025-09-10 00:20:52,285 - root - INFO - lr: 4.6993e-06 gnorm: 0.38 [2 days, 6:45:23<18:18:46] +[titan] 2025-09-10 00:21:24,143 - root - INFO - step: 29980 loss: 2.7489 memory: 122.04GiB(87.57%) tps: 10,286 tflops: 490.22 mfu: 49.57% global_avg_ntp_loss: 0.7802 global_avg_top_loss: 1.9688 +[titan] 2025-09-10 00:21:24,143 - root - INFO - lr: 4.6968e-06 gnorm: 0.38 [2 days, 6:45:54<18:18:13] +[titan] 2025-09-10 00:21:56,300 - root - INFO - step: 29985 loss: 2.6586 memory: 122.04GiB(87.57%) tps: 10,190 tflops: 485.66 mfu: 49.11% global_avg_ntp_loss: 0.7358 global_avg_top_loss: 1.9227 +[titan] 2025-09-10 00:21:56,301 - root - INFO - lr: 4.6942e-06 gnorm: 0.39 [2 days, 6:46:27<18:17:40] +[titan] 2025-09-10 00:22:28,294 - root - INFO - step: 29990 loss: 2.5988 memory: 122.04GiB(87.57%) tps: 10,242 tflops: 488.14 mfu: 49.36% global_avg_ntp_loss: 0.7105 global_avg_top_loss: 1.8884 +[titan] 2025-09-10 00:22:28,295 - root - INFO - lr: 4.6917e-06 gnorm: 0.39 [2 days, 6:46:59<18:17:07] +[titan] 2025-09-10 00:23:00,308 - root - INFO - step: 29995 loss: 2.5806 memory: 122.04GiB(87.57%) tps: 10,236 tflops: 487.84 mfu: 49.33% global_avg_ntp_loss: 0.7012 global_avg_top_loss: 1.8794 +[titan] 2025-09-10 00:23:00,308 - root - INFO - lr: 4.6891e-06 gnorm: 0.38 [2 days, 6:47:31<18:16:34] +[titan] 2025-09-10 00:23:25,917 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds. +[titan] 2025-09-10 00:23:32,341 - root - INFO - step: 30000 loss: 2.5194 memory: 122.04GiB(87.57%) tps: 10,230 tflops: 487.54 mfu: 49.30% global_avg_ntp_loss: 0.6778 global_avg_top_loss: 1.8416 +[titan] 2025-09-10 00:23:32,341 - root - INFO - lr: 4.6866e-06 gnorm: 0.39 [2 days, 6:48:03<18:16:01] +[titan] 2025-09-10 00:23:32,341 - root - INFO - Saving the checkpoint (or staging if async is enabled). +[titan] 2025-09-10 00:24:07,145 - root - INFO - [GC] GC collection invoked by checkpointer. 0.01 seconds. +[titan] 2025-09-10 00:24:07,145 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 34.80 seconds. +[titan] 2025-09-10 00:24:07,145 - root - INFO - Ensuring repository zaydzuhri/top-code-7B-4096-batch8x2-steps40000-20250909-061757 exists... +[titan] 2025-09-10 00:24:07,296 - root - INFO - Repository zaydzuhri/top-code-7B-4096-batch8x2-steps40000-20250909-061757 ensured. +[titan] 2025-09-10 00:24:07,296 - root - INFO - Uploading exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/checkpoint/step-30000 to zaydzuhri/top-code-7B-4096-batch8x2-steps40000-20250909-061757/step-30000 on Hugging Face Hub... +Processing Files (8 / 9) : 100%|█████████▉| 83.2GB / 83.3GB, 114MB/s +New Data Upload : 100%|█████████▉| 83.2GB / 83.3GB, 114MB/s + ...ine/checkpoint/step-30000/.metadata: 100%|██████████| 2.47MB / 2.47MB + .../checkpoint/step-30000/__1_0.distcp: 100%|██████████| 10.4GB / 10.4GB + .../checkpoint/step-30000/__5_0.distcp: 100%|██████████| 10.4GB / 10.4GB + .../checkpoint/step-30000/__3_0.distcp: 100%|██████████| 10.4GB / 10.4GB + .../checkpoint/step-30000/__7_0.distcp: 100%|██████████| 10.4GB / 10.4GB + .../checkpoint/step-30000/__0_0.distcp: 100%|██████████| 10.4GB / 10.4GB + .../checkpoint/step-30000/__4_0.distcp: 100%|██████████| 10.4GB / 10.4GB + .../checkpoint/step-30000/__2_0.distcp: 100%|██████████| 10.4GB / 10.4GB + .../checkpoint/step-30000/__6_0.distcp: 99%|█████████▉| 10.4GB / 10.4GB diff --git a/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/files/requirements.txt b/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf79d25031b268c8bf0ae12f6799c04bf1cbbba1 --- /dev/null +++ b/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/files/requirements.txt @@ -0,0 +1,207 @@ +flame==0.1.0 +pluggy==1.6.0 +triton==3.2.0 +sympy==1.13.1 +wcwidth==0.2.13 +nvidia-cusolver-cu12==11.6.1.9 +peft==0.17.0 +smart_open==7.3.0.post1 +cymem==2.0.11 +spacy-legacy==3.0.12 +h11==0.16.0 +pytablewriter==1.2.1 +idna==3.10 +regex==2025.7.34 +antlr4-python3-runtime==4.13.2 +wandb==0.21.0 +nvidia-cuda-cupti-cu12==12.4.127 +sentencepiece==0.2.1 +zstandard==0.23.0 +pybind11==3.0.0 +inquirerpy==0.3.4 +contourpy==1.3.3 +Pygments==2.19.2 +sniffio==1.3.1 +Jinja2==3.1.6 +packaging==25.0 +Markdown==3.8.2 +astunparse==1.6.3 +spacy==3.8.7 +pyparsing==3.2.3 +networkx==3.5 +ninja==1.11.1.4 +tf-slim==1.1.0 +PyYAML==6.0.2 +smmap==5.0.2 +tiktoken==0.9.0 +flatbuffers==25.2.10 +tensorflow==2.20.0 +langcodes==3.5.0 +nvidia-cuda-nvrtc-cu12==12.4.127 +numexpr==2.11.0 +charset-normalizer==3.4.3 +frozenlist==1.7.0 +setuptools==80.9.0 +cycler==0.12.1 +weasel==0.4.1 +tzdata==2025.2 +sacrebleu==2.5.1 +rouge_score==0.1.2 +requests==2.32.5 +nvidia-nvjitlink-cu12==12.4.127 +grpcio==1.74.0 +nvidia-cusparse-cu12==12.3.1.170 +mdurl==0.1.2 +pandas==2.3.1 +preshed==3.0.10 +attrs==25.3.0 +tensorboard-data-server==0.7.2 +aiohappyeyeballs==2.6.1 +keras==3.11.2 +wrapt==1.17.3 +aiosignal==1.4.0 +tcolorpy==0.1.7 +platformdirs==4.3.8 +tqdm-multiprocess==0.0.11 +python-dotenv==1.1.1 +wasabi==1.1.3 +google-pasta==0.2.0 +optree==0.17.0 +MarkupSafe==3.0.2 +colorlog==6.9.0 +nvidia-cufft-cu12==11.2.1.3 +lm_eval==0.4.9.1 +lxml==6.0.0 +protobuf==6.32.0 +radgraph==0.1.18 +scipy==1.16.1 +click==8.2.1 +wheel==0.45.1 +marisa-trie==1.3.0 +pathvalidate==3.3.1 +nvidia-nccl-cu12==2.21.5 +evaluate==0.4.5 +nvidia-cuda-runtime-cu12==12.4.127 +transformers==4.51.3 +aenum==3.1.15 +typing-inspection==0.4.1 +gitdb==4.0.12 +iniconfig==2.1.0 +multidict==6.6.3 +huggingface-hub==0.34.4 +tokenizers==0.21.4 +tabledata==1.3.4 +mbstrdecoder==1.1.4 +Werkzeug==3.1.3 +accelerate==1.10.0 +hf-xet==1.1.8 +tensorboard==2.20.0 +ml_dtypes==0.5.3 +pytest==8.4.1 +namex==0.1.0 +pillow==11.3.0 +datasets==3.6.0 +tqdm==4.67.1 +murmurhash==1.0.13 +fonttools==4.59.1 +absl-py==2.3.1 +multiprocess==0.70.16 +fsspec==2025.3.0 +transformers==4.51.3 +dill==0.3.8 +propcache==0.3.2 +jsonpickle==4.1.1 +BLEURT==0.0.2 +yarl==1.20.1 +portalocker==3.2.0 +httpx==0.27.2 +numpy==2.3.2 +mpmath==1.3.0 +pyarrow==21.0.0 +matplotlib==3.10.5 +typepy==1.3.4 +pycountry==24.6.1 +word2number==1.1 +psutil==7.0.0 +catalogue==2.0.10 +latex2sympy2_extended==1.0.6 +pydantic_core==2.33.2 +threadpoolctl==3.6.0 +spacy-loggers==1.0.5 +certifi==2025.8.3 +confection==0.1.5 +flame==0.1.0 +pfzy==0.3.4 +safetensors==0.6.2 +pip==25.1 +DataProperty==1.1.0 +lighteval==0.10.1.dev0 +jsonlines==4.0.0 +scikit-learn==1.7.1 +torch==2.6.0 +pytz==2025.2 +python-dateutil==2.9.0.post0 +nltk==3.9.1 +sqlitedict==2.1.0 +gast==0.6.0 +nvidia-curand-cu12==10.3.5.147 +rich==14.1.0 +sentry-sdk==2.33.2 +nvidia-cusparselt-cu12==0.6.2 +kiwisolver==1.4.9 +appdirs==1.4.4 +bert-score==0.3.13 +blis==1.3.0 +GitPython==3.1.45 +chardet==5.2.0 +more-itertools==10.7.0 +filelock==3.19.1 +transformers==4.51.3 +httpcore==1.0.9 +termcolor==3.1.0 +typer==0.16.1 +einops==0.8.1 +torchdata==0.11.0 +six==1.17.0 +colorama==0.4.6 +aiohttp==3.12.14 +srsly==2.5.1 +urllib3==2.5.0 +nvidia-cublas-cu12==12.4.5.8 +cloudpathlib==0.21.1 +h5py==3.14.0 +thinc==8.3.6 +markdown-it-py==4.0.0 +flash-attn==2.7.3 +prompt_toolkit==3.0.52 +nvidia-nvtx-cu12==12.4.127 +en_core_web_sm==3.8.0 +xxhash==3.5.0 +anyio==4.10.0 +joblib==1.5.1 +pydantic==2.11.7 +opt_einsum==3.4.0 +dotmap==1.3.30 +language_data==1.3.0 +shellingham==1.5.4 +nvidia-cudnn-cu12==9.1.0.70 +typing_extensions==4.14.1 +libclang==18.1.1 +tabulate==0.9.0 +annotated-types==0.7.0 +jaraco.context==5.3.0 +autocommand==2.2.2 +more-itertools==10.3.0 +tomli==2.0.1 +jaraco.functools==4.0.1 +zipp==3.19.2 +backports.tarfile==1.2.0 +wheel==0.45.1 +platformdirs==4.2.2 +inflect==7.3.1 +typing_extensions==4.12.2 +jaraco.text==3.12.1 +typeguard==4.3.0 +importlib_metadata==8.0.0 +packaging==24.2 +jaraco.collections==5.1.0 diff --git a/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug-internal.log b/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..050118e4b9f12bc5327706ed04d6d2d1cbd6ce67 --- /dev/null +++ b/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug-internal.log @@ -0,0 +1,10 @@ +{"time":"2025-09-09T06:19:20.029854482Z","level":"INFO","msg":"stream: starting","core version":"0.21.0"} +{"time":"2025-09-09T06:19:20.338868384Z","level":"INFO","msg":"stream: created new stream","id":"top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614"} +{"time":"2025-09-09T06:19:20.338942945Z","level":"INFO","msg":"stream: started","id":"top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614"} +{"time":"2025-09-09T06:19:20.338955936Z","level":"INFO","msg":"handler: started","stream_id":"top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614"} +{"time":"2025-09-09T06:19:20.33900181Z","level":"INFO","msg":"writer: Do: started","stream_id":"top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614"} +{"time":"2025-09-09T06:19:20.339014387Z","level":"INFO","msg":"sender: started","stream_id":"top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614"} +{"time":"2025-09-09T16:55:51.461783187Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-09-09T17:52:23.968650788Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/zaydzuhri/fla/top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2025-09-09T22:51:18.011409168Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/zaydzuhri/fla/top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/file_stream\": dial tcp 35.186.228.49:443: connect: connection refused"} +{"time":"2025-09-09T22:58:20.165767227Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/zaydzuhri/fla/top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/file_stream\": dial tcp 35.186.228.49:443: connect: connection refused"} diff --git a/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug.log b/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..b1fe6180502b5e0c9a480097879c081c907d671c --- /dev/null +++ b/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug.log @@ -0,0 +1,21 @@ +2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_setup.py:_flush():80] Current SDK version is 0.21.0 +2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_setup.py:_flush():80] Configure stats pid to 795439 +2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_setup.py:_flush():80] Loading settings from /home/cvm/.config/wandb/settings +2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_setup.py:_flush():80] Loading settings from /home/cvm/flame/wandb/settings +2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_init.py:setup_run_log_directory():703] Logging user logs to exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug.log +2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug-internal.log +2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_init.py:init():830] calling init triggers +2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_init.py:init():835] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_init.py:init():871] starting backend +2025-09-09 06:19:20,025 INFO MainThread:795439 [wandb_init.py:init():874] sending inform_init request +2025-09-09 06:19:20,027 INFO MainThread:795439 [wandb_init.py:init():882] backend started and connected +2025-09-09 06:19:20,033 INFO MainThread:795439 [wandb_init.py:init():953] updated telemetry +2025-09-09 06:19:20,039 INFO MainThread:795439 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout +2025-09-09 06:19:20,682 INFO MainThread:795439 [wandb_init.py:init():1029] starting run threads in backend +2025-09-09 06:19:20,815 INFO MainThread:795439 [wandb_run.py:_console_start():2458] atexit reg +2025-09-09 06:19:20,815 INFO MainThread:795439 [wandb_run.py:_redirect():2306] redirect: wrap_raw +2025-09-09 06:19:20,815 INFO MainThread:795439 [wandb_run.py:_redirect():2375] Wrapping output streams. +2025-09-09 06:19:20,815 INFO MainThread:795439 [wandb_run.py:_redirect():2398] Redirects installed. +2025-09-09 06:19:20,817 INFO MainThread:795439 [wandb_init.py:init():1075] run started, returning control to user process diff --git a/torchtitan/components/__pycache__/dataloader.cpython-312.pyc b/torchtitan/components/__pycache__/dataloader.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e7c2ae5f7ce6a985bba4e4780597cf9971f42316 Binary files /dev/null and b/torchtitan/components/__pycache__/dataloader.cpython-312.pyc differ diff --git a/torchtitan/components/__pycache__/lr_scheduler.cpython-312.pyc b/torchtitan/components/__pycache__/lr_scheduler.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..86532b3bc4da661183c020f21ccd5f27ea079a04 Binary files /dev/null and b/torchtitan/components/__pycache__/lr_scheduler.cpython-312.pyc differ diff --git a/torchtitan/components/__pycache__/metrics.cpython-312.pyc b/torchtitan/components/__pycache__/metrics.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..00c9b41b8f5e4e1e3d80eac861f354891c01494a Binary files /dev/null and b/torchtitan/components/__pycache__/metrics.cpython-312.pyc differ diff --git a/torchtitan/components/__pycache__/tokenizer.cpython-312.pyc b/torchtitan/components/__pycache__/tokenizer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e6ef655c5f8cc3fe62d17d3f86b66a7ea3c4ab5 Binary files /dev/null and b/torchtitan/components/__pycache__/tokenizer.cpython-312.pyc differ diff --git a/torchtitan/components/metrics.py b/torchtitan/components/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..4c90ed54a4af3d644abb552615675a7af5f15910 --- /dev/null +++ b/torchtitan/components/metrics.py @@ -0,0 +1,435 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import os +import time +from collections import namedtuple +from datetime import datetime +from typing import Any + +import torch +from torch.utils.tensorboard import SummaryWriter +from torchtitan.components.lr_scheduler import LRSchedulersContainer +from torchtitan.components.optimizer import OptimizersContainer +from torchtitan.config_manager import JobConfig +from torchtitan.distributed import ParallelDims +from torchtitan.tools import utils +from torchtitan.tools.logging import logger +from torchtitan.tools.utils import Color, device_module, device_type + +# named tuple for passing device memory stats for logging +DeviceMemStats = namedtuple( + "DeviceMemStats", + [ + "max_active_gib", + "max_active_pct", + "max_reserved_gib", + "max_reserved_pct", + "num_alloc_retries", + "num_ooms", + ], +) + + +class DeviceMemoryMonitor: + def __init__(self, device: str = f"{device_type}:0"): + self.device = torch.device(device) # device object + self.device_name = device_module.get_device_name(self.device) + self.device_index = device_module.current_device() + self.device_capacity = device_module.get_device_properties( + self.device + ).total_memory + self.device_capacity_gib = self._to_gib(self.device_capacity) + + device_module.reset_peak_memory_stats() + device_module.empty_cache() + + def _to_gib(self, memory_in_bytes): + # NOTE: GiB (gibibyte) is 1024, vs GB is 1000 + _gib_in_bytes = 1024 * 1024 * 1024 + memory_in_gib = memory_in_bytes / _gib_in_bytes + return memory_in_gib + + def _to_pct(self, memory): + return 100 * memory / self.device_capacity + + def get_peak_stats(self): + device_info = device_module.memory_stats(self.device) + + max_active = device_info.get("active_bytes.all.peak", -1) + max_active_gib = self._to_gib(max_active) + max_active_pct = self._to_pct(max_active) + + max_reserved = device_info.get("reserved_bytes.all.peak", -1) + max_reserved_gib = self._to_gib(max_reserved) + max_reserved_pct = self._to_pct(max_reserved) + + num_retries = device_info.get("num_alloc_retries", -1) + num_ooms = device_info.get("num_ooms", -1) + + if num_retries > 0: + logger.warning( + f"{num_retries} {device_type.upper()} memory allocation retries." + ) + if num_ooms > 0: + logger.warning(f"{num_ooms} {device_type.upper()} OOM errors thrown.") + + return DeviceMemStats( + max_active_gib, + max_active_pct, + max_reserved_gib, + max_reserved_pct, + num_retries, + num_ooms, + ) + + def reset_peak_stats(self): + device_module.reset_peak_memory_stats() + + +def build_device_memory_monitor(): + device_memory_monitor = DeviceMemoryMonitor(device_type) + logger.info( + f"{device_type.upper()} capacity: {device_memory_monitor.device_name} " + f"with {device_memory_monitor.device_capacity_gib:.2f}GiB memory" + ) + return device_memory_monitor + + +class BaseLogger: + """Logger that does nothing, used when logging is disabled.""" + + def log(self, metrics: dict[str, Any], step: int) -> None: + pass + + def close(self) -> None: + pass + + +class TensorBoardLogger(BaseLogger): + """Logger implementation for TensorBoard.""" + + def __init__(self, log_dir: str, tag: str | None = None): + self.tag = tag + self.writer = SummaryWriter(log_dir, max_queue=1000) + logger.info(f"TensorBoard logging enabled. Logs will be saved at {log_dir}") + + def log(self, metrics: dict[str, Any], step: int) -> None: + for k, v in metrics.items(): + tag = k if self.tag is None else f"{self.tag}/{k}" + self.writer.add_scalar(tag, v, step) + + def close(self) -> None: + self.writer.close() + + +class WandBLogger(BaseLogger): + """Logger implementation for Weights & Biases.""" + + def __init__(self, log_dir: str, tag: str | None = None): + # Import wandb here to avoid startup import + import wandb + + self.wandb = wandb + self.tag = tag + + # Create logging directory + os.makedirs(log_dir, exist_ok=True) + + self.wandb.init( + project=os.getenv("WANDB_PROJECT", "torchtitan"), + dir=log_dir, + ) + logger.info("WandB logging enabled") + + def log(self, metrics: dict[str, Any], step: int) -> None: + wandb_metrics = { + (k if self.tag is None else f"{self.tag}/{k}"): v + for k, v in metrics.items() + } + self.wandb.log(wandb_metrics, step=step) + + def close(self) -> None: + if self.wandb.run is not None: + self.wandb.finish() + + +def ensure_pp_loss_visible( + parallel_dims: ParallelDims, job_config: JobConfig, color: Color +) -> None: + """ + Ensures that the loss is visible on the console for pipeline-parallel training. + + For pipeline-parallel training, the loss is only visible on the last pipeline stage. + This function checks if the appropriate rank is included in the LOG_RANK environment + variable and warns if it's not. + """ + + # V Block Schedules return loss on rank 0 + if job_config.parallelism.pipeline_parallel_schedule == "ZBVZeroBubble": + return + + # Calculate the rank where loss is visible (first rank of the last pipeline stage) + world_size = parallel_dims.world_size + pp_size = parallel_dims.pp + loss_visible_rank = (world_size // pp_size) * (pp_size - 1) + + # Check if the loss-visible rank is included in LOG_RANK environment variable + env_logged_ranks = os.environ.get("LOG_RANK", "").split(",") + if env_logged_ranks == [""]: + env_logged_ranks = [] + + if str(loss_visible_rank) not in env_logged_ranks: + logger.warning( + f"{color.red}Pipeline Parallel loss is not visible. " + f"Please add {color.yellow}rank {loss_visible_rank}{color.red} " + f"to LOG_RANK environment variable in run_train.sh.{color.reset}" + ) + + +def _get_metrics_rank( + parallel_dims: ParallelDims, + job_config: JobConfig, +) -> int: + """ + Determines which rank should log metrics. + + Returns: + int: The rank responsible for logging metrics: + - Rank 0 for non-pipeline-parallel configs + - Rank 0 for pipeline-parallel 'ZBVZeroBubble' schedule + - The first rank of the last pipeline stage for other pipeline-parallel schedules + """ + # Early return for non-pipeline-parallel configurations + if not parallel_dims.pp_enabled: + return 0 + + # V Block Schedules return loss on rank 0 + if job_config.parallelism.pipeline_parallel_schedule == "ZBVZeroBubble": + return 0 + + # Calculate first rank of the last pipeline stage + world_size = parallel_dims.world_size + pp_size = parallel_dims.pp + return (world_size // pp_size) * (pp_size - 1) + + +def _build_metric_logger( + job_config: JobConfig, parallel_dims: ParallelDims, tag: str | None = None +) -> BaseLogger: + """ + Build an appropriate metric logger based on configuration. + """ + metrics_config = job_config.metrics + + # Log initial config state + logger.debug( + f"Building logger with config: wandb={metrics_config.enable_wandb}, " + f"tensorboard={metrics_config.enable_tensorboard}" + ) + + # Check if any logging backend is enabled + has_logging_enabled = ( + metrics_config.enable_tensorboard or metrics_config.enable_wandb + ) + + # Determine if this rank should log + should_log = has_logging_enabled + if (not metrics_config.save_for_all_ranks) and should_log: + metrics_rank = _get_metrics_rank(parallel_dims, job_config) + should_log = torch.distributed.get_rank() == metrics_rank + + logger.debug( + f"Logging decision: has_logging_enabled={has_logging_enabled}, should_log={should_log}" + ) + + if not should_log: + logger.debug("Returning BaseLogger due to should_log=False") + return BaseLogger() + + # Setup logging directory + dump_dir = job_config.job.dump_folder + base_log_dir = os.path.join( + dump_dir, metrics_config.save_tb_folder, datetime.now().strftime("%Y%m%d-%H%M") + ) + + if metrics_config.save_for_all_ranks: + base_log_dir = os.path.join( + base_log_dir, f"rank_{torch.distributed.get_rank()}" + ) + + # Create loggers in priority order + if metrics_config.enable_wandb: + logger.debug("Attempting to create WandB logger") + try: + return WandBLogger(base_log_dir, tag) + except Exception as e: + if "No module named 'wandb'" in str(e): + logger.error( + "Failed to create WandB logger: No module named 'wandb'. Please install it using 'pip install wandb'." + ) + else: + logger.error(f"Failed to create WandB logger: {e}") + + if metrics_config.enable_tensorboard: + logger.debug("Creating TensorBoard logger") + return TensorBoardLogger(base_log_dir, tag) + + logger.debug("No loggers enabled, returning BaseLogger") + return BaseLogger() + + +class MetricsProcessor: + """Metrics processor to processes the metrics and log metrics. + + The current MetricsProcessor log some metrics to STDOUT and some metrics to + TensorBoard or WandB. + + Args: + job_config (JobConfig): Job configuration. + parallel_dims (ParallelDims): Parallel dimensions. + tag (Optional[str]): Tag to use for TensorBoard or WandB. Defaults to None. + """ + + logger: BaseLogger + parallel_dims: ParallelDims + job_config: JobConfig + device_memory_monitor: DeviceMemoryMonitor + color: utils.NoColor | utils.Color + + gpu_peak_flops: int + ntokens_since_last_log: int + data_loading_times: list[float] + time_last_log: float + + num_flops_per_token: int + optimizers: OptimizersContainer | None + lr_schedulers: LRSchedulersContainer | None + + def __init__( + self, + job_config: JobConfig, + parallel_dims: ParallelDims, + tag: str | None = None, + ): + self.logger = _build_metric_logger(job_config, parallel_dims, tag) + self.parallel_dims = parallel_dims + self.job_config = job_config + self.device_memory_monitor = build_device_memory_monitor() + # used for colorful printing + self.color = ( + utils.NoColor() + if job_config.metrics.disable_color_printing + else utils.Color() + ) + + self.gpu_peak_flops = utils.get_peak_flops( + self.device_memory_monitor.device_name + ) + self.ntokens_since_last_log = 0 + self.data_loading_times = [] + self.time_last_log = time.perf_counter() + self.device_memory_monitor.reset_peak_stats() + + # These variables have to be set later as they depend on other components or model. + self.num_flops_per_token = -1 + self.optimizers = None + self.lr_schedulers = None + + def should_log(self, step: int) -> bool: + return step == 1 or step % self.job_config.metrics.log_freq == 0 + + def log( + self, + step: int, + global_avg_loss: float, + global_max_loss: float, + extra_metrics: dict[str, Any] | None = None, + ): + assert self.num_flops_per_token > 0, "num_flops_per_token must be set" + + time_delta = time.perf_counter() - self.time_last_log + + # tokens per second per device, abbreviated as tps + tps = self.ntokens_since_last_log / ( + time_delta * self.parallel_dims.non_data_parallel_size + ) + # model FLOPS utilization + # For its definition and calculation, please refer to the PaLM paper: + # https://arxiv.org/abs/2204.02311 + mfu = 100 * self.num_flops_per_token * tps / self.gpu_peak_flops + tflops = self.num_flops_per_token * tps / 1e12 + + time_end_to_end = time_delta / self.job_config.metrics.log_freq + time_data_loading = sum(self.data_loading_times) / len(self.data_loading_times) + time_data_loading_pct = 100 * sum(self.data_loading_times) / time_delta + + device_mem_stats = self.device_memory_monitor.get_peak_stats() + + metrics = { + "loss_metrics/global_avg_loss": global_avg_loss, + "loss_metrics/global_max_loss": global_max_loss, + "throughput(tps)": tps, + "tflops": tflops, + "mfu(%)": mfu, + "time_metrics/end_to_end(s)": time_end_to_end, + "time_metrics/data_loading(s)": time_data_loading, + "time_metrics/data_loading(%)": time_data_loading_pct, + "memory/max_active(GiB)": device_mem_stats.max_active_gib, + "memory/max_active(%)": device_mem_stats.max_active_pct, + "memory/max_reserved(GiB)": device_mem_stats.max_reserved_gib, + "memory/max_reserved(%)": device_mem_stats.max_reserved_pct, + "memory/num_alloc_retries": device_mem_stats.num_alloc_retries, + "memory/num_ooms": device_mem_stats.num_ooms, + } + + if extra_metrics: + metrics.update(extra_metrics) + + self.logger.log(metrics, step) + + color = self.color + construct_string = str( + f"{color.red}step: {step:2} " + f"{color.green}loss: {global_avg_loss:7.4f} " + f"{color.yellow}memory: {device_mem_stats.max_reserved_gib:5.2f}GiB" + f"({device_mem_stats.max_reserved_pct:.2f}%) " + f"{color.blue}tps: {round(tps):,} " + f"{color.cyan}tflops: {tflops:,.2f} " + f"{color.magenta}mfu: {mfu:.2f}%{color.reset}" + ) + + if extra_metrics: + for k, v in extra_metrics.items(): + if "loss" in k: + construct_string += f" {color.white}{k.lstrip('loss_metrics/')}: {v:7.4f}" + logger.info( + construct_string + ) + + self.ntokens_since_last_log = 0 + self.data_loading_times.clear() + self.time_last_log = time.perf_counter() + self.device_memory_monitor.reset_peak_stats() + + def close(self): + self.logger.close() + + +def build_metrics_processor( + job_config: JobConfig, parallel_dims: ParallelDims, tag: str | None = None +) -> MetricsProcessor: + """Create a metrics processor. + + Args: + job_config (JobConfig): Job configuration. + parallel_dims (ParallelDims): Parallel dimensions. + tag (Optional[str]): Tag to use for TensorBoard or WandB. Defaults to None. + + Returns: + MetricsProcessor: A metrics processor. + """ + return MetricsProcessor(job_config, parallel_dims, tag) diff --git a/torchtitan/experiments/deepseek_v3/LICENSE-CODE b/torchtitan/experiments/deepseek_v3/LICENSE-CODE new file mode 100644 index 0000000000000000000000000000000000000000..d84f527e101b2cdd171e2b14253f84ea4fedabe9 --- /dev/null +++ b/torchtitan/experiments/deepseek_v3/LICENSE-CODE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 DeepSeek + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/torchtitan/experiments/deepseek_v3/README.md b/torchtitan/experiments/deepseek_v3/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a1c4303a91e1fc4d31f88f0f4b345af90a9ec3f0 --- /dev/null +++ b/torchtitan/experiments/deepseek_v3/README.md @@ -0,0 +1,40 @@ +# Running DeepSeek in Titan (experimental) + +This folder contains a DeepSeek model supporting v2 and v3 as well as kernels +and scripts needed to run it. + +## Inference + +### Prerequisites: + +You will need to download a DeepSeek model's weights if you want to run a +pre-trained checkpoint. We provided a script to download the weights from +HuggingFace Model Hub: +```bash +python download.py [vX] +``` +where `vX` can be v2 or v3, both are supported. You may be required to create a +HuggingFace account and log in first. + +### Running inference: + +The inference script is in `generate.py`. You can run it with the following +command: +```bash +torchrun --standalone --nproc-per-node 4 generate.py +``` +This will run inference on the `DeepSeek-V2-Lite-Chat` model using 4 GPUs by +default. + +Alternatively, you can run inference by using `bash inference.sh`, optionally +followed by your prompt. + +## Training + +The training script is in `train.py`. You can run it by the following command: +```bash +torchrun --standalone --nproc-per-node 8 train.py +``` + +This will run training on the `DeepSeek-V2-Lite-Chat` model using 8 GPUs by +default, with pipeline parallel, expert parallel, and data parallel enabled. diff --git a/torchtitan/experiments/deepseek_v3/checkpoint.py b/torchtitan/experiments/deepseek_v3/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..535ac7fe069a88555841181dddc1e870c2d30934 --- /dev/null +++ b/torchtitan/experiments/deepseek_v3/checkpoint.py @@ -0,0 +1,154 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import json +import logging +import os +from typing import Dict, Optional, Set, Tuple + +import torch +from safetensors import safe_open + +from transformers.utils import cached_file + + +logger = logging.getLogger(__name__) + +_DEFAULT_SAFETENSOR_FILE_NAME = "model.safetensors.index.json" + + +def read_weights_from_json(file_path: str) -> Optional[Dict[str, str]]: + try: + with open(file_path, "r") as file: + data = json.load(file) + + if "weight_map" in data and isinstance(data["weight_map"], dict): + return data["weight_map"] + else: + logger.info("No 'weight_map' dictionary found in the JSON file.") + return None + except (json.JSONDecodeError, Exception) as e: + logger.info(f"An error occurred while reading the JSON file: {str(e)}") + return None + + +def get_hf_weight_map_and_path( + model_id: str, +) -> Tuple[Dict[str, str], str]: + """Get the weight map for a given HF model id and also the cache path for loading the weights""" + try: + index_file = cached_file(model_id, _DEFAULT_SAFETENSOR_FILE_NAME) + except Exception as e: + logger.error( + f"Model `{model_id}` not found in HF cache. " + f"You can download the model using `python download.py {model_id}" + ) + raise e + + weight_map = read_weights_from_json(index_file) + weight_path = os.path.dirname(index_file) + logger.info(f"Loading weights from: {weight_path}") + return weight_map, weight_path + + +def get_needed_files( + state_dict: Dict[str, torch.Tensor], weight_map: Dict[str, str] +) -> Set[str]: + needed_files = set() + for param in state_dict.keys(): + file = weight_map.get(param) + if file: + needed_files.add(file) + elif param.endswith("weight"): + raise ValueError( + f"Parameter {param} not found in weight map, please check..." + ) + logger.info(f"Needed files: {needed_files}") + return needed_files + + +def load_safetensor_file( + full_path: str, device: torch.device +) -> Dict[str, torch.Tensor]: + tensors = {} + with safe_open(full_path, framework="pt", device=device) as f: + for k in f.keys(): + tensors[k] = f.get_tensor(k) + logger.info(f"Loaded {len(tensors)} tensors from {full_path}") + return tensors + + +def load_safetensor_weights( + model: torch.nn.Module, + weight_map: Dict[str, str], + file_location: str, + device: torch.device, +): + """ + Load safetensor weights into a `nn.Module`. + + Args: + model (Module): The PyTorch module to load weights into. It may be a + model chunk or a full model. + weight_map (Dict[str, str]): Mapping of model parameters to file names. + file_location (str): Directory containing the weight files. + device (torch.device): The device to load tensors onto. + """ + model_state_dict = model.state_dict() + needed_files = get_needed_files(model_state_dict, weight_map) + updated_states: Set[str] = set() + + for file in needed_files: + full_path = os.path.join(file_location, file) + try: + checkpoint = load_safetensor_file(full_path, "cpu") + except FileNotFoundError: + logger.error(f"File not found: {full_path}") + except Exception as e: + logger.error(f"Error during checkpoint processing of {full_path}: {str(e)}") + + matched_keys = set(checkpoint.keys()) & set(model_state_dict.keys()) + for key in matched_keys: + # Check shape + if model_state_dict[key].shape != checkpoint[key].shape: + raise ValueError( + f"Shape mismatch for {key}: " + f"model needs {model_state_dict[key].shape}, but " + f"checkpoint has {checkpoint[key].shape}" + ) + model_state_dict[key] = checkpoint[key].to(device) + + updated_states.update(matched_keys) + + missing_keys = set(model_state_dict.keys()) - updated_states + if missing_keys: + raise RuntimeError( + f"Partially updated state dict. Missing parameters: {missing_keys}" + ) + + model.load_state_dict(model_state_dict, strict=False, assign=True) + logger.info(f"Successfully loaded {len(updated_states)} weights into model") + + +def load_weights_from_hf( + model: torch.nn.Module, + distribution: str, + device: torch.device, +): + """ + Load the weights from Hugging Face format (index file + multiple safetensor + files), and fill into `model`. Model config is needed b/c we permute + wq and wk weights based on attn heads. + """ + + weight_map, weight_path = get_hf_weight_map_and_path(distribution) + + load_safetensor_weights( + model, + weight_map, + weight_path, + device, + ) diff --git a/torchtitan/experiments/deepseek_v3/download.py b/torchtitan/experiments/deepseek_v3/download.py new file mode 100644 index 0000000000000000000000000000000000000000..0b9ec3104d716cbd6142c6564d83f042f128770f --- /dev/null +++ b/torchtitan/experiments/deepseek_v3/download.py @@ -0,0 +1,70 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Usage: +# Downloads a given model to the HF Cache. Pass in a listed option ala "v3" or your own custom model path. +# python download.py {model_id} [custom_model_path] +# Examples: +# python download.py v2 # Use predefined model: deepseek-ai/DeepSeek-V2 +# python download.py custom "deepseek-ai/new-model" # Download a custom model path + +# Available models: +# "v2-lite-chat": "deepseek-ai/DeepSeek-V2-Lite-Chat", +# "v2-lite": "deepseek-ai/DeepSeek-V2-Lite", +# "v2": "deepseek-ai/DeepSeek-V2", +# "v3": "deepseek-ai/deepseek-v3", +# "v3-0324": "deepseek-ai/DeepSeek-V3-0324", +# "custom": None, # Placeholder for custom models + + +import sys + +from transformers import AutoModelForCausalLM + + +MODELS = { + "v2-lite-chat": "deepseek-ai/DeepSeek-V2-Lite-Chat", + "v2-lite": "deepseek-ai/DeepSeek-V2-Lite", + "v2": "deepseek-ai/DeepSeek-V2", + "v3": "deepseek-ai/deepseek-v3", + "v3-0324": "deepseek-ai/DeepSeek-V3-0324", + "custom": None, # For custom (any) models +} + + +def print_usage(): + print("Usage:") + print(" python download.py [model_version]") + print(" python download.py custom [custom_model_path]") + print("\nAvailable predefined models:") + for key, model in MODELS.items(): + if key != "custom": # Skip the custom placeholder + print(f" {key}: {model}") + print("\nFor custom models:") + print(" custom: Specify your own model path") + print(' Example: python download.py custom "organization/model-name"') + sys.exit(1) + + +# Process command line arguments +if len(sys.argv) < 2 or sys.argv[1] not in MODELS: + print_usage() + +if sys.argv[1] == "custom": + if len(sys.argv) != 3: + print("Error: Custom model requires a model path") + print_usage() + model_id = sys.argv[2] + print(f"Using custom model: {model_id}") +else: + model_id = MODELS[sys.argv[1]] +print(f"Downloading model: {model_id}") + +model = AutoModelForCausalLM.from_pretrained( + model_id, + device_map="auto", + trust_remote_code=True, +) diff --git a/torchtitan/experiments/deepseek_v3/model.py b/torchtitan/experiments/deepseek_v3/model.py new file mode 100644 index 0000000000000000000000000000000000000000..0669df9528b3db0de3325db36f010312b5b3eac7 --- /dev/null +++ b/torchtitan/experiments/deepseek_v3/model.py @@ -0,0 +1,1325 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# This code is based on model definition of `deepseek-ai/DeepSeek-V3-Base` on +# Hugging Face Model Hub. Url: +# https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/modeling_deepseek.py +# https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/resolve/main/configuration_deepseek.py +# +# It has been modified from its original forms to accommodate naming convention +# and usage patterns of the TorchTitan project. + +# Copyright 2023 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch DeepSeek model.""" +import math +from typing import Optional, Tuple + +import torch +import torch.distributed as dist + +import torch.distributed._symmetric_memory as symm_mem +import torch.nn.functional as F +import torch.utils.checkpoint + +from attn_mask_utils import _prepare_4d_causal_attention_mask +from indices import generate_permute_indices +from model_config import ModelArgs +from symm_mem_recipes import OnDeviceAllToAllV +from torch import nn +from torch.distributed._functional_collectives import all_to_all_single_autograd + +from torchtitan.experiments.kernels.triton_mg_group_gemm.torchao_pr import ( + ALIGN_SIZE_M, + grouped_gemm_forward, +) + +# Get model parallel subgroup by name: +# e.g. "pp", "ep", None +def get_group(dim_name: Optional[str] = None) -> dist.ProcessGroup: + glob = torch.distributed.device_mesh._mesh_resources.get_current_mesh() + return glob.get_group(dim_name) + + +class RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +class RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / ( + self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, + device=self.inv_freq.device, + dtype=torch.get_default_dtype(), + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange( + self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype + ) + + freqs = torch.outer(t, self.inv_freq.to(t.device)) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if self.max_seq_len_cached is None or seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +class LinearScalingRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__( + self, + dim, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + ): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange( + self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype + ) + t = t / self.scaling_factor + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Deepseek +class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" + + def __init__( + self, + dim, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + ): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) + - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / ( + base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange( + self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype + ) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +# Inverse dim formula to find dim based on number of rotations +def yarn_find_correction_dim( + num_rotations, dim, base=10000, max_position_embeddings=2048 +): + return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / ( + 2 * math.log(base) + ) + + +# Find dim range bounds based on rotations +def yarn_find_correction_range( + low_rot, high_rot, dim, base=10000, max_position_embeddings=2048 +): + low = math.floor( + yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings) + ) + high = math.ceil( + yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings) + ) + return max(low, 0), min(high, dim - 1) # Clamp values just in case + + +def yarn_get_mscale(scale=1, mscale=1): + if scale <= 1: + return 1.0 + return 0.1 * mscale * math.log(scale) + 1.0 + + +def yarn_linear_ramp_mask(min, max, dim): + if min == max: + max += 0.001 # Prevent singularity + + linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) + ramp_func = torch.clamp(linear_func, 0, 1) + return ramp_func + + +class YarnRotaryEmbedding(RotaryEmbedding): + def __init__( + self, + dim, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + original_max_position_embeddings=4096, + beta_fast=32, + beta_slow=1, + mscale=1, + mscale_all_dim=0, + ): + self.scaling_factor = scaling_factor + self.original_max_position_embeddings = original_max_position_embeddings + self.beta_fast = beta_fast + self.beta_slow = beta_slow + self.mscale = mscale + self.mscale_all_dim = mscale_all_dim + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + dim = self.dim + + freq_extra = 1.0 / ( + self.base + ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim) + ) + freq_inter = 1.0 / ( + self.scaling_factor + * self.base + ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim) + ) + + low, high = yarn_find_correction_range( + self.beta_fast, + self.beta_slow, + dim, + self.base, + self.original_max_position_embeddings, + ) + inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2).to( + device=device, dtype=torch.float32 + ) + inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange(seq_len, device=device, dtype=torch.float32) + + freqs = torch.outer(t, inv_freq) + + _mscale = float( + yarn_get_mscale(self.scaling_factor, self.mscale) + / yarn_get_mscale(self.scaling_factor, self.mscale_all_dim) + ) + + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer( + "cos_cached", (emb.cos() * _mscale).to(dtype), persistent=False + ) + self.register_buffer( + "sin_cached", (emb.sin() * _mscale).to(dtype), persistent=False + ) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + + b, h, s, d = q.shape + q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d) + + b, h, s, d = k.shape + k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d) + + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class MLP(nn.Module): + act_fn = nn.SiLU() + + def __init__(self, config, hidden_size=None, intermediate_size=None): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size if hidden_size is None else hidden_size + self.intermediate_size = ( + config.intermediate_size if intermediate_size is None else intermediate_size + ) + + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +class MoEGate(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.top_k = config.num_experts_per_tok + self.n_routed_experts = config.n_routed_experts + self.routed_scaling_factor = config.routed_scaling_factor + self.scoring_func = config.scoring_func + self.seq_aux = config.seq_aux + self.topk_method = config.topk_method + self.n_group = config.n_group + self.topk_group = config.topk_group + + # topk selection algorithm + self.norm_topk_prob = config.norm_topk_prob + self.gating_dim = config.hidden_size + self.weight = nn.Parameter( + torch.empty((self.n_routed_experts, self.gating_dim)) + ) + if self.topk_method == "noaux_tc": + self.e_score_correction_bias = nn.Parameter( + # Changed from torch.empty to torch.rand to avoid non-even + # distribution for runs without actual weigths + torch.rand((self.n_routed_experts)) + ) + self.reset_parameters() + + def reset_parameters(self) -> None: + import torch.nn.init as init + + init.kaiming_uniform_(self.weight, a=math.sqrt(5)) + + def forward(self, hidden_states): + bsz, seq_len, h = hidden_states.shape + # compute gating score + hidden_states = hidden_states.view(-1, h) + logits = F.linear( + hidden_states.type(torch.float32), self.weight.type(torch.float32), None + ) + if self.scoring_func == "sigmoid": + scores = logits.sigmoid() + elif self.scoring_func == "softmax": + scores = logits.softmax(dim=-1, dtype=torch.float32) + else: + raise NotImplementedError( + f"insupportable scoring function for MoE gating: {self.scoring_func}" + ) + + # select top-k experts + if self.topk_method == "noaux_tc": + scores_for_choice = scores.view( + bsz * seq_len, -1 + ) + self.e_score_correction_bias.unsqueeze(0) + group_scores = ( + scores_for_choice.view(bsz * seq_len, self.n_group, -1) + .topk(2, dim=-1)[0] + .sum(dim=-1) + ) # [n, n_group] + group_idx = torch.topk( + group_scores, k=self.topk_group, dim=-1, sorted=False + )[ + 1 + ] # [n, top_k_group] + group_mask = torch.zeros_like(group_scores) # [n, n_group] + group_mask.scatter_(1, group_idx, 1) # [n, n_group] + score_mask = ( + group_mask.unsqueeze(-1) + .expand( + bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group + ) + .reshape(bsz * seq_len, -1) + ) # [n, e] + tmp_scores = scores_for_choice.masked_fill( + ~score_mask.bool(), 0.0 + ) # [n, e] + _, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False) + topk_weight = scores.gather(1, topk_idx) + elif self.topk_method == "greedy": + topk_weight, topk_idx = torch.topk( + scores, k=self.top_k, dim=-1, sorted=False + ) + else: + raise NotImplementedError( + f"insupportable TopK function for MoE gating: {self.topk_method}" + ) + + # norm gate to sum 1 + if self.top_k > 1 and self.norm_topk_prob: + denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20 + topk_weight = topk_weight / denominator + topk_weight = ( + topk_weight * self.routed_scaling_factor + ) # must multiply the scaling factor + + return topk_idx, topk_weight + + +class MoE(nn.Module): + """ + A mixed expert module containing shared experts. + """ + + # Class attributes: + # Two shuffle method supported: + # 1. "torch_all_to_all" + # 2. "symm_mem" (see `setup_symm_mem` below) + shuffle_method = "torch_all_to_all" + + # Symmetric memory buffers shared by all MoE instances across layers + token_send_buf: Optional[torch.Tensor] = None + token_gather_buf: Optional[torch.Tensor] = None + + def __init__(self, config): + super().__init__() + self.config = config + self.num_experts_per_tok = config.num_experts_per_tok + + # ep_size is the number of ranks in expert dimension + if config.ep_size <= 1: + raise ValueError( + "For code simplicity, this model only supports distributed experts, " + "thus EP size must be > 1, please modify your model config" + ) + self.ep_group = get_group("ep") + assert config.ep_size == self.ep_group.size() + self.ep_size = config.ep_size + self.ep_rank = self.ep_group.rank() + self.experts_per_rank = config.n_routed_experts // config.ep_size + # Use ModuleDict instead of ModuleList to preserve absoulte expert + # IDs while avoiding `None` experts. The absolute expert IDs match + # with checkpoint FQNs. + self.experts = nn.ModuleDict() + for i in range(self.experts_per_rank): + abs_expert_id = self.ep_rank * self.experts_per_rank + i + self.experts[str(abs_expert_id)] = MLP( + config, intermediate_size=config.moe_intermediate_size + ) + self.gate = MoEGate(config) + if config.n_shared_experts is not None: + intermediate_size = config.moe_intermediate_size * config.n_shared_experts + self.shared_experts = MLP( + config=config, intermediate_size=intermediate_size + ) + + def combine_experts(self, submod_name): + all_weights = [] + for expert in self.experts.values(): + lin = expert.get_submodule(submod_name) + all_weights.append(lin.weight) + lin.weight = None + + concat_weight = torch.cat(all_weights) + self.register_parameter(f"{submod_name}_weight", nn.Parameter(concat_weight)) + + # This function is used to create a symm mem buffer for MoE's. It is for + # shuffling tokens fully "on-device", as compared to traditional torch + # all_to_all APIs which requrie a GPU-to-CPU sync of the splits. If a user + # calls this function, the `shuffle_method` would switch from + # `torch_all_to_all` to `symm_mem`. + def setup_symm_mem(self, dtype: torch.dtype, device: torch.device): + # Switch shuffle method + self.shuffle_method = "symm_mem" + + # Combine expert weights + print("Combining expert weights for Group GEMM") + self.combine_experts("gate_proj") + self.combine_experts("up_proj") + self.combine_experts("down_proj") + + # Assuming worst case, 2x tokens are routed to one EP rank + overflow = 2 + OnDeviceAllToAllV.max_output_len = ( + self.config.max_seq_len * self.num_experts_per_tok * overflow + ) + + # Symmetric memory buffers are shared by all MoE instances across + # layers, we only need to initialize them once + if MoE.token_send_buf is not None: + return + + # Input buffer for DP-to-EP shuffle + MoE.token_send_buf = symm_mem.empty( + self.config.max_seq_len + * self.num_experts_per_tok, # seq len * top k (flattened) + self.config.hidden_size, # hidden dim + dtype=dtype, + device=device, + ) + # Input buffer for EP-to-DP shuffle + MoE.token_gather_buf = symm_mem.empty( + self.config.max_seq_len + * self.num_experts_per_tok # seq len * top k (flattened) + * overflow, + self.config.hidden_size, # hidden dim + dtype=dtype, + device=device, + ) + print(f"EP rank [{self.ep_rank}]: Created Symmetric Memory for MoE") + + def get_send_buf(self): + # [Why detach?] During a first forward-backward step, the buffer would + # be included in a computational graph. In a second step, autograd will + # return an error saying "Trying to backward through the graph a second + # time (or directly access saved tensors more than once)". This is + # because the buffer is still in the graph, and autograd is trying to + # backward through the graph a second time. To avoid this, we detach the + # buffer from the graph. `detach()` returns a new tensor, which shares + # the same storage with the original one. + self.token_send_buf.grad = None + return self.token_send_buf.detach() + + def get_gather_buf(self): + # See [Why detach?] in `get_send_buf` + self.token_gather_buf.grad = None + return self.token_gather_buf.detach() + + def forward(self, hidden_states): + identity = hidden_states + orig_shape = hidden_states.shape + # for each token, select top-k experts, and compute the weight for each expert + topk_idx, topk_weight = self.gate(hidden_states) + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + if self.shuffle_method == "symm_mem": + y = self.moe_on_device(hidden_states, topk_idx, topk_weight) + else: # "torch_all_to_all" + y = self.moe_forward(hidden_states, topk_idx, topk_weight) + + y = y.view(*orig_shape) + if self.config.n_shared_experts is not None: + y = y + self.shared_experts(identity) + return y + + def moe_forward(self, x, topk_ids, topk_weight): + # This part sorts the token indices so that tokens routed to the same expert reside consecutively. + # An implication is that tokens to the same "expert group" (i.e., device) are also consecutive. + # Since this is an "aritificial" index creation (final outcome being + # `idxs`), we don't need gradients here. + with torch.no_grad(): + # [seq_len, n_routed_experts] + cnts = topk_ids.new_zeros((topk_ids.shape[0], self.config.n_routed_experts)) + # Fill 1 to the selected experts + cnts.scatter_(1, topk_ids, 1) + tokens_per_expert = cnts.sum(dim=0) + # Token indices for each expert + idxs = topk_ids.view(-1).argsort() + sorted_tokens_shape = idxs.shape + x.shape[1:] + + sorted_tokens = x[idxs // topk_ids.shape[1]] + assert sorted_tokens.shape == sorted_tokens_shape + + # This part exchange the information about the number of tokens send and + # received by each expert. We can understand this information as "side + # band", which is not part of the actual data. Thus no gradient is + # needed. + with torch.no_grad(): + # Sum the tokens over local experts, then we get tokens per EP rank, + # which is the input splits + tokens_per_expert_group = tokens_per_expert.new_empty( + tokens_per_expert.shape[0] + ) + dist.all_to_all_single( + tokens_per_expert_group, tokens_per_expert, group=self.ep_group + ) + input_splits = tokens_per_expert.view(self.ep_size, -1).sum(dim=1) + + # DP to EP token shuffle. This part needs gradient. + if self.shuffle_method == "symm_mem": + # Move input to the `token_send_buf` symm mem + token_send_buf = self.get_send_buf() + token_send_buf[: idxs.shape[0]].copy_(sorted_tokens) + # Note: `out=` avoids copy, but it is not differentiable + # torch.index_select(x, 0, idxs // topk_ids.shape[1], out=self.token_send_buf[: idxs.shape[0]]) + token_gather_buf, output_splits = OnDeviceAllToAllV.apply( + token_send_buf, + input_splits, + self.ep_group, + ) + with torch.no_grad(): + # Received tokens from all other ranks. TODO: use mask instead + received = output_splits.sum() + # TODO: don't use `received` + gathered_tokens = token_gather_buf[:received] + else: # "torch_all_to_all" + # Prepare input ans output splits + with torch.no_grad(): + output_splits = tokens_per_expert_group.view(self.ep_size, -1).sum( + dim=1 + ) + gathered_tokens = all_to_all_single_autograd( + sorted_tokens, + output_splits.tolist(), + input_splits.tolist(), + self.ep_group, + ) + + # This part prepares a 1D tensor with the same length as + # `gathered_tokens`. The 1D tensor is filled with local expert IDs which + # the tokens in `gathered_tokens` are headed for. This part doesn't need + # gradient. + with torch.no_grad(): + gatherd_idxs = ( + torch.arange( + tokens_per_expert_group.numel(), + device=tokens_per_expert_group.device, + ) + % self.experts_per_rank + ) + gatherd_idxs = gatherd_idxs.repeat_interleave(tokens_per_expert_group) + + # Prepare buffer for tokens processed by experts + if self.shuffle_method == "symm_mem": + # Take necessary space from `token_gather_buf` symm mem because we are + # going to send them out after expert processing + processed_tokens = self.get_gather_buf()[: gathered_tokens.shape[0]] + else: # "torch_all_to_all" + processed_tokens = torch.empty_like(gathered_tokens) + + # This part processes the tokens routed to the local experts. + # TODO: can we use group GEMM here? + for i, expert in enumerate(self.experts.values()): + processed_tokens[gatherd_idxs == i] = expert( + gathered_tokens[gatherd_idxs == i] + ) + + # Now shuffle the tokens back to their original owner, i.e. EP to DP shuffle. + # The input/output splits are just a reverse of the previous shuffle. + if self.shuffle_method == "symm_mem": + token_return_buf, _ = OnDeviceAllToAllV.apply( + processed_tokens, + output_splits, + self.ep_group, + ) + returned_tokens = token_return_buf[: sorted_tokens_shape[0]] + else: # "torch_all_to_all" + returned_tokens = all_to_all_single_autograd( + processed_tokens, + input_splits.tolist(), + output_splits.tolist(), + self.ep_group, + ) + + output_tokens = torch.empty_like(returned_tokens) + output_tokens[idxs] = returned_tokens + final_out = ( + output_tokens.view(*topk_ids.shape, -1) + .type(topk_weight.dtype) + .mul_(topk_weight.unsqueeze(dim=-1)) + .sum(dim=1) + .type(returned_tokens.dtype) + ) + return final_out + + def moe_on_device(self, x, topk_ids, topk_weight): + # This part sorts the token indices so that tokens routed to the same expert reside consecutively. + # An implication is that tokens to the same "expert group" (i.e., device) are also consecutive. + # Since this is an "aritificial" index creation (final outcome being + # `idxs`), we don't need gradients here. + with torch.no_grad(): + # [seq_len, n_routed_experts] + cnts = topk_ids.new_zeros((topk_ids.shape[0], self.config.n_routed_experts)) + # Fill 1 to the selected experts + cnts.scatter_(1, topk_ids, 1) + tokens_per_expert = cnts.sum(dim=0) + # Token indices for each expert + idxs = topk_ids.view(-1).argsort() + sorted_tokens_shape = idxs.shape + x.shape[1:] + + sorted_tokens = x[idxs // topk_ids.shape[1]] + assert sorted_tokens.shape == sorted_tokens_shape + + # This part exchange the information about the number of tokens send and + # received by each expert. We can understand this information as "side + # band", which is not part of the actual data. Thus no gradient is + # needed. + with torch.no_grad(): + # Sum the tokens over local experts, then we get tokens per EP rank, + # which is the input splits + tokens_per_expert_group = tokens_per_expert.new_empty( + tokens_per_expert.shape[0] + ) + dist.all_to_all_single( + tokens_per_expert_group, tokens_per_expert, group=self.ep_group + ) + input_splits = tokens_per_expert.view(self.ep_size, -1).sum(dim=1) + + # Move input to the `token_send_buf` symm mem + token_send_buf = self.get_send_buf() + token_send_buf[: idxs.shape[0]].copy_(sorted_tokens) + # Note: `out=` avoids copy, but it is not differentiable + # torch.index_select(x, 0, idxs // topk_ids.shape[1], out=self.token_send_buf[: idxs.shape[0]]) + token_gather_buf, output_splits = OnDeviceAllToAllV.apply( + token_send_buf, + input_splits, + self.ep_group, + ) + + # We need to permute the received tokens so that tokens for the same expert are contiguous. + # This part prepares a 1D tensor `permuted_indices` for such permutation. + # This part doesn't need gradient. + with torch.no_grad(): + permuted_indices, m_sizes = generate_permute_indices( + tokens_per_expert_group, + self.experts_per_rank, + self.ep_size, + token_gather_buf.shape[0], + ALIGN_SIZE_M, + ) + + # Permute the received tokens so that tokens for the same expert are contiguous. + contig_tokens = token_gather_buf[permuted_indices] + + # Run the first grouped GEMM + w1 = self.get_parameter("gate_proj_weight") + gate_proj = grouped_gemm_forward(contig_tokens, w1, m_sizes) + + # Run the second grouped GEMM + w3 = self.get_parameter("up_proj_weight") + up_proj = grouped_gemm_forward(contig_tokens, w3, m_sizes) + + # Apply activation + hidden_outputs = MLP.act_fn(gate_proj) * up_proj + + # Run the third grouped GEMM + w2 = self.get_parameter("down_proj_weight") + hidden_outputs = grouped_gemm_forward(hidden_outputs, w2, m_sizes) + + # Prepare buffer for tokens processed by experts + # Take necessary space from `token_gather_buf` symm mem because we are + # going to send them out after expert processing + processed_tokens = self.get_gather_buf() + + # Move into Symmetric Memory for the return shuffle + processed_tokens[permuted_indices] = hidden_outputs + + # Now shuffle the tokens back to their original owner, i.e. EP to DP shuffle. + # The input/output splits are just a reverse of the previous shuffle. + token_return_buf, _ = OnDeviceAllToAllV.apply( + processed_tokens, + output_splits, + self.ep_group, + ) + returned_tokens = token_return_buf[: sorted_tokens_shape[0]] + + output_tokens = torch.empty_like(returned_tokens) + output_tokens[idxs] = returned_tokens + final_out = ( + output_tokens.view(*topk_ids.shape, -1) + .type(topk_weight.dtype) + .mul_(topk_weight.unsqueeze(dim=-1)) + .sum(dim=1) + .type(returned_tokens.dtype) + ) + return final_out + + +class Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: ModelArgs, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.q_lora_rank = config.q_lora_rank + self.qk_rope_head_dim = config.qk_rope_head_dim + self.kv_lora_rank = config.kv_lora_rank + self.v_head_dim = config.v_head_dim + self.qk_nope_head_dim = config.qk_nope_head_dim + self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim + + self.is_causal = True + + if self.q_lora_rank is None: + self.q_proj = nn.Linear( + self.hidden_size, self.num_heads * self.q_head_dim, bias=False + ) + else: + self.q_a_proj = nn.Linear( + self.hidden_size, config.q_lora_rank, bias=config.attention_bias + ) + self.q_a_layernorm = RMSNorm(config.q_lora_rank) + self.q_b_proj = nn.Linear( + config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False + ) + + self.kv_a_proj_with_mqa = nn.Linear( + self.hidden_size, + config.kv_lora_rank + config.qk_rope_head_dim, + bias=config.attention_bias, + ) + self.kv_a_layernorm = RMSNorm(config.kv_lora_rank) + self.kv_b_proj = nn.Linear( + config.kv_lora_rank, + self.num_heads + * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim), + bias=False, + ) + + self.o_proj = nn.Linear( + self.num_heads * self.v_head_dim, + self.hidden_size, + bias=config.attention_bias, + ) + self._init_rope() + + self.softmax_scale = self.q_head_dim ** (-0.5) + if self.config.rope_scaling is not None: + mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0) + scaling_factor = self.config.rope_scaling["factor"] + if mscale_all_dim: + mscale = yarn_get_mscale(scaling_factor, mscale_all_dim) + self.softmax_scale = self.softmax_scale * mscale * mscale + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = RotaryEmbedding( + self.qk_rope_head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "linear": + self.rotary_emb = LinearScalingRotaryEmbedding( + self.qk_rope_head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "dynamic": + self.rotary_emb = DynamicNTKScalingRotaryEmbedding( + self.qk_rope_head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "yarn": + kwargs = { + key: self.config.rope_scaling[key] + for key in [ + "original_max_position_embeddings", + "beta_fast", + "beta_slow", + "mscale", + "mscale_all_dim", + ] + if key in self.config.rope_scaling + } + self.rotary_emb = YarnRotaryEmbedding( + self.qk_rope_head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + **kwargs, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + if self.q_lora_rank is None: + q = self.q_proj(hidden_states) + else: + q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))) + q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2) + q_nope, q_pe = torch.split( + q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1 + ) + + compressed_kv = self.kv_a_proj_with_mqa(hidden_states) + compressed_kv, k_pe = torch.split( + compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1 + ) + k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2) + kv = ( + self.kv_b_proj(self.kv_a_layernorm(compressed_kv)) + .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim) + .transpose(1, 2) + ) + + k_nope, value_states = torch.split( + kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1 + ) + kv_seq_len = value_states.shape[-2] + + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids) + + query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim) + query_states[:, :, :, : self.qk_nope_head_dim] = q_nope + query_states[:, :, :, self.qk_nope_head_dim :] = q_pe + + key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim) + key_states[:, :, :, : self.qk_nope_head_dim] = k_nope + key_states[:, :, :, self.qk_nope_head_dim :] = k_pe + + if attention_mask is not None: + # Attention mask was made 4D because the `attn_weights` above is 4D. + # We probably can make this mask smarter if we want to pack sequences + # together, instead of using padding. This optimization can be used in + # inference. For training, if we want to pack sequences, data loader + # will pass in a mask containing such info. + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, # None, or user provided mask in 2D + (bsz, q_len), + hidden_states, + 0, # past_key_values_length, 0 when training + ) + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query=query_states, + key=key_states, + value=value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout, + is_causal=attention_mask is None, + scale=self.softmax_scale, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim) + attn_output = self.o_proj(attn_output) + + return attn_output + + +class DecoderLayer(nn.Module): + def __init__(self, config: ModelArgs, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = Attention(config=config, layer_idx=layer_idx) + + self.mlp = ( + MoE(config) + if ( + config.n_routed_experts is not None + and layer_idx >= config.first_k_dense_replace + and layer_idx % config.moe_layer_freq == 0 + ) + else MLP(config) + ) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + ) -> torch.Tensor: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + """ + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +Deepseek_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +class DeepseekModel(torch.nn.Module): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DecoderLayer`] + + Args: + config: ModelArgs + """ + + def __init__(self, config: ModelArgs): + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + # Creating model parts related to my stage + assert ( + config.stage_idx < config.num_stages + ), f"Stage {config.stage_idx} is not in the model" + print(f"Creating model stage {config.stage_idx} of {config.num_stages}") + + self.embed_tokens = ( + nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + if config.stage_idx == 0 + else None + ) + + self.layers = torch.nn.ModuleDict() + division = config.num_hidden_layers // config.num_stages + residual = config.num_hidden_layers % config.num_stages + # Some earlier stages may have 1 more layer than latter stages because + # the division may have residual; this is more even than giving the + # entire residual to the last stage. + layers_per_stage = [ + division + 1 if stage < residual else division + for stage in range(config.num_stages) + ] + assert sum(layers_per_stage) == config.num_hidden_layers + layer_id_start = sum(layers_per_stage[: config.stage_idx]) + layer_id_end = layer_id_start + layers_per_stage[config.stage_idx] + for layer_id in range(layer_id_start, layer_id_end): + self.layers[str(layer_id)] = DecoderLayer(config, layer_id) + + self.norm = ( + RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + if config.stage_idx == config.num_stages - 1 + else None + ) + + # Initialize weights and apply final processing + self.apply(self._init_weights) + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + def forward( + self, + tokens: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + ) -> torch.Tensor: + # Embedding + hidden_states = ( + self.embed_tokens(tokens) if self.embed_tokens is not None else tokens + ) + + # decoder layers + for decoder_layer in self.layers.values(): + hidden_states = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + ) + + hidden_states = ( + self.norm(hidden_states) if self.norm is not None else hidden_states + ) + return hidden_states + + +class DeepseekForCausalLM(torch.nn.Module): + def __init__(self, config): + super().__init__() + self.model = DeepseekModel(config) + self.lm_head = ( + nn.Linear(config.hidden_size, config.vocab_size, bias=False) + if config.stage_idx == config.num_stages - 1 + else None + ) + + # Initialize weights and apply final processing + # self.post_init() + + def forward( + self, + tokens: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + ) -> Tuple: + r""" + Example: + + ```python + >>> from transformers import AutoTokenizer, DeepseekForCausalLM + + >>> model = DeepseekForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + hidden_states = self.model( + tokens, + attention_mask=attention_mask, + position_ids=position_ids, + ) + + logits = ( + self.lm_head(hidden_states) if self.lm_head is not None else hidden_states + ) + return logits + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + **kwargs, + ): + if past_key_values is not None: + # Assuming isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as + # input) + if ( + attention_mask is not None + and attention_mask.shape[1] > input_ids.shape[1] + ): + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple( + past_state.index_select(0, beam_idx.to(past_state.device)) + for past_state in layer_past + ), + ) + return reordered_past + + # Setup Symmetric Memory for MoE token shuffle. + # Supports inference currently. + def setup_symm_mem(self, dtype: torch.dtype, device: torch.device): + for layer in self.model.layers.values(): + if not isinstance(layer.mlp, MoE): + continue + layer.mlp.setup_symm_mem(dtype, device) diff --git a/torchtitan/experiments/deepseek_v3/requirements.txt b/torchtitan/experiments/deepseek_v3/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b66a52d87be39b1c4fb36e822c24958d40dfa81 --- /dev/null +++ b/torchtitan/experiments/deepseek_v3/requirements.txt @@ -0,0 +1,5 @@ +transformers +accelerate +torchdata >= 0.8.0 +datasets >= 2.21.0 +tomli >= 1.1.0 ; python_version < "3.11" diff --git a/torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_utils.py b/torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ed00317084d85abd10e13cc4f18437d6e9337a75 --- /dev/null +++ b/torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_utils.py @@ -0,0 +1,63 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import triton +import triton.language as tl + + +@triton.jit +def get_tid(): + return tl.inline_asm_elementwise( + """ + mov.u32 $0, %tid.x; + mov.u32 $1, %tid.y; + mov.u32 $2, %tid.z; + """, + "=r,=r,=r", + [], + dtype=(tl.uint32, tl.uint32, tl.uint32), + is_pure=True, + pack=1, + ) + + +@triton.jit +def get_ntid(): + return tl.inline_asm_elementwise( + """ + mov.u32 $0, %ntid.x; + mov.u32 $1, %ntid.y; + mov.u32 $2, %ntid.z; + """, + "=r,=r,=r", + [], + dtype=(tl.uint32, tl.uint32, tl.uint32), + is_pure=True, + pack=1, + ) + + +@triton.jit +def get_flat_tid(): + tid_x, tid_y, tid_z = get_tid() + ntid_x, ntid_y, _ = get_ntid() + return tid_z * ntid_y * ntid_x + tid_y * ntid_x + tid_x + + +@triton.jit +def get_flat_bid(): + return ( + tl.program_id(2) * tl.num_programs(1) * tl.num_programs(0) + + tl.program_id(1) * tl.num_programs(0) + + tl.program_id(0) + ) + + +@triton.jit +def sync_threads(): + tl.inline_asm_elementwise( + "bar.sync 0;", "=r", [], dtype=tl.int32, is_pure=False, pack=1 + ) diff --git a/torchtitan/experiments/flux/README.md b/torchtitan/experiments/flux/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2e56939b6eea7769d5130703cd3acb58f7eb5f5a --- /dev/null +++ b/torchtitan/experiments/flux/README.md @@ -0,0 +1,23 @@ +# FLUX model in torchtitan + +## Overview + +## Usage +First, download the autoencoder model from HuggingFace with your own access token: +```bash +python torchtitan/experiments/flux/scripts/download_autoencoder.py --repo_id black-forest-labs/FLUX.1-dev --ae_path ae.safetensors --hf_token +``` +This step will download the autoencoder model from HuggingFace and save it to the `torchtitan/experiments/flux/assets/autoencoder/ae.safetensors` file. + +Run the following command to train the model on a single GPU: +```bash +PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True torchrun --nproc_per_node=1 torchtitan/experiments/flux/train.py --job.config_file torchtitan/experiments/flux/train_configs/debug_model.toml +``` + +## TODO +- [ ] Supporting for multiple GPUs is comming soon (FSDP, etc) +- [ ] Implement test cases in CI for FLUX model. Adding more unit tests for FLUX model (eg, unit test for preprocessor, etc) +- [ ] More parallesim support (Tensor Parallelism, Context Parallelism, etc) +- [ ] Support for distributed checkpointing and loading +- [ ] Implement init_weights() function to initialize the model weights +- [ ] Implement the num_flops_per_token calculation in get_nparams_and_flops() function diff --git a/torchtitan/experiments/flux/__pycache__/__init__.cpython-312.pyc b/torchtitan/experiments/flux/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f71ead984739bd6ad7c0808e5bb122786a517b4 Binary files /dev/null and b/torchtitan/experiments/flux/__pycache__/__init__.cpython-312.pyc differ diff --git a/torchtitan/experiments/flux/dataset/flux_dataset.py b/torchtitan/experiments/flux/dataset/flux_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..995f0af3b4152052bcfb21b4331e8dcff8ddd7da --- /dev/null +++ b/torchtitan/experiments/flux/dataset/flux_dataset.py @@ -0,0 +1,267 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import math +import random +from dataclasses import dataclass +from typing import Any, Callable, Optional + +import numpy as np + +import torch + +from datasets import Dataset, load_dataset +from datasets.distributed import split_dataset_by_node +from PIL import Image + +from torch.distributed.checkpoint.stateful import Stateful + +from torch.utils.data import IterableDataset +from torchtitan.components.dataloader import ParallelAwareDataloader + +from torchtitan.config_manager import JobConfig +from torchtitan.experiments.flux.dataset.tokenizer import FluxTokenizer +from torchtitan.tools.logging import logger + + +def _process_cc12m_image( + img: Image.Image, + output_size: int = 256, +) -> Optional[torch.Tensor]: + """Process CC12M image to the desired size.""" + + width, height = img.size + # Skip low resolution images + if width < output_size or height < output_size: + return None + + if width >= height: + # resize height to be equal to output_size, then crop + new_width, new_height = math.ceil(output_size / height * width), output_size + img = img.resize((new_width, new_height)) + left = random.randint(0, new_width - output_size) + resized_img = img.crop((left, 0, left + output_size, output_size)) + else: + # resize width to be equal to output_size, the crop + new_width, new_height = ( + output_size, + math.ceil(output_size / width * height), + ) + img = img.resize((new_width, new_height)) + lower = random.randint(0, new_width - output_size) + resized_img = img.crop((0, lower, output_size, lower + output_size)) + + assert resized_img.size[0] == resized_img.size[1] == output_size + + # Skip grayscale images + if resized_img.mode == "L": + return None + + np_img = np.array(resized_img).transpose((2, 0, 1)) + tensor_img = torch.tensor(np_img).float() / 255.0 + + # NOTE: The following commented code is an alternative way + # img_transform = transforms.Compose( + # [ + # transforms.Resize(max(output_size, output_size)), + # transforms.CenterCrop((output_size, output_size)), + # transforms.ToTensor(), + # ] + # ) + # tensor_img = img_transform(img) + + return tensor_img + + +def _flux_data_processor( + sample: dict[str, Any], + t5_tokenizer: FluxTokenizer, + clip_tokenizer: FluxTokenizer, + output_size: int = 256, +) -> dict[str, Any]: + """ + Preprocess CC12M dataset sample image and text for Flux model. + + Args: + sample: A sample from dataset + t5_encoder: T5 encoder + clip_encoder: CLIP encoder + output_size: The output image size + + """ + img = _process_cc12m_image(sample["jpg"], output_size=output_size) + t5_tokens = t5_tokenizer.encode(sample["txt"]) + clip_tokens = clip_tokenizer.encode(sample["txt"]) + + return { + "image": img, + "clip_tokens": clip_tokens, # type: List[int] + "t5_tokens": t5_tokens, # type: List[int] + } + + +@dataclass +class TextToImageDatasetConfig: + path: str + loader: Callable + data_processor: Callable + + +DATASETS = { + "cc12m": TextToImageDatasetConfig( + path="pixparse/cc12m-wds", + loader=lambda path: load_dataset(path, split="train", streaming=True), + data_processor=_flux_data_processor, + ), +} + + +def _validate_dataset( + dataset_name: str, dataset_path: Optional[str] = None +) -> tuple[str, Callable, Callable]: + """Validate dataset name and path.""" + if dataset_name not in DATASETS: + raise ValueError( + f"Dataset {dataset_name} is not supported. " + f"Supported datasets are: {list(DATASETS.keys())}" + ) + + config = DATASETS[dataset_name] + path = dataset_path or config.path + logger.info(f"Preparing {dataset_name} dataset from {path}") + return path, config.loader, config.data_processor + + +class FluxDataset(IterableDataset, Stateful): + """Dataset for FLUX text-to-image model. + + Args: + dataset_name (str): Name of the dataset. + dataset_path (str): Path to the dataset. + model_transform (Transform): Callable that applies model-specific preprocessing to the sample. + dp_rank (int): Data parallel rank. + dp_world_size (int): Data parallel world size. + infinite (bool): Whether to loop over the dataset infinitely. + """ + + def __init__( + self, + dataset_name: str, + dataset_path: Optional[str], + t5_tokenizer: FluxTokenizer, + clip_tokenizer: FluxTokenizer, + job_config: Optional[JobConfig] = None, + dp_rank: int = 0, + dp_world_size: int = 1, + infinite: bool = False, + ) -> None: + + # Force lowercase for consistent comparison + dataset_name = dataset_name.lower() + + path, dataset_loader, data_processor = _validate_dataset( + dataset_name, dataset_path + ) + ds = dataset_loader(path) + + self.dataset_name = dataset_name + self._data = split_dataset_by_node(ds, dp_rank, dp_world_size) + + self._t5_tokenizer = t5_tokenizer + self._clip_tokenizer = clip_tokenizer + self._data_processor = data_processor + self.job_config = job_config + + self.infinite = infinite + + # Variables for checkpointing + self._sample_idx = 0 + self._all_samples: list[dict[str, Any]] = [] + + def _get_data_iter(self): + if isinstance(self._data, Dataset) and self._sample_idx == len(self._data): + return iter([]) + + it = iter(self._data) + for _ in range(self._sample_idx): + next(it) + return it + + def __iter__(self): + while True: + for sample in self._get_data_iter(): + # Use the dataset-specific preprocessor + sample_dict = self._data_processor( + sample, self._t5_tokenizer, self._clip_tokenizer, output_size=256 + ) + + # skip low quality image or image with color channel = 1 + if sample_dict["image"] is None: + logger.warning( + f"Low quality image {sample['__key__']} is skipped in Flux Dataloader" + ) + continue + + self._all_samples.extend(sample_dict) + self._sample_idx += 1 + + labels = sample_dict.pop("image") + yield sample_dict, labels + + if not self.infinite: + logger.warning(f"Dataset {self.dataset_name} has run out of data") + break + else: + # Reset offset for the next iteration + self._sample_idx = 0 + logger.warning(f"Dataset {self.dataset_name} is being re-looped") + + def load_state_dict(self, state_dict): + self._sample_idx = state_dict["sample_idx"] + self._all_samples = state_dict["all_samples"] + + def state_dict(self): + return { + "all_samples": self._all_samples, + "sample_idx": self._sample_idx, + } + + +def build_flux_dataloader( + dp_world_size: int, + dp_rank: int, + job_config: JobConfig, + # This parameter is not used, keep it for compatibility + tokenizer: FluxTokenizer | None, + infinite: bool = True, +) -> ParallelAwareDataloader: + """Build a data loader for HuggingFace datasets.""" + dataset_name = job_config.training.dataset + dataset_path = job_config.training.dataset_path + batch_size = job_config.training.batch_size + + t5_encoder_name = job_config.encoder.t5_encoder + clip_encoder_name = job_config.encoder.clip_encoder + max_t5_encoding_len = job_config.encoder.max_t5_encoding_len + + ds = FluxDataset( + dataset_name=dataset_name, + dataset_path=dataset_path, + t5_tokenizer=FluxTokenizer(t5_encoder_name, max_length=max_t5_encoding_len), + clip_tokenizer=FluxTokenizer( + clip_encoder_name, max_length=77 + ), # fix max_length for CLIP + dp_rank=dp_rank, + dp_world_size=dp_world_size, + infinite=infinite, + ) + + return ParallelAwareDataloader( + dataset=ds, + dp_rank=dp_rank, + dp_world_size=dp_world_size, + batch_size=batch_size, + ) diff --git a/torchtitan/experiments/flux/dataset/tokenizer.py b/torchtitan/experiments/flux/dataset/tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..090bfc955152d87614f03793fd606330995da39d --- /dev/null +++ b/torchtitan/experiments/flux/dataset/tokenizer.py @@ -0,0 +1,64 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement. + + +from typing import List + +from torchtitan.components.tokenizer import Tokenizer +from transformers import CLIPTokenizer, T5Tokenizer + + +class FluxTokenizer(Tokenizer): + """ + Tokenizing and encoding/decoding text using the T5 or Clip tokenizer. + + Args: + model_path (str): Path to the tokenzier from hugging face. + + """ + + def __init__(self, model_path: str = "t5-small", max_length: int = 77): + super().__init__() + self._n_words = 8 # TODO(jianiw): check + self._max_length = max_length + + self.is_clip = model_path.startswith("openai") + + if self.is_clip: + self._tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained( + model_path, max_length=max_length + ) + else: + self._tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained( + model_path, max_length=max_length + ) + + def encode( + self, + s: str, + ) -> List[int]: + """ + Encode the prompt text into tokens. + """ + tokens = self._tokenizer( + s, + truncation=True, + max_length=self._max_length, + return_length=False, + return_overflowing_tokens=False, + padding="max_length", + return_tensors="pt", # return pytorch tensors, default return List[int] + )["input_ids"] + return tokens + + def decode(self, t: List[int]) -> str: + """ + Decode function. This function will not be called. + """ + return self._tokenizer.decode(t) diff --git a/torchtitan/experiments/flux/model/__pycache__/layers.cpython-312.pyc b/torchtitan/experiments/flux/model/__pycache__/layers.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6880dfb59cfd0796ca3f69a67fec3d4d7b3a472 Binary files /dev/null and b/torchtitan/experiments/flux/model/__pycache__/layers.cpython-312.pyc differ diff --git a/torchtitan/experiments/flux/model/hf_embedder.py b/torchtitan/experiments/flux/model/hf_embedder.py new file mode 100644 index 0000000000000000000000000000000000000000..495fd7a81d16cc0cadeaab3b390a638339ff0f94 --- /dev/null +++ b/torchtitan/experiments/flux/model/hf_embedder.py @@ -0,0 +1,40 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from torch import nn, Tensor +from transformers import CLIPTextModel, T5EncoderModel + + +class FluxEmbedder(nn.Module): + def __init__(self, version: str, **hf_kwargs): + super().__init__() + self.is_clip = version.startswith("openai") + self.output_key = "pooler_output" if self.is_clip else "last_hidden_state" + + if self.is_clip: + self.hf_module: CLIPTextModel = CLIPTextModel.from_pretrained( + version, **hf_kwargs + ) + else: + self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained( + version, **hf_kwargs + ) + + self.hf_module = self.hf_module.eval().requires_grad_(False) + + def forward(self, batch_tokens: Tensor) -> Tensor: + """ + batch_tokens: [bsz, embedding_length] + + For T5 Encoder, embeding_length is 768 + For CLIP, embedding_length is 256 + """ + outputs = self.hf_module( + input_ids=batch_tokens.to(self.hf_module.device), + attention_mask=None, + output_hidden_states=False, + ) + return outputs[self.output_key] diff --git a/torchtitan/experiments/flux/model/model.py b/torchtitan/experiments/flux/model/model.py new file mode 100644 index 0000000000000000000000000000000000000000..67b9e6aeaacee709c4fdc7d86f338eec050bf322 --- /dev/null +++ b/torchtitan/experiments/flux/model/model.py @@ -0,0 +1,177 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass, field + +import torch + +from torch import nn, Tensor +from torchtitan.components.tokenizer import Tokenizer +from torchtitan.config_manager import JobConfig + +from torchtitan.experiments.flux.model.autoencoder import AutoEncoderParams +from torchtitan.experiments.flux.model.layers import ( + DoubleStreamBlock, + EmbedND, + LastLayer, + MLPEmbedder, + SingleStreamBlock, + timestep_embedding, +) + +from torchtitan.protocols.train_spec import BaseModelArgs, ModelProtocol +from torchtitan.tools.logging import logger + + +@dataclass +class FluxModelArgs(BaseModelArgs): + in_channels: int = 64 + out_channels: int = 64 + vec_in_dim: int = 768 + context_in_dim: int = 512 + hidden_size: int = 3072 + mlp_ratio: float = 4.0 + num_heads: int = 24 + depth: int = 19 + depth_single_blocks: int = 38 + axes_dim: tuple = (16, 56, 56) + theta: int = 10_000 + qkv_bias: bool = True + guidance_embed: bool = True + autoencoder_params: AutoEncoderParams = field(default_factory=AutoEncoderParams) + + def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> None: + # context_in_dim is the same as the T5 embedding dimension + self.context_in_dim = job_config.encoder.max_t5_encoding_len + + def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: + # TODO(jianiw): Add the number of flops for the autoencoder + nparams = sum(p.numel() for p in model.parameters()) + logger.warning("FLUX model haven't implement get_nparams_and_flops() function") + return nparams, 1 + + +class FluxModel(nn.Module, ModelProtocol): + """ + Transformer model for flow matching on sequences. + + Agrs: + model_args: FluxModelArgs. + + Attributes: + model_args (TransformerModelArgs): Model configuration arguments. + """ + + def __init__(self, model_args: FluxModelArgs): + super().__init__() + + self.model_args = model_args + self.in_channels = model_args.in_channels + self.out_channels = model_args.out_channels + if model_args.hidden_size % model_args.num_heads != 0: + raise ValueError( + f"Hidden size {model_args.hidden_size} must be divisible by num_heads {model_args.num_heads}" + ) + pe_dim = model_args.hidden_size // model_args.num_heads + if sum(model_args.axes_dim) != pe_dim: + raise ValueError( + f"Got {model_args.axes_dim} but expected positional dim {pe_dim}" + ) + self.hidden_size = model_args.hidden_size + self.num_heads = model_args.num_heads + self.pe_embedder = EmbedND( + dim=pe_dim, theta=model_args.theta, axes_dim=model_args.axes_dim + ) + self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True) + self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) + self.vector_in = MLPEmbedder(model_args.vec_in_dim, self.hidden_size) + self.guidance_in = ( + MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) + if model_args.guidance_embed + else nn.Identity() + ) + self.txt_in = nn.Linear(model_args.context_in_dim, self.hidden_size) + + self.double_blocks = nn.ModuleList( + [ + DoubleStreamBlock( + self.hidden_size, + self.num_heads, + mlp_ratio=model_args.mlp_ratio, + qkv_bias=model_args.qkv_bias, + ) + for _ in range(model_args.depth) + ] + ) + + self.single_blocks = nn.ModuleList( + [ + SingleStreamBlock( + self.hidden_size, self.num_heads, mlp_ratio=model_args.mlp_ratio + ) + for _ in range(model_args.depth_single_blocks) + ] + ) + + self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels) + + def init_weights(self, buffer_device=None): + # TODO(jianiw): replace placeholder with real weight init + for param in self.parameters(): + param.data.uniform_(0, 0.1) + + def forward( + self, + img: Tensor, + img_ids: Tensor, + txt: Tensor, + txt_ids: Tensor, + timesteps: Tensor, + y: Tensor, + guidance: Tensor | None = None, + ) -> Tensor: + if img.ndim != 3 or txt.ndim != 3: + raise ValueError("Input img and txt tensors must have 3 dimensions.") + + # running on sequences img + img = self.img_in(img) + vec = self.time_in(timestep_embedding(timesteps, 256)) + if self.model_args.guidance_embed: + if guidance is None: + raise ValueError( + "Didn't get guidance strength for guidance distilled model." + ) + vec = vec + self.guidance_in(timestep_embedding(guidance, 256)) + vec = vec + self.vector_in(y) + txt = self.txt_in(txt) + + ids = torch.cat((txt_ids, img_ids), dim=1) + pe = self.pe_embedder(ids) + + for block in self.double_blocks: + img, txt = block(img=img, txt=txt, vec=vec, pe=pe) + + img = torch.cat((txt, img), 1) + for block in self.single_blocks: + img = block(img, vec=vec, pe=pe) + img = img[:, txt.shape[1] :, ...] + + img = self.final_layer(img, vec) # (N, T, patch_size ** 2 * out_channels) + return img + + @classmethod + def from_model_args(cls, model_args: FluxModelArgs) -> "FluxModel": + """ + Initialize a Flux model from a FluxModelArgs object. + + Args: + model_args (FluxModelArgs): Model configuration arguments. + + Returns: + FluxModel: FluxModel model. + + """ + return cls(model_args) diff --git a/torchtitan/experiments/flux/tests/test_flux_dataloader.py b/torchtitan/experiments/flux/tests/test_flux_dataloader.py new file mode 100644 index 0000000000000000000000000000000000000000..fc87f1b8b4ae3ad7daf1558835716720127e3b42 --- /dev/null +++ b/torchtitan/experiments/flux/tests/test_flux_dataloader.py @@ -0,0 +1,103 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import sys + +from torchtitan.config_manager import JobConfig +from torchtitan.experiments.flux.dataset.flux_dataset import build_flux_dataloader +from torchtitan.tools.profiling import ( + maybe_enable_memory_snapshot, + maybe_enable_profiling, +) + + +class TestFluxDataLoader: + def test_flux_dataloader(self): + dataset_name = "cc12m" + batch_size = 32 + world_size = 4 + rank = 0 + + num_steps = 10 + + path = "torchtitan.experiments.flux.flux_argparser" + sys.argv.append(f"--experimental.custom_args_module={path}") + config = JobConfig() + config.maybe_add_custom_args() + config.parse_args( + [ + # Profiling options + # "--profiling.enable_profiling", + # "--profiling.profile_freq", + # "5", + # "--profiling.enable_memory_snapshot", + # "--profiling.save_memory_snapshot_folder", + # "memory_snapshot_flux", + "--training.dataset", + dataset_name, + "--training.batch_size", + str(batch_size), + "--encoder.t5_encoder", + "google/t5-v1_1-small", + "--encoder.clip_encoder", + "openai/clip-vit-large-patch14", + "--encoder.max_t5_encoding_len", + "512", + ] + ) + + with maybe_enable_profiling( + config, global_step=0 + ) as torch_profiler, maybe_enable_memory_snapshot( + config, global_step=0 + ) as memory_profiler: + dl = self._build_dataloader( + config, + world_size, + rank, + ) + dl = iter(dl) + + for i in range(0, num_steps): + input_data, labels = next(dl) + print(f"Step {i} image size: {labels.shape}") + if torch_profiler: + torch_profiler.step() + if memory_profiler: + memory_profiler.step() + + print(len(input_data["clip_tokens"])) + for k, v in input_data.items(): + print(f"Step {i} {k} value: {type(v), v.shape}") + + assert len(input_data) == 2 # (clip_encodings, t5_encodings) + assert labels.shape == (batch_size, 3, 256, 256) + # assert input_data["clip_tokens"].shape[0] == batch_size + # assert input_data["t5_tokens"].shape == (batch_size, 512, 512) + + if torch_profiler: + torch_profiler.step() + if memory_profiler: + memory_profiler.step(exit_ctx=True) + + def test_preprocess(self): + # TODO + pass + + def _build_dataloader( + self, + job_config, + world_size, + rank, + ): + + return build_flux_dataloader( + dp_world_size=world_size, + dp_rank=rank, + job_config=job_config, + tokenizer=None, + infinite=False, + ) diff --git a/torchtitan/experiments/flux/tests/test_generate_image.py b/torchtitan/experiments/flux/tests/test_generate_image.py new file mode 100644 index 0000000000000000000000000000000000000000..86d8d16cfbbcbfaa706e6ff6713403520744efd5 --- /dev/null +++ b/torchtitan/experiments/flux/tests/test_generate_image.py @@ -0,0 +1,252 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import math +import os +import time +from typing import Callable + +import torch +from einops import rearrange + +from PIL import ExifTags, Image + +from torch import Tensor + +from torchtitan.experiments.flux.dataset.tokenizer import FluxTokenizer + +from torchtitan.experiments.flux.model.autoencoder import ( + AutoEncoder, + AutoEncoderParams, + load_ae, +) +from torchtitan.experiments.flux.model.hf_embedder import FluxEmbedder + +from torchtitan.experiments.flux.model.model import FluxModel, FluxModelArgs +from torchtitan.experiments.flux.utils import ( + create_position_encoding_for_latents, + generate_noise_latent, + pack_latents, + preprocess_flux_data, + unpack_latents, +) + + +def time_shift(mu: float, sigma: float, t: Tensor): + return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma) + + +def get_lin_function( + x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15 +) -> Callable[[float], float]: + m = (y2 - y1) / (x2 - x1) + b = y1 - m * x1 + return lambda x: m * x + b + + +def get_schedule( + num_steps: int, + image_seq_len: int, + base_shift: float = 0.5, + max_shift: float = 1.15, + shift: bool = True, +) -> list[float]: + # extra step for zero + timesteps = torch.linspace(1, 0, num_steps + 1) + + # shifting the schedule to favor high timesteps for higher signal images + if shift: + # estimate mu based on linear estimation between two points + mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len) + timesteps = time_shift(mu, 1.0, timesteps) + + return timesteps.tolist() + + +class TestGenerateImage: + def test_generate_image(self): + """ + Run a forward pass of flux model to generate an image. + """ + name = "flux-dev" + img_width = 512 + img_height = 512 + seed = None + prompt = ( + "a photo of a forest with mist swirling around the tree trunks. The word " + '"FLUX" is painted over it in big, red brush strokes with visible texture' + ) + device = "cuda" + num_steps = None + loop = False + guidance = 3.5 + output_dir = "output" + add_sampling_metadata = True + + prompt = prompt.split("|") + if len(prompt) == 1: + prompt = prompt[0] + additional_prompts = None + else: + additional_prompts = prompt[1:] + prompt = prompt[0] + + assert not ( + (additional_prompts is not None) and loop + ), "Do not provide additional prompts and set loop to True" + + torch_device = torch.device(device) + if num_steps is None: + num_steps = 30 + + # allow for packing and conversion to latent space + img_height = 16 * (img_height // 16) + img_width = 16 * (img_width // 16) + + # init all components + model = FluxModel(FluxModelArgs()).to(device=torch_device, dtype=torch.bfloat16) + + ae = load_ae( + ckpt_path="assets/autoencoder/ae.safetensors", + autoencoder_params=AutoEncoderParams(), + device=torch_device, + dtype=torch.bfloat16, + ) + clip_tokenizer = FluxTokenizer( + model_path="openai/clip-vit-large-patch14", max_length=77 + ) + t5_tokenizer = FluxTokenizer(model_path="google/t5-v1_1-small", max_length=512) + clip_encoder = FluxEmbedder(version="openai/clip-vit-large-patch14").to( + torch_device, dtype=torch.bfloat16 + ) + t5_encoder = FluxEmbedder(version="google/t5-v1_1-small").to( + torch_device, dtype=torch.bfloat16 + ) + + rng = torch.Generator(device="cpu") + + if seed is None: + seed = rng.seed() + print(f"Generating with seed {seed}:\n{prompt}") + t0 = time.perf_counter() + output_name = os.path.join(output_dir, f"img_{seed}.jpg") + + # Tokenize the prompt, on CPU + clip_tokens = clip_tokenizer.encode(prompt) + t5_tokens = t5_tokenizer.encode(prompt) + + batch = preprocess_flux_data( + device=torch_device, + dtype=torch.bfloat16, + autoencoder=None, + clip_encoder=clip_encoder, + t5_encoder=t5_encoder, + batch={ + "clip_tokens": clip_tokens, + "t5_tokens": t5_tokens, + }, + ) + + img = self._generate_images( + device=torch_device, + dtype=torch.bfloat16, + model=model, + decoder=ae, + img_width=img_width, + img_height=img_height, + denoising_steps=num_steps, + seed=seed, + clip_encodings=batch["clip_encodings"], + t5_encodings=batch["t5_encodings"], + guidance=guidance, + ) + + if torch.cuda.is_available(): + torch.cuda.synchronize() + t1 = time.perf_counter() + + print(f"Done in {t1 - t0:.1f}s.") + + self._save_image(name, output_name, img, add_sampling_metadata, prompt) + + def _generate_images( + self, + device: torch.device, + dtype: torch.dtype, + model: FluxModel, + decoder: AutoEncoder, + # image params: + img_width: int, + img_height: int, + # sampling params: + denoising_steps: int, + seed: int, + clip_encodings: torch.Tensor, + t5_encodings: torch.Tensor, + guidance: float = 4.0, + ): + + bsz = clip_encodings.shape[0] + latents = generate_noise_latent(bsz, img_height, img_width, device, dtype, seed) + _, latent_channels, latent_height, latent_width = latents.shape + + # create denoising schedule + timesteps = get_schedule(denoising_steps, latent_channels, shift=True) + + # create positional encodings + POSITION_DIM = 3 # constant for Flux flow model + latent_pos_enc = create_position_encoding_for_latents( + bsz, latent_height, latent_width, POSITION_DIM + ).to(latents) + text_pos_enc = torch.zeros(bsz, t5_encodings.shape[1], POSITION_DIM).to(latents) + + # convert img-like latents into sequences of patches + latents = pack_latents(latents) + + # this is ignored for schnell + guidance_vec = torch.full((bsz,), guidance, device=device, dtype=dtype) + for t_curr, t_prev in zip(timesteps[:-1], timesteps[1:]): + t_vec = torch.full((bsz,), t_curr, dtype=dtype, device=device) + pred = model( + img=latents, + img_ids=latent_pos_enc, + txt=t5_encodings, + txt_ids=text_pos_enc, + y=clip_encodings, + timesteps=t_vec, + guidance=guidance_vec, + ) + + latents = latents + (t_prev - t_curr) * pred + + # convert sequences of patches into img-like latents + latents = unpack_latents(latents, latent_height, latent_width) + + img = decoder.decode(latents) + return img + + def _save_image( + self, + name: str, + output_name: str, + x: torch.Tensor, + add_sampling_metadata: bool, + prompt: str, + ): + print(f"Saving {output_name}") + # bring into PIL format and save + x = x.clamp(-1, 1) + x = rearrange(x[0], "c h w -> h w c") + + img = Image.fromarray((127.5 * (x + 1.0)).cpu().byte().numpy()) + + exif_data = Image.Exif() + exif_data[ExifTags.Base.Software] = "AI generated;txt2img;flux" + exif_data[ExifTags.Base.Make] = "Black Forest Labs" + exif_data[ExifTags.Base.Model] = name + if add_sampling_metadata: + exif_data[ExifTags.Base.ImageDescription] = prompt + img.save(output_name, exif=exif_data, quality=95, subsampling=0) diff --git a/torchtitan/experiments/flux/train_configs/debug_model.toml b/torchtitan/experiments/flux/train_configs/debug_model.toml new file mode 100644 index 0000000000000000000000000000000000000000..250a71d60ec28028b548803bad7f14b6b3a6db62 --- /dev/null +++ b/torchtitan/experiments/flux/train_configs/debug_model.toml @@ -0,0 +1,68 @@ + +[job] +dump_folder = "./outputs" +description = "Flux debug model" +print_args = false +use_for_integration_test = true + +[profiling] +enable_profiling = false +save_traces_folder = "profile_trace" +profile_freq = 10 +enable_memory_snapshot = false +save_memory_snapshot_folder = "memory_snapshot" + +[metrics] +log_freq = 1 +disable_color_printing = false +enable_tensorboard = false +save_tb_folder = "tb" +enable_wandb = false + +[model] +name = "flux" +flavor = "flux-debug" +norm_type = "rmsnorm" # layernorm / np_layernorm / rmsnorm +# test tokenizer.model, for debug purpose only +# tokenizer_path = "./tests/assets/test_tiktoken.model" +# converters = "float8" + + +[optimizer] +name = "AdamW" +lr = 8e-4 +eps = 1e-8 + +[lr_scheduler] +warmup_steps = 2 # lr scheduler warm up, normally 20% of the train steps +decay_ratio = 0.8 # lr scheduler decay ratio, 80% of the train steps +decay_type = "linear" +lr_min = 0.0 + +[training] +batch_size = 32 +seq_len = 512 +max_norm = 1.0 # grad norm clipping +steps = 10 +compile = false +dataset = "cc12m" +guidance = 3.5 +seed = 0 + +[encoder] +t5_encoder="google/t5-v1_1-small" +clip_encoder="openai/clip-vit-large-patch14" +max_t5_encoding_len=512 +auto_encoder_path="torchtitan/experiments/flux/assets/autoencoder/ae.safetensors" # Autoencoder to use for image + +[parallelism] +data_parallel_replicate_degree = 1 +data_parallel_shard_degree = 1 +fsdp_reshard_after_forward = "default" # default / never / always +tensor_parallel_degree = 1 +enable_async_tensor_parallel = false +pipeline_parallel_degree = 1 +context_parallel_degree = 1 + +[experimental] +custom_args_module = "torchtitan.experiments.flux.flux_argparser" diff --git a/torchtitan/experiments/kernels/triton_mg_group_gemm/benchmark.py b/torchtitan/experiments/kernels/triton_mg_group_gemm/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..7dbabd1317a5923545f24c9a77feca46f5a92130 --- /dev/null +++ b/torchtitan/experiments/kernels/triton_mg_group_gemm/benchmark.py @@ -0,0 +1,630 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# Benchmark comparing reference PyTorch vs optimized M*G group GEMM implementation + +import argparse +import logging +import time + +# from typing import Dict, List, Optional, Tuple + +import matplotlib.pyplot as plt +import numpy as np +import torch +import triton + +# import triton.language as tl + +# Configure logging +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) + +# Try to import the optimized implementations +try: + from torchao_pr.mg_grouped_gemm import grouped_gemm_forward + +except ImportError: + logging.error( + "Error importing MG grouped GEMM modules. Make sure the implementation files are in the correct path." + ) + raise + + +def compute_reference_forward(x, w, m_sizes): + """ + Reference PyTorch implementation of M*G grouped GEMM forward pass. + + Args: + x (torch.Tensor): Input tensor of shape (M, K) + w (torch.Tensor): Weight tensor of shape (N, K) + m_sizes (torch.Tensor): Group sizes tensor of shape (G) + + Returns: + torch.Tensor: Output tensor of shape (M, N) + """ + result = torch.zeros((x.shape[0], w.shape[0]), dtype=x.dtype, device=x.device) + + m_start = 0 + for g in range(len(m_sizes)): + m_size = m_sizes[g].item() + if m_size > 0: + m_end = m_start + m_size + + # Extract group input + x_g = x[m_start:m_end] + + # Compute group output + y_g = torch.matmul(x_g, w.T) + + # Store result + result[m_start:m_end] = y_g + + # Update start index + m_start = m_end + + return result + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["N"], # We'll vary the output dimension + x_vals=[1024, 2048, 4096, 8192, 16384], # Different output dimensions to test + # x_vals=[8192, 16384], + line_arg="provider", # We'll compare different providers + line_vals=["pytorch_reference", "M*G grouped GEMM"], + line_names=["PyTorch Reference", "M*G grouped Kernel"], + styles=[("blue", "-"), ("red", "-")], + ylabel="TFLOPS", # We'll measure TFLOPS + plot_name="mg_grouped_gemm_comparison", + args={ + "M": 8192, # Batch dimension, fixed for all tests + "K": 7168, # Hidden dimension, fixed for all tests + "G": 8, # Number of groups + "dtype": torch.float16, + "device": "cuda", + }, + ) +) +def benchmark_forward(M, K, N, G, provider, dtype=torch.float16, device="cuda"): + """ + Benchmark the forward pass of the grouped GEMM implementation. + + Args: + M (int): Total batch size dimension + K (int): Hidden dimension + N (int): Output dimension + G (int): Number of groups + provider (str): Provider to use ('pytorch_reference' or 'optimized_kernel') + dtype (torch.dtype): Data type to use + device (str): Device to use + + Returns: + float: Performance in TFLOPS + """ + # Create group sizes for M dimension (balanced across groups) + base_size = M // G + remainder = M % G + M_sizes = [base_size + (1 if i < remainder else 0) for i in range(G)] + m_sizes = torch.tensor(M_sizes, device=device, dtype=torch.int32) + + print(f"N: {N}, M: {M}, K: {K}, G: {G}, dtype: {dtype}, device: {device}") + + # Create input and weight tensors + x = torch.randn(M, K, dtype=dtype, device=device) + w = torch.randn(N, K, dtype=dtype, device=device) + + # Pre-compute for PyTorch reference to ensure fair comparison + if provider == "pytorch_reference": + # Warmup + torch.cuda.synchronize() + compute_reference_forward(x, w, m_sizes) + torch.cuda.synchronize() + + # Benchmark + start_time = time.time() + for _ in range(10): # Average over 10 runs + compute_reference_forward(x, w, m_sizes) + torch.cuda.synchronize() + end_time = time.time() + else: # Optimized kernel + # Warmup + torch.cuda.synchronize() + grouped_gemm_forward(x, w, m_sizes) + torch.cuda.synchronize() + + # Benchmark + start_time = time.time() + for _ in range(10): # Average over 10 runs + grouped_gemm_forward(x, w, m_sizes) + torch.cuda.synchronize() + end_time = time.time() + + # Calculate FLOPs + # For GEMM: 2 * M * N * K FLOPs (multiply-add counts as 2 FLOPs) + flops = 2 * M * N * K + + # Convert to TFLOPS (tera-FLOPS) + avg_time = (end_time - start_time) / 10 # Average time per run + tflops = flops / avg_time / 1e12 + + return tflops + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["G"], # We'll vary the number of groups + x_vals=[1, 2, 4, 8, 16], # Different numbers of groups to test + line_arg="provider", # We'll compare different providers + line_vals=["pytorch_reference", "optimized_kernel"], + line_names=["PyTorch Reference", "Optimized Kernel"], + styles=[("blue", "-"), ("red", "-")], + ylabel="TFLOPS", # We'll measure TFLOPS + plot_name="mg_grouped_gemm_group_scaling", + args={ + "M": 8192, # Batch dimension, fixed for all tests + "K": 4096, # Hidden dimension, fixed for all tests + "N": 8192, # Output dimension, fixed for all tests + "dtype": torch.float16, + "device": "cuda", + }, + ) +) +def benchmark_forward_groups(M, K, N, G, provider, dtype=torch.float16, device="cuda"): + """ + Benchmark how performance scales with number of groups. + + Args: + M (int): Total batch size dimension + K (int): Hidden dimension + N (int): Output dimension + G (int): Number of groups + provider (str): Provider to use ('pytorch_reference' or 'optimized_kernel') + dtype (torch.dtype): Data type to use + device (str): Device to use + + Returns: + float: Performance in TFLOPS + """ + # Create group sizes for M dimension (balanced across groups) + base_size = M // G + remainder = M % G + M_sizes = [base_size + (1 if i < remainder else 0) for i in range(G)] + m_sizes = torch.tensor(M_sizes, device=device, dtype=torch.int32) + + # Create input and weight tensors + x = torch.randn(M, K, dtype=dtype, device=device) + w = torch.randn(N, K, dtype=dtype, device=device) + + # Benchmark logic - same as previous function + if provider == "pytorch_reference": + torch.cuda.synchronize() + compute_reference_forward(x, w, m_sizes) + torch.cuda.synchronize() + + start_time = time.time() + for _ in range(10): + compute_reference_forward(x, w, m_sizes) + torch.cuda.synchronize() + end_time = time.time() + else: + torch.cuda.synchronize() + grouped_gemm_forward(x, w, m_sizes) + torch.cuda.synchronize() + + start_time = time.time() + for _ in range(10): + grouped_gemm_forward(x, w, m_sizes) + torch.cuda.synchronize() + end_time = time.time() + + # Calculate FLOPs and TFLOPS + flops = 2 * M * N * K + avg_time = (end_time - start_time) / 10 + tflops = flops / avg_time / 1e12 + + return tflops + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["group_balance"], # We'll vary the group balance factor + x_vals=[ + 0.0, + 0.25, + 0.5, + 0.75, + 0.9, + ], # Different imbalance factors (0 = balanced, 1 = max imbalance) + line_arg="provider", # We'll compare different providers + line_vals=["pytorch_reference", "optimized_kernel"], + line_names=["PyTorch Reference", "Optimized Kernel"], + styles=[("blue", "-"), ("red", "-")], + ylabel="TFLOPS", # We'll measure TFLOPS + plot_name="mg_grouped_gemm_imbalance", + args={ + "M": 8192, # Batch dimension, fixed for all tests + "K": 4096, # Hidden dimension, fixed for all tests + "N": 8192, # Output dimension, fixed for all tests + "G": 4, # Number of groups + "dtype": torch.float16, + "device": "cuda", + }, + ) +) +def benchmark_imbalance( + M, K, N, G, group_balance, provider, dtype=torch.float16, device="cuda" +): + """ + Benchmark how performance is affected by imbalanced group sizes. + + Args: + M (int): Total batch size dimension + K (int): Hidden dimension + N (int): Output dimension + G (int): Number of groups + group_balance (float): Balance factor from 0 to 1 (0 = balanced, 1 = max imbalance) + provider (str): Provider to use ('pytorch_reference' or 'optimized_kernel') + dtype (torch.dtype): Data type to use + device (str): Device to use + + Returns: + float: Performance in TFLOPS + """ + # Create imbalanced group sizes for M dimension + if group_balance == 0: + # Balanced case + base_size = M // G + remainder = M % G + M_sizes = [base_size + (1 if i < remainder else 0) for i in range(G)] + else: + # Imbalanced case + # First group gets more elements, last group gets fewer + # The imbalance is controlled by the group_balance factor + remaining = M + M_sizes = [] + for g in range(G): + # Interpolate from balanced to imbalanced based on group_balance + # For balanced (group_balance=0), each group gets M/G + # For imbalanced (group_balance=1), first group gets much more than last group + balanced_size = remaining // (G - g) + + # Adjusting size based on position and imbalance factor + # First groups get more, last groups get less + if g < G // 2: + # First half of groups get more + adjustment = int(balanced_size * group_balance * (1 - g / (G - 1))) + size = balanced_size + adjustment + else: + # Second half of groups get less + adjustment = int(balanced_size * group_balance * ((g / (G - 1)) - 0.5)) + size = balanced_size - adjustment + + # Ensure we don't go below 1 or take more than remaining + size = max(1, min(size, remaining)) + M_sizes.append(size) + remaining -= size + + # Handle any remaining elements + if remaining > 0: + M_sizes[-1] += remaining + + m_sizes = torch.tensor(M_sizes, device=device, dtype=torch.int32) + + # Create input and weight tensors + x = torch.randn(M, K, dtype=dtype, device=device) + w = torch.randn(N, K, dtype=dtype, device=device) + + # Benchmark logic + if provider == "pytorch_reference": + torch.cuda.synchronize() + compute_reference_forward(x, w, m_sizes) + torch.cuda.synchronize() + + start_time = time.time() + for _ in range(10): + compute_reference_forward(x, w, m_sizes) + torch.cuda.synchronize() + end_time = time.time() + else: + torch.cuda.synchronize() + grouped_gemm_forward(x, w, m_sizes) + torch.cuda.synchronize() + + start_time = time.time() + for _ in range(10): + grouped_gemm_forward(x, w, m_sizes) + torch.cuda.synchronize() + end_time = time.time() + + # Calculate FLOPs and TFLOPS + flops = 2 * M * N * K + avg_time = (end_time - start_time) / 10 + tflops = flops / avg_time / 1e12 + + return tflops + + +def benchmark_model_configs(): + """ + Benchmark common model configurations used in DeepSeek-like models. + """ + # Model configurations: (M, K, N, G) + configs = [ + (8192, 7168, 4096, 4), # Config 1 + (8192, 2048, 7168, 4), # Config 2 + (4096, 7168, 4096, 8), # Config 3 + (4096, 2048, 7168, 8), # Config 4 + ] + + results = [] + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + dtype = torch.float16 + + for config_idx, (M, K, N, G) in enumerate(configs): + logging.info(f"\n===== Benchmarking DeepSeek Config {config_idx + 1} =====") + logging.info(f"M={M}, K={K}, N={N}, G={G}") + + # Create group sizes for M dimension + base_size = M // G + remainder = M % G + M_sizes = [base_size + (1 if i < remainder else 0) for i in range(G)] + m_sizes = torch.tensor(M_sizes, device=device, dtype=torch.int32) + + # Create tensors + x = torch.randn(M, K, dtype=dtype, device=device) + w = torch.randn(N, K, dtype=dtype, device=device) + + # Benchmark PyTorch reference + torch.cuda.synchronize() + compute_reference_forward(x, w, m_sizes) # Warmup + torch.cuda.synchronize() + + logging.info("Benchmarking PyTorch reference...") + torch.cuda.reset_peak_memory_stats() + start_time = time.time() + for _ in range(10): + compute_reference_forward(x, w, m_sizes) + torch.cuda.synchronize() + end_time = time.time() + pt_time = (end_time - start_time) / 10 + pt_memory = torch.cuda.max_memory_allocated() / (1024**2) # MB + + # Benchmark optimized kernel + torch.cuda.synchronize() + grouped_gemm_forward(x, w, m_sizes) # Warmup + torch.cuda.synchronize() + + logging.info("Benchmarking optimized kernel...") + torch.cuda.reset_peak_memory_stats() + start_time = time.time() + for _ in range(10): + grouped_gemm_forward(x, w, m_sizes) + torch.cuda.synchronize() + end_time = time.time() + opt_time = (end_time - start_time) / 10 + opt_memory = torch.cuda.max_memory_allocated() / (1024**2) # MB + + # Calculate FLOPs and speedup + flops = 2 * M * N * K + pt_tflops = flops / pt_time / 1e12 + opt_tflops = flops / opt_time / 1e12 + speedup = pt_time / opt_time + + # Store results + results.append( + { + "config": f"Config {config_idx + 1}", + "dimensions": f"M={M}, K={K}, N={N}, G={G}", + "pt_time_ms": pt_time * 1000, + "opt_time_ms": opt_time * 1000, + "pt_tflops": pt_tflops, + "opt_tflops": opt_tflops, + "speedup": speedup, + "pt_memory_mb": pt_memory, + "opt_memory_mb": opt_memory, + "memory_savings": ( + (pt_memory - opt_memory) / pt_memory * 100 if pt_memory > 0 else 0 + ), + } + ) + + logging.info( + f"PyTorch Reference: {pt_time * 1000:.2f} ms, {pt_tflops:.2f} TFLOPS, {pt_memory:.2f} MB" + ) + logging.info( + f"Optimized Kernel: {opt_time * 1000:.2f} ms, {opt_tflops:.2f} TFLOPS, {opt_memory:.2f} MB" + ) + logging.info( + f"Speedup: {speedup:.2f}x, Memory savings: {results[-1]['memory_savings']:.2f}%" + ) + + # Print summary table + logging.info("\n===== Benchmark Results Summary =====") + logging.info( + f"{'Config':<10} | {'Time (ms)':<20} | {'TFLOPS':<20} | {'Speedup':<10} | {'Memory (MB)':<20} | {'Memory Saved':<12}" + ) + logging.info( + f"{'':<10} | {'PyTorch':<9} {'Kernel':<9} | {'PyTorch':<9} {'Kernel':<9} | {'':<10} | " + f"{'PyTorch':<9} {'Kernel':<9} | {'':<12}" + ) + logging.info("-" * 100) + + for result in results: + logging.info( + f"{result['config']:<10} | " + f"{result['pt_time_ms']:<9.2f} {result['opt_time_ms']:<9.2f} | " + f"{result['pt_tflops']:<9.2f} {result['opt_tflops']:<9.2f} | " + f"{result['speedup']:<10.2f} | " + f"{result['pt_memory_mb']:<9.2f} {result['opt_memory_mb']:<9.2f} | " + f"{result['memory_savings']:<12.2f}%" + ) + + return results + + +def plot_benchmark_results(results): + """ + Plot benchmark results as bar charts. + """ + # Extract data + configs = [r["config"] for r in results] + pt_tflops = [r["pt_tflops"] for r in results] + opt_tflops = [r["opt_tflops"] for r in results] + speedups = [r["speedup"] for r in results] + + # Create figure with subplots + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5)) + + # Plot TFLOPS comparison + x = np.arange(len(configs)) + width = 0.35 + ax1.bar(x - width / 2, pt_tflops, width, label="PyTorch Reference") + ax1.bar(x + width / 2, opt_tflops, width, label="Optimized Kernel") + ax1.set_xlabel("Model Configuration") + ax1.set_ylabel("TFLOPS") + ax1.set_title("Performance Comparison (Higher is Better)") + ax1.set_xticks(x) + ax1.set_xticklabels(configs) + ax1.legend() + ax1.grid(axis="y", linestyle="--", alpha=0.7) + + # Plot speedup + ax2.bar(x, speedups, width=0.6, color="green") + ax2.set_xlabel("Model Configuration") + ax2.set_ylabel("Speedup (x)") + ax2.set_title("Speedup Factor (Higher is Better)") + ax2.set_xticks(x) + ax2.set_xticklabels(configs) + ax2.grid(axis="y", linestyle="--", alpha=0.7) + + # Add speedup values on top of bars + for i, v in enumerate(speedups): + ax2.text(i, v + 0.1, f"{v:.2f}x", ha="center") + + plt.tight_layout() + plt.savefig("mg_grouped_gemm_benchmark_results.png") + logging.info( + "Benchmark results plot saved to 'mg_grouped_gemm_benchmark_results.png'" + ) + + +def compare_mg_implementations(): + """ + Combine the M*G and N*G benchmark results for comparison. + """ + # Only run this if both NG and MG benchmarks have been run + try: + import pandas as pd + + # Try to load previous benchmark results + mg_results = pd.read_csv("mg_grouped_gemm_benchmark_results.csv") + ng_results = pd.read_csv("ng_grouped_gemm_benchmark_results.csv") + + # Create comparison plot + fig, axes = plt.subplots(1, 2, figsize=(14, 6)) + + # Plot speedup comparison + configs = mg_results["config"].unique() + mg_speedups = mg_results.groupby("config")["speedup"].mean() + ng_speedups = ng_results.groupby("config")["speedup"].mean() + + x = np.arange(len(configs)) + width = 0.35 + + axes[0].bar(x - width / 2, mg_speedups, width, label="M*G Grouping") + axes[0].bar(x + width / 2, ng_speedups, width, label="N*G Grouping") + axes[0].set_xlabel("Model Configuration") + axes[0].set_ylabel("Speedup (x)") + axes[0].set_title("Speedup Comparison: M*G vs N*G") + axes[0].set_xticks(x) + axes[0].set_xticklabels(configs) + axes[0].legend() + axes[0].grid(axis="y", linestyle="--", alpha=0.7) + + # Plot TFLOPS comparison for optimized kernels + mg_tflops = ( + mg_results[mg_results["implementation"] == "optimized"] + .groupby("config")["tflops"] + .mean() + ) + ng_tflops = ( + ng_results[ng_results["implementation"] == "optimized"] + .groupby("config")["tflops"] + .mean() + ) + + axes[1].bar(x - width / 2, mg_tflops, width, label="M*G Grouping") + axes[1].bar(x + width / 2, ng_tflops, width, label="N*G Grouping") + axes[1].set_xlabel("Model Configuration") + axes[1].set_ylabel("TFLOPS") + axes[1].set_title("Performance Comparison: M*G vs N*G") + axes[1].set_xticks(x) + axes[1].set_xticklabels(configs) + axes[1].legend() + axes[1].grid(axis="y", linestyle="--", alpha=0.7) + + plt.tight_layout() + plt.savefig("mg_vs_ng_comparison.png") + logging.info("Comparison plot saved to 'mg_vs_ng_comparison.png'") + + except Exception as e: + logging.error(f"Could not create comparison plot: {e}") + logging.info( + "Run both M*G and N*G benchmarks first to generate comparison plots" + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Benchmark M*G Grouped GEMM implementations" + ) + parser.add_argument("--run-all", action="store_true", help="Run all benchmarks") + parser.add_argument( + "--triton-bench", action="store_true", help="Run Triton performance reports" + ) + parser.add_argument( + "--model-configs", action="store_true", help="Benchmark model configurations" + ) + parser.add_argument( + "--compare-mg-ng", + action="store_true", + help="Compare M*G and N*G implementations", + ) + args = parser.parse_args() + + # Check if CUDA is available + if not torch.cuda.is_available(): + logging.error( + "CUDA is not available. This benchmark requires a CUDA-capable GPU." + ) + exit(1) + + if args.run_all or args.model_configs: + # Benchmark model configurations + logging.info("Running benchmark for model configurations...") + results = benchmark_model_configs() + plot_benchmark_results(results) + + if args.run_all or args.triton_bench: + # Run Triton performance reports + logging.info("Running Triton performance reports...") + benchmark_forward.run(save_path="mg_grouped_gemm_benchmark_results") + benchmark_forward_groups.run(save_path="mg_grouped_gemm_benchmark_results") + benchmark_imbalance.run(save_path="mg_grouped_gemm_benchmark_results") + logging.info( + "Triton performance reports saved to 'mg_grouped_gemm_benchmark_results' directory" + ) + + if args.run_all or args.compare_mg_ng: + # Compare M*G and N*G implementations + logging.info("Comparing M*G and N*G implementations...") + compare_mg_implementations() diff --git a/torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/unit_test_forwards.py b/torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/unit_test_forwards.py new file mode 100644 index 0000000000000000000000000000000000000000..2429432d756ae4d5bb6f91a6108c7ba8a4b9c627 --- /dev/null +++ b/torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/unit_test_forwards.py @@ -0,0 +1,82 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe +import logging +import unittest +from typing import Tuple + +import torch +import torch.nn as nn + +from mg_grouped_gemm import grouped_gemm_forward + + +class TestMG_GroupedGEMM(unittest.TestCase): + def setUp(self) -> None: + torch.manual_seed(2020) + + def _run_grouped_gemm_test( + self, + shape: Tuple[int, int, int, int], + device: torch.device, + dtype: torch.dtype = torch.bfloat16, + atol: float = 1e-5, + rtol: float = 1.6e-2, + ) -> None: + G, M, N, K = shape + # In M*G grouping, input is [M*G, K] and weights are [N*G, K] + a = torch.randn(M * G, K, dtype=dtype, device=device) + b = torch.randn(N * G, K, dtype=dtype, device=device) + + # Create equal-sized groups for simplicity + m_size = M + m_sizes = torch.full((G,), m_size, device=device, dtype=torch.int32) + + result = grouped_gemm_forward(a, b, m_sizes) + self.assertTrue(result.shape == (M * G, N)) + + expected_result = torch.zeros(M * G, N, dtype=dtype, device=device) + m_start = 0 + for g in range(G): + m_end = m_start + m_sizes[g] + b_slice = b[N * g : N * (g+1), :] + expected_result[m_start:m_end, :] = a[m_start:m_end, :] @ b_slice.T + m_start = m_end + + # Convert result to match input dtype if needed + result = result.to(dtype) + torch.testing.assert_close(result, expected_result, atol=atol, rtol=rtol) + + def test_MG_grouped_gemm_bf16(self) -> None: + for G in (1, 4, 16): + for M in (128, 512, 1024): + print(f"Testing BF16 M*G GroupGeMM with G={G}, M={M}") + self._run_grouped_gemm_test( + (G, M, 1024, 1024), + torch.device("cuda"), + dtype=torch.bfloat16, + atol=1e-5, + rtol=1.6e-2, + ) + + def test_MG_grouped_gemm_deepseek_shapes(self) -> None: + """Test with shapes from Deepseek model.""" + deepseek_shapes = [ + (4, 2048, 4096, 7168), # G, M, N, K + (4, 2048, 7168, 2048), + (8, 512, 4096, 7168), + (8, 512, 7168, 2048), + ] + + device = torch.device("cuda") + + for shape in deepseek_shapes: + G, M, N, K = shape + print(f"Testing BF16 M*G Deepseek shape: G={G}, M={M}, N={N}, K={K}") + self._run_grouped_gemm_test( + shape, device, dtype=torch.bfloat16, atol=1e-5, rtol=1.6e-2 + ) diff --git a/torchtitan/experiments/llama4/infra/__pycache__/parallelize_llama.cpython-312.pyc b/torchtitan/experiments/llama4/infra/__pycache__/parallelize_llama.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27907cdc4b5f16d3ba623e3576da6fea69e8ae47 Binary files /dev/null and b/torchtitan/experiments/llama4/infra/__pycache__/parallelize_llama.cpython-312.pyc differ diff --git a/torchtitan/experiments/llama4/infra/parallelize_llama.py b/torchtitan/experiments/llama4/infra/parallelize_llama.py new file mode 100644 index 0000000000000000000000000000000000000000..72842fc04f896896772beca4ec7b50b0ce66a7b5 --- /dev/null +++ b/torchtitan/experiments/llama4/infra/parallelize_llama.py @@ -0,0 +1,159 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +import torch +import torch.nn as nn +from torch.distributed.device_mesh import DeviceMesh + +from torchtitan.config_manager import JobConfig, TORCH_DTYPE_MAP +from torchtitan.distributed import ParallelDims + +from torchtitan.models.llama3.parallelize_llama import ( + apply_ac, + apply_compile, + apply_ddp, + apply_fsdp, + apply_tp, +) +from torchtitan.tools.logging import logger + + +def parallelize_llama( + model: nn.Module, + world_mesh: DeviceMesh, + parallel_dims: ParallelDims, + job_config: JobConfig, +): + """ + Apply tensor parallelism, activation checkpointing, torch.compile, and data + parallelism to the model. + + NOTE: The passed-in model preferably should be on meta device. Otherwise, + the model must fit on GPU or CPU memory. + """ + + if parallel_dims.tp_enabled: + if ( + job_config.parallelism.enable_async_tensor_parallel + and not job_config.training.compile + ): + raise RuntimeError("Async TP requires --training.compile") + + enable_float8_linear = "float8" in job_config.model.converters + float8_is_rowwise = job_config.float8.recipe_name in ( + "rowwise", + "rowwise_with_gw_hp", + ) + + # For now, float8 all-gather with TP is only supported for tensorwise + # float8 scaling recipes. For rowwise recipes, we use regular TP and + # all-gather happens in high precision. + enable_float8_tensorwise_tp = enable_float8_linear and not float8_is_rowwise + + apply_tp( + model, + world_mesh["tp"], + loss_parallel=parallel_dims.loss_parallel_enabled, + enable_float8_tensorwise_tp=enable_float8_tensorwise_tp, + enable_async_tp=job_config.parallelism.enable_async_tensor_parallel, + ) + + apply_moe_tp(model, world_mesh["tp"]) + + if job_config.activation_checkpoint.mode != "none": + if ( + job_config.activation_checkpoint.mode == "selective" + and job_config.model.use_flex_attn + ): + raise ValueError( + "FlexAttention is not compatible with selective AC yet. " + "See https://github.com/pytorch/pytorch/issues/147879" + ) + apply_ac(model, job_config.activation_checkpoint) + + # turn on per-TransformerBlock compile after AC wrapping and before FSDP + if job_config.training.compile: + apply_compile(model) + + # NOTE: needed for torch.compile to work with dynamic shapes in token-choice MoE + torch._dynamo.config.capture_scalar_outputs = True + + if ( + parallel_dims.dp_shard_enabled or parallel_dims.cp_enabled + ): # apply FSDP or HSDP, potentially with Context Parallel + if parallel_dims.dp_replicate_enabled: + dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp") + else: + dp_mesh_dim_names = ("dp_shard_cp",) + + apply_fsdp( + model, + world_mesh[tuple(dp_mesh_dim_names)], + param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param], + reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce], + pp_enabled=parallel_dims.pp_enabled, + cpu_offload=job_config.training.enable_cpu_offload, + reshard_after_forward_policy=job_config.parallelism.fsdp_reshard_after_forward, + ) + + if parallel_dims.dp_replicate_enabled: + logger.info("Applied HSDP to the model") + else: + logger.info("Applied FSDP to the model") + + if parallel_dims.cp_enabled: + logger.info("Applied Context Parallel to the model") + + if job_config.training.enable_cpu_offload: + logger.info("Applied CPU Offloading to the model") + elif parallel_dims.dp_replicate_enabled: + if world_mesh.ndim > 1: + raise RuntimeError("DDP has not supported > 1D parallelism") + apply_ddp( + model, + world_mesh, + enable_compile=job_config.training.compile, + enable_compiled_autograd=job_config.parallelism.enable_compiled_autograd, + ) + + return model + + +def apply_moe_tp( + model: nn.Module, + tp_mesh: DeviceMesh, +): + from torch.distributed.tensor import Partial, Replicate, Shard + from torch.distributed.tensor.parallel import ( + parallelize_module, + PrepareModuleInputOutput, + ) + + from .expert_parallel import NoParallel, TensorParallel + + for _, transformer_block in model.layers.items(): + moe_layer_plan = { + # input / output sharding on the seqlen dim + # all-gather for input, reduce-scatter for output + "moe": PrepareModuleInputOutput( + input_layouts=(Shard(1),), + desired_input_layouts=(Replicate(),), + use_local_input=True, + output_layouts=(Partial(),), + desired_output_layouts=(Shard(1),), + ), + # replicate computation for the router + "moe.router.gate": NoParallel(), + # input Replicate, output Partial + "moe.experts": TensorParallel(), + "moe.shared_expert": TensorParallel(), + } + parallelize_module( + module=transformer_block, + device_mesh=tp_mesh, + parallelize_plan=moe_layer_plan, + ) diff --git a/torchtitan/experiments/llama4/model/__pycache__/args.cpython-312.pyc b/torchtitan/experiments/llama4/model/__pycache__/args.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..75650d462a816282cc77f1f0feb67300f3c708e6 Binary files /dev/null and b/torchtitan/experiments/llama4/model/__pycache__/args.cpython-312.pyc differ diff --git a/torchtitan/experiments/llama4/model/__pycache__/model.cpython-312.pyc b/torchtitan/experiments/llama4/model/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..98abd5c8366e3acbf5fc8e031f0dcb5ba3225180 Binary files /dev/null and b/torchtitan/experiments/llama4/model/__pycache__/model.cpython-312.pyc differ diff --git a/torchtitan/experiments/llama4/model/moe.py b/torchtitan/experiments/llama4/model/moe.py new file mode 100644 index 0000000000000000000000000000000000000000..0b925b36207875dedc13a16be10890c3671cdabb --- /dev/null +++ b/torchtitan/experiments/llama4/model/moe.py @@ -0,0 +1,228 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn.functional as F +from torch import nn + +from .args import TransformerModelArgs + + +class GroupedExperts(nn.Module): + def __init__( + self, + dim: int, + hidden_dim: int, + num_experts: int, + ): + super().__init__() + self.num_experts = num_experts + self.w1 = nn.Parameter(torch.empty(num_experts, dim, hidden_dim)) + self.w2 = nn.Parameter(torch.empty(num_experts, hidden_dim, dim)) + self.w3 = nn.Parameter(torch.empty(num_experts, dim, hidden_dim)) + + def forward( + self, + x: torch.Tensor, + num_local_tokens_per_expert: torch.Tensor | None = None, + ) -> torch.Tensor: + if num_local_tokens_per_expert is not None: + # a tuple of tensors indexed by experts + # each with shape (tokens_per_expert(varying), dim) + x = torch.split( + x, + split_size_or_sections=num_local_tokens_per_expert.tolist(), + dim=0, + ) + out_experts_splits = [] + for expert_idx, x_expert in enumerate(x): + w1, w2, w3 = ( + self.w1[expert_idx], + self.w2[expert_idx], + self.w3[expert_idx], + ) + h = F.silu(torch.matmul(x_expert, w1)) + h = h * torch.matmul(x_expert, w3) + h = torch.matmul(h, w2) + # h shape (tokens_per_expert(varying), dim) + out_experts_splits.append(h) + out = torch.cat(out_experts_splits, dim=0) + + # TODO:optimize with GroupedGEMM + # https://github.com/pytorch/pytorch/pull/150374 + # _gouped_mm requires shapes to be multiple of 8 + # offsets = torch.cumsum(num_local_tokens_per_expert, dim=0, dtype=torch.int32) + # h = F.silu(torch._grouped_mm(x, self.w1.transpose(-2, -1), offs=offsets, out_dtype=torch.bfloat16)) + # h = h * torch._grouped_mm(x, self.w3.transpose(-2, -1), offs=offsets, out_dtype=torch.bfloat16) + # out = torch._grouped_mm(h, self.w2.transpose(-2, -1), offs=offsets, out_dtype=torch.bfloat16) + else: + # x shape (num_experts, tokens_per_expert, dim) + h = F.silu(torch.bmm(x, self.w1)) + h = h * torch.bmm(x, self.w3) + # out shape (num_experts, tokens_per_expert, dim) + out = torch.bmm(h, self.w2) + return out + + def init_weights(self, init_std: float): + nn.init.trunc_normal_(self.w1, mean=0.0, std=0.02) + nn.init.trunc_normal_(self.w2, mean=0.0, std=init_std) + nn.init.trunc_normal_(self.w3, mean=0.0, std=init_std) + + +class TokenChoiceTopKRouter(nn.Module): + """This class implements token-choice routing. In token-choice top-K routing, each token is + routed to top K experts based on the router scores. + + Args: + gate (nn.Module): Gate module to calculate the scores, typically nn.Linear(dim, num_experts). + dim (int): Dimension of input tokens. + num_experts (int): Number of experts in each moe layer. + top_k (int): Number of experts each token will be routed to in token-choice routing. + use_sigmoid (bool): Whether to use sigmoid or softmax for router scores. Default is False. + """ + + def __init__( + self, + dim: int, + num_experts: int, + top_k: int, + use_sigmoid: bool = False, + ): + super().__init__() + self.gate = nn.Linear(dim, num_experts, bias=False) + self.num_experts = num_experts + self.top_k = top_k + self.use_sigmoid = use_sigmoid + + def forward( + self, x: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Args: + x (torch.Tensor): Input tensor with shape ``(bs*slen, dim)``. + + Returns: + routed_input (torch.Tensor): + Tokens grouped together by experts indices with shape ``(bs*slen*top_k,)``. + token_indices (torch.Tensor): + Token indices for routed_input with shape ``(bs*slen*top_k,)``. + num_local_tokens_per_expert (torch.Tensor): + Number of tokens assigned to each expert with shape ``(num_experts,)``. + """ + # scores shape (bs*slen, num_experts) + scores = self.gate(x) + + # By default, sigmoid or softmax is performed in float32 to avoid loss explosion + if self.use_sigmoid: + scores = torch.sigmoid(scores.to(torch.float32)).to(x.dtype) + else: + scores = F.softmax(scores.to(torch.float32), dim=1).to(x.dtype) + + # top scores shape (bs*slen, top_k) + top_scores, selected_experts_indices = torch.topk(scores, k=self.top_k, dim=1) + # top_scores /= top_scores.sum(dim=-1, keep_dim=True).to(x.dtype) + + # group tokens together by expert indices from 0 to num_experts and pass that to experts forward + num_local_tokens_per_expert = torch.histc( + selected_experts_indices.view(-1), + bins=self.num_experts, + min=0, + max=self.num_experts, + ) + # token_indices_experts_sorted shape (bs*slen*top_k,) + token_indices_experts_sorted = torch.argsort( + selected_experts_indices.view(-1), stable=True + ) + top_scores = top_scores.view(-1)[token_indices_experts_sorted] + token_indices_experts_sorted = token_indices_experts_sorted // self.top_k + + return top_scores, token_indices_experts_sorted, num_local_tokens_per_expert + + def init_weights(self, init_std: float): + nn.init.trunc_normal_(self.gate.weight, mean=0.0, std=init_std) + + +# TODO: implement load balancing auxiliary loss for token-choice routing +class MoE(nn.Module): + def __init__(self, model_args: TransformerModelArgs): + super().__init__() + dim = model_args.dim + hidden_dim = 4 * model_args.dim + ffn_dim_multiplier = model_args.ffn_dim_multiplier + hidden_dim = int(2 * hidden_dim / 3) + if ffn_dim_multiplier is not None: + hidden_dim = int(ffn_dim_multiplier * hidden_dim) + + num_experts = model_args.num_experts + + hidden_dim_denom = 1 + if model_args.auto_scale_hidden_dim: + hidden_dim_denom = model_args.top_k + int(model_args.use_shared_expert) + + if model_args.auto_scale_hidden_dim: + hidden_dim = int(hidden_dim / hidden_dim_denom) + hidden_dim += -hidden_dim % model_args.multiple_of + + self.experts = GroupedExperts( + dim=dim, hidden_dim=hidden_dim, num_experts=num_experts + ) + self.router = TokenChoiceTopKRouter( + dim=dim, num_experts=num_experts, top_k=model_args.top_k + ) + self.shared_expert = ( + GroupedExperts(dim=dim, hidden_dim=hidden_dim, num_experts=1) + if model_args.use_shared_expert + else None + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Args: + x (torch.Tensor): Input tensor with shape ``(bs, slen, dim)``. + + Returns: + out (torch.Tensor): Output tensor with shape ``(bs, slen, dim)``. + """ + bs, slen, dim = x.shape + # top_scores and selected_indices shape (bs*slen*top_k,) + # num_local_tokens_per_expert shape (num_experts,) + ( + top_scores, + token_indices, + num_local_tokens_per_expert, + ) = self.router(x.reshape(bs * slen, dim)) + + # shape (bs*slen*top_k, dim) + token_indices = token_indices.reshape(-1, 1).expand(-1, dim) + + # shape (bs*slen*top_k, dim) + routed_input = torch.gather( + x.view(-1, dim), + dim=0, + index=token_indices, + ) + routed_input = routed_input * top_scores.reshape(-1, 1) + + # shape (bs*slen*top_k, dim) + routed_output = self.experts(routed_input, num_local_tokens_per_expert) + + # shared expert + if self.shared_expert is not None: + out = self.shared_expert(x.reshape(1, bs * slen, dim)).reshape( + bs * slen, dim + ) + else: + out = torch.zeros_like(x.reshape(bs * slen, dim)) + + out = out.scatter_add(dim=0, index=token_indices, src=routed_output) + out = out.reshape(bs, slen, dim) + return out + + def init_weights(self, init_std: float): + self.experts.init_weights(init_std) + self.router.init_weights(init_std) + if self.shared_expert is not None: + self.shared_expert.init_weights(init_std) diff --git a/torchtitan/experiments/llama4/scripts/convert_hf_to_dcp_with_gpus.sh b/torchtitan/experiments/llama4/scripts/convert_hf_to_dcp_with_gpus.sh new file mode 100644 index 0000000000000000000000000000000000000000..6530b8ce992c8c33ccec94614e026d73964710ee --- /dev/null +++ b/torchtitan/experiments/llama4/scripts/convert_hf_to_dcp_with_gpus.sh @@ -0,0 +1,26 @@ +#!/usr/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +set -ex + +# use envs as local overrides for convenience +# e.g. +# LOG_RANK=0,1 NGPU=4 ./convert_hf_to_dcp_with_gpus.sh +NGPU=${NGPU:-"8"} +LOG_RANK=${LOG_RANK:-0,1,2,3,4,5,6,7} +CONFIG_FILE=${CONFIG_FILE:-"../train_configs/llama4_17bx16e.toml"} + +overrides="" +if [ $# -ne 0 ]; then + overrides="$*" +fi + +PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" \ +torchrun --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \ +--local-ranks-filter ${LOG_RANK} --role rank --tee 3 \ +convert_hf_to_dcp_with_gpus.py --job.config_file ${CONFIG_FILE} $overrides diff --git a/torchtitan/experiments/llama4/scripts/convert_meta_to_dcp_with_gpus.py b/torchtitan/experiments/llama4/scripts/convert_meta_to_dcp_with_gpus.py new file mode 100644 index 0000000000000000000000000000000000000000..7756afe3de1527f469a38fc6a0bdc6c62eaa2526 --- /dev/null +++ b/torchtitan/experiments/llama4/scripts/convert_meta_to_dcp_with_gpus.py @@ -0,0 +1,536 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import math +import os +import time +from dataclasses import dataclass +from typing import Any, Optional + +import torch +import torch.distributed as dist +from torch.distributed.tensor import DeviceMesh, distribute_tensor, DTensor, Shard +from torch.distributed.tensor._utils import compute_local_shape_and_global_offset +from torchtitan.components.checkpoint import MODEL +from torchtitan.config_manager import JobConfig +from torchtitan.tools.logging import init_logger, logger +from torchtitan.train import Trainer + +# Sharding dims for MP checkpoints + +column_parallel = [ + "tok_embeddings", + "wq", + "wk", + "wv", + "wqkv", + "w_in_shared_FD", + "w_out_eF_D", + "w_swiglu_FD", + "output", + "_linear", + "c_fc", + "vision_projection", +] + +row_parallel = [ + "wo", + "w_out_shared_DF", + "w_in_eD_F", + "moe_w_swiglu_eD_F", + "c_proj", +] + + +def convert_to_titan_fqns(fqn: str) -> list[str]: + # From the stored checkpoint keys to TorchTitan keys. + if "wqkv" in fqn and "layer_norm_weight" not in fqn: + ret = [] + for k in ("wq", "wk", "wv"): + ret.append(fqn.replace("wqkv", k)) + return ret + return [fqn] + + +def get_shard_dim(fqn: str) -> Optional[int]: + if "bias" in fqn: + # Some bias params are still sharded + if "resblocks" in fqn: + for k in ("wq", "wk", "wv", "c_fc"): + if k in fqn: + return 0 + return None + elif any([x in fqn for x in column_parallel]): + return 0 + elif any([x in fqn for x in row_parallel]): + return 1 + else: + return None + + +def split_fused_qkv(shards: list[torch.Tensor]) -> tuple[torch.Tensor, ...]: + qkvs = [torch.split(shard, [640, 128, 128]) for shard in shards] + q = torch.cat([qkv[0] for qkv in qkvs], dim=0) + k = torch.cat([qkv[1] for qkv in qkvs], dim=0) + v = torch.cat([qkv[2] for qkv in qkvs], dim=0) + return q, k, v + + +@dataclass +class _Assignment: + loader_id: int + filename: str + fqns: tuple[str, ...] + shapes: tuple[torch.Size, ...] + dtypes: tuple[torch.dtype, ...] + + +@dataclass +class _AssignmentRound: + loader_assignments: dict[int, _Assignment] # List of assignments for each loader + + +class CheckpointConverter: + TOTAL_SHARDS = 8 + + def __init__( + self, + process_group: dist.ProcessGroup, + path: str, + loader_every_n_ranks: int = 8, + ) -> None: + self.path = path + self.pg = process_group + self.my_rank = dist.get_rank(self.pg) + self.loader_every_n_ranks = loader_every_n_ranks + self.loader_id = self.my_rank // loader_every_n_ranks + self.should_load = ( + self.my_rank % loader_every_n_ranks == 0 + and self.loader_id < CheckpointConverter.TOTAL_SHARDS + ) + self.total_loader = CheckpointConverter.TOTAL_SHARDS + self.titan_fqn_to_stored_fqn: dict[str, str] = {} + self.stored_fqn_to_titan_fqn: dict[str, list[str]] = {} + self.total_send_bytes = 0 + self.total_recv_bytes = 0 + + def convert(self, state_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + begin = time.time() + self._load_metadata() + self._create_fqn_mappings(state_dict) + rounds = self._get_load_assignments(state_dict) + + for assignments in rounds: + loader_assignments = assignments.loader_assignments + loaded_state_dict = None + # Let each loader to load its own data and move to its GPU. + for i in range(self.total_loader): + # This loader doesn't have any loading assignment for this round. + if i not in loader_assignments: + continue + # This rank is not the loader + if i != self.loader_id or not self.should_load: + continue + loaded_state_dict = self._load_round(loader_assignments[i]) + + results = [] + for i in range(self.total_loader): + if i not in loader_assignments: + continue + + if i == self.loader_id and self.should_load: + # This rank is the loader. It needs to send the loaded data to + # the other ranks. + assert loaded_state_dict is not None + results.append( + self._reshard_send(loader_assignments[i], loaded_state_dict) + ) + else: + results.append( + self._reshard_receive(loader_assignments[i], state_dict) + ) + + self._reshard(results, state_dict) + + torch.cuda.synchronize() + logger.info(f"Checkpoint conversion took {time.time() - begin:.2f} seconds.") + logger.info(f"Total send bytes: {self.total_send_bytes / 1e9:.2f} GB") + logger.info(f"Total recv bytes: {self.total_recv_bytes / 1e9:.2f} GB") + return state_dict + + def _get_file_path(self, loader_id: int) -> str: + return os.path.join(self.path, f"consolidated.0{loader_id}.pth") + + def _load_metadata(self) -> None: + if not self.should_load: + self.read_dict = {} + return + self.read_dict = torch.load( + self._get_file_path(self.loader_id), + mmap=True, + weights_only=False, + ) + + def _create_fqn_mappings(self, state_dict: dict[str, torch.Tensor]) -> None: + if not self.read_dict: + return + + # Create the mapping from the stored checkpoint keys to TorchTitan keys. + for fqn in list(self.read_dict.keys()): + titan_fqns = convert_to_titan_fqns(fqn) + # We don't know how to process _extra_state + if "_extra_state" in fqn: + self.read_dict.pop(fqn) + continue + + if titan_fqns[0] not in state_dict: + for titan_fqn in titan_fqns: + assert titan_fqns[0] not in state_dict + self.read_dict.pop(fqn) + continue + self.stored_fqn_to_titan_fqn[fqn] = titan_fqns + for titan_fqn in titan_fqns: + self.titan_fqn_to_stored_fqn[titan_fqn] = fqn + + assert set(state_dict.keys()) == set(self.titan_fqn_to_stored_fqn.keys()), ( + set(state_dict.keys()) - set(self.titan_fqn_to_stored_fqn.keys()), + set(self.titan_fqn_to_stored_fqn.keys()) - set(state_dict.keys()), + ) + + def _get_load_assignments( + self, state_dict: dict[str, torch.Tensor] + ) -> list[_AssignmentRound]: + if self.my_rank == 0: + rounds: list[_AssignmentRound] = [] + size = 0 + fqns = [] + shapes = [] + dtypes = [] + + # All loader must load all the FQNs because the checkpoint is purely TP sharded. + all_keys = list(self.read_dict.keys()) + for fqn in all_keys: + fqns.append(fqn) + shapes.append(self.read_dict[fqn].shape) + dtypes.append(self.read_dict[fqn].dtype) + size += self.read_dict[fqn].numel() * self.read_dict[fqn].element_size() + if size < 1e9 and fqn != all_keys[-1]: + continue + + logger.info(f"Adding {fqns} to round {len(rounds)}") + round_assignment = _AssignmentRound(loader_assignments={}) + for loader_id in range(self.total_loader): + path = self._get_file_path(loader_id) + round_assignment.loader_assignments[loader_id] = _Assignment( + filename=path, + fqns=tuple(fqns), + shapes=tuple(shapes), + dtypes=tuple(dtypes), + loader_id=loader_id, + ) + rounds.append(round_assignment) + size = 0 + fqns.clear() + shapes.clear() + dtypes.clear() + + object_list: list[Any] = [ + rounds, + self.titan_fqn_to_stored_fqn, + self.stored_fqn_to_titan_fqn, + ] + else: + object_list = [None, None, None] + + dist.broadcast_object_list(object_list, src=0, group=self.pg) + rounds = object_list[0] + self.titan_fqn_to_stored_fqn = object_list[1] + self.stored_fqn_to_titan_fqn = object_list[2] + return rounds + + def _load_round(self, assignment: _Assignment) -> dict[str, torch.Tensor]: + ret = {} + assert self.read_dict + for fqn in assignment.fqns: + ret[fqn] = self.read_dict[fqn].to(device="cuda") + return ret + + def _reshard_send( + self, + assignment: _Assignment, + loaded_state_dict: dict[str, torch.Tensor], + ) -> dict[str, torch.Tensor]: + flatten_tensors = [t.flatten() for t in loaded_state_dict.values()] + flatten_tensor = torch.concat(flatten_tensors) + assert self.loader_id == assignment.loader_id + rank = self.loader_id * self.loader_every_n_ranks + assert rank == self.my_rank + logger.info(f"Sending {assignment.filename} from {rank} {self.loader_id}") + logger.info(f"Sending {assignment.fqns}") + dist.broadcast(flatten_tensor, src=rank, group=self.pg) + self.total_send_bytes += flatten_tensor.numel() * flatten_tensor.element_size() + return loaded_state_dict + + def _reshard_receive( + self, assignment: _Assignment, state_dict: dict[str, torch.Tensor] + ) -> dict[str, torch.Tensor]: + flatten_tensor = torch.empty( + sum(math.prod(s) for s, d in zip(assignment.shapes, assignment.dtypes)), + dtype=assignment.dtypes[0], + device="cuda", + ) + rank = assignment.loader_id * self.loader_every_n_ranks + dist.broadcast(flatten_tensor, src=rank, group=self.pg) + self.total_recv_bytes += flatten_tensor.numel() * flatten_tensor.element_size() + + ret: dict[str, torch.Tensor] = {} + loc = 0 + for fqn, shape, dtype in zip( + assignment.fqns, assignment.shapes, assignment.dtypes + ): + n_ele = math.prod(shape) + ret[fqn] = flatten_tensor[loc : loc + n_ele].view(shape) + loc += n_ele + return ret + + def _reshard( + self, + results: list[dict[str, torch.Tensor]], + state_dict: dict[str, torch.Tensor], + ) -> None: + def _inplace_copy(fqn: str, full_tensors: tuple[torch.Tensor, ...]): + titan_fqns = self.stored_fqn_to_titan_fqn[fqn] + assert len(titan_fqns) == len(full_tensors) + for titan_fqn, full_tensor in zip(titan_fqns, full_tensors): + dtensor = state_dict[titan_fqn] + logger.info(f"{titan_fqn} {full_tensor.sum()}") + assert isinstance(dtensor, DTensor) + shape, offset = compute_local_shape_and_global_offset( + full_tensor.shape, dtensor.device_mesh, dtensor.placements + ) + slices = [ + slice(cur_offset, cur_offset + cur_shape) + for cur_shape, cur_offset in zip(shape, offset) + ] + logger.info( + f"Copying {titan_fqn} with {slices=} {dtensor._local_tensor.shape=} " + f"{shape=} {offset=} {self.my_rank=} {dtensor.shape=} {full_tensor.shape=} " + f"{dtensor.placements=} {dtensor.device_mesh=} " + ) + dtensor.to_local().copy_(full_tensor[slices]) + + def _concat_shards(fqn, shards: list[torch.Tensor]) -> tuple[torch.Tensor, ...]: + if "wqkv" in fqn: + if "layer_norm" in fqn: + return (shards[0],) + return split_fused_qkv(shards) + + shard_dim = get_shard_dim(fqn) + if shard_dim is None: + return (shards[0],) + return (torch.cat(shards, dim=shard_dim),) + + fqns = list(results[0].keys()) + for result in results: + assert list(result.keys()) == fqns + + for fqn in fqns: + full_tensors = _concat_shards(fqn, [result[fqn] for result in results]) + _inplace_copy(fqn, full_tensors) + + +def _create_verified_state_dict( + pg: dist.ProcessGroup, mesh: DeviceMesh +) -> dict[str, torch.Tensor]: + placements = [Shard(0)] + state_dict = { + "tok_embeddings.weight": torch.rand( + 25256 * 8, 5120, device="cuda", dtype=torch.bfloat16 + ), + "layers.47.attention.wqkv.layer_norm_weight": torch.rand( + 5120, device="cuda", dtype=torch.bfloat16 + ), + "layers.47.attention.wq.weight": torch.rand( + 640 * 8, 5120, device="cuda", dtype=torch.bfloat16 + ), + "layers.47.attention.wk.weight": torch.rand( + 128 * 8, 5120, device="cuda", dtype=torch.bfloat16 + ), + "layers.47.attention.wv.weight": torch.rand( + 128 * 8, 5120, device="cuda", dtype=torch.bfloat16 + ), + "layers.47.attention.wo.weight": torch.rand( + 5120, 640 * 8, device="cuda", dtype=torch.bfloat16 + ), + # "layers.47.feed_forward.router_DE": torch.rand(5120, 128, device="cuda", dtype=torch.bfloat16), + # "layers.47.feed_forward.running_gate_stats_3E": torch.rand(3, 128, device="cuda", dtype=torch.bfloat16), + # "layers.47.feed_forward.global_gate_stats_3E": torch.rand(3, 128, device="cuda", dtype=torch.bfloat16), + "layers.47.feed_forward.w_in_shared_FD.weight": torch.rand( + 1024 * 8, 5120, device="cuda", dtype=torch.bfloat16 + ), + "layers.47.feed_forward.w_out_shared_DF.weight": torch.rand( + 5120, 1024 * 8, device="cuda", dtype=torch.bfloat16 + ), + "layers.47.feed_forward.w_swiglu_FD.weight": torch.rand( + 1024 * 8, 5120, device="cuda", dtype=torch.bfloat16 + ), + "layers.47.feed_forward.norm.weight": torch.rand( + 5120, device="cuda", dtype=torch.bfloat16 + ), + "layers.47.feed_forward.experts.moe_w_in_eD_F": torch.rand( + 655360, 1024 * 8, device="cuda", dtype=torch.bfloat16 + ), + "layers.47.feed_forward.experts.moe_w_out_eF_D": torch.rand( + 131072 * 8, 5120, device="cuda", dtype=torch.bfloat16 + ), + "layers.47.feed_forward.experts.moe_w_swiglu_eD_F": torch.rand( + 655360, 1024 * 8, device="cuda", dtype=torch.bfloat16 + ), + } + return {k: distribute_tensor(v, mesh, placements) for k, v in state_dict.items()} + + +def _verify_state_dict( + state_dict: dict[str, torch.Tensor], path: str, rank: int +) -> None: + stored_state_dicts = [ + torch.load( + os.path.join(path, f"consolidated.0{i}.pth"), + map_location="cpu", + weights_only=False, + mmap=True, + ) + for i in range(8) + ] + + def read_and_verify_tensor(fqn: str, dtensor: DTensor) -> None: + logger.info(f"Verifying {fqn} {dtensor.shape=} {dtensor.placements=} ") + shards = [stored_state_dicts[i][fqn] for i in range(8)] + full_tensor = dtensor.full_tensor() + logger.info(f"Gather {fqn} {full_tensor.shape} completely.") + + if rank > 0: + return + + if len(shards[0].shape) == 1: + assert full_tensor.shape == shards[0].shape, fqn + assert torch.allclose(shards[0].to(device="cuda"), full_tensor), fqn + return + elif shards[0].shape[0] == full_tensor.shape[0]: + concat_shards = torch.cat(shards, dim=1) + logger.info(f"Load {fqn} completely.") + elif shards[0].shape[1] == full_tensor.shape[1]: + concat_shards = torch.cat(shards, dim=0) + logger.info(f"Load {fqn} completely.") + + concat_shards = concat_shards.to(device="cuda") + logger.info(f"Move to GPU {fqn} completely.") + + assert concat_shards.shape == full_tensor.shape, fqn + assert concat_shards.dtype == full_tensor.dtype, fqn + assert concat_shards.device == full_tensor.device, fqn + assert torch.allclose(concat_shards, full_tensor), fqn + + for k, v in state_dict.items(): + if "wq" in k and "wqkv" not in k: + pass + elif "wk" in k: + pass + elif "wv" in k: + pass + else: + assert v is not None, k + read_and_verify_tensor(k, v) + + +if __name__ == "__main__": + init_logger() + config = JobConfig() + config.parser.add_argument( + "--checkpoint.convert_path", + type=str, + default="", + help="""Specify the path of the target checkpoint to convert.""", + ) + config.parser.add_argument( + "--checkpoint.convert_load_every_n_ranks", + type=int, + default=8, + help=""" + Specify the interval at which ranks are assigned to load checkpoints. + + For example, if this number is 4, then ranks 0, 4, 8, ... will load the + checkpoint. Each loader is responsible for loading one file. If there + are more loaders than files, only the first few loaders will be assigned + to load the checkpoint. The default value is 8. + """, + ) + config.parser.add_argument( + "--checkpoint.fake_model", + action="store_true", + help="""If true, the model will be fake.""", + ) + config.parse_args() + assert config.checkpoint.convert_path != "" + + trainer: Optional[Trainer] = None + + try: + trainer = Trainer(config) + if os.path.exists(trainer.checkpointer.folder): + raise RuntimeError( + "The checkpoint folder already exists. Abort to avoid overwriting " + f"the checkpoint. {trainer.checkpointer.folder=}" + ) + if config.checkpoint.fake_model: + state_dict = _create_verified_state_dict( + trainer.world_mesh.get_group(), trainer.world_mesh + ) + else: + state_dict = trainer.checkpointer.states[MODEL].state_dict() + + size = 0 + for v in state_dict.values(): + size += v.numel() * v.element_size() + logger.info(f"Total size of the model: {size / 1e9:.2f} GB") + + # Do not support PP yet, we will need to iterate over the PP dimension and + # extract the corresponding state_dict and device_mesh. + if "freq_cis" in state_dict: + state_dict.pop("freqs_cis") + + state_dict = CheckpointConverter( + process_group=trainer.world_mesh.get_group(), + path=config.checkpoint.convert_path, + loader_every_n_ranks=config.checkpoint.convert_load_every_n_ranks, + ).convert(state_dict) + + class DummyModel: + def __init__(self, state_dict: dict[str, torch.Tensor]) -> None: + self._state_dict = state_dict + + def state_dict(self) -> dict[str, torch.Tensor]: + return self._state_dict + + if config.checkpoint.fake_model: + begin = time.time() + _verify_state_dict( + state_dict, + config.checkpoint.convert_path, + trainer.world_mesh.get_rank(), + ) + dist.barrier() + logger.info(f"Verifies state_dict {time.time() - begin}.") + else: + # oh, this is pretty bad, when can we get rid of the freqs_cis issue? + state_dict["freqs_cis"] = None + trainer.checkpointer.states[MODEL] = DummyModel(state_dict) + trainer.checkpointer.model_weights_only = True + trainer.checkpointer.export_dtype = next(iter(state_dict.values())).dtype + trainer.checkpointer.save(curr_step=0, force=True) + time.sleep(2) + finally: + pass diff --git a/torchtitan/experiments/llama4/scripts/convert_meta_to_dcp_with_gpus.sh b/torchtitan/experiments/llama4/scripts/convert_meta_to_dcp_with_gpus.sh new file mode 100644 index 0000000000000000000000000000000000000000..f3fd310934b1181ed83fa9fc4463f0c2336b46fc --- /dev/null +++ b/torchtitan/experiments/llama4/scripts/convert_meta_to_dcp_with_gpus.sh @@ -0,0 +1,25 @@ +#!/usr/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -ex + +# use envs as local overrides for convenience +# e.g. +# LOG_RANK=0,1 NGPU=4 ./convert_meta_to_dcp_with_gpus.sh +NGPU=${NGPU:-"8"} +LOG_RANK=${LOG_RANK:-0,1,2,3,4,5,6,7} +CONFIG_FILE=${CONFIG_FILE:-"../train_configs/llama4_17bx16e.toml"} + +overrides="" +if [ $# -ne 0 ]; then + overrides="$*" +fi + +PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" \ +torchrun --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \ +--local-ranks-filter ${LOG_RANK} --role rank --tee 3 \ +convert_meta_to_dcp_with_gpus_meta.py --job.config_file ${CONFIG_FILE} $overrides diff --git a/torchtitan/experiments/multimodal/__pycache__/__init__.cpython-312.pyc b/torchtitan/experiments/multimodal/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bcece22f4ab407a7702d83af4b95ac824c05cc45 Binary files /dev/null and b/torchtitan/experiments/multimodal/__pycache__/__init__.cpython-312.pyc differ diff --git a/torchtitan/experiments/multimodal/tests/__init__.py b/torchtitan/experiments/multimodal/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2e41cd717f6a439a9c08d76a9d0e4a54e190fc5a --- /dev/null +++ b/torchtitan/experiments/multimodal/tests/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. diff --git a/torchtitan/experiments/multimodal/tests/test_multimodal_model.py b/torchtitan/experiments/multimodal/tests/test_multimodal_model.py new file mode 100644 index 0000000000000000000000000000000000000000..b5acc51bb3d186674267a4fc47d9075f04122a60 --- /dev/null +++ b/torchtitan/experiments/multimodal/tests/test_multimodal_model.py @@ -0,0 +1,128 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import pytest +import torch + +from torchtitan.experiments.llama_multimodal import ( + ModelArgs, + MultimodalDecoder, + VisionEncoder, +) + +from .test_utils import fixed_init_model, fixed_init_tensor + + +@pytest.fixture +def encoder_config(): + return ModelArgs( + encoder_embed_dim=32, + encoder_num_layers=2, + encoder_num_heads=4, + tile_size=49, + patch_size=9, + max_num_tiles=4, + in_channels=3, + return_intermediates=[0, 1], + num_layers_projection=2, + decoder_embed_dim=128, + ) + + +@pytest.fixture +def decoder_config(): + return ModelArgs( + decoder_embed_dim=512, + vocab_size=10000, + fusion_interval=2, + num_special_tokens=3, + decoder_num_layers=6, + decoder_num_heads=8, + decoder_num_kv_heads=4, + max_seq_len=512, + rope_theta=50000.0, + ) + + +class TestMultimodalModelVisionEncoder: + @pytest.fixture(autouse=True) + def setup_class(self, encoder_config): + self.model_args = encoder_config + self.batch_size = 1 + self.num_imgs = 2 + self.num_tiles = 4 + self.aspect_ratio = torch.tensor([[1, 3], [2, 2]]).reshape( + self.batch_size, self.num_imgs, 2 + ) + image = torch.rand( + ( + self.batch_size, + self.num_imgs, + self.num_tiles, + self.model_args.in_channels, + self.model_args.tile_size, + self.model_args.tile_size, + ) + ) + self.image = fixed_init_tensor(image.shape, min_val=-1, max_val=1) + + def test_llama_mm_vision_encoder(self): + model = VisionEncoder(self.model_args) + fixed_init_model(model, min_val=-1, max_val=1) + output = model(self.image, self.aspect_ratio) + expected_shape = ( + self.batch_size, + self.num_imgs * self.num_tiles * (model.vit.patches_per_tile + 1), + self.model_args.decoder_embed_dim, + ) + assert ( + output.shape == expected_shape + ), f"Expected shape {expected_shape}, but got {output.shape}" + + # TODO: Need to ensure numerical stability before doing convergence test. + # output.mean() = 3.994, we need to debug why it is not close to 5.28800, which is + # the test value from the original torch tune test + # assert torch.allclose( + # output.mean(), torch.tensor(5.28800), atol=1e-3, rtol=1e-3 + # ) + + +class TestMultimodalModelDecoder: + @pytest.fixture(autouse=True) + def setup_class(self, decoder_config): + self.model_args = decoder_config + self.batch_size = 1 + self.decoder_embed_dim = self.model_args.decoder_embed_dim + self.vocab_size = self.model_args.vocab_size + self.seq_len = 128 + self.input = { + "tokens": torch.arange(self.batch_size * self.seq_len).reshape( + self.batch_size, self.seq_len + ), + "encoder_input": fixed_init_tensor( + (self.batch_size, self.seq_len, self.decoder_embed_dim), + min_val=-1, + max_val=1, + ), + "encoder_mask": None, + } + + @torch.no_grad() + def test_llama_mm_decoder(self): + model = MultimodalDecoder(self.model_args) + fixed_init_model(model, min_val=-1, max_val=1) + output = model(**self.input) + expected_shape = (self.batch_size, self.seq_len, self.vocab_size) + assert ( + output.shape == expected_shape + ), f"Expected shape {expected_shape}, but got {output.shape}" + + # TODO: Need to ensure numerical stability before doing convergence test. + # output.mean() = -0.0134, we need to debug why it is not close to -9.47548e-5, which is + # the test value from the original torch tune test + # assert torch.allclose( + # output.mean(), torch.tensor(-9.47548e-5), atol=1e-3, rtol=1e-3 + # ) diff --git a/torchtitan/experiments/multimodal/tests/test_utils.py b/torchtitan/experiments/multimodal/tests/test_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7c3817db8699966a8d848ad744ccd6b6dabb3836 --- /dev/null +++ b/torchtitan/experiments/multimodal/tests/test_utils.py @@ -0,0 +1,58 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import math + +from typing import Optional, Union + +import torch +from torch import nn + + +def fixed_init_tensor( + shape: torch.Size, + min_val: Union[float, int] = 0.0, + max_val: Union[float, int] = 1.0, + nonlinear: bool = False, + dtype: torch.dtype = torch.float, +): + """ + Utility for generating deterministic tensors of a given shape. In general stuff + like torch.ones, torch.eye, etc can result in trivial outputs. This utility + generates a range tensor [min_val, max_val) of a specified dtype, applies + a sine function if nonlinear=True, then reshapes to the appropriate shape. + """ + n_elements = math.prod(shape) + step_size = (max_val - min_val) / n_elements + x = torch.arange(min_val, max_val, step_size, dtype=dtype) + x = x.reshape(shape) + if nonlinear: + return torch.sin(x) + return x + + +@torch.no_grad +def fixed_init_model( + model: nn.Module, + min_val: Union[float, int] = 0.0, + max_val: Union[float, int] = 1.0, + nonlinear: bool = False, + dtype: Optional[torch.dtype] = None, +): + """ + This utility initializes all parameters of a model deterministically using the + function fixed_init_tensor above. See that docstring for details of each parameter. + """ + for _, param in model.named_parameters(): + param.copy_( + fixed_init_tensor( + param.shape, + min_val=min_val, + max_val=max_val, + nonlinear=nonlinear, + dtype=param.dtype if dtype is None else dtype, + ) + ) diff --git a/torchtitan/experiments/multimodal/transform.py b/torchtitan/experiments/multimodal/transform.py new file mode 100644 index 0000000000000000000000000000000000000000..ecb0f989acd0b818f20116a60813c26e68438cec --- /dev/null +++ b/torchtitan/experiments/multimodal/transform.py @@ -0,0 +1,185 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any, List, Mapping, Optional, Tuple + +import torch + +import torchvision +from torchvision.transforms.v2 import functional as F + +from utils import ( + find_supported_resolutions, + get_canvas_best_fit, + resize_with_pad, + tile_crop, +) + +from torchtitan.tools.logging import logger + + +class CLIPTransform: + """ + This class accepts images of any size and dynamically resizes, pads, normalizes and tiles it + based on the image aspect ratio and the number of image tiles we allow. + + The algorithm will NOT distort the image to fit a certain aspect ratio, because + that leads to a significant degradation in image quality. + + The user can choose if they want to allow upscaling by using the flag ``resize_to_max_canvas``. + + For example, if an input image is of size 300x800, and we want to allow + a maximum of 16 image tiles, with side 224px, then: + + If ``resize_to_max_canvas=False``, then: + best_resolution = (448, 896) -> smallest canvas, up to 16 tiles, that doesn't require downscaling + image is NOT resized + image is padded (300, 800) -> 448,896 + Image is tiled 2x4, for a final output shape of (8, 3, 224, 224) + + If ``resize_to_max_canvas=True``, then: + best_resolution = (448, 1344) # canvas that allows maximum upscaling, with minimum padding, up to 16 tiles + image is resized without distortion (300,800) -> (448, 1194) #448 is the limiting side for the resize + image is padded (448, 1194) -> (448, 1344) + Image is tiled 2x6, for a final output shape of (10, 3, 224, 224) + + Args: + image_mean (Optional[List[float]]): Mean values of each channel, used for normalization. + Should be the same used for the pre-trained model. If None, no normalization is performed. Default None. + image_std (Optional[List[float]]): Standard deviation values of each channel, used for normalization. + Should be the same used for the pre-trained model. If None, no normalization is performed. Default None. + possible_resolutions (Optional[List[Tuple[int, int]]]): List of possible resolutions as tuples (height, width). + where each tuple represents a possible canvas to fit the image into when calling ``get_canvas_best_fit``. + If None, this will be calculated using max_num_tiles and tile_size. Default None. + tile_size (int): Size of the tiles to divide the image into. Default 224. + max_num_tiles (Optional[int]): Only used if possible_resolutions is NOT given. + Maximum number of tiles to break an image into. + This will be used to generate possible_resolutions, + e.g. [(224, 224), (224, 448), (448, 224)] if max_num_tiles = 2 and tile_size = 224. + Default 4. + dtype (torch.dtype): Data type of the output image. Default torch.bfloat16. + resample (str): Resampling method used when resizing images. Supports any enum of + ``torchvision.transforms.InterpolationMode``, e.g. "nearest", "nearest_exact", "bilinear", "bicubic". + Default 'bilinear'. + resize_to_max_canvas (bool): "If True, the image will be upscaled without distortion to fit the largest possible + resolution from possible_resolutions. + If False, it will pick the resolution that minimizes downscaling, including no downscaling at all. + In this case, the image will only be upscaled if it's size < tile_size. Default False. + + Examples: + >>> image_transform = CLIPImageTransform( + ... image_mean=None, + ... image_std=None, + ... tile_size=224, + ... possible_resolutions=None, + ... max_num_tiles=4, + ... resample="bilinear", + ... resize_to_max_canvas=True, + ...) + >>> # create random image + >>> image = (np.random.rand(100,200,3) * 255).astype(np.uint8) + >>> image = PIL.Image.fromarray(image) + >>> output = image_transform(image) + >>> output['image'].shape # [num_tiles, num_channels, tile_size, tile_size] + torch.Size([2, 3, 224, 224]) + >>> output['ar'] # image best fits the canvas 224x448 + torch.tensor([1,2]) + """ + + def __init__( + self, + *, + image_mean: Optional[List[float]] = None, + image_std: Optional[List[float]] = None, + possible_resolutions: Optional[List[Tuple[int, int]]] = None, + tile_size: int = 224, + max_num_tiles: Optional[int] = 4, + dtype: torch.dtype = torch.bfloat16, + resample: str = "bilinear", + resize_to_max_canvas: bool = False, + ) -> None: + + # get_canvas_best_fit + assert ( + possible_resolutions is not None or max_num_tiles is not None + ), f"Either possible_resolutions or max_num_tiles must be given. Got {possible_resolutions} and {max_num_tiles}" + + # If possible_resolutions are not given, then calculate possible ones based on max_num_tiles + if not possible_resolutions and max_num_tiles: + possible_resolutions = find_supported_resolutions( + max_num_tiles=max_num_tiles, tile_size=tile_size + ) + else: + possible_resolutions = possible_resolutions + + self.possible_resolutions = torch.tensor(possible_resolutions).reshape(-1, 2) + logger.debug( + f"Found possible_resolutions: {self.possible_resolutions}. Will fit the images into the canvas with best fit." + ) + + self.resize_to_max_canvas = resize_to_max_canvas + + # normalize + assert (image_mean is None) == ( + image_std is None + ), f"Need to provide both or none of image_mean and image_std. Got {image_mean=} and {image_std=}" + self.mean = image_mean + self.std = image_std + + # resize_with_pad + self.max_size = None if resize_to_max_canvas else tile_size + self.dtype = dtype + self.resample = torchvision.transforms.InterpolationMode[resample.upper()] + + # tile_crop + self.tile_size = tile_size + + def __call__(self, image: torch.Tensor) -> Mapping[str, Any]: + """ + Apply image decoding and transformations to the "image" field in the sample. + + Args: + sample (Mapping[str, Any]): A sample with an "image" field containing + a List[Message] to tokenize + + Returns: + Mapping[str, Any]: The sample with an updated "image" filed and added + "aspect_ratio" field. + """ + assert isinstance(image, torch.Tensor), "Input image must be a torch.Tensor." + + image = F.to_image(image) + image = F.grayscale_to_rgb_image(image) + image = F.to_dtype(image, dtype=self.dtype, scale=True) + + # Find the best canvas to fit the image without distortion + best_resolution = get_canvas_best_fit( + image=image, + possible_resolutions=self.possible_resolutions, + resize_to_max_canvas=self.resize_to_max_canvas, + ) + + # resize without distortion + pad to fit best_resolution + image = resize_with_pad( + image=image, + target_size=best_resolution, + resample=self.resample, + max_size=self.max_size, + ) + + # Normalize + if self.mean: + image = F.normalize(image, mean=self.mean, std=self.std) + + # Divide the image into equally sized tiles + image = tile_crop(image=image, tile_size=self.tile_size) + + aspect_ratio = torch.tensor(best_resolution).reshape(-1) // self.tile_size + + return { + "image": image, + "aspect_ratio": aspect_ratio, + } diff --git a/torchtitan/experiments/multimodal/utils.py b/torchtitan/experiments/multimodal/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c927772a5ef95ba65123c9387de4ead1e732490f --- /dev/null +++ b/torchtitan/experiments/multimodal/utils.py @@ -0,0 +1,437 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import math + +from collections import defaultdict + +from pathlib import Path +from typing import List, Optional, Set, Tuple, Union +from urllib import request + +import torch +import torchvision +from torchvision.transforms.v2 import functional as F + +# NOTE Copied from torchtune.modules.transforms.vision_utils.tile_crop.py +def tile_crop(image: torch.Tensor, tile_size: int) -> torch.Tensor: + """ + Divides a tensor into equally sized tiles. The tensor should be divisible by tile_size. + + Args: + image (torch.Tensor): Input image to crop into tiles. + tile_size (int): Size of each tile. + + Returns: + torch.Tensor: torch.Tensor of shape [num_tiles, channel_size, tile_size, tile_size] + + Examples: + >>> image = torch.rand(3, 200, 300) + >>> tiles = tile_crop(image, tile_size=50) + >>> tiles.shape # 4x6 = 24 tiles + torch.Size([24, 3, 50, 50]) + + >>> image = torch.rand(3, 400, 600) + >>> tiles = tile_crop(image, tile_size=200) + >>> tiles.shape # 2x3 = 6 tiles + torch.Size([6, 3, 200, 200]) + """ + + channel_size, height, width = image.shape + + # assert sizes are divisible + assert ( + height % tile_size == 0 and width % tile_size == 0 + ), f"Image size {height}x{width} is not divisible by tile size {tile_size}" + + # Reshape to split height and width into tile_size blocks + tiles_height = height // tile_size + tiles_width = width // tile_size + + reshaped = image.view(channel_size, tiles_height, tile_size, tiles_width, tile_size) + + # Transpose to bring tiles together + # We want [tiles_height, tiles_width, channel_size, tile_size, tile_size] + transposed = reshaped.permute(1, 3, 0, 2, 4) + + # Flatten the tiles + tiles = transposed.contiguous().view( + tiles_height * tiles_width, channel_size, tile_size, tile_size + ) + + return tiles + + +# NOTE Copied from torchtune.modules.transforms.vision_utils.resize_with_pad.py +def resize_with_pad( + image: torch.Tensor, + target_size: Tuple[int, int], + resample: torchvision.transforms.InterpolationMode, + max_size: Optional[int] = None, +) -> torch.Tensor: + """ + Resizes and pads an image to target_size without causing distortion. + The user can set max_size to limit upscaling when target_size exceeds image_size. + + Args: + image (torch.Tensor): The input image tensor in the format [..., H, W]. + target_size (Tuple[int, int]): The desired resolution to fit the image into in the format [height, width]. + resample (torchvision.transforms.InterpolationMode): Resampling method used when resizing images. + Supports torchvision.transforms.InterpolationMode.NEAREST, InterpolationMode.NEAREST_EXACT, + InterpolationMode.BILINEAR and InterpolationMode.BICUBIC. + max_size (Optional[int]): The maximum size to upscale the image to. + If None, will upscale up to target_size. + + Returns: + torch.Tensor: The resized and padded image tensor in the format [..., H, W]. + + Examples: + + Example 1: The image will be upscaled from (300, 800) to (448, 1194), since 448 is the limiting side, + and then padded from (448, 1194) to (448, 1344). + + >>> max_size = None + >>> image = torch.rand([3, 300, 800]) + >>> target_size = (448, 1344) + >>> resample = torchvision.transforms.InterpolationMode.BILINEAR + >>> output = resize_with_pad(image, target_size, resample, max_size) + + Example 2: The image will stay as is, since 800 > 600, and then padded from (300, 800) to (448, 1344). + + >>> max_size = 600 + >>> image = torch.rand([3, 300, 800]) + >>> target_size = (448, 1344) + >>> resample = torchvision.transforms.InterpolationMode.BILINEAR + >>> output = resize_with_pad(image, target_size, resample, max_size) + + Example 3: The image will be downscaled from (500, 1000) to (224, 448), + and padded from (224, 448) to (448, 448). + + >>> max_size = 600 + >>> image = torch.rand([3, 500, 1000]) + >>> target_size = (448, 488) + >>> resample = torchvision.transforms.InterpolationMode.BILINEAR + >>> output = resize_with_pad(image, target_size, resample, max_size) + + """ + + image_height, image_width = image.shape[-2:] + image_size = (image_height, image_width) + + # If target_size requires upscaling, we might want to limit the upscaling to max_size + if max_size is not None: + new_target_height = min(max(image_height, max_size), target_size[0]) + new_target_width = min(max(image_width, max_size), target_size[1]) + target_size_resize = (new_target_height, new_target_width) + else: + target_size_resize = target_size + + # resize to target_size while preserving aspect ratio + new_size_preserving_aspect_ratio = _get_max_res_without_distortion( + image_size=image_size, + target_size=target_size_resize, + ) + + image = F.resize( + inpt=image, + size=list(new_size_preserving_aspect_ratio), + interpolation=resample, + antialias=True, + ) + + image = _pad_image_top_left(image=image, target_size=target_size) + + return image + + +# NOTE Copied from torchtune.modules.transforms.vision_utils.resize_with_pad.py +def _pad_image_top_left( + image: torch.Tensor, + target_size: Tuple[int, int], +) -> torch.Tensor: + """ + Places the image at the top left of the canvas and pads with 0 the right and bottom + to fit to the target resolution. If target_size < image_size, it will crop the image. + + Args: + image (torch.Tensor): The input image tensor in the format [..., H, W]. + target_size (Tuple[int, int]): The desired resolution to fit the image into in the format [height, width]. + + Returns: + torch.Tensor: The padded image tensor in the format [..., H, W]. + """ + + image_size = image.shape[-2:] + + height, width = image_size + target_height, target_width = target_size + + pad_x = target_width - width + pad_y = target_height - height + + padding = [0, 0, pad_x, pad_y] + return F.pad(inpt=image, padding=padding) + + +# NOTE Copied from torchtune.modules.transforms.vision_utils.resize_with_pad.py +def _get_max_res_without_distortion( + image_size: Tuple[int, int], + target_size: Tuple[int, int], +) -> Tuple[int, int]: + """ + Determines the maximum resolution to which an image can be resized to without distorting its + aspect ratio, based on the target resolution. + + For example, if image_size = (200,400) and target_size = (600,800), + scale_h = 600/200 = 3 + scale_w = 800/400 = 2 + So the maximum that we can upscale without distortion is min(scale_h, scale_w) = 2 + + Since scale_w is the limiting side, then new_w = target_w, and new_h = old_h*scale_w + + Args: + image_size (Tuple[int, int]): The original resolution of the image. + target_size (Tuple[int, int]): The desired resolution to fit the image into. + Returns: + Tuple[int, int]: The optimal dimensions to which the image should be resized. + Examples: + >>> _get_max_res_without_distortion([200, 300], target_size = (450, 200)) + (133, 200) + >>> _get_max_res_without_distortion([800, 600], target_size = (450, 1300)) + (450, 337) + """ + + original_height, original_width = image_size + target_height, target_width = target_size + + scale_w = target_width / original_width + scale_h = target_height / original_height + + if scale_w < scale_h: + new_width = target_width + new_height = min(math.floor(original_height * scale_w), target_height) + else: + new_height = target_height + new_width = min(math.floor(original_width * scale_h), target_width) + + return new_height, new_width + + +# NOTE Copied from torchtune.modules.transforms.vision_utils.get_canvas_best_fit.py +def _get_factors(n: int) -> Set[int]: + """ + Calculate all factors of a given number, i.e. a divisor that leaves no remainder. + + Args: + n (int): The number to find factors for. + + Returns: + set: A set containing all factors of the number. + + Examples: + >>> _get_factors(n=12) + {1, 2, 3, 4, 6, 12} + """ + factors_set = set() + + for i in range(1, int(n**0.5) + 1): + if n % i == 0: + factors_set.add(i) + factors_set.add(n // i) + return factors_set + + +# NOTE Copied from torchtune.modules.transforms.vision_utils.get_canvas_best_fit.py +def get_canvas_best_fit( + image: torch.Tensor, possible_resolutions: torch.Tensor, resize_to_max_canvas: bool +) -> Tuple[int, int]: + """ + Determines the best canvas possible from a list of possible resolutions to + resize an image to, without distortion. + + For each possible resolution, calculates the scaling factors for + width and height, and selects the smallest one, which is the limiting side. + E.g. if to match a canvas shape you have to upscale an image's height by 2x, and width by 1.5x, + then the maximum upscaling without distortion is min(2, 1.5) = 1.5. + + If there are multiple canvases that satisfy the conditions, + we pick the one with the lowest area to minimize padding. + + Args: + image (torch.Tensor): The image we want to fit into a canvas. + possible_resolutions (torch.Tensor): A tensor of shape (N, 2) where each + row represents a possible canvas. + resize_to_max_canvas (bool): If True, pick the canvas that allows maximum scaling. + If False, pick the canvas that minimizes downscaling, including no downscaling at all. + + Returns: + Tuple[int, int]: The best resolution to fit the image into. + + Examples: + >>> image = torch.rand(3, 200, 300) + >>> possible_resolutions = torch.tensor([ + ... [224, 672], + ... [672, 224], + ... [224, 448], + ... [448, 224], + ... [224, 224] + ... ]) + >>> get_canvas_best_fit(image, possible_resolutions, resize_to_max_canvas=False) + (224, 448) + + In the example above, we calculate the scaling factors for each possible resolution + + >>> scale_height = torch.tensor([1.1200, 3.3600, 1.1200, 2.2400, 1.1200]) + >>> scale_width = torch.tensor([2.2400, 0.7467, 1.4933, 0.7467, 0.7467]) + >>> scales = torch.tensor([1.1200, 0.7467, 1.1200, 0.7467, 0.7467]) + + Two options have scaling_factor > 1, since resize_to_max_canvas is False, we pick the smallest + + >>> upscaling_options = torch.tensor([1.1200, 1.1200]) + >>> selected_scale = torch.tensor(1.1200) + + There are two possible options, so we pick the one with the smallest area + + >>> areas = torch.tensor([150528, 100352]) # for resolutions [672, 224] and [224, 448], respectively + >>> optimal_canvas = torch.tensor([224, 448]) # resolution with the smallest area + """ + + original_height, original_width = image.shape[-2:] + + # possible resolutions heights/widths + target_heights, target_widths = ( + possible_resolutions[:, 0], + possible_resolutions[:, 1], + ) + + # scaling factors to resize the image without distortion + scale_w = target_widths / original_width + scale_h = target_heights / original_height + + # get limiting side scaling -> no distortion + scales = torch.where(scale_w > scale_h, scale_h, scale_w) + + # filter only scales that allow upscaling + upscaling_options = scales[scales >= 1] + if len(upscaling_options) > 0: + if resize_to_max_canvas: + selected_scale = torch.max(upscaling_options) + else: + selected_scale = torch.min(upscaling_options) + else: + # no upscaling possible, + # get the minimum downscaling (max scale for scales<1) + downscaling_options = scales[scales < 1] + selected_scale = torch.max(downscaling_options) + + # get all resolutions that support this scaling factor, + # e.g. you can upscale to 224x224, 224x448, 224x672 without distortion + chosen_canvas = possible_resolutions[scales == selected_scale] + + # if there are multiple resolutions, + # get the one with minimum area to reduce padding + if len(chosen_canvas) > 1: + areas = chosen_canvas[:, 0] * chosen_canvas[:, 1] + optimal_idx = torch.argmin(areas) + optimal_canvas = chosen_canvas[optimal_idx] + else: + optimal_canvas = chosen_canvas[0] + + return tuple(optimal_canvas.tolist()) + + +# NOTE Copied from torchtune.modules.transforms.vision_utils.get_canvas_best_fit.py +def find_supported_resolutions( + max_num_tiles: int, tile_size: int +) -> List[Tuple[int, int]]: + """ + Computes all combinations of resolutions, multiple of tile_size, + that contain up to max_num_tiles. Useful for when dividing an image into tiles. + + For example, if we want at most 2 tiles per image, then we can support the + following resolutions: (1x1, 1x2, 2x1) * tile_size + + Args: + max_num_tiles (int): Maximum number of tiles. + tile_size (int): Size of the side of the tile. + + Returns: + List[Tuple[int, int]]: List of possible resolutions as tuples (height, width). + + Examples: + + >>> max_num_tiles = 4 + >>> tile_size = 224 + >>> find_supported_resolutions(max_num_tiles, tile_size) + [(224, 896), (448, 448), (224, 224), (896, 224), (224, 672), (672, 224), (224, 448), (448, 224)] + """ + + # create dictionary {aspect_ratio: [resolution1, ..., resolution n]} + # example {0.25: [(1,4)], 1.0: [(2,2), (1,1)], 4.0: [(4,1)]} + asp_dict = defaultdict(list) + for _tile_size in range(max_num_tiles, 0, -1): + factors = sorted(_get_factors(_tile_size)) + asp_ratios = [(factor, _tile_size // factor) for factor in factors] + for height, width in asp_ratios: + ratio_float = height / width + asp_dict[ratio_float].append((height, width)) + + # get the resolutions multiplied by the tile_size + possible_resolutions = [] + for ar, resolution in asp_dict.items(): + for height, width in resolution: + possible_resolutions.append((height * tile_size, width * tile_size)) + + return possible_resolutions + + +# NOTE Copied from torchtune.data._utils.py +def load_image(image_loc: Union[Path, str]) -> torch.Tensor: + """ + Convenience method to load an image in torch.Tensor format from a local file path or remote source. + + Args: + image_loc (Union[Path, str]): Local file path or remote source pointing to the image + which will be loaded in PIL format. + + Note: + If loading an image from a remote source, the function expects the URL provided in ``image_loc`` + to start with "http" or "https" e.g. "https://www.wikipedia.org/en/bird.jpg". + + Raises: + ValueError: If the image cannot be loaded from remote source, **or** + if the image cannot be opened as a :class:`~torch.Tensor`. + + Examples: + >>> # Load from remote source + >>> image = load_image("https://www.wikipedia.org/en/bird.jpg") + + >>> # Load from local file path + >>> image = load_image(Path("/home/user/bird.jpg")) + + Returns: + torch.Tensor: The loaded image. + """ + + # If pointing to remote source, try to load to local + if isinstance(image_loc, str) and image_loc.startswith("http"): + try: + image_loc = request.urlopen(image_loc).read() + image = torchvision.io.decode_image( + torch.frombuffer(image_loc, dtype=torch.uint8), + mode="RGB", + ) + except Exception as e: + raise ValueError("Failed to load remote image as torch.Tensor") from e + + # Open the local image as a Tensor image + else: + try: + image = torchvision.io.decode_image(image_loc, mode="RGB") + except Exception as e: + raise ValueError("Failed to load local image as torch.Tensor") from e + + return image diff --git a/torchtitan/experiments/simple_fsdp/__pycache__/__init__.cpython-312.pyc b/torchtitan/experiments/simple_fsdp/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc198d440c85b33f72d66a3b6434ac78f6591c29 Binary files /dev/null and b/torchtitan/experiments/simple_fsdp/__pycache__/__init__.cpython-312.pyc differ diff --git a/torchtitan/experiments/simple_fsdp/__pycache__/model.cpython-312.pyc b/torchtitan/experiments/simple_fsdp/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6eccaf665f354d725ca23d132b2c4b5e7bce82c5 Binary files /dev/null and b/torchtitan/experiments/simple_fsdp/__pycache__/model.cpython-312.pyc differ diff --git a/torchtitan/experiments/simple_fsdp/__pycache__/parallelize_llama.cpython-312.pyc b/torchtitan/experiments/simple_fsdp/__pycache__/parallelize_llama.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9a492e16243709f2ae35304eda0898aa37f7c096 Binary files /dev/null and b/torchtitan/experiments/simple_fsdp/__pycache__/parallelize_llama.cpython-312.pyc differ diff --git a/torchtitan/experiments/simple_fsdp/__pycache__/simple_fsdp.cpython-312.pyc b/torchtitan/experiments/simple_fsdp/__pycache__/simple_fsdp.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3718b57ca56ac3984a3e90236764f0fd00d74c16 Binary files /dev/null and b/torchtitan/experiments/simple_fsdp/__pycache__/simple_fsdp.cpython-312.pyc differ diff --git a/torchtitan/experiments/simple_fsdp/model.py b/torchtitan/experiments/simple_fsdp/model.py new file mode 100644 index 0000000000000000000000000000000000000000..63104169b8fa14ed7032182c1ad08b782cd715fe --- /dev/null +++ b/torchtitan/experiments/simple_fsdp/model.py @@ -0,0 +1,18 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from torchtitan.models.llama3 import Transformer, TransformerModelArgs +from .simple_fsdp import disable_data_parallel + + +class SimpleFSDPTransformer(Transformer): + def __init__(self, model_args: TransformerModelArgs): + super().__init__(model_args) + self.init_weights() + + def init_weights(self, *args, **kwargs): + with disable_data_parallel(): + super().init_weights(*args, **kwargs) diff --git a/torchtitan/experiments/simple_fsdp/parallelize_llama.py b/torchtitan/experiments/simple_fsdp/parallelize_llama.py new file mode 100644 index 0000000000000000000000000000000000000000..25d696db27e90e292465aa7b9c6ffa20ae8f0508 --- /dev/null +++ b/torchtitan/experiments/simple_fsdp/parallelize_llama.py @@ -0,0 +1,98 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn + +from torch.distributed import DeviceMesh + +from torchtitan.config_manager import JobConfig, TORCH_DTYPE_MAP +from torchtitan.distributed import ParallelDims +from torchtitan.models.llama3.parallelize_llama import apply_ac +from torchtitan.tools.logging import logger + +from .simple_fsdp import data_parallel, MixedPrecisionPolicy + + +def parallelize_llama( + model: nn.Module, + world_mesh: DeviceMesh, + parallel_dims: ParallelDims, + job_config: JobConfig, +): + """ + Apply tensor parallelism, activation checkpointing, torch.compile, and data + parallelism to the model. + + NOTE: The passed-in model preferably should be on meta device. Otherwise, + the model must fit on GPU or CPU memory. + """ + # TODO(ruisizhang123): Add support for TP (on-going) + # if parallel_dims.tp_enabled: + # if ( + # job_config.parallelism.enable_async_tensor_parallel + # and not job_config.training.compile + # ): + # raise RuntimeError("Async TP requires --training.compile") + + # enable_float8_linear = "float8" in job_config.model.converters + # float8_is_rowwise = job_config.float8.recipe_name in ( + # "rowwise", + # "rowwise_with_gw_hp", + # ) + + # # For now, float8 all-gather with TP is only supported for tensorwise + # # float8 scaling recipes. For rowwise recipes, we use regular TP and + # # all-gather happens in high precision. + # enable_float8_tensorwise_tp = enable_float8_linear and not float8_is_rowwise + + # apply_tp( + # model, + # world_mesh["tp"], + # loss_parallel=parallel_dims.loss_parallel_enabled, + # enable_float8_tensorwise_tp=enable_float8_tensorwise_tp, + # enable_async_tp=job_config.parallelism.enable_async_tensor_parallel, + # ) + + if job_config.activation_checkpoint.mode != "none": + apply_ac(model, job_config.activation_checkpoint) + + # apply data parallel + if ( + parallel_dims.dp_replicate_enabled + or parallel_dims.dp_shard_enabled + or parallel_dims.cp_enabled + ): + if parallel_dims.dp_replicate_enabled: + if parallel_dims.dp_shard_enabled or parallel_dims.cp_enabled: + dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp") + dp_mode = "hybrid_shard" + else: + dp_mesh_dim_names = ("dp_replicate",) + dp_mode = "replicate" + else: + dp_mesh_dim_names = ("dp_shard_cp",) + dp_mode = "fully_shard" + + mp_policy = MixedPrecisionPolicy( + param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param], + reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce], + ) + + model = data_parallel( + model, + world_mesh[tuple(dp_mesh_dim_names)], + mode=dp_mode, + ac_mode=job_config.activation_checkpoint.mode, + mp_policy=mp_policy, + ) + logger.info("Applied Data Parallel (dp mode=%s) to the model", dp_mode) + + if job_config.training.compile: + torch._inductor.config.reorder_for_peak_memory = False + model = torch.compile(model, fullgraph=True) + + return model diff --git a/torchtitan/experiments/simple_fsdp/tests/__pycache__/test_numerics.cpython-312-pytest-8.4.1.pyc b/torchtitan/experiments/simple_fsdp/tests/__pycache__/test_numerics.cpython-312-pytest-8.4.1.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24e814b57101af3d3137611c1ca31d0d9bc46326 Binary files /dev/null and b/torchtitan/experiments/simple_fsdp/tests/__pycache__/test_numerics.cpython-312-pytest-8.4.1.pyc differ diff --git a/torchtitan/models/__pycache__/attention.cpython-312.pyc b/torchtitan/models/__pycache__/attention.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3bd8501a02d87fc1646b31bd8df09b34531abf20 Binary files /dev/null and b/torchtitan/models/__pycache__/attention.cpython-312.pyc differ diff --git a/torchtitan/models/llama3/__pycache__/parallelize_llama.cpython-312.pyc b/torchtitan/models/llama3/__pycache__/parallelize_llama.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..95e80b52715c0488bc26290ce159be14aef87949 Binary files /dev/null and b/torchtitan/models/llama3/__pycache__/parallelize_llama.cpython-312.pyc differ diff --git a/torchtitan/models/llama3/parallelize_llama.py b/torchtitan/models/llama3/parallelize_llama.py new file mode 100644 index 0000000000000000000000000000000000000000..ed2e6f0c78eb4acb3a4d561aa0717758fdf3b1c1 --- /dev/null +++ b/torchtitan/models/llama3/parallelize_llama.py @@ -0,0 +1,398 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# This file applies the PT-D parallelisms (except pipeline parallelism) and various +# training techniques (e.g. activation checkpointing and compile) to the Llama model. + +from collections import defaultdict + +import torch +import torch.nn as nn +from torch.distributed._composable.replicate import replicate +from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( + checkpoint_wrapper as ptd_checkpoint_wrapper, +) + +from torch.distributed.device_mesh import DeviceMesh +from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, MixedPrecisionPolicy +from torch.distributed.tensor import Replicate, Shard +from torch.distributed.tensor.parallel import ( + ColwiseParallel, + parallelize_module, + PrepareModuleInput, + RowwiseParallel, + SequenceParallel, +) + +from torchtitan.config_manager import JobConfig, TORCH_DTYPE_MAP +from torchtitan.distributed import ParallelDims +from torchtitan.tools.logging import logger + + +def parallelize_llama( + model: nn.Module, + world_mesh: DeviceMesh, + parallel_dims: ParallelDims, + job_config: JobConfig, +): + """ + Apply tensor parallelism, activation checkpointing, torch.compile, and data + parallelism to the model. + + NOTE: The passed-in model preferably should be on meta device. Otherwise, + the model must fit on GPU or CPU memory. + """ + + if parallel_dims.tp_enabled: + if ( + job_config.parallelism.enable_async_tensor_parallel + and not job_config.training.compile + ): + raise RuntimeError("Async TP requires --training.compile") + + enable_float8_linear = "float8" in job_config.model.converters + float8_is_rowwise = job_config.float8.recipe_name in ( + "rowwise", + "rowwise_with_gw_hp", + ) + + # For now, float8 all-gather with TP is only supported for tensorwise + # float8 scaling recipes. For rowwise recipes, we use regular TP and + # all-gather happens in high precision. + enable_float8_tensorwise_tp = enable_float8_linear and not float8_is_rowwise + + apply_tp( + model, + world_mesh["tp"], + loss_parallel=parallel_dims.loss_parallel_enabled, + enable_float8_tensorwise_tp=enable_float8_tensorwise_tp, + enable_async_tp=job_config.parallelism.enable_async_tensor_parallel, + ) + + if job_config.model.use_flex_attn: + if job_config.activation_checkpoint.mode == "selective": + raise ValueError( + "FlexAttention is not compatible with selective AC yet. " + "See https://github.com/pytorch/pytorch/issues/147879" + ) + + if parallel_dims.cp_enabled: + raise ValueError( + "FlexAttention is not compatible with CP yet. " + "We are still working on this." + ) + + if job_config.activation_checkpoint.mode != "none": + apply_ac(model, job_config.activation_checkpoint) + + # turn on per-TransformerBlock compile after AC wrapping and before FSDP + if job_config.training.compile: + apply_compile(model) + + if ( + parallel_dims.dp_shard_enabled or parallel_dims.cp_enabled + ): # apply FSDP or HSDP, potentially with Context Parallel + if parallel_dims.dp_replicate_enabled: + dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp") + else: + dp_mesh_dim_names = ("dp_shard_cp",) + + apply_fsdp( + model, + world_mesh[tuple(dp_mesh_dim_names)], + param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param], + reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce], + pp_enabled=parallel_dims.pp_enabled, + cpu_offload=job_config.training.enable_cpu_offload, + reshard_after_forward_policy=job_config.parallelism.fsdp_reshard_after_forward, + ) + + if parallel_dims.dp_replicate_enabled: + logger.info("Applied HSDP to the model") + else: + logger.info("Applied FSDP to the model") + + if parallel_dims.cp_enabled: + logger.info("Applied Context Parallel to the model") + + if job_config.training.enable_cpu_offload: + logger.info("Applied CPU Offloading to the model") + elif parallel_dims.dp_replicate_enabled: + if world_mesh.ndim > 1: + raise RuntimeError("DDP has not supported > 1D parallelism") + apply_ddp( + model, + world_mesh, + enable_compile=job_config.training.compile, + enable_compiled_autograd=job_config.parallelism.enable_compiled_autograd, + ) + + return model + + +def apply_tp( + model: nn.Module, + tp_mesh: DeviceMesh, + loss_parallel: bool, + enable_float8_tensorwise_tp: bool, + enable_async_tp: bool, +): + """Apply tensor parallelism.""" + # 1. Parallelize the embedding and shard its outputs (which are the first + # transformer block's inputs) + # 2. Parallelize the root norm layer over the sequence dim + # 3. Parallelize the final linear output layer + parallelize_module( + model, + tp_mesh, + { + "tok_embeddings": RowwiseParallel( + input_layouts=Replicate(), + output_layouts=Shard(1), + ), + "norm": SequenceParallel(), + "output": ColwiseParallel( + input_layouts=Shard(1), + output_layouts=Shard(-1) if loss_parallel else Replicate(), + use_local_output=not loss_parallel, + ), + }, + ) + + # Parallel styles used for transformer block linear weights and their + # inputs may be different for float8 linears with tensorwise scaling. + if enable_float8_tensorwise_tp: + # TODO(vkuzo): add the items below to __init__.py of torchao.float8 and import from there + from torchao.float8.float8_tensor_parallel import ( + Float8ColwiseParallel, + Float8RowwiseParallel, + PrepareFloat8ModuleInput, + ) + + rowwise_parallel, colwise_parallel, prepare_module_input = ( + Float8RowwiseParallel, + Float8ColwiseParallel, + PrepareFloat8ModuleInput, + ) + else: + rowwise_parallel, colwise_parallel, prepare_module_input = ( + RowwiseParallel, + ColwiseParallel, + PrepareModuleInput, + ) + + # Apply tensor + sequence parallelism to every transformer block + # NOTE: At the cost of model code change, we can accelerate Sequence Parallel + # by folding (and unfolding) the batch dimension and the sequence dimension. + # Examples can be found at https://github.com/pytorch/torchtitan/pull/437 + for layer_id, transformer_block in model.layers.items(): + layer_plan = { + "attention_norm": SequenceParallel(), + "attention": prepare_module_input( + input_layouts=(Shard(1), None), + desired_input_layouts=(Replicate(), None), + ), + "attention.wq": colwise_parallel(), + "attention.wk": colwise_parallel(), + "attention.wv": colwise_parallel(), + "attention.wo": rowwise_parallel(output_layouts=Shard(1)), + "ffn_norm": SequenceParallel(), + "feed_forward": prepare_module_input( + input_layouts=(Shard(1),), + desired_input_layouts=(Replicate(),), + ), + "feed_forward.w1": colwise_parallel(), + "feed_forward.w2": rowwise_parallel(output_layouts=Shard(1)), + "feed_forward.w3": colwise_parallel(), + } + + parallelize_module( + module=transformer_block, + device_mesh=tp_mesh, + parallelize_plan=layer_plan, + ) + + if enable_async_tp: + from torch.distributed._symmetric_memory import enable_symm_mem_for_group + + torch._inductor.config._micro_pipeline_tp = True + enable_symm_mem_for_group(tp_mesh.get_group().group_name) + + logger.info( + f"Applied {'Float8 tensorwise ' if enable_float8_tensorwise_tp else ''}{'Async ' if enable_async_tp else ''}" + "Tensor Parallelism to the model" + ) + + +# for selective op activation checkpointing +_save_list = { + torch.ops.aten.mm.default, + torch.ops.aten._scaled_dot_product_efficient_attention.default, + torch.ops.aten._scaled_dot_product_flash_attention.default, + # for low precision training, it's useful to always save + # the result of max, since the absolute maximum is + # used to compute the scaling factor for quantization. + torch.ops.aten.max.default, +} + + +def _apply_ac_to_transformer_block(module: nn.Module, ac_config): + valid_ac_modes = ("full", "selective") + if ac_config.mode not in valid_ac_modes: + raise ValueError( + f"Invalid AC mode: {ac_config.mode}. Valid modes: {valid_ac_modes}" + ) + + if ac_config.mode == "full": + return ptd_checkpoint_wrapper(module, preserve_rng_state=False) + + assert ac_config.mode == "selective", f"{ac_config.mode}" + use_op_sac = ac_config.selective_ac_option == "op" + use_layer_sac = ac_config.selective_ac_option.isdigit() + if not use_op_sac and not use_layer_sac: + raise ValueError( + f"Invalid selective AC option: {ac_config.selective_ac_option}. " + f"Valid options: 'op' or a positive int representing layer frequency" + ) + if use_op_sac: + from torch.utils.checkpoint import ( + CheckpointPolicy, + create_selective_checkpoint_contexts, + ) + + def _get_custom_policy(meta): + def _custom_policy(ctx, func, *args, **kwargs): + mode = "recompute" if ctx.is_recompute else "forward" + mm_count_key = f"{mode}_mm_count" + if func == torch.ops.aten.mm.default: + meta[mm_count_key] += 1 + # Saves output of all compute ops, except every second mm + to_save = func in _save_list and not ( + func == torch.ops.aten.mm.default and meta[mm_count_key] % 2 == 0 + ) + return ( + CheckpointPolicy.MUST_SAVE + if to_save + else CheckpointPolicy.PREFER_RECOMPUTE + ) + + return _custom_policy + + def selective_checkpointing_context_fn(): + meta = defaultdict(int) + return create_selective_checkpoint_contexts(_get_custom_policy(meta)) + + return ptd_checkpoint_wrapper( + module, + context_fn=selective_checkpointing_context_fn, + preserve_rng_state=False, + ) + elif use_layer_sac: + # Checkpoint every `ac_freq` of the modules passed to this function + ac_freq = int(ac_config.selective_ac_option) + ptd_checkpoint_wrapper.__dict__.setdefault("_count", 0) + ptd_checkpoint_wrapper._count += 1 + if not ac_freq or ptd_checkpoint_wrapper._count % ac_freq == 0: + return ptd_checkpoint_wrapper(module, preserve_rng_state=False) + else: + return module + + +def apply_ac(model: nn.Module, ac_config): + """Apply activation checkpointing to the model.""" + for layer_id, transformer_block in model.layers.named_children(): + transformer_block = _apply_ac_to_transformer_block(transformer_block, ac_config) + model.layers.register_module(layer_id, transformer_block) + + logger.info(f"Applied {ac_config.mode} activation checkpointing to the model") + + +def apply_compile(model: nn.Module): + """ + Apply torch.compile to each TransformerBlock, which makes compilation efficient due to + repeated structure. Alternatively one can compile the whole model (after applying DP). + """ + for layer_id, transformer_block in model.layers.named_children(): + transformer_block = torch.compile(transformer_block, fullgraph=True) + model.layers.register_module(layer_id, transformer_block) + + logger.info("Compiling each TransformerBlock with torch.compile") + + +def apply_fsdp( + model: nn.Module, + dp_mesh: DeviceMesh, + param_dtype: torch.dtype, + reduce_dtype: torch.dtype, + pp_enabled: bool, + cpu_offload: bool = False, + reshard_after_forward_policy: str = "default", +): + """ + Apply data parallelism (via FSDP2) to the model. + + Args: + model (nn.Module): The model to apply data parallelism to. + dp_mesh (DeviceMesh): The device mesh to use for data parallelism. + param_dtype (torch.dtype): The data type to use for model parameters. + reduce_dtype (torch.dtype): The data type to use for reduction operations. + pp_enabled (bool): Whether pipeline parallelism is enabled. + cpu_offload (bool, optional): Whether to offload model parameters to CPU. Defaults to False. + reshard_after_forward_policy (str, optional): The policy to use for resharding after forward pass. Defaults to "default". + Other options: "never", "always". + - "default" applies default resharding behavior, implementing "smart defaults" for known optimal scenarios. + - "always" will enable `reshard_after_forward` for all forward passes. + - "never" will disable `reshard_after_forward` for all forward passes. + + """ + mp_policy = MixedPrecisionPolicy(param_dtype=param_dtype, reduce_dtype=reduce_dtype) + fsdp_config = {"mesh": dp_mesh, "mp_policy": mp_policy} + if cpu_offload: + fsdp_config["offload_policy"] = CPUOffloadPolicy() + + for layer_id, transformer_block in model.layers.items(): + if reshard_after_forward_policy == "always": + reshard_after_forward = True + elif reshard_after_forward_policy == "never": + reshard_after_forward = False + elif reshard_after_forward_policy == "default": + if pp_enabled: + # For PP, do not reshard after forward to avoid per-microbatch + # all-gathers, which can be expensive and non-overlapped + reshard_after_forward = False + else: + # As an optimization, do not reshard after forward for the last + # transformer block since FSDP would prefetch it immediately + reshard_after_forward = int(layer_id) < len(model.layers) - 1 + else: + raise ValueError( + f"Invalid reshard_after_forward_policy: {reshard_after_forward_policy}." + ) + fully_shard( + transformer_block, + **fsdp_config, + reshard_after_forward=reshard_after_forward, + ) + fully_shard(model, **fsdp_config, reshard_after_forward=not pp_enabled) + + +def apply_ddp( + model: nn.Module, + dp_mesh: DeviceMesh, + enable_compile: bool, + enable_compiled_autograd: bool, +): + if enable_compile: + if enable_compiled_autograd: + torch._dynamo.config.optimize_ddp = ( + "python_reducer_without_compiled_forward" + ) + else: + torch._dynamo.config.optimize_ddp = "ddp_optimizer" + + replicate(model, device_mesh=dp_mesh, bucket_cap_mb=100) + + logger.info("Applied DDP to the model") diff --git a/torchtitan/protocols/__pycache__/model_converter.cpython-312.pyc b/torchtitan/protocols/__pycache__/model_converter.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da8e5f2c191b348fce11d2a6a3cb9fe7fb03519d Binary files /dev/null and b/torchtitan/protocols/__pycache__/model_converter.cpython-312.pyc differ